unicodeobject.c revision dc724d6e35aaf76a291f4b17b59516adcf3c9e98
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_DEL(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 *unicode = (PyObject *)w; 280 return 0; 281 } 282 283 /* Note that we don't have to modify *unicode for unshared Unicode 284 objects, since we can modify them in-place. */ 285 return unicode_resize(v, length); 286} 287 288/* Internal API for use in unicodeobject.c only ! */ 289#define _PyUnicode_Resize(unicodevar, length) \ 290 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 291 292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 293 int size) 294{ 295 PyUnicodeObject *unicode; 296 297 /* If the Unicode data is known at construction time, we can apply 298 some optimizations which share commonly used objects. */ 299 if (u != NULL) { 300 301 /* Optimization for empty strings */ 302 if (size == 0 && unicode_empty != NULL) { 303 Py_INCREF(unicode_empty); 304 return (PyObject *)unicode_empty; 305 } 306 307 /* Single character Unicode objects in the Latin-1 range are 308 shared when using this constructor */ 309 if (size == 1 && *u < 256) { 310 unicode = unicode_latin1[*u]; 311 if (!unicode) { 312 unicode = _PyUnicode_New(1); 313 if (!unicode) 314 return NULL; 315 unicode->str[0] = *u; 316 unicode_latin1[*u] = unicode; 317 } 318 Py_INCREF(unicode); 319 return (PyObject *)unicode; 320 } 321 } 322 323 unicode = _PyUnicode_New(size); 324 if (!unicode) 325 return NULL; 326 327 /* Copy the Unicode data into the new object */ 328 if (u != NULL) 329 Py_UNICODE_COPY(unicode->str, u, size); 330 331 return (PyObject *)unicode; 332} 333 334#ifdef HAVE_WCHAR_H 335 336PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 337 int size) 338{ 339 PyUnicodeObject *unicode; 340 341 if (w == NULL) { 342 PyErr_BadInternalCall(); 343 return NULL; 344 } 345 346 unicode = _PyUnicode_New(size); 347 if (!unicode) 348 return NULL; 349 350 /* Copy the wchar_t data into the new object */ 351#ifdef HAVE_USABLE_WCHAR_T 352 memcpy(unicode->str, w, size * sizeof(wchar_t)); 353#else 354 { 355 register Py_UNICODE *u; 356 register int i; 357 u = PyUnicode_AS_UNICODE(unicode); 358 for (i = size; i >= 0; i--) 359 *u++ = *w++; 360 } 361#endif 362 363 return (PyObject *)unicode; 364} 365 366int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 367 register wchar_t *w, 368 int size) 369{ 370 if (unicode == NULL) { 371 PyErr_BadInternalCall(); 372 return -1; 373 } 374 if (size > PyUnicode_GET_SIZE(unicode)) 375 size = PyUnicode_GET_SIZE(unicode); 376#ifdef HAVE_USABLE_WCHAR_T 377 memcpy(w, unicode->str, size * sizeof(wchar_t)); 378#else 379 { 380 register Py_UNICODE *u; 381 register int i; 382 u = PyUnicode_AS_UNICODE(unicode); 383 for (i = size; i >= 0; i--) 384 *w++ = *u++; 385 } 386#endif 387 388 return size; 389} 390 391#endif 392 393PyObject *PyUnicode_FromObject(register PyObject *obj) 394{ 395 /* XXX Perhaps we should make this API an alias of 396 PyObject_Unicode() instead ?! */ 397 if (PyUnicode_CheckExact(obj)) { 398 Py_INCREF(obj); 399 return obj; 400 } 401 if (PyUnicode_Check(obj)) { 402 /* For a Unicode subtype that's not a Unicode object, 403 return a true Unicode object with the same data. */ 404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 405 PyUnicode_GET_SIZE(obj)); 406 } 407 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 408} 409 410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 411 const char *encoding, 412 const char *errors) 413{ 414 const char *s = NULL; 415 int len; 416 int owned = 0; 417 PyObject *v; 418 419 if (obj == NULL) { 420 PyErr_BadInternalCall(); 421 return NULL; 422 } 423 424#if 0 425 /* For b/w compatibility we also accept Unicode objects provided 426 that no encodings is given and then redirect to 427 PyObject_Unicode() which then applies the additional logic for 428 Unicode subclasses. 429 430 NOTE: This API should really only be used for object which 431 represent *encoded* Unicode ! 432 433 */ 434 if (PyUnicode_Check(obj)) { 435 if (encoding) { 436 PyErr_SetString(PyExc_TypeError, 437 "decoding Unicode is not supported"); 438 return NULL; 439 } 440 return PyObject_Unicode(obj); 441 } 442#else 443 if (PyUnicode_Check(obj)) { 444 PyErr_SetString(PyExc_TypeError, 445 "decoding Unicode is not supported"); 446 return NULL; 447 } 448#endif 449 450 /* Coerce object */ 451 if (PyString_Check(obj)) { 452 s = PyString_AS_STRING(obj); 453 len = PyString_GET_SIZE(obj); 454 } 455 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 456 /* Overwrite the error message with something more useful in 457 case of a TypeError. */ 458 if (PyErr_ExceptionMatches(PyExc_TypeError)) 459 PyErr_Format(PyExc_TypeError, 460 "coercing to Unicode: need string or buffer, " 461 "%.80s found", 462 obj->ob_type->tp_name); 463 goto onError; 464 } 465 466 /* Convert to Unicode */ 467 if (len == 0) { 468 Py_INCREF(unicode_empty); 469 v = (PyObject *)unicode_empty; 470 } 471 else 472 v = PyUnicode_Decode(s, len, encoding, errors); 473 474 if (owned) { 475 Py_DECREF(obj); 476 } 477 return v; 478 479 onError: 480 if (owned) { 481 Py_DECREF(obj); 482 } 483 return NULL; 484} 485 486PyObject *PyUnicode_Decode(const char *s, 487 int size, 488 const char *encoding, 489 const char *errors) 490{ 491 PyObject *buffer = NULL, *unicode; 492 493 if (encoding == NULL) 494 encoding = PyUnicode_GetDefaultEncoding(); 495 496 /* Shortcuts for common default encodings */ 497 if (strcmp(encoding, "utf-8") == 0) 498 return PyUnicode_DecodeUTF8(s, size, errors); 499 else if (strcmp(encoding, "latin-1") == 0) 500 return PyUnicode_DecodeLatin1(s, size, errors); 501 else if (strcmp(encoding, "ascii") == 0) 502 return PyUnicode_DecodeASCII(s, size, errors); 503 504 /* Decode via the codec registry */ 505 buffer = PyBuffer_FromMemory((void *)s, size); 506 if (buffer == NULL) 507 goto onError; 508 unicode = PyCodec_Decode(buffer, encoding, errors); 509 if (unicode == NULL) 510 goto onError; 511 if (!PyUnicode_Check(unicode)) { 512 PyErr_Format(PyExc_TypeError, 513 "decoder did not return an unicode object (type=%.400s)", 514 unicode->ob_type->tp_name); 515 Py_DECREF(unicode); 516 goto onError; 517 } 518 Py_DECREF(buffer); 519 return unicode; 520 521 onError: 522 Py_XDECREF(buffer); 523 return NULL; 524} 525 526PyObject *PyUnicode_Encode(const Py_UNICODE *s, 527 int size, 528 const char *encoding, 529 const char *errors) 530{ 531 PyObject *v, *unicode; 532 533 unicode = PyUnicode_FromUnicode(s, size); 534 if (unicode == NULL) 535 return NULL; 536 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 537 Py_DECREF(unicode); 538 return v; 539} 540 541PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 542 const char *encoding, 543 const char *errors) 544{ 545 PyObject *v; 546 547 if (!PyUnicode_Check(unicode)) { 548 PyErr_BadArgument(); 549 goto onError; 550 } 551 552 if (encoding == NULL) 553 encoding = PyUnicode_GetDefaultEncoding(); 554 555 /* Shortcuts for common default encodings */ 556 if (errors == NULL) { 557 if (strcmp(encoding, "utf-8") == 0) 558 return PyUnicode_AsUTF8String(unicode); 559 else if (strcmp(encoding, "latin-1") == 0) 560 return PyUnicode_AsLatin1String(unicode); 561 else if (strcmp(encoding, "ascii") == 0) 562 return PyUnicode_AsASCIIString(unicode); 563 } 564 565 /* Encode via the codec registry */ 566 v = PyCodec_Encode(unicode, encoding, errors); 567 if (v == NULL) 568 goto onError; 569 /* XXX Should we really enforce this ? */ 570 if (!PyString_Check(v)) { 571 PyErr_Format(PyExc_TypeError, 572 "encoder did not return a string object (type=%.400s)", 573 v->ob_type->tp_name); 574 Py_DECREF(v); 575 goto onError; 576 } 577 return v; 578 579 onError: 580 return NULL; 581} 582 583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 584 const char *errors) 585{ 586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 587 588 if (v) 589 return v; 590 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 591 if (v && errors == NULL) 592 ((PyUnicodeObject *)unicode)->defenc = v; 593 return v; 594} 595 596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 597{ 598 if (!PyUnicode_Check(unicode)) { 599 PyErr_BadArgument(); 600 goto onError; 601 } 602 return PyUnicode_AS_UNICODE(unicode); 603 604 onError: 605 return NULL; 606} 607 608int PyUnicode_GetSize(PyObject *unicode) 609{ 610 if (!PyUnicode_Check(unicode)) { 611 PyErr_BadArgument(); 612 goto onError; 613 } 614 return PyUnicode_GET_SIZE(unicode); 615 616 onError: 617 return -1; 618} 619 620const char *PyUnicode_GetDefaultEncoding(void) 621{ 622 return unicode_default_encoding; 623} 624 625int PyUnicode_SetDefaultEncoding(const char *encoding) 626{ 627 PyObject *v; 628 629 /* Make sure the encoding is valid. As side effect, this also 630 loads the encoding into the codec registry cache. */ 631 v = _PyCodec_Lookup(encoding); 632 if (v == NULL) 633 goto onError; 634 Py_DECREF(v); 635 strncpy(unicode_default_encoding, 636 encoding, 637 sizeof(unicode_default_encoding)); 638 return 0; 639 640 onError: 641 return -1; 642} 643 644/* --- UTF-7 Codec -------------------------------------------------------- */ 645 646/* see RFC2152 for details */ 647 648static 649char utf7_special[128] = { 650 /* indicate whether a UTF-7 character is special i.e. cannot be directly 651 encoded: 652 0 - not special 653 1 - special 654 2 - whitespace (optional) 655 3 - RFC2152 Set O (optional) */ 656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 664 665}; 666 667#define SPECIAL(c, encodeO, encodeWS) \ 668 (((c)>127 || utf7_special[(c)] == 1) || \ 669 (encodeWS && (utf7_special[(c)] == 2)) || \ 670 (encodeO && (utf7_special[(c)] == 3))) 671 672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 676 677#define ENCODE(out, ch, bits) \ 678 while (bits >= 6) { \ 679 *out++ = B64(ch >> (bits-6)); \ 680 bits -= 6; \ 681 } 682 683#define DECODE(out, ch, bits, surrogate) \ 684 while (bits >= 16) { \ 685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 686 bits -= 16; \ 687 if (surrogate) { \ 688 /* We have already generated an error for the high surrogate 689 so let's not bother seeing if the low surrogate is correct or not */\ 690 surrogate = 0; \ 691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 692 /* This is a surrogate pair. Unfortunately we can't represent \ 693 it in a 16-bit character */ \ 694 surrogate = 1; \ 695 errmsg = "code pairs are not supported"; \ 696 goto utf7Error; \ 697 } else { \ 698 *out++ = outCh; \ 699 } \ 700 } \ 701 702static 703int utf7_decoding_error(Py_UNICODE **dest, 704 const char *errors, 705 const char *details) 706{ 707 if ((errors == NULL) || 708 (strcmp(errors,"strict") == 0)) { 709 PyErr_Format(PyExc_UnicodeError, 710 "UTF-7 decoding error: %.400s", 711 details); 712 return -1; 713 } 714 else if (strcmp(errors,"ignore") == 0) { 715 return 0; 716 } 717 else if (strcmp(errors,"replace") == 0) { 718 if (dest != NULL) { 719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 720 (*dest)++; 721 } 722 return 0; 723 } 724 else { 725 PyErr_Format(PyExc_ValueError, 726 "UTF-7 decoding error; unknown error handling code: %.400s", 727 errors); 728 return -1; 729 } 730} 731 732PyObject *PyUnicode_DecodeUTF7(const char *s, 733 int size, 734 const char *errors) 735{ 736 const char *e; 737 PyUnicodeObject *unicode; 738 Py_UNICODE *p; 739 const char *errmsg = ""; 740 int inShift = 0; 741 unsigned int bitsleft = 0; 742 unsigned long charsleft = 0; 743 int surrogate = 0; 744 745 unicode = _PyUnicode_New(size); 746 if (!unicode) 747 return NULL; 748 if (size == 0) 749 return (PyObject *)unicode; 750 751 p = unicode->str; 752 e = s + size; 753 754 while (s < e) { 755 Py_UNICODE ch = *s; 756 757 if (inShift) { 758 if ((ch == '-') || !B64CHAR(ch)) { 759 inShift = 0; 760 s++; 761 762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 763 if (bitsleft >= 6) { 764 /* The shift sequence has a partial character in it. If 765 bitsleft < 6 then we could just classify it as padding 766 but that is not the case here */ 767 768 errmsg = "partial character in shift sequence"; 769 goto utf7Error; 770 } 771 /* According to RFC2152 the remaining bits should be zero. We 772 choose to signal an error/insert a replacement character 773 here so indicate the potential of a misencoded character. */ 774 775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 777 errmsg = "non-zero padding bits in shift sequence"; 778 goto utf7Error; 779 } 780 781 if (ch == '-') { 782 if ((s < e) && (*(s) == '-')) { 783 *p++ = '-'; 784 inShift = 1; 785 } 786 } else if (SPECIAL(ch,0,0)) { 787 errmsg = "unexpected special character"; 788 goto utf7Error; 789 } else { 790 *p++ = ch; 791 } 792 } else { 793 charsleft = (charsleft << 6) | UB64(ch); 794 bitsleft += 6; 795 s++; 796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 797 } 798 } 799 else if ( ch == '+' ) { 800 s++; 801 if (s < e && *s == '-') { 802 s++; 803 *p++ = '+'; 804 } else 805 { 806 inShift = 1; 807 bitsleft = 0; 808 } 809 } 810 else if (SPECIAL(ch,0,0)) { 811 errmsg = "unexpected special character"; 812 s++; 813 goto utf7Error; 814 } 815 else { 816 *p++ = ch; 817 s++; 818 } 819 continue; 820 utf7Error: 821 if (utf7_decoding_error(&p, errors, errmsg)) 822 goto onError; 823 } 824 825 if (inShift) { 826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) 827 goto onError; 828 } 829 830 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 831 goto onError; 832 833 return (PyObject *)unicode; 834 835onError: 836 Py_DECREF(unicode); 837 return NULL; 838} 839 840 841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 842 int size, 843 int encodeSetO, 844 int encodeWhiteSpace, 845 const char *errors) 846{ 847 PyObject *v; 848 /* It might be possible to tighten this worst case */ 849 unsigned int cbAllocated = 5 * size; 850 int inShift = 0; 851 int i = 0; 852 unsigned int bitsleft = 0; 853 unsigned long charsleft = 0; 854 char * out; 855 char * start; 856 857 if (size == 0) 858 return PyString_FromStringAndSize(NULL, 0); 859 860 v = PyString_FromStringAndSize(NULL, cbAllocated); 861 if (v == NULL) 862 return NULL; 863 864 start = out = PyString_AS_STRING(v); 865 for (;i < size; ++i) { 866 Py_UNICODE ch = s[i]; 867 868 if (!inShift) { 869 if (ch == '+') { 870 *out++ = '+'; 871 *out++ = '-'; 872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 873 charsleft = ch; 874 bitsleft = 16; 875 *out++ = '+'; 876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 877 inShift = bitsleft > 0; 878 } else { 879 *out++ = (char) ch; 880 } 881 } else { 882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 883 *out++ = B64(charsleft << (6-bitsleft)); 884 charsleft = 0; 885 bitsleft = 0; 886 /* Characters not in the BASE64 set implicitly unshift the sequence 887 so no '-' is required, except if the character is itself a '-' */ 888 if (B64CHAR(ch) || ch == '-') { 889 *out++ = '-'; 890 } 891 inShift = 0; 892 *out++ = (char) ch; 893 } else { 894 bitsleft += 16; 895 charsleft = (charsleft << 16) | ch; 896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 897 898 /* If the next character is special then we dont' need to terminate 899 the shift sequence. If the next character is not a BASE64 character 900 or '-' then the shift sequence will be terminated implicitly and we 901 don't have to insert a '-'. */ 902 903 if (bitsleft == 0) { 904 if (i + 1 < size) { 905 Py_UNICODE ch2 = s[i+1]; 906 907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 908 909 } else if (B64CHAR(ch2) || ch2 == '-') { 910 *out++ = '-'; 911 inShift = 0; 912 } else { 913 inShift = 0; 914 } 915 916 } 917 else { 918 *out++ = '-'; 919 inShift = 0; 920 } 921 } 922 } 923 } 924 } 925 if (bitsleft) { 926 *out++= B64(charsleft << (6-bitsleft) ); 927 *out++ = '-'; 928 } 929 930 if (_PyString_Resize(&v, out - start)) { 931 Py_DECREF(v); 932 return NULL; 933 } 934 return v; 935} 936 937#undef SPECIAL 938#undef B64 939#undef B64CHAR 940#undef UB64 941#undef ENCODE 942#undef DECODE 943 944/* --- UTF-8 Codec -------------------------------------------------------- */ 945 946static 947char utf8_code_length[256] = { 948 /* Map UTF-8 encoded prefix byte to sequence length. zero means 949 illegal prefix. see RFC 2279 for details */ 950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 966}; 967 968static 969int utf8_decoding_error(const char **source, 970 Py_UNICODE **dest, 971 const char *errors, 972 const char *details) 973{ 974 if ((errors == NULL) || 975 (strcmp(errors,"strict") == 0)) { 976 PyErr_Format(PyExc_UnicodeError, 977 "UTF-8 decoding error: %.400s", 978 details); 979 return -1; 980 } 981 else if (strcmp(errors,"ignore") == 0) { 982 (*source)++; 983 return 0; 984 } 985 else if (strcmp(errors,"replace") == 0) { 986 (*source)++; 987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 988 (*dest)++; 989 return 0; 990 } 991 else { 992 PyErr_Format(PyExc_ValueError, 993 "UTF-8 decoding error; unknown error handling code: %.400s", 994 errors); 995 return -1; 996 } 997} 998 999PyObject *PyUnicode_DecodeUTF8(const char *s, 1000 int size, 1001 const char *errors) 1002{ 1003 int n; 1004 const char *e; 1005 PyUnicodeObject *unicode; 1006 Py_UNICODE *p; 1007 const char *errmsg = ""; 1008 1009 /* Note: size will always be longer than the resulting Unicode 1010 character count */ 1011 unicode = _PyUnicode_New(size); 1012 if (!unicode) 1013 return NULL; 1014 if (size == 0) 1015 return (PyObject *)unicode; 1016 1017 /* Unpack UTF-8 encoded data */ 1018 p = unicode->str; 1019 e = s + size; 1020 1021 while (s < e) { 1022 Py_UCS4 ch = (unsigned char)*s; 1023 1024 if (ch < 0x80) { 1025 *p++ = (Py_UNICODE)ch; 1026 s++; 1027 continue; 1028 } 1029 1030 n = utf8_code_length[ch]; 1031 1032 if (s + n > e) { 1033 errmsg = "unexpected end of data"; 1034 goto utf8Error; 1035 } 1036 1037 switch (n) { 1038 1039 case 0: 1040 errmsg = "unexpected code byte"; 1041 goto utf8Error; 1042 1043 case 1: 1044 errmsg = "internal error"; 1045 goto utf8Error; 1046 1047 case 2: 1048 if ((s[1] & 0xc0) != 0x80) { 1049 errmsg = "invalid data"; 1050 goto utf8Error; 1051 } 1052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1053 if (ch < 0x80) { 1054 errmsg = "illegal encoding"; 1055 goto utf8Error; 1056 } 1057 else 1058 *p++ = (Py_UNICODE)ch; 1059 break; 1060 1061 case 3: 1062 if ((s[1] & 0xc0) != 0x80 || 1063 (s[2] & 0xc0) != 0x80) { 1064 errmsg = "invalid data"; 1065 goto utf8Error; 1066 } 1067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1068 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 1069 errmsg = "illegal encoding"; 1070 goto utf8Error; 1071 } 1072 else 1073 *p++ = (Py_UNICODE)ch; 1074 break; 1075 1076 case 4: 1077 if ((s[1] & 0xc0) != 0x80 || 1078 (s[2] & 0xc0) != 0x80 || 1079 (s[3] & 0xc0) != 0x80) { 1080 errmsg = "invalid data"; 1081 goto utf8Error; 1082 } 1083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1085 /* validate and convert to UTF-16 */ 1086 if ((ch < 0x10000) /* minimum value allowed for 4 1087 byte encoding */ 1088 || (ch > 0x10ffff)) /* maximum value allowed for 1089 UTF-16 */ 1090 { 1091 errmsg = "illegal encoding"; 1092 goto utf8Error; 1093 } 1094#ifdef Py_UNICODE_WIDE 1095 *p++ = (Py_UNICODE)ch; 1096#else 1097 /* compute and append the two surrogates: */ 1098 1099 /* translate from 10000..10FFFF to 0..FFFF */ 1100 ch -= 0x10000; 1101 1102 /* high surrogate = top 10 bits added to D800 */ 1103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1104 1105 /* low surrogate = bottom 10 bits added to DC00 */ 1106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1107#endif 1108 break; 1109 1110 default: 1111 /* Other sizes are only needed for UCS-4 */ 1112 errmsg = "unsupported Unicode code range"; 1113 goto utf8Error; 1114 } 1115 s += n; 1116 continue; 1117 1118 utf8Error: 1119 if (utf8_decoding_error(&s, &p, errors, errmsg)) 1120 goto onError; 1121 } 1122 1123 /* Adjust length */ 1124 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1125 goto onError; 1126 1127 return (PyObject *)unicode; 1128 1129onError: 1130 Py_DECREF(unicode); 1131 return NULL; 1132} 1133 1134/* Not used anymore, now that the encoder supports UTF-16 1135 surrogates. */ 1136#if 0 1137static 1138int utf8_encoding_error(const Py_UNICODE **source, 1139 char **dest, 1140 const char *errors, 1141 const char *details) 1142{ 1143 if ((errors == NULL) || 1144 (strcmp(errors,"strict") == 0)) { 1145 PyErr_Format(PyExc_UnicodeError, 1146 "UTF-8 encoding error: %.400s", 1147 details); 1148 return -1; 1149 } 1150 else if (strcmp(errors,"ignore") == 0) { 1151 return 0; 1152 } 1153 else if (strcmp(errors,"replace") == 0) { 1154 **dest = '?'; 1155 (*dest)++; 1156 return 0; 1157 } 1158 else { 1159 PyErr_Format(PyExc_ValueError, 1160 "UTF-8 encoding error; " 1161 "unknown error handling code: %.400s", 1162 errors); 1163 return -1; 1164 } 1165} 1166#endif 1167 1168PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1169 int size, 1170 const char *errors) 1171{ 1172 PyObject *v; 1173 char *p; 1174 unsigned int cbAllocated = 2 * size; 1175 unsigned int cbWritten = 0; 1176 int i = 0; 1177 1178 v = PyString_FromStringAndSize(NULL, cbAllocated + 4); 1179 if (v == NULL) 1180 return NULL; 1181 if (size == 0) 1182 return v; 1183 1184 p = PyString_AS_STRING(v); 1185 while (i < size) { 1186 Py_UCS4 ch = s[i++]; 1187 1188 if (ch < 0x80) { 1189 *p++ = (char) ch; 1190 cbWritten++; 1191 } 1192 1193 else if (ch < 0x0800) { 1194 *p++ = (char)(0xc0 | (ch >> 6)); 1195 *p++ = (char)(0x80 | (ch & 0x3f)); 1196 cbWritten += 2; 1197 } 1198 1199 else { 1200 1201 /* Assure that we have enough room for high order Unicode 1202 ordinals */ 1203 if (cbWritten >= cbAllocated) { 1204 cbAllocated += 4 * 10; 1205 if (_PyString_Resize(&v, cbAllocated + 4)) 1206 goto onError; 1207 p = PyString_AS_STRING(v) + cbWritten; 1208 } 1209 1210 if (ch < 0x10000) { 1211 /* Check for high surrogate */ 1212 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1213 Py_UCS4 ch2 = s[i]; 1214 /* Check for low surrogate */ 1215 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1216 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 1217 *p++ = (char)((ch >> 18) | 0xf0); 1218 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1219 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1220 *p++ = (char)(0x80 | (ch & 0x3f)); 1221 i++; 1222 cbWritten += 4; 1223 continue; 1224 } 1225 /* Fall through: handles isolated high surrogates */ 1226 } 1227 *p++ = (char)(0xe0 | (ch >> 12)); 1228 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1229 *p++ = (char)(0x80 | (ch & 0x3f)); 1230 cbWritten += 3; 1231 1232 } else { 1233 *p++ = (char)(0xf0 | (ch>>18)); 1234 *p++ = (char)(0x80 | ((ch>>12) & 0x3f)); 1235 *p++ = (char)(0x80 | ((ch>>6) & 0x3f)); 1236 *p++ = (char)(0x80 | (ch & 0x3f)); 1237 cbWritten += 4; 1238 } 1239 } 1240 } 1241 *p = '\0'; 1242 if (_PyString_Resize(&v, cbWritten)) 1243 goto onError; 1244 return v; 1245 1246 onError: 1247 Py_DECREF(v); 1248 return NULL; 1249} 1250 1251PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1252{ 1253 if (!PyUnicode_Check(unicode)) { 1254 PyErr_BadArgument(); 1255 return NULL; 1256 } 1257 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1258 PyUnicode_GET_SIZE(unicode), 1259 NULL); 1260} 1261 1262/* --- UTF-16 Codec ------------------------------------------------------- */ 1263 1264static 1265int utf16_decoding_error(Py_UNICODE **dest, 1266 const char *errors, 1267 const char *details) 1268{ 1269 if ((errors == NULL) || 1270 (strcmp(errors,"strict") == 0)) { 1271 PyErr_Format(PyExc_UnicodeError, 1272 "UTF-16 decoding error: %.400s", 1273 details); 1274 return -1; 1275 } 1276 else if (strcmp(errors,"ignore") == 0) { 1277 return 0; 1278 } 1279 else if (strcmp(errors,"replace") == 0) { 1280 if (dest) { 1281 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1282 (*dest)++; 1283 } 1284 return 0; 1285 } 1286 else { 1287 PyErr_Format(PyExc_ValueError, 1288 "UTF-16 decoding error; " 1289 "unknown error handling code: %.400s", 1290 errors); 1291 return -1; 1292 } 1293} 1294 1295PyObject * 1296PyUnicode_DecodeUTF16(const char *s, 1297 int size, 1298 const char *errors, 1299 int *byteorder) 1300{ 1301 PyUnicodeObject *unicode; 1302 Py_UNICODE *p; 1303 const unsigned char *q, *e; 1304 int bo = 0; /* assume native ordering by default */ 1305 const char *errmsg = ""; 1306 /* Offsets from q for retrieving byte pairs in the right order. */ 1307#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1308 int ihi = 1, ilo = 0; 1309#else 1310 int ihi = 0, ilo = 1; 1311#endif 1312 1313 /* size should be an even number */ 1314 if (size & 1) { 1315 if (utf16_decoding_error(NULL, errors, "truncated data")) 1316 return NULL; 1317 --size; /* else ignore the oddball byte */ 1318 } 1319 1320 /* Note: size will always be longer than the resulting Unicode 1321 character count */ 1322 unicode = _PyUnicode_New(size); 1323 if (!unicode) 1324 return NULL; 1325 if (size == 0) 1326 return (PyObject *)unicode; 1327 1328 /* Unpack UTF-16 encoded data */ 1329 p = unicode->str; 1330 q = (unsigned char *)s; 1331 e = q + size; 1332 1333 if (byteorder) 1334 bo = *byteorder; 1335 1336 /* Check for BOM marks (U+FEFF) in the input and adjust current 1337 byte order setting accordingly. In native mode, the leading BOM 1338 mark is skipped, in all other modes, it is copied to the output 1339 stream as-is (giving a ZWNBSP character). */ 1340 if (bo == 0) { 1341 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1342#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1343 if (bom == 0xFEFF) { 1344 q += 2; 1345 bo = -1; 1346 } 1347 else if (bom == 0xFFFE) { 1348 q += 2; 1349 bo = 1; 1350 } 1351#else 1352 if (bom == 0xFEFF) { 1353 q += 2; 1354 bo = 1; 1355 } 1356 else if (bom == 0xFFFE) { 1357 q += 2; 1358 bo = -1; 1359 } 1360#endif 1361 } 1362 1363 if (bo == -1) { 1364 /* force LE */ 1365 ihi = 1; 1366 ilo = 0; 1367 } 1368 else if (bo == 1) { 1369 /* force BE */ 1370 ihi = 0; 1371 ilo = 1; 1372 } 1373 1374 while (q < e) { 1375 Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; 1376 q += 2; 1377 1378 if (ch < 0xD800 || ch > 0xDFFF) { 1379 *p++ = ch; 1380 continue; 1381 } 1382 1383 /* UTF-16 code pair: */ 1384 if (q >= e) { 1385 errmsg = "unexpected end of data"; 1386 goto utf16Error; 1387 } 1388 if (0xD800 <= ch && ch <= 0xDBFF) { 1389 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1390 q += 2; 1391 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1392#ifndef Py_UNICODE_WIDE 1393 *p++ = ch; 1394 *p++ = ch2; 1395#else 1396 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1397#endif 1398 continue; 1399 } 1400 else { 1401 errmsg = "illegal UTF-16 surrogate"; 1402 goto utf16Error; 1403 } 1404 1405 } 1406 errmsg = "illegal encoding"; 1407 /* Fall through to report the error */ 1408 1409 utf16Error: 1410 if (utf16_decoding_error(&p, errors, errmsg)) 1411 goto onError; 1412 } 1413 1414 if (byteorder) 1415 *byteorder = bo; 1416 1417 /* Adjust length */ 1418 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1419 goto onError; 1420 1421 return (PyObject *)unicode; 1422 1423onError: 1424 Py_DECREF(unicode); 1425 return NULL; 1426} 1427 1428PyObject * 1429PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1430 int size, 1431 const char *errors, 1432 int byteorder) 1433{ 1434 PyObject *v; 1435 unsigned char *p; 1436 int i, pairs; 1437 /* Offsets from p for storing byte pairs in the right order. */ 1438#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1439 int ihi = 1, ilo = 0; 1440#else 1441 int ihi = 0, ilo = 1; 1442#endif 1443 1444#define STORECHAR(CH) \ 1445 do { \ 1446 p[ihi] = ((CH) >> 8) & 0xff; \ 1447 p[ilo] = (CH) & 0xff; \ 1448 p += 2; \ 1449 } while(0) 1450 1451 for (i = pairs = 0; i < size; i++) 1452 if (s[i] >= 0x10000) 1453 pairs++; 1454 v = PyString_FromStringAndSize(NULL, 1455 2 * (size + pairs + (byteorder == 0))); 1456 if (v == NULL) 1457 return NULL; 1458 1459 p = (unsigned char *)PyString_AS_STRING(v); 1460 if (byteorder == 0) 1461 STORECHAR(0xFEFF); 1462 if (size == 0) 1463 return v; 1464 1465 if (byteorder == -1) { 1466 /* force LE */ 1467 ihi = 1; 1468 ilo = 0; 1469 } 1470 else if (byteorder == 1) { 1471 /* force BE */ 1472 ihi = 0; 1473 ilo = 1; 1474 } 1475 1476 while (size-- > 0) { 1477 Py_UNICODE ch = *s++; 1478 Py_UNICODE ch2 = 0; 1479 if (ch >= 0x10000) { 1480 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1481 ch = 0xD800 | ((ch-0x10000) >> 10); 1482 } 1483 STORECHAR(ch); 1484 if (ch2) 1485 STORECHAR(ch2); 1486 } 1487 return v; 1488#undef STORECHAR 1489} 1490 1491PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1492{ 1493 if (!PyUnicode_Check(unicode)) { 1494 PyErr_BadArgument(); 1495 return NULL; 1496 } 1497 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1498 PyUnicode_GET_SIZE(unicode), 1499 NULL, 1500 0); 1501} 1502 1503/* --- Unicode Escape Codec ----------------------------------------------- */ 1504 1505static 1506int unicodeescape_decoding_error(const char **source, 1507 Py_UNICODE *x, 1508 const char *errors, 1509 const char *details) 1510{ 1511 if ((errors == NULL) || 1512 (strcmp(errors,"strict") == 0)) { 1513 PyErr_Format(PyExc_UnicodeError, 1514 "Unicode-Escape decoding error: %.400s", 1515 details); 1516 return -1; 1517 } 1518 else if (strcmp(errors,"ignore") == 0) { 1519 return 0; 1520 } 1521 else if (strcmp(errors,"replace") == 0) { 1522 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1523 return 0; 1524 } 1525 else { 1526 PyErr_Format(PyExc_ValueError, 1527 "Unicode-Escape decoding error; " 1528 "unknown error handling code: %.400s", 1529 errors); 1530 return -1; 1531 } 1532} 1533 1534static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1535 1536PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1537 int size, 1538 const char *errors) 1539{ 1540 PyUnicodeObject *v; 1541 Py_UNICODE *p, *buf; 1542 const char *end; 1543 char* message; 1544 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1545 1546 /* Escaped strings will always be longer than the resulting 1547 Unicode string, so we start with size here and then reduce the 1548 length after conversion to the true value. */ 1549 v = _PyUnicode_New(size); 1550 if (v == NULL) 1551 goto onError; 1552 if (size == 0) 1553 return (PyObject *)v; 1554 1555 p = buf = PyUnicode_AS_UNICODE(v); 1556 end = s + size; 1557 1558 while (s < end) { 1559 unsigned char c; 1560 Py_UNICODE x; 1561 int i, digits; 1562 1563 /* Non-escape characters are interpreted as Unicode ordinals */ 1564 if (*s != '\\') { 1565 *p++ = (unsigned char) *s++; 1566 continue; 1567 } 1568 1569 /* \ - Escapes */ 1570 s++; 1571 switch (*s++) { 1572 1573 /* \x escapes */ 1574 case '\n': break; 1575 case '\\': *p++ = '\\'; break; 1576 case '\'': *p++ = '\''; break; 1577 case '\"': *p++ = '\"'; break; 1578 case 'b': *p++ = '\b'; break; 1579 case 'f': *p++ = '\014'; break; /* FF */ 1580 case 't': *p++ = '\t'; break; 1581 case 'n': *p++ = '\n'; break; 1582 case 'r': *p++ = '\r'; break; 1583 case 'v': *p++ = '\013'; break; /* VT */ 1584 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1585 1586 /* \OOO (octal) escapes */ 1587 case '0': case '1': case '2': case '3': 1588 case '4': case '5': case '6': case '7': 1589 x = s[-1] - '0'; 1590 if ('0' <= *s && *s <= '7') { 1591 x = (x<<3) + *s++ - '0'; 1592 if ('0' <= *s && *s <= '7') 1593 x = (x<<3) + *s++ - '0'; 1594 } 1595 *p++ = x; 1596 break; 1597 1598 /* hex escapes */ 1599 /* \xXX */ 1600 case 'x': 1601 digits = 2; 1602 message = "truncated \\xXX escape"; 1603 goto hexescape; 1604 1605 /* \uXXXX */ 1606 case 'u': 1607 digits = 4; 1608 message = "truncated \\uXXXX escape"; 1609 goto hexescape; 1610 1611 /* \UXXXXXXXX */ 1612 case 'U': 1613 digits = 8; 1614 message = "truncated \\UXXXXXXXX escape"; 1615 hexescape: 1616 chr = 0; 1617 for (i = 0; i < digits; i++) { 1618 c = (unsigned char) s[i]; 1619 if (!isxdigit(c)) { 1620 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1621 goto onError; 1622 chr = x; 1623 i++; 1624 break; 1625 } 1626 chr = (chr<<4) & ~0xF; 1627 if (c >= '0' && c <= '9') 1628 chr += c - '0'; 1629 else if (c >= 'a' && c <= 'f') 1630 chr += 10 + c - 'a'; 1631 else 1632 chr += 10 + c - 'A'; 1633 } 1634 s += i; 1635 store: 1636 /* when we get here, chr is a 32-bit unicode character */ 1637 if (chr <= 0xffff) 1638 /* UCS-2 character */ 1639 *p++ = (Py_UNICODE) chr; 1640 else if (chr <= 0x10ffff) { 1641 /* UCS-4 character. Either store directly, or as 1642 surrogate pair. */ 1643#ifdef Py_UNICODE_WIDE 1644 *p++ = chr; 1645#else 1646 chr -= 0x10000L; 1647 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1648 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1649#endif 1650 } else { 1651 if (unicodeescape_decoding_error( 1652 &s, &x, errors, 1653 "illegal Unicode character") 1654 ) 1655 goto onError; 1656 *p++ = x; /* store replacement character */ 1657 } 1658 break; 1659 1660 /* \N{name} */ 1661 case 'N': 1662 message = "malformed \\N character escape"; 1663 if (ucnhash_CAPI == NULL) { 1664 /* load the unicode data module */ 1665 PyObject *m, *v; 1666 m = PyImport_ImportModule("unicodedata"); 1667 if (m == NULL) 1668 goto ucnhashError; 1669 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1670 Py_DECREF(m); 1671 if (v == NULL) 1672 goto ucnhashError; 1673 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1674 Py_DECREF(v); 1675 if (ucnhash_CAPI == NULL) 1676 goto ucnhashError; 1677 } 1678 if (*s == '{') { 1679 const char *start = s+1; 1680 /* look for the closing brace */ 1681 while (*s != '}' && s < end) 1682 s++; 1683 if (s > start && s < end && *s == '}') { 1684 /* found a name. look it up in the unicode database */ 1685 message = "unknown Unicode character name"; 1686 s++; 1687 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1688 goto store; 1689 } 1690 } 1691 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1692 goto onError; 1693 *p++ = x; 1694 break; 1695 1696 default: 1697 *p++ = '\\'; 1698 *p++ = (unsigned char)s[-1]; 1699 break; 1700 } 1701 } 1702 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1703 goto onError; 1704 return (PyObject *)v; 1705 1706ucnhashError: 1707 PyErr_SetString( 1708 PyExc_UnicodeError, 1709 "\\N escapes not supported (can't load unicodedata module)" 1710 ); 1711 return NULL; 1712 1713onError: 1714 Py_XDECREF(v); 1715 return NULL; 1716} 1717 1718/* Return a Unicode-Escape string version of the Unicode object. 1719 1720 If quotes is true, the string is enclosed in u"" or u'' quotes as 1721 appropriate. 1722 1723*/ 1724 1725static const Py_UNICODE *findchar(const Py_UNICODE *s, 1726 int size, 1727 Py_UNICODE ch); 1728 1729static 1730PyObject *unicodeescape_string(const Py_UNICODE *s, 1731 int size, 1732 int quotes) 1733{ 1734 PyObject *repr; 1735 char *p; 1736 1737 static const char *hexdigit = "0123456789abcdef"; 1738 1739 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1740 if (repr == NULL) 1741 return NULL; 1742 1743 p = PyString_AS_STRING(repr); 1744 1745 if (quotes) { 1746 *p++ = 'u'; 1747 *p++ = (findchar(s, size, '\'') && 1748 !findchar(s, size, '"')) ? '"' : '\''; 1749 } 1750 while (size-- > 0) { 1751 Py_UNICODE ch = *s++; 1752 1753 /* Escape quotes */ 1754 if (quotes && 1755 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1756 *p++ = '\\'; 1757 *p++ = (char) ch; 1758 continue; 1759 } 1760 1761#ifdef Py_UNICODE_WIDE 1762 /* Map 21-bit characters to '\U00xxxxxx' */ 1763 else if (ch >= 0x10000) { 1764 int offset = p - PyString_AS_STRING(repr); 1765 1766 /* Resize the string if necessary */ 1767 if (offset + 12 > PyString_GET_SIZE(repr)) { 1768 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1769 goto onError; 1770 p = PyString_AS_STRING(repr) + offset; 1771 } 1772 1773 *p++ = '\\'; 1774 *p++ = 'U'; 1775 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1776 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1777 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1778 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1779 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1780 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1781 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1782 *p++ = hexdigit[ch & 0x0000000F]; 1783 continue; 1784 } 1785#endif 1786 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1787 else if (ch >= 0xD800 && ch < 0xDC00) { 1788 Py_UNICODE ch2; 1789 Py_UCS4 ucs; 1790 1791 ch2 = *s++; 1792 size--; 1793 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1794 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1795 *p++ = '\\'; 1796 *p++ = 'U'; 1797 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1798 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1799 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1800 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1801 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1802 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1803 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1804 *p++ = hexdigit[ucs & 0x0000000F]; 1805 continue; 1806 } 1807 /* Fall through: isolated surrogates are copied as-is */ 1808 s--; 1809 size++; 1810 } 1811 1812 /* Map 16-bit characters to '\uxxxx' */ 1813 if (ch >= 256) { 1814 *p++ = '\\'; 1815 *p++ = 'u'; 1816 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1817 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1818 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1819 *p++ = hexdigit[ch & 0x000F]; 1820 } 1821 1822 /* Map special whitespace to '\t', \n', '\r' */ 1823 else if (ch == '\t') { 1824 *p++ = '\\'; 1825 *p++ = 't'; 1826 } 1827 else if (ch == '\n') { 1828 *p++ = '\\'; 1829 *p++ = 'n'; 1830 } 1831 else if (ch == '\r') { 1832 *p++ = '\\'; 1833 *p++ = 'r'; 1834 } 1835 1836 /* Map non-printable US ASCII to '\xhh' */ 1837 else if (ch < ' ' || ch >= 0x7F) { 1838 *p++ = '\\'; 1839 *p++ = 'x'; 1840 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1841 *p++ = hexdigit[ch & 0x000F]; 1842 } 1843 1844 /* Copy everything else as-is */ 1845 else 1846 *p++ = (char) ch; 1847 } 1848 if (quotes) 1849 *p++ = PyString_AS_STRING(repr)[1]; 1850 1851 *p = '\0'; 1852 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 1853 goto onError; 1854 1855 return repr; 1856 1857 onError: 1858 Py_DECREF(repr); 1859 return NULL; 1860} 1861 1862PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1863 int size) 1864{ 1865 return unicodeescape_string(s, size, 0); 1866} 1867 1868PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1869{ 1870 if (!PyUnicode_Check(unicode)) { 1871 PyErr_BadArgument(); 1872 return NULL; 1873 } 1874 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1875 PyUnicode_GET_SIZE(unicode)); 1876} 1877 1878/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1879 1880PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1881 int size, 1882 const char *errors) 1883{ 1884 PyUnicodeObject *v; 1885 Py_UNICODE *p, *buf; 1886 const char *end; 1887 const char *bs; 1888 1889 /* Escaped strings will always be longer than the resulting 1890 Unicode string, so we start with size here and then reduce the 1891 length after conversion to the true value. */ 1892 v = _PyUnicode_New(size); 1893 if (v == NULL) 1894 goto onError; 1895 if (size == 0) 1896 return (PyObject *)v; 1897 p = buf = PyUnicode_AS_UNICODE(v); 1898 end = s + size; 1899 while (s < end) { 1900 unsigned char c; 1901 Py_UNICODE x; 1902 int i; 1903 1904 /* Non-escape characters are interpreted as Unicode ordinals */ 1905 if (*s != '\\') { 1906 *p++ = (unsigned char)*s++; 1907 continue; 1908 } 1909 1910 /* \u-escapes are only interpreted iff the number of leading 1911 backslashes if odd */ 1912 bs = s; 1913 for (;s < end;) { 1914 if (*s != '\\') 1915 break; 1916 *p++ = (unsigned char)*s++; 1917 } 1918 if (((s - bs) & 1) == 0 || 1919 s >= end || 1920 *s != 'u') { 1921 continue; 1922 } 1923 p--; 1924 s++; 1925 1926 /* \uXXXX with 4 hex digits */ 1927 for (x = 0, i = 0; i < 4; i++) { 1928 c = (unsigned char)s[i]; 1929 if (!isxdigit(c)) { 1930 if (unicodeescape_decoding_error(&s, &x, errors, 1931 "truncated \\uXXXX")) 1932 goto onError; 1933 i++; 1934 break; 1935 } 1936 x = (x<<4) & ~0xF; 1937 if (c >= '0' && c <= '9') 1938 x += c - '0'; 1939 else if (c >= 'a' && c <= 'f') 1940 x += 10 + c - 'a'; 1941 else 1942 x += 10 + c - 'A'; 1943 } 1944 s += i; 1945 *p++ = x; 1946 } 1947 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1948 goto onError; 1949 return (PyObject *)v; 1950 1951 onError: 1952 Py_XDECREF(v); 1953 return NULL; 1954} 1955 1956PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1957 int size) 1958{ 1959 PyObject *repr; 1960 char *p; 1961 char *q; 1962 1963 static const char *hexdigit = "0123456789abcdef"; 1964 1965 repr = PyString_FromStringAndSize(NULL, 6 * size); 1966 if (repr == NULL) 1967 return NULL; 1968 if (size == 0) 1969 return repr; 1970 1971 p = q = PyString_AS_STRING(repr); 1972 while (size-- > 0) { 1973 Py_UNICODE ch = *s++; 1974 /* Map 16-bit characters to '\uxxxx' */ 1975 if (ch >= 256) { 1976 *p++ = '\\'; 1977 *p++ = 'u'; 1978 *p++ = hexdigit[(ch >> 12) & 0xf]; 1979 *p++ = hexdigit[(ch >> 8) & 0xf]; 1980 *p++ = hexdigit[(ch >> 4) & 0xf]; 1981 *p++ = hexdigit[ch & 15]; 1982 } 1983 /* Copy everything else as-is */ 1984 else 1985 *p++ = (char) ch; 1986 } 1987 *p = '\0'; 1988 if (_PyString_Resize(&repr, p - q)) 1989 goto onError; 1990 1991 return repr; 1992 1993 onError: 1994 Py_DECREF(repr); 1995 return NULL; 1996} 1997 1998PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1999{ 2000 if (!PyUnicode_Check(unicode)) { 2001 PyErr_BadArgument(); 2002 return NULL; 2003 } 2004 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2005 PyUnicode_GET_SIZE(unicode)); 2006} 2007 2008/* --- Latin-1 Codec ------------------------------------------------------ */ 2009 2010PyObject *PyUnicode_DecodeLatin1(const char *s, 2011 int size, 2012 const char *errors) 2013{ 2014 PyUnicodeObject *v; 2015 Py_UNICODE *p; 2016 2017 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2018 if (size == 1 && *(unsigned char*)s < 256) { 2019 Py_UNICODE r = *(unsigned char*)s; 2020 return PyUnicode_FromUnicode(&r, 1); 2021 } 2022 2023 v = _PyUnicode_New(size); 2024 if (v == NULL) 2025 goto onError; 2026 if (size == 0) 2027 return (PyObject *)v; 2028 p = PyUnicode_AS_UNICODE(v); 2029 while (size-- > 0) 2030 *p++ = (unsigned char)*s++; 2031 return (PyObject *)v; 2032 2033 onError: 2034 Py_XDECREF(v); 2035 return NULL; 2036} 2037 2038static 2039int latin1_encoding_error(const Py_UNICODE **source, 2040 char **dest, 2041 const char *errors, 2042 const char *details) 2043{ 2044 if ((errors == NULL) || 2045 (strcmp(errors,"strict") == 0)) { 2046 PyErr_Format(PyExc_UnicodeError, 2047 "Latin-1 encoding error: %.400s", 2048 details); 2049 return -1; 2050 } 2051 else if (strcmp(errors,"ignore") == 0) { 2052 return 0; 2053 } 2054 else if (strcmp(errors,"replace") == 0) { 2055 **dest = '?'; 2056 (*dest)++; 2057 return 0; 2058 } 2059 else { 2060 PyErr_Format(PyExc_ValueError, 2061 "Latin-1 encoding error; " 2062 "unknown error handling code: %.400s", 2063 errors); 2064 return -1; 2065 } 2066} 2067 2068PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2069 int size, 2070 const char *errors) 2071{ 2072 PyObject *repr; 2073 char *s, *start; 2074 2075 repr = PyString_FromStringAndSize(NULL, size); 2076 if (repr == NULL) 2077 return NULL; 2078 if (size == 0) 2079 return repr; 2080 2081 s = PyString_AS_STRING(repr); 2082 start = s; 2083 while (size-- > 0) { 2084 Py_UNICODE ch = *p++; 2085 if (ch >= 256) { 2086 if (latin1_encoding_error(&p, &s, errors, 2087 "ordinal not in range(256)")) 2088 goto onError; 2089 } 2090 else 2091 *s++ = (char)ch; 2092 } 2093 /* Resize if error handling skipped some characters */ 2094 if (s - start < PyString_GET_SIZE(repr)) 2095 if (_PyString_Resize(&repr, s - start)) 2096 goto onError; 2097 return repr; 2098 2099 onError: 2100 Py_DECREF(repr); 2101 return NULL; 2102} 2103 2104PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2105{ 2106 if (!PyUnicode_Check(unicode)) { 2107 PyErr_BadArgument(); 2108 return NULL; 2109 } 2110 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2111 PyUnicode_GET_SIZE(unicode), 2112 NULL); 2113} 2114 2115/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2116 2117static 2118int ascii_decoding_error(const char **source, 2119 Py_UNICODE **dest, 2120 const char *errors, 2121 const char *details) 2122{ 2123 if ((errors == NULL) || 2124 (strcmp(errors,"strict") == 0)) { 2125 PyErr_Format(PyExc_UnicodeError, 2126 "ASCII decoding error: %.400s", 2127 details); 2128 return -1; 2129 } 2130 else if (strcmp(errors,"ignore") == 0) { 2131 return 0; 2132 } 2133 else if (strcmp(errors,"replace") == 0) { 2134 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2135 (*dest)++; 2136 return 0; 2137 } 2138 else { 2139 PyErr_Format(PyExc_ValueError, 2140 "ASCII decoding error; " 2141 "unknown error handling code: %.400s", 2142 errors); 2143 return -1; 2144 } 2145} 2146 2147PyObject *PyUnicode_DecodeASCII(const char *s, 2148 int size, 2149 const char *errors) 2150{ 2151 PyUnicodeObject *v; 2152 Py_UNICODE *p; 2153 2154 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2155 if (size == 1 && *(unsigned char*)s < 128) { 2156 Py_UNICODE r = *(unsigned char*)s; 2157 return PyUnicode_FromUnicode(&r, 1); 2158 } 2159 2160 v = _PyUnicode_New(size); 2161 if (v == NULL) 2162 goto onError; 2163 if (size == 0) 2164 return (PyObject *)v; 2165 p = PyUnicode_AS_UNICODE(v); 2166 while (size-- > 0) { 2167 register unsigned char c; 2168 2169 c = (unsigned char)*s++; 2170 if (c < 128) 2171 *p++ = c; 2172 else if (ascii_decoding_error(&s, &p, errors, 2173 "ordinal not in range(128)")) 2174 goto onError; 2175 } 2176 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2177 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2178 goto onError; 2179 return (PyObject *)v; 2180 2181 onError: 2182 Py_XDECREF(v); 2183 return NULL; 2184} 2185 2186static 2187int ascii_encoding_error(const Py_UNICODE **source, 2188 char **dest, 2189 const char *errors, 2190 const char *details) 2191{ 2192 if ((errors == NULL) || 2193 (strcmp(errors,"strict") == 0)) { 2194 PyErr_Format(PyExc_UnicodeError, 2195 "ASCII encoding error: %.400s", 2196 details); 2197 return -1; 2198 } 2199 else if (strcmp(errors,"ignore") == 0) { 2200 return 0; 2201 } 2202 else if (strcmp(errors,"replace") == 0) { 2203 **dest = '?'; 2204 (*dest)++; 2205 return 0; 2206 } 2207 else { 2208 PyErr_Format(PyExc_ValueError, 2209 "ASCII encoding error; " 2210 "unknown error handling code: %.400s", 2211 errors); 2212 return -1; 2213 } 2214} 2215 2216PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2217 int size, 2218 const char *errors) 2219{ 2220 PyObject *repr; 2221 char *s, *start; 2222 2223 repr = PyString_FromStringAndSize(NULL, size); 2224 if (repr == NULL) 2225 return NULL; 2226 if (size == 0) 2227 return repr; 2228 2229 s = PyString_AS_STRING(repr); 2230 start = s; 2231 while (size-- > 0) { 2232 Py_UNICODE ch = *p++; 2233 if (ch >= 128) { 2234 if (ascii_encoding_error(&p, &s, errors, 2235 "ordinal not in range(128)")) 2236 goto onError; 2237 } 2238 else 2239 *s++ = (char)ch; 2240 } 2241 /* Resize if error handling skipped some characters */ 2242 if (s - start < PyString_GET_SIZE(repr)) 2243 if (_PyString_Resize(&repr, s - start)) 2244 goto onError; 2245 return repr; 2246 2247 onError: 2248 Py_DECREF(repr); 2249 return NULL; 2250} 2251 2252PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2253{ 2254 if (!PyUnicode_Check(unicode)) { 2255 PyErr_BadArgument(); 2256 return NULL; 2257 } 2258 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2259 PyUnicode_GET_SIZE(unicode), 2260 NULL); 2261} 2262 2263#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) 2264 2265/* --- MBCS codecs for Windows -------------------------------------------- */ 2266 2267PyObject *PyUnicode_DecodeMBCS(const char *s, 2268 int size, 2269 const char *errors) 2270{ 2271 PyUnicodeObject *v; 2272 Py_UNICODE *p; 2273 2274 /* First get the size of the result */ 2275 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2276 if (size > 0 && usize==0) 2277 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2278 2279 v = _PyUnicode_New(usize); 2280 if (v == NULL) 2281 return NULL; 2282 if (usize == 0) 2283 return (PyObject *)v; 2284 p = PyUnicode_AS_UNICODE(v); 2285 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2286 Py_DECREF(v); 2287 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2288 } 2289 2290 return (PyObject *)v; 2291} 2292 2293PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2294 int size, 2295 const char *errors) 2296{ 2297 PyObject *repr; 2298 char *s; 2299 DWORD mbcssize; 2300 2301 /* If there are no characters, bail now! */ 2302 if (size==0) 2303 return PyString_FromString(""); 2304 2305 /* First get the size of the result */ 2306 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2307 if (mbcssize==0) 2308 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2309 2310 repr = PyString_FromStringAndSize(NULL, mbcssize); 2311 if (repr == NULL) 2312 return NULL; 2313 if (mbcssize == 0) 2314 return repr; 2315 2316 /* Do the conversion */ 2317 s = PyString_AS_STRING(repr); 2318 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2319 Py_DECREF(repr); 2320 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2321 } 2322 return repr; 2323} 2324 2325#endif /* MS_WIN32 */ 2326 2327/* --- Character Mapping Codec -------------------------------------------- */ 2328 2329static 2330int charmap_decoding_error(const char **source, 2331 Py_UNICODE **dest, 2332 const char *errors, 2333 const char *details) 2334{ 2335 if ((errors == NULL) || 2336 (strcmp(errors,"strict") == 0)) { 2337 PyErr_Format(PyExc_UnicodeError, 2338 "charmap decoding error: %.400s", 2339 details); 2340 return -1; 2341 } 2342 else if (strcmp(errors,"ignore") == 0) { 2343 return 0; 2344 } 2345 else if (strcmp(errors,"replace") == 0) { 2346 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2347 (*dest)++; 2348 return 0; 2349 } 2350 else { 2351 PyErr_Format(PyExc_ValueError, 2352 "charmap decoding error; " 2353 "unknown error handling code: %.400s", 2354 errors); 2355 return -1; 2356 } 2357} 2358 2359PyObject *PyUnicode_DecodeCharmap(const char *s, 2360 int size, 2361 PyObject *mapping, 2362 const char *errors) 2363{ 2364 PyUnicodeObject *v; 2365 Py_UNICODE *p; 2366 int extrachars = 0; 2367 2368 /* Default to Latin-1 */ 2369 if (mapping == NULL) 2370 return PyUnicode_DecodeLatin1(s, size, errors); 2371 2372 v = _PyUnicode_New(size); 2373 if (v == NULL) 2374 goto onError; 2375 if (size == 0) 2376 return (PyObject *)v; 2377 p = PyUnicode_AS_UNICODE(v); 2378 while (size-- > 0) { 2379 unsigned char ch = *s++; 2380 PyObject *w, *x; 2381 2382 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2383 w = PyInt_FromLong((long)ch); 2384 if (w == NULL) 2385 goto onError; 2386 x = PyObject_GetItem(mapping, w); 2387 Py_DECREF(w); 2388 if (x == NULL) { 2389 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2390 /* No mapping found means: mapping is undefined. */ 2391 PyErr_Clear(); 2392 x = Py_None; 2393 Py_INCREF(x); 2394 } else 2395 goto onError; 2396 } 2397 2398 /* Apply mapping */ 2399 if (PyInt_Check(x)) { 2400 long value = PyInt_AS_LONG(x); 2401 if (value < 0 || value > 65535) { 2402 PyErr_SetString(PyExc_TypeError, 2403 "character mapping must be in range(65536)"); 2404 Py_DECREF(x); 2405 goto onError; 2406 } 2407 *p++ = (Py_UNICODE)value; 2408 } 2409 else if (x == Py_None) { 2410 /* undefined mapping */ 2411 if (charmap_decoding_error(&s, &p, errors, 2412 "character maps to <undefined>")) { 2413 Py_DECREF(x); 2414 goto onError; 2415 } 2416 } 2417 else if (PyUnicode_Check(x)) { 2418 int targetsize = PyUnicode_GET_SIZE(x); 2419 2420 if (targetsize == 1) 2421 /* 1-1 mapping */ 2422 *p++ = *PyUnicode_AS_UNICODE(x); 2423 2424 else if (targetsize > 1) { 2425 /* 1-n mapping */ 2426 if (targetsize > extrachars) { 2427 /* resize first */ 2428 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2429 int needed = (targetsize - extrachars) + \ 2430 (targetsize << 2); 2431 extrachars += needed; 2432 if (_PyUnicode_Resize(&v, 2433 PyUnicode_GET_SIZE(v) + needed)) { 2434 Py_DECREF(x); 2435 goto onError; 2436 } 2437 p = PyUnicode_AS_UNICODE(v) + oldpos; 2438 } 2439 Py_UNICODE_COPY(p, 2440 PyUnicode_AS_UNICODE(x), 2441 targetsize); 2442 p += targetsize; 2443 extrachars -= targetsize; 2444 } 2445 /* 1-0 mapping: skip the character */ 2446 } 2447 else { 2448 /* wrong return value */ 2449 PyErr_SetString(PyExc_TypeError, 2450 "character mapping must return integer, None or unicode"); 2451 Py_DECREF(x); 2452 goto onError; 2453 } 2454 Py_DECREF(x); 2455 } 2456 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2457 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2458 goto onError; 2459 return (PyObject *)v; 2460 2461 onError: 2462 Py_XDECREF(v); 2463 return NULL; 2464} 2465 2466static 2467int charmap_encoding_error(const Py_UNICODE **source, 2468 char **dest, 2469 const char *errors, 2470 const char *details) 2471{ 2472 if ((errors == NULL) || 2473 (strcmp(errors,"strict") == 0)) { 2474 PyErr_Format(PyExc_UnicodeError, 2475 "charmap encoding error: %.400s", 2476 details); 2477 return -1; 2478 } 2479 else if (strcmp(errors,"ignore") == 0) { 2480 return 0; 2481 } 2482 else if (strcmp(errors,"replace") == 0) { 2483 **dest = '?'; 2484 (*dest)++; 2485 return 0; 2486 } 2487 else { 2488 PyErr_Format(PyExc_ValueError, 2489 "charmap encoding error; " 2490 "unknown error handling code: %.400s", 2491 errors); 2492 return -1; 2493 } 2494} 2495 2496PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2497 int size, 2498 PyObject *mapping, 2499 const char *errors) 2500{ 2501 PyObject *v; 2502 char *s; 2503 int extrachars = 0; 2504 2505 /* Default to Latin-1 */ 2506 if (mapping == NULL) 2507 return PyUnicode_EncodeLatin1(p, size, errors); 2508 2509 v = PyString_FromStringAndSize(NULL, size); 2510 if (v == NULL) 2511 return NULL; 2512 if (size == 0) 2513 return v; 2514 s = PyString_AS_STRING(v); 2515 while (size-- > 0) { 2516 Py_UNICODE ch = *p++; 2517 PyObject *w, *x; 2518 2519 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2520 w = PyInt_FromLong((long)ch); 2521 if (w == NULL) 2522 goto onError; 2523 x = PyObject_GetItem(mapping, w); 2524 Py_DECREF(w); 2525 if (x == NULL) { 2526 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2527 /* No mapping found means: mapping is undefined. */ 2528 PyErr_Clear(); 2529 x = Py_None; 2530 Py_INCREF(x); 2531 } else 2532 goto onError; 2533 } 2534 2535 /* Apply mapping */ 2536 if (PyInt_Check(x)) { 2537 long value = PyInt_AS_LONG(x); 2538 if (value < 0 || value > 255) { 2539 PyErr_SetString(PyExc_TypeError, 2540 "character mapping must be in range(256)"); 2541 Py_DECREF(x); 2542 goto onError; 2543 } 2544 *s++ = (char)value; 2545 } 2546 else if (x == Py_None) { 2547 /* undefined mapping */ 2548 if (charmap_encoding_error(&p, &s, errors, 2549 "character maps to <undefined>")) { 2550 Py_DECREF(x); 2551 goto onError; 2552 } 2553 } 2554 else if (PyString_Check(x)) { 2555 int targetsize = PyString_GET_SIZE(x); 2556 2557 if (targetsize == 1) 2558 /* 1-1 mapping */ 2559 *s++ = *PyString_AS_STRING(x); 2560 2561 else if (targetsize > 1) { 2562 /* 1-n mapping */ 2563 if (targetsize > extrachars) { 2564 /* resize first */ 2565 int oldpos = (int)(s - PyString_AS_STRING(v)); 2566 int needed = (targetsize - extrachars) + \ 2567 (targetsize << 2); 2568 extrachars += needed; 2569 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2570 Py_DECREF(x); 2571 goto onError; 2572 } 2573 s = PyString_AS_STRING(v) + oldpos; 2574 } 2575 memcpy(s, PyString_AS_STRING(x), targetsize); 2576 s += targetsize; 2577 extrachars -= targetsize; 2578 } 2579 /* 1-0 mapping: skip the character */ 2580 } 2581 else { 2582 /* wrong return value */ 2583 PyErr_SetString(PyExc_TypeError, 2584 "character mapping must return integer, None or unicode"); 2585 Py_DECREF(x); 2586 goto onError; 2587 } 2588 Py_DECREF(x); 2589 } 2590 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2591 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2592 goto onError; 2593 return v; 2594 2595 onError: 2596 Py_DECREF(v); 2597 return NULL; 2598} 2599 2600PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2601 PyObject *mapping) 2602{ 2603 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2604 PyErr_BadArgument(); 2605 return NULL; 2606 } 2607 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2608 PyUnicode_GET_SIZE(unicode), 2609 mapping, 2610 NULL); 2611} 2612 2613static 2614int translate_error(const Py_UNICODE **source, 2615 Py_UNICODE **dest, 2616 const char *errors, 2617 const char *details) 2618{ 2619 if ((errors == NULL) || 2620 (strcmp(errors,"strict") == 0)) { 2621 PyErr_Format(PyExc_UnicodeError, 2622 "translate error: %.400s", 2623 details); 2624 return -1; 2625 } 2626 else if (strcmp(errors,"ignore") == 0) { 2627 return 0; 2628 } 2629 else if (strcmp(errors,"replace") == 0) { 2630 **dest = '?'; 2631 (*dest)++; 2632 return 0; 2633 } 2634 else { 2635 PyErr_Format(PyExc_ValueError, 2636 "translate error; " 2637 "unknown error handling code: %.400s", 2638 errors); 2639 return -1; 2640 } 2641} 2642 2643PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2644 int size, 2645 PyObject *mapping, 2646 const char *errors) 2647{ 2648 PyUnicodeObject *v; 2649 Py_UNICODE *p; 2650 2651 if (mapping == NULL) { 2652 PyErr_BadArgument(); 2653 return NULL; 2654 } 2655 2656 /* Output will never be longer than input */ 2657 v = _PyUnicode_New(size); 2658 if (v == NULL) 2659 goto onError; 2660 if (size == 0) 2661 goto done; 2662 p = PyUnicode_AS_UNICODE(v); 2663 while (size-- > 0) { 2664 Py_UNICODE ch = *s++; 2665 PyObject *w, *x; 2666 2667 /* Get mapping */ 2668 w = PyInt_FromLong(ch); 2669 if (w == NULL) 2670 goto onError; 2671 x = PyObject_GetItem(mapping, w); 2672 Py_DECREF(w); 2673 if (x == NULL) { 2674 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2675 /* No mapping found: default to 1-1 mapping */ 2676 PyErr_Clear(); 2677 *p++ = ch; 2678 continue; 2679 } 2680 goto onError; 2681 } 2682 2683 /* Apply mapping */ 2684 if (PyInt_Check(x)) 2685 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2686 else if (x == Py_None) { 2687 /* undefined mapping */ 2688 if (translate_error(&s, &p, errors, 2689 "character maps to <undefined>")) { 2690 Py_DECREF(x); 2691 goto onError; 2692 } 2693 } 2694 else if (PyUnicode_Check(x)) { 2695 if (PyUnicode_GET_SIZE(x) != 1) { 2696 /* 1-n mapping */ 2697 PyErr_SetString(PyExc_NotImplementedError, 2698 "1-n mappings are currently not implemented"); 2699 Py_DECREF(x); 2700 goto onError; 2701 } 2702 *p++ = *PyUnicode_AS_UNICODE(x); 2703 } 2704 else { 2705 /* wrong return value */ 2706 PyErr_SetString(PyExc_TypeError, 2707 "translate mapping must return integer, None or unicode"); 2708 Py_DECREF(x); 2709 goto onError; 2710 } 2711 Py_DECREF(x); 2712 } 2713 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2714 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2715 goto onError; 2716 2717 done: 2718 return (PyObject *)v; 2719 2720 onError: 2721 Py_XDECREF(v); 2722 return NULL; 2723} 2724 2725PyObject *PyUnicode_Translate(PyObject *str, 2726 PyObject *mapping, 2727 const char *errors) 2728{ 2729 PyObject *result; 2730 2731 str = PyUnicode_FromObject(str); 2732 if (str == NULL) 2733 goto onError; 2734 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2735 PyUnicode_GET_SIZE(str), 2736 mapping, 2737 errors); 2738 Py_DECREF(str); 2739 return result; 2740 2741 onError: 2742 Py_XDECREF(str); 2743 return NULL; 2744} 2745 2746/* --- Decimal Encoder ---------------------------------------------------- */ 2747 2748int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2749 int length, 2750 char *output, 2751 const char *errors) 2752{ 2753 Py_UNICODE *p, *end; 2754 2755 if (output == NULL) { 2756 PyErr_BadArgument(); 2757 return -1; 2758 } 2759 2760 p = s; 2761 end = s + length; 2762 while (p < end) { 2763 register Py_UNICODE ch = *p++; 2764 int decimal; 2765 2766 if (Py_UNICODE_ISSPACE(ch)) { 2767 *output++ = ' '; 2768 continue; 2769 } 2770 decimal = Py_UNICODE_TODECIMAL(ch); 2771 if (decimal >= 0) { 2772 *output++ = '0' + decimal; 2773 continue; 2774 } 2775 if (0 < ch && ch < 256) { 2776 *output++ = (char)ch; 2777 continue; 2778 } 2779 /* All other characters are considered invalid */ 2780 if (errors == NULL || strcmp(errors, "strict") == 0) { 2781 PyErr_SetString(PyExc_ValueError, 2782 "invalid decimal Unicode string"); 2783 goto onError; 2784 } 2785 else if (strcmp(errors, "ignore") == 0) 2786 continue; 2787 else if (strcmp(errors, "replace") == 0) { 2788 *output++ = '?'; 2789 continue; 2790 } 2791 } 2792 /* 0-terminate the output string */ 2793 *output++ = '\0'; 2794 return 0; 2795 2796 onError: 2797 return -1; 2798} 2799 2800/* --- Helpers ------------------------------------------------------------ */ 2801 2802static 2803int count(PyUnicodeObject *self, 2804 int start, 2805 int end, 2806 PyUnicodeObject *substring) 2807{ 2808 int count = 0; 2809 2810 if (start < 0) 2811 start += self->length; 2812 if (start < 0) 2813 start = 0; 2814 if (end > self->length) 2815 end = self->length; 2816 if (end < 0) 2817 end += self->length; 2818 if (end < 0) 2819 end = 0; 2820 2821 if (substring->length == 0) 2822 return (end - start + 1); 2823 2824 end -= substring->length; 2825 2826 while (start <= end) 2827 if (Py_UNICODE_MATCH(self, start, substring)) { 2828 count++; 2829 start += substring->length; 2830 } else 2831 start++; 2832 2833 return count; 2834} 2835 2836int PyUnicode_Count(PyObject *str, 2837 PyObject *substr, 2838 int start, 2839 int end) 2840{ 2841 int result; 2842 2843 str = PyUnicode_FromObject(str); 2844 if (str == NULL) 2845 return -1; 2846 substr = PyUnicode_FromObject(substr); 2847 if (substr == NULL) { 2848 Py_DECREF(str); 2849 return -1; 2850 } 2851 2852 result = count((PyUnicodeObject *)str, 2853 start, end, 2854 (PyUnicodeObject *)substr); 2855 2856 Py_DECREF(str); 2857 Py_DECREF(substr); 2858 return result; 2859} 2860 2861static 2862int findstring(PyUnicodeObject *self, 2863 PyUnicodeObject *substring, 2864 int start, 2865 int end, 2866 int direction) 2867{ 2868 if (start < 0) 2869 start += self->length; 2870 if (start < 0) 2871 start = 0; 2872 2873 if (substring->length == 0) 2874 return start; 2875 2876 if (end > self->length) 2877 end = self->length; 2878 if (end < 0) 2879 end += self->length; 2880 if (end < 0) 2881 end = 0; 2882 2883 end -= substring->length; 2884 2885 if (direction < 0) { 2886 for (; end >= start; end--) 2887 if (Py_UNICODE_MATCH(self, end, substring)) 2888 return end; 2889 } else { 2890 for (; start <= end; start++) 2891 if (Py_UNICODE_MATCH(self, start, substring)) 2892 return start; 2893 } 2894 2895 return -1; 2896} 2897 2898int PyUnicode_Find(PyObject *str, 2899 PyObject *substr, 2900 int start, 2901 int end, 2902 int direction) 2903{ 2904 int result; 2905 2906 str = PyUnicode_FromObject(str); 2907 if (str == NULL) 2908 return -1; 2909 substr = PyUnicode_FromObject(substr); 2910 if (substr == NULL) { 2911 Py_DECREF(substr); 2912 return -1; 2913 } 2914 2915 result = findstring((PyUnicodeObject *)str, 2916 (PyUnicodeObject *)substr, 2917 start, end, direction); 2918 Py_DECREF(str); 2919 Py_DECREF(substr); 2920 return result; 2921} 2922 2923static 2924int tailmatch(PyUnicodeObject *self, 2925 PyUnicodeObject *substring, 2926 int start, 2927 int end, 2928 int direction) 2929{ 2930 if (start < 0) 2931 start += self->length; 2932 if (start < 0) 2933 start = 0; 2934 2935 if (substring->length == 0) 2936 return 1; 2937 2938 if (end > self->length) 2939 end = self->length; 2940 if (end < 0) 2941 end += self->length; 2942 if (end < 0) 2943 end = 0; 2944 2945 end -= substring->length; 2946 if (end < start) 2947 return 0; 2948 2949 if (direction > 0) { 2950 if (Py_UNICODE_MATCH(self, end, substring)) 2951 return 1; 2952 } else { 2953 if (Py_UNICODE_MATCH(self, start, substring)) 2954 return 1; 2955 } 2956 2957 return 0; 2958} 2959 2960int PyUnicode_Tailmatch(PyObject *str, 2961 PyObject *substr, 2962 int start, 2963 int end, 2964 int direction) 2965{ 2966 int result; 2967 2968 str = PyUnicode_FromObject(str); 2969 if (str == NULL) 2970 return -1; 2971 substr = PyUnicode_FromObject(substr); 2972 if (substr == NULL) { 2973 Py_DECREF(substr); 2974 return -1; 2975 } 2976 2977 result = tailmatch((PyUnicodeObject *)str, 2978 (PyUnicodeObject *)substr, 2979 start, end, direction); 2980 Py_DECREF(str); 2981 Py_DECREF(substr); 2982 return result; 2983} 2984 2985static 2986const Py_UNICODE *findchar(const Py_UNICODE *s, 2987 int size, 2988 Py_UNICODE ch) 2989{ 2990 /* like wcschr, but doesn't stop at NULL characters */ 2991 2992 while (size-- > 0) { 2993 if (*s == ch) 2994 return s; 2995 s++; 2996 } 2997 2998 return NULL; 2999} 3000 3001/* Apply fixfct filter to the Unicode object self and return a 3002 reference to the modified object */ 3003 3004static 3005PyObject *fixup(PyUnicodeObject *self, 3006 int (*fixfct)(PyUnicodeObject *s)) 3007{ 3008 3009 PyUnicodeObject *u; 3010 3011 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3012 if (u == NULL) 3013 return NULL; 3014 3015 Py_UNICODE_COPY(u->str, self->str, self->length); 3016 3017 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3018 /* fixfct should return TRUE if it modified the buffer. If 3019 FALSE, return a reference to the original buffer instead 3020 (to save space, not time) */ 3021 Py_INCREF(self); 3022 Py_DECREF(u); 3023 return (PyObject*) self; 3024 } 3025 return (PyObject*) u; 3026} 3027 3028static 3029int fixupper(PyUnicodeObject *self) 3030{ 3031 int len = self->length; 3032 Py_UNICODE *s = self->str; 3033 int status = 0; 3034 3035 while (len-- > 0) { 3036 register Py_UNICODE ch; 3037 3038 ch = Py_UNICODE_TOUPPER(*s); 3039 if (ch != *s) { 3040 status = 1; 3041 *s = ch; 3042 } 3043 s++; 3044 } 3045 3046 return status; 3047} 3048 3049static 3050int fixlower(PyUnicodeObject *self) 3051{ 3052 int len = self->length; 3053 Py_UNICODE *s = self->str; 3054 int status = 0; 3055 3056 while (len-- > 0) { 3057 register Py_UNICODE ch; 3058 3059 ch = Py_UNICODE_TOLOWER(*s); 3060 if (ch != *s) { 3061 status = 1; 3062 *s = ch; 3063 } 3064 s++; 3065 } 3066 3067 return status; 3068} 3069 3070static 3071int fixswapcase(PyUnicodeObject *self) 3072{ 3073 int len = self->length; 3074 Py_UNICODE *s = self->str; 3075 int status = 0; 3076 3077 while (len-- > 0) { 3078 if (Py_UNICODE_ISUPPER(*s)) { 3079 *s = Py_UNICODE_TOLOWER(*s); 3080 status = 1; 3081 } else if (Py_UNICODE_ISLOWER(*s)) { 3082 *s = Py_UNICODE_TOUPPER(*s); 3083 status = 1; 3084 } 3085 s++; 3086 } 3087 3088 return status; 3089} 3090 3091static 3092int fixcapitalize(PyUnicodeObject *self) 3093{ 3094 int len = self->length; 3095 Py_UNICODE *s = self->str; 3096 int status = 0; 3097 3098 if (len == 0) 3099 return 0; 3100 if (Py_UNICODE_ISLOWER(*s)) { 3101 *s = Py_UNICODE_TOUPPER(*s); 3102 status = 1; 3103 } 3104 s++; 3105 while (--len > 0) { 3106 if (Py_UNICODE_ISUPPER(*s)) { 3107 *s = Py_UNICODE_TOLOWER(*s); 3108 status = 1; 3109 } 3110 s++; 3111 } 3112 return status; 3113} 3114 3115static 3116int fixtitle(PyUnicodeObject *self) 3117{ 3118 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3119 register Py_UNICODE *e; 3120 int previous_is_cased; 3121 3122 /* Shortcut for single character strings */ 3123 if (PyUnicode_GET_SIZE(self) == 1) { 3124 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3125 if (*p != ch) { 3126 *p = ch; 3127 return 1; 3128 } 3129 else 3130 return 0; 3131 } 3132 3133 e = p + PyUnicode_GET_SIZE(self); 3134 previous_is_cased = 0; 3135 for (; p < e; p++) { 3136 register const Py_UNICODE ch = *p; 3137 3138 if (previous_is_cased) 3139 *p = Py_UNICODE_TOLOWER(ch); 3140 else 3141 *p = Py_UNICODE_TOTITLE(ch); 3142 3143 if (Py_UNICODE_ISLOWER(ch) || 3144 Py_UNICODE_ISUPPER(ch) || 3145 Py_UNICODE_ISTITLE(ch)) 3146 previous_is_cased = 1; 3147 else 3148 previous_is_cased = 0; 3149 } 3150 return 1; 3151} 3152 3153PyObject *PyUnicode_Join(PyObject *separator, 3154 PyObject *seq) 3155{ 3156 Py_UNICODE *sep; 3157 int seplen; 3158 PyUnicodeObject *res = NULL; 3159 int reslen = 0; 3160 Py_UNICODE *p; 3161 int sz = 100; 3162 int i; 3163 PyObject *it; 3164 3165 it = PyObject_GetIter(seq); 3166 if (it == NULL) 3167 return NULL; 3168 3169 if (separator == NULL) { 3170 Py_UNICODE blank = ' '; 3171 sep = ␣ 3172 seplen = 1; 3173 } 3174 else { 3175 separator = PyUnicode_FromObject(separator); 3176 if (separator == NULL) 3177 goto onError; 3178 sep = PyUnicode_AS_UNICODE(separator); 3179 seplen = PyUnicode_GET_SIZE(separator); 3180 } 3181 3182 res = _PyUnicode_New(sz); 3183 if (res == NULL) 3184 goto onError; 3185 p = PyUnicode_AS_UNICODE(res); 3186 reslen = 0; 3187 3188 for (i = 0; ; ++i) { 3189 int itemlen; 3190 PyObject *item = PyIter_Next(it); 3191 if (item == NULL) { 3192 if (PyErr_Occurred()) 3193 goto onError; 3194 break; 3195 } 3196 if (!PyUnicode_Check(item)) { 3197 PyObject *v; 3198 if (!PyString_Check(item)) { 3199 PyErr_Format(PyExc_TypeError, 3200 "sequence item %i: expected string or Unicode," 3201 " %.80s found", 3202 i, item->ob_type->tp_name); 3203 Py_DECREF(item); 3204 goto onError; 3205 } 3206 v = PyUnicode_FromObject(item); 3207 Py_DECREF(item); 3208 item = v; 3209 if (item == NULL) 3210 goto onError; 3211 } 3212 itemlen = PyUnicode_GET_SIZE(item); 3213 while (reslen + itemlen + seplen >= sz) { 3214 if (_PyUnicode_Resize(&res, sz*2)) { 3215 Py_DECREF(item); 3216 goto onError; 3217 } 3218 sz *= 2; 3219 p = PyUnicode_AS_UNICODE(res) + reslen; 3220 } 3221 if (i > 0) { 3222 Py_UNICODE_COPY(p, sep, seplen); 3223 p += seplen; 3224 reslen += seplen; 3225 } 3226 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3227 p += itemlen; 3228 reslen += itemlen; 3229 Py_DECREF(item); 3230 } 3231 if (_PyUnicode_Resize(&res, reslen)) 3232 goto onError; 3233 3234 Py_XDECREF(separator); 3235 Py_DECREF(it); 3236 return (PyObject *)res; 3237 3238 onError: 3239 Py_XDECREF(separator); 3240 Py_XDECREF(res); 3241 Py_DECREF(it); 3242 return NULL; 3243} 3244 3245static 3246PyUnicodeObject *pad(PyUnicodeObject *self, 3247 int left, 3248 int right, 3249 Py_UNICODE fill) 3250{ 3251 PyUnicodeObject *u; 3252 3253 if (left < 0) 3254 left = 0; 3255 if (right < 0) 3256 right = 0; 3257 3258 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3259 Py_INCREF(self); 3260 return self; 3261 } 3262 3263 u = _PyUnicode_New(left + self->length + right); 3264 if (u) { 3265 if (left) 3266 Py_UNICODE_FILL(u->str, fill, left); 3267 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3268 if (right) 3269 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3270 } 3271 3272 return u; 3273} 3274 3275#define SPLIT_APPEND(data, left, right) \ 3276 str = PyUnicode_FromUnicode(data + left, right - left); \ 3277 if (!str) \ 3278 goto onError; \ 3279 if (PyList_Append(list, str)) { \ 3280 Py_DECREF(str); \ 3281 goto onError; \ 3282 } \ 3283 else \ 3284 Py_DECREF(str); 3285 3286static 3287PyObject *split_whitespace(PyUnicodeObject *self, 3288 PyObject *list, 3289 int maxcount) 3290{ 3291 register int i; 3292 register int j; 3293 int len = self->length; 3294 PyObject *str; 3295 3296 for (i = j = 0; i < len; ) { 3297 /* find a token */ 3298 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3299 i++; 3300 j = i; 3301 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 3302 i++; 3303 if (j < i) { 3304 if (maxcount-- <= 0) 3305 break; 3306 SPLIT_APPEND(self->str, j, i); 3307 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3308 i++; 3309 j = i; 3310 } 3311 } 3312 if (j < len) { 3313 SPLIT_APPEND(self->str, j, len); 3314 } 3315 return list; 3316 3317 onError: 3318 Py_DECREF(list); 3319 return NULL; 3320} 3321 3322PyObject *PyUnicode_Splitlines(PyObject *string, 3323 int keepends) 3324{ 3325 register int i; 3326 register int j; 3327 int len; 3328 PyObject *list; 3329 PyObject *str; 3330 Py_UNICODE *data; 3331 3332 string = PyUnicode_FromObject(string); 3333 if (string == NULL) 3334 return NULL; 3335 data = PyUnicode_AS_UNICODE(string); 3336 len = PyUnicode_GET_SIZE(string); 3337 3338 list = PyList_New(0); 3339 if (!list) 3340 goto onError; 3341 3342 for (i = j = 0; i < len; ) { 3343 int eol; 3344 3345 /* Find a line and append it */ 3346 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 3347 i++; 3348 3349 /* Skip the line break reading CRLF as one line break */ 3350 eol = i; 3351 if (i < len) { 3352 if (data[i] == '\r' && i + 1 < len && 3353 data[i+1] == '\n') 3354 i += 2; 3355 else 3356 i++; 3357 if (keepends) 3358 eol = i; 3359 } 3360 SPLIT_APPEND(data, j, eol); 3361 j = i; 3362 } 3363 if (j < len) { 3364 SPLIT_APPEND(data, j, len); 3365 } 3366 3367 Py_DECREF(string); 3368 return list; 3369 3370 onError: 3371 Py_DECREF(list); 3372 Py_DECREF(string); 3373 return NULL; 3374} 3375 3376static 3377PyObject *split_char(PyUnicodeObject *self, 3378 PyObject *list, 3379 Py_UNICODE ch, 3380 int maxcount) 3381{ 3382 register int i; 3383 register int j; 3384 int len = self->length; 3385 PyObject *str; 3386 3387 for (i = j = 0; i < len; ) { 3388 if (self->str[i] == ch) { 3389 if (maxcount-- <= 0) 3390 break; 3391 SPLIT_APPEND(self->str, j, i); 3392 i = j = i + 1; 3393 } else 3394 i++; 3395 } 3396 if (j <= len) { 3397 SPLIT_APPEND(self->str, j, len); 3398 } 3399 return list; 3400 3401 onError: 3402 Py_DECREF(list); 3403 return NULL; 3404} 3405 3406static 3407PyObject *split_substring(PyUnicodeObject *self, 3408 PyObject *list, 3409 PyUnicodeObject *substring, 3410 int maxcount) 3411{ 3412 register int i; 3413 register int j; 3414 int len = self->length; 3415 int sublen = substring->length; 3416 PyObject *str; 3417 3418 for (i = j = 0; i <= len - sublen; ) { 3419 if (Py_UNICODE_MATCH(self, i, substring)) { 3420 if (maxcount-- <= 0) 3421 break; 3422 SPLIT_APPEND(self->str, j, i); 3423 i = j = i + sublen; 3424 } else 3425 i++; 3426 } 3427 if (j <= len) { 3428 SPLIT_APPEND(self->str, j, len); 3429 } 3430 return list; 3431 3432 onError: 3433 Py_DECREF(list); 3434 return NULL; 3435} 3436 3437#undef SPLIT_APPEND 3438 3439static 3440PyObject *split(PyUnicodeObject *self, 3441 PyUnicodeObject *substring, 3442 int maxcount) 3443{ 3444 PyObject *list; 3445 3446 if (maxcount < 0) 3447 maxcount = INT_MAX; 3448 3449 list = PyList_New(0); 3450 if (!list) 3451 return NULL; 3452 3453 if (substring == NULL) 3454 return split_whitespace(self,list,maxcount); 3455 3456 else if (substring->length == 1) 3457 return split_char(self,list,substring->str[0],maxcount); 3458 3459 else if (substring->length == 0) { 3460 Py_DECREF(list); 3461 PyErr_SetString(PyExc_ValueError, "empty separator"); 3462 return NULL; 3463 } 3464 else 3465 return split_substring(self,list,substring,maxcount); 3466} 3467 3468static 3469PyObject *strip(PyUnicodeObject *self, 3470 int left, 3471 int right) 3472{ 3473 Py_UNICODE *p = self->str; 3474 int start = 0; 3475 int end = self->length; 3476 3477 if (left) 3478 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3479 start++; 3480 3481 if (right) 3482 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3483 end--; 3484 3485 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 3486 /* couldn't strip anything off, return original string */ 3487 Py_INCREF(self); 3488 return (PyObject*) self; 3489 } 3490 3491 return (PyObject*) PyUnicode_FromUnicode( 3492 self->str + start, 3493 end - start 3494 ); 3495} 3496 3497static 3498PyObject *replace(PyUnicodeObject *self, 3499 PyUnicodeObject *str1, 3500 PyUnicodeObject *str2, 3501 int maxcount) 3502{ 3503 PyUnicodeObject *u; 3504 3505 if (maxcount < 0) 3506 maxcount = INT_MAX; 3507 3508 if (str1->length == 1 && str2->length == 1) { 3509 int i; 3510 3511 /* replace characters */ 3512 if (!findchar(self->str, self->length, str1->str[0]) && 3513 PyUnicode_CheckExact(self)) { 3514 /* nothing to replace, return original string */ 3515 Py_INCREF(self); 3516 u = self; 3517 } else { 3518 Py_UNICODE u1 = str1->str[0]; 3519 Py_UNICODE u2 = str2->str[0]; 3520 3521 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3522 NULL, 3523 self->length 3524 ); 3525 if (u != NULL) { 3526 Py_UNICODE_COPY(u->str, self->str, 3527 self->length); 3528 for (i = 0; i < u->length; i++) 3529 if (u->str[i] == u1) { 3530 if (--maxcount < 0) 3531 break; 3532 u->str[i] = u2; 3533 } 3534 } 3535 } 3536 3537 } else { 3538 int n, i; 3539 Py_UNICODE *p; 3540 3541 /* replace strings */ 3542 n = count(self, 0, self->length, str1); 3543 if (n > maxcount) 3544 n = maxcount; 3545 if (n == 0 && PyUnicode_CheckExact(self)) { 3546 /* nothing to replace, return original string */ 3547 Py_INCREF(self); 3548 u = self; 3549 } else { 3550 u = _PyUnicode_New( 3551 self->length + n * (str2->length - str1->length)); 3552 if (u) { 3553 i = 0; 3554 p = u->str; 3555 while (i <= self->length - str1->length) 3556 if (Py_UNICODE_MATCH(self, i, str1)) { 3557 /* replace string segment */ 3558 Py_UNICODE_COPY(p, str2->str, str2->length); 3559 p += str2->length; 3560 i += str1->length; 3561 if (--n <= 0) { 3562 /* copy remaining part */ 3563 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3564 break; 3565 } 3566 } else 3567 *p++ = self->str[i++]; 3568 } 3569 } 3570 } 3571 3572 return (PyObject *) u; 3573} 3574 3575/* --- Unicode Object Methods --------------------------------------------- */ 3576 3577static char title__doc__[] = 3578"S.title() -> unicode\n\ 3579\n\ 3580Return a titlecased version of S, i.e. words start with title case\n\ 3581characters, all remaining cased characters have lower case."; 3582 3583static PyObject* 3584unicode_title(PyUnicodeObject *self) 3585{ 3586 return fixup(self, fixtitle); 3587} 3588 3589static char capitalize__doc__[] = 3590"S.capitalize() -> unicode\n\ 3591\n\ 3592Return a capitalized version of S, i.e. make the first character\n\ 3593have upper case."; 3594 3595static PyObject* 3596unicode_capitalize(PyUnicodeObject *self) 3597{ 3598 return fixup(self, fixcapitalize); 3599} 3600 3601#if 0 3602static char capwords__doc__[] = 3603"S.capwords() -> unicode\n\ 3604\n\ 3605Apply .capitalize() to all words in S and return the result with\n\ 3606normalized whitespace (all whitespace strings are replaced by ' ')."; 3607 3608static PyObject* 3609unicode_capwords(PyUnicodeObject *self) 3610{ 3611 PyObject *list; 3612 PyObject *item; 3613 int i; 3614 3615 /* Split into words */ 3616 list = split(self, NULL, -1); 3617 if (!list) 3618 return NULL; 3619 3620 /* Capitalize each word */ 3621 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3622 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3623 fixcapitalize); 3624 if (item == NULL) 3625 goto onError; 3626 Py_DECREF(PyList_GET_ITEM(list, i)); 3627 PyList_SET_ITEM(list, i, item); 3628 } 3629 3630 /* Join the words to form a new string */ 3631 item = PyUnicode_Join(NULL, list); 3632 3633onError: 3634 Py_DECREF(list); 3635 return (PyObject *)item; 3636} 3637#endif 3638 3639static char center__doc__[] = 3640"S.center(width) -> unicode\n\ 3641\n\ 3642Return S centered in a Unicode string of length width. Padding is done\n\ 3643using spaces."; 3644 3645static PyObject * 3646unicode_center(PyUnicodeObject *self, PyObject *args) 3647{ 3648 int marg, left; 3649 int width; 3650 3651 if (!PyArg_ParseTuple(args, "i:center", &width)) 3652 return NULL; 3653 3654 if (self->length >= width && PyUnicode_CheckExact(self)) { 3655 Py_INCREF(self); 3656 return (PyObject*) self; 3657 } 3658 3659 marg = width - self->length; 3660 left = marg / 2 + (marg & width & 1); 3661 3662 return (PyObject*) pad(self, left, marg - left, ' '); 3663} 3664 3665#if 0 3666 3667/* This code should go into some future Unicode collation support 3668 module. The basic comparison should compare ordinals on a naive 3669 basis (this is what Java does and thus JPython too). */ 3670 3671/* speedy UTF-16 code point order comparison */ 3672/* gleaned from: */ 3673/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3674 3675static short utf16Fixup[32] = 3676{ 3677 0, 0, 0, 0, 0, 0, 0, 0, 3678 0, 0, 0, 0, 0, 0, 0, 0, 3679 0, 0, 0, 0, 0, 0, 0, 0, 3680 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3681}; 3682 3683static int 3684unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3685{ 3686 int len1, len2; 3687 3688 Py_UNICODE *s1 = str1->str; 3689 Py_UNICODE *s2 = str2->str; 3690 3691 len1 = str1->length; 3692 len2 = str2->length; 3693 3694 while (len1 > 0 && len2 > 0) { 3695 Py_UNICODE c1, c2; 3696 3697 c1 = *s1++; 3698 c2 = *s2++; 3699 3700 if (c1 > (1<<11) * 26) 3701 c1 += utf16Fixup[c1>>11]; 3702 if (c2 > (1<<11) * 26) 3703 c2 += utf16Fixup[c2>>11]; 3704 /* now c1 and c2 are in UTF-32-compatible order */ 3705 3706 if (c1 != c2) 3707 return (c1 < c2) ? -1 : 1; 3708 3709 len1--; len2--; 3710 } 3711 3712 return (len1 < len2) ? -1 : (len1 != len2); 3713} 3714 3715#else 3716 3717static int 3718unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3719{ 3720 register int len1, len2; 3721 3722 Py_UNICODE *s1 = str1->str; 3723 Py_UNICODE *s2 = str2->str; 3724 3725 len1 = str1->length; 3726 len2 = str2->length; 3727 3728 while (len1 > 0 && len2 > 0) { 3729 Py_UNICODE c1, c2; 3730 3731 c1 = *s1++; 3732 c2 = *s2++; 3733 3734 if (c1 != c2) 3735 return (c1 < c2) ? -1 : 1; 3736 3737 len1--; len2--; 3738 } 3739 3740 return (len1 < len2) ? -1 : (len1 != len2); 3741} 3742 3743#endif 3744 3745int PyUnicode_Compare(PyObject *left, 3746 PyObject *right) 3747{ 3748 PyUnicodeObject *u = NULL, *v = NULL; 3749 int result; 3750 3751 /* Coerce the two arguments */ 3752 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3753 if (u == NULL) 3754 goto onError; 3755 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3756 if (v == NULL) 3757 goto onError; 3758 3759 /* Shortcut for empty or interned objects */ 3760 if (v == u) { 3761 Py_DECREF(u); 3762 Py_DECREF(v); 3763 return 0; 3764 } 3765 3766 result = unicode_compare(u, v); 3767 3768 Py_DECREF(u); 3769 Py_DECREF(v); 3770 return result; 3771 3772onError: 3773 Py_XDECREF(u); 3774 Py_XDECREF(v); 3775 return -1; 3776} 3777 3778int PyUnicode_Contains(PyObject *container, 3779 PyObject *element) 3780{ 3781 PyUnicodeObject *u = NULL, *v = NULL; 3782 int result; 3783 register const Py_UNICODE *p, *e; 3784 register Py_UNICODE ch; 3785 3786 /* Coerce the two arguments */ 3787 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3788 if (v == NULL) { 3789 PyErr_SetString(PyExc_TypeError, 3790 "'in <string>' requires character as left operand"); 3791 goto onError; 3792 } 3793 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3794 if (u == NULL) { 3795 Py_DECREF(v); 3796 goto onError; 3797 } 3798 3799 /* Check v in u */ 3800 if (PyUnicode_GET_SIZE(v) != 1) { 3801 PyErr_SetString(PyExc_TypeError, 3802 "'in <string>' requires character as left operand"); 3803 goto onError; 3804 } 3805 ch = *PyUnicode_AS_UNICODE(v); 3806 p = PyUnicode_AS_UNICODE(u); 3807 e = p + PyUnicode_GET_SIZE(u); 3808 result = 0; 3809 while (p < e) { 3810 if (*p++ == ch) { 3811 result = 1; 3812 break; 3813 } 3814 } 3815 3816 Py_DECREF(u); 3817 Py_DECREF(v); 3818 return result; 3819 3820onError: 3821 Py_XDECREF(u); 3822 Py_XDECREF(v); 3823 return -1; 3824} 3825 3826/* Concat to string or Unicode object giving a new Unicode object. */ 3827 3828PyObject *PyUnicode_Concat(PyObject *left, 3829 PyObject *right) 3830{ 3831 PyUnicodeObject *u = NULL, *v = NULL, *w; 3832 3833 /* Coerce the two arguments */ 3834 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3835 if (u == NULL) 3836 goto onError; 3837 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3838 if (v == NULL) 3839 goto onError; 3840 3841 /* Shortcuts */ 3842 if (v == unicode_empty) { 3843 Py_DECREF(v); 3844 return (PyObject *)u; 3845 } 3846 if (u == unicode_empty) { 3847 Py_DECREF(u); 3848 return (PyObject *)v; 3849 } 3850 3851 /* Concat the two Unicode strings */ 3852 w = _PyUnicode_New(u->length + v->length); 3853 if (w == NULL) 3854 goto onError; 3855 Py_UNICODE_COPY(w->str, u->str, u->length); 3856 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3857 3858 Py_DECREF(u); 3859 Py_DECREF(v); 3860 return (PyObject *)w; 3861 3862onError: 3863 Py_XDECREF(u); 3864 Py_XDECREF(v); 3865 return NULL; 3866} 3867 3868static char count__doc__[] = 3869"S.count(sub[, start[, end]]) -> int\n\ 3870\n\ 3871Return the number of occurrences of substring sub in Unicode string\n\ 3872S[start:end]. Optional arguments start and end are\n\ 3873interpreted as in slice notation."; 3874 3875static PyObject * 3876unicode_count(PyUnicodeObject *self, PyObject *args) 3877{ 3878 PyUnicodeObject *substring; 3879 int start = 0; 3880 int end = INT_MAX; 3881 PyObject *result; 3882 3883 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3884 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3885 return NULL; 3886 3887 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3888 (PyObject *)substring); 3889 if (substring == NULL) 3890 return NULL; 3891 3892 if (start < 0) 3893 start += self->length; 3894 if (start < 0) 3895 start = 0; 3896 if (end > self->length) 3897 end = self->length; 3898 if (end < 0) 3899 end += self->length; 3900 if (end < 0) 3901 end = 0; 3902 3903 result = PyInt_FromLong((long) count(self, start, end, substring)); 3904 3905 Py_DECREF(substring); 3906 return result; 3907} 3908 3909static char encode__doc__[] = 3910"S.encode([encoding[,errors]]) -> string\n\ 3911\n\ 3912Return an encoded string version of S. Default encoding is the current\n\ 3913default string encoding. errors may be given to set a different error\n\ 3914handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3915a ValueError. Other possible values are 'ignore' and 'replace'."; 3916 3917static PyObject * 3918unicode_encode(PyUnicodeObject *self, PyObject *args) 3919{ 3920 char *encoding = NULL; 3921 char *errors = NULL; 3922 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3923 return NULL; 3924 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3925} 3926 3927static char expandtabs__doc__[] = 3928"S.expandtabs([tabsize]) -> unicode\n\ 3929\n\ 3930Return a copy of S where all tab characters are expanded using spaces.\n\ 3931If tabsize is not given, a tab size of 8 characters is assumed."; 3932 3933static PyObject* 3934unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3935{ 3936 Py_UNICODE *e; 3937 Py_UNICODE *p; 3938 Py_UNICODE *q; 3939 int i, j; 3940 PyUnicodeObject *u; 3941 int tabsize = 8; 3942 3943 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3944 return NULL; 3945 3946 /* First pass: determine size of output string */ 3947 i = j = 0; 3948 e = self->str + self->length; 3949 for (p = self->str; p < e; p++) 3950 if (*p == '\t') { 3951 if (tabsize > 0) 3952 j += tabsize - (j % tabsize); 3953 } 3954 else { 3955 j++; 3956 if (*p == '\n' || *p == '\r') { 3957 i += j; 3958 j = 0; 3959 } 3960 } 3961 3962 /* Second pass: create output string and fill it */ 3963 u = _PyUnicode_New(i + j); 3964 if (!u) 3965 return NULL; 3966 3967 j = 0; 3968 q = u->str; 3969 3970 for (p = self->str; p < e; p++) 3971 if (*p == '\t') { 3972 if (tabsize > 0) { 3973 i = tabsize - (j % tabsize); 3974 j += i; 3975 while (i--) 3976 *q++ = ' '; 3977 } 3978 } 3979 else { 3980 j++; 3981 *q++ = *p; 3982 if (*p == '\n' || *p == '\r') 3983 j = 0; 3984 } 3985 3986 return (PyObject*) u; 3987} 3988 3989static char find__doc__[] = 3990"S.find(sub [,start [,end]]) -> int\n\ 3991\n\ 3992Return the lowest index in S where substring sub is found,\n\ 3993such that sub is contained within s[start,end]. Optional\n\ 3994arguments start and end are interpreted as in slice notation.\n\ 3995\n\ 3996Return -1 on failure."; 3997 3998static PyObject * 3999unicode_find(PyUnicodeObject *self, PyObject *args) 4000{ 4001 PyUnicodeObject *substring; 4002 int start = 0; 4003 int end = INT_MAX; 4004 PyObject *result; 4005 4006 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4007 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4008 return NULL; 4009 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4010 (PyObject *)substring); 4011 if (substring == NULL) 4012 return NULL; 4013 4014 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4015 4016 Py_DECREF(substring); 4017 return result; 4018} 4019 4020static PyObject * 4021unicode_getitem(PyUnicodeObject *self, int index) 4022{ 4023 if (index < 0 || index >= self->length) { 4024 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4025 return NULL; 4026 } 4027 4028 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4029} 4030 4031static long 4032unicode_hash(PyUnicodeObject *self) 4033{ 4034 /* Since Unicode objects compare equal to their ASCII string 4035 counterparts, they should use the individual character values 4036 as basis for their hash value. This is needed to assure that 4037 strings and Unicode objects behave in the same way as 4038 dictionary keys. */ 4039 4040 register int len; 4041 register Py_UNICODE *p; 4042 register long x; 4043 4044 if (self->hash != -1) 4045 return self->hash; 4046 len = PyUnicode_GET_SIZE(self); 4047 p = PyUnicode_AS_UNICODE(self); 4048 x = *p << 7; 4049 while (--len >= 0) 4050 x = (1000003*x) ^ *p++; 4051 x ^= PyUnicode_GET_SIZE(self); 4052 if (x == -1) 4053 x = -2; 4054 self->hash = x; 4055 return x; 4056} 4057 4058static char index__doc__[] = 4059"S.index(sub [,start [,end]]) -> int\n\ 4060\n\ 4061Like S.find() but raise ValueError when the substring is not found."; 4062 4063static PyObject * 4064unicode_index(PyUnicodeObject *self, PyObject *args) 4065{ 4066 int result; 4067 PyUnicodeObject *substring; 4068 int start = 0; 4069 int end = INT_MAX; 4070 4071 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4072 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4073 return NULL; 4074 4075 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4076 (PyObject *)substring); 4077 if (substring == NULL) 4078 return NULL; 4079 4080 result = findstring(self, substring, start, end, 1); 4081 4082 Py_DECREF(substring); 4083 if (result < 0) { 4084 PyErr_SetString(PyExc_ValueError, "substring not found"); 4085 return NULL; 4086 } 4087 return PyInt_FromLong(result); 4088} 4089 4090static char islower__doc__[] = 4091"S.islower() -> int\n\ 4092\n\ 4093Return 1 if all cased characters in S are lowercase and there is\n\ 4094at least one cased character in S, 0 otherwise."; 4095 4096static PyObject* 4097unicode_islower(PyUnicodeObject *self) 4098{ 4099 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4100 register const Py_UNICODE *e; 4101 int cased; 4102 4103 /* Shortcut for single character strings */ 4104 if (PyUnicode_GET_SIZE(self) == 1) 4105 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 4106 4107 /* Special case for empty strings */ 4108 if (PyString_GET_SIZE(self) == 0) 4109 return PyInt_FromLong(0); 4110 4111 e = p + PyUnicode_GET_SIZE(self); 4112 cased = 0; 4113 for (; p < e; p++) { 4114 register const Py_UNICODE ch = *p; 4115 4116 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4117 return PyInt_FromLong(0); 4118 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4119 cased = 1; 4120 } 4121 return PyInt_FromLong(cased); 4122} 4123 4124static char isupper__doc__[] = 4125"S.isupper() -> int\n\ 4126\n\ 4127Return 1 if all cased characters in S are uppercase and there is\n\ 4128at least one cased character in S, 0 otherwise."; 4129 4130static PyObject* 4131unicode_isupper(PyUnicodeObject *self) 4132{ 4133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4134 register const Py_UNICODE *e; 4135 int cased; 4136 4137 /* Shortcut for single character strings */ 4138 if (PyUnicode_GET_SIZE(self) == 1) 4139 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4140 4141 /* Special case for empty strings */ 4142 if (PyString_GET_SIZE(self) == 0) 4143 return PyInt_FromLong(0); 4144 4145 e = p + PyUnicode_GET_SIZE(self); 4146 cased = 0; 4147 for (; p < e; p++) { 4148 register const Py_UNICODE ch = *p; 4149 4150 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4151 return PyInt_FromLong(0); 4152 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4153 cased = 1; 4154 } 4155 return PyInt_FromLong(cased); 4156} 4157 4158static char istitle__doc__[] = 4159"S.istitle() -> int\n\ 4160\n\ 4161Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 4162may only follow uncased characters and lowercase characters only cased\n\ 4163ones. Return 0 otherwise."; 4164 4165static PyObject* 4166unicode_istitle(PyUnicodeObject *self) 4167{ 4168 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4169 register const Py_UNICODE *e; 4170 int cased, previous_is_cased; 4171 4172 /* Shortcut for single character strings */ 4173 if (PyUnicode_GET_SIZE(self) == 1) 4174 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4175 (Py_UNICODE_ISUPPER(*p) != 0)); 4176 4177 /* Special case for empty strings */ 4178 if (PyString_GET_SIZE(self) == 0) 4179 return PyInt_FromLong(0); 4180 4181 e = p + PyUnicode_GET_SIZE(self); 4182 cased = 0; 4183 previous_is_cased = 0; 4184 for (; p < e; p++) { 4185 register const Py_UNICODE ch = *p; 4186 4187 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4188 if (previous_is_cased) 4189 return PyInt_FromLong(0); 4190 previous_is_cased = 1; 4191 cased = 1; 4192 } 4193 else if (Py_UNICODE_ISLOWER(ch)) { 4194 if (!previous_is_cased) 4195 return PyInt_FromLong(0); 4196 previous_is_cased = 1; 4197 cased = 1; 4198 } 4199 else 4200 previous_is_cased = 0; 4201 } 4202 return PyInt_FromLong(cased); 4203} 4204 4205static char isspace__doc__[] = 4206"S.isspace() -> int\n\ 4207\n\ 4208Return 1 if there are only whitespace characters in S,\n\ 42090 otherwise."; 4210 4211static PyObject* 4212unicode_isspace(PyUnicodeObject *self) 4213{ 4214 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4215 register const Py_UNICODE *e; 4216 4217 /* Shortcut for single character strings */ 4218 if (PyUnicode_GET_SIZE(self) == 1 && 4219 Py_UNICODE_ISSPACE(*p)) 4220 return PyInt_FromLong(1); 4221 4222 /* Special case for empty strings */ 4223 if (PyString_GET_SIZE(self) == 0) 4224 return PyInt_FromLong(0); 4225 4226 e = p + PyUnicode_GET_SIZE(self); 4227 for (; p < e; p++) { 4228 if (!Py_UNICODE_ISSPACE(*p)) 4229 return PyInt_FromLong(0); 4230 } 4231 return PyInt_FromLong(1); 4232} 4233 4234static char isalpha__doc__[] = 4235"S.isalpha() -> int\n\ 4236\n\ 4237Return 1 if all characters in S are alphabetic\n\ 4238and there is at least one character in S, 0 otherwise."; 4239 4240static PyObject* 4241unicode_isalpha(PyUnicodeObject *self) 4242{ 4243 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4244 register const Py_UNICODE *e; 4245 4246 /* Shortcut for single character strings */ 4247 if (PyUnicode_GET_SIZE(self) == 1 && 4248 Py_UNICODE_ISALPHA(*p)) 4249 return PyInt_FromLong(1); 4250 4251 /* Special case for empty strings */ 4252 if (PyString_GET_SIZE(self) == 0) 4253 return PyInt_FromLong(0); 4254 4255 e = p + PyUnicode_GET_SIZE(self); 4256 for (; p < e; p++) { 4257 if (!Py_UNICODE_ISALPHA(*p)) 4258 return PyInt_FromLong(0); 4259 } 4260 return PyInt_FromLong(1); 4261} 4262 4263static char isalnum__doc__[] = 4264"S.isalnum() -> int\n\ 4265\n\ 4266Return 1 if all characters in S are alphanumeric\n\ 4267and there is at least one character in S, 0 otherwise."; 4268 4269static PyObject* 4270unicode_isalnum(PyUnicodeObject *self) 4271{ 4272 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4273 register const Py_UNICODE *e; 4274 4275 /* Shortcut for single character strings */ 4276 if (PyUnicode_GET_SIZE(self) == 1 && 4277 Py_UNICODE_ISALNUM(*p)) 4278 return PyInt_FromLong(1); 4279 4280 /* Special case for empty strings */ 4281 if (PyString_GET_SIZE(self) == 0) 4282 return PyInt_FromLong(0); 4283 4284 e = p + PyUnicode_GET_SIZE(self); 4285 for (; p < e; p++) { 4286 if (!Py_UNICODE_ISALNUM(*p)) 4287 return PyInt_FromLong(0); 4288 } 4289 return PyInt_FromLong(1); 4290} 4291 4292static char isdecimal__doc__[] = 4293"S.isdecimal() -> int\n\ 4294\n\ 4295Return 1 if there are only decimal characters in S,\n\ 42960 otherwise."; 4297 4298static PyObject* 4299unicode_isdecimal(PyUnicodeObject *self) 4300{ 4301 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4302 register const Py_UNICODE *e; 4303 4304 /* Shortcut for single character strings */ 4305 if (PyUnicode_GET_SIZE(self) == 1 && 4306 Py_UNICODE_ISDECIMAL(*p)) 4307 return PyInt_FromLong(1); 4308 4309 /* Special case for empty strings */ 4310 if (PyString_GET_SIZE(self) == 0) 4311 return PyInt_FromLong(0); 4312 4313 e = p + PyUnicode_GET_SIZE(self); 4314 for (; p < e; p++) { 4315 if (!Py_UNICODE_ISDECIMAL(*p)) 4316 return PyInt_FromLong(0); 4317 } 4318 return PyInt_FromLong(1); 4319} 4320 4321static char isdigit__doc__[] = 4322"S.isdigit() -> int\n\ 4323\n\ 4324Return 1 if there are only digit characters in S,\n\ 43250 otherwise."; 4326 4327static PyObject* 4328unicode_isdigit(PyUnicodeObject *self) 4329{ 4330 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4331 register const Py_UNICODE *e; 4332 4333 /* Shortcut for single character strings */ 4334 if (PyUnicode_GET_SIZE(self) == 1 && 4335 Py_UNICODE_ISDIGIT(*p)) 4336 return PyInt_FromLong(1); 4337 4338 /* Special case for empty strings */ 4339 if (PyString_GET_SIZE(self) == 0) 4340 return PyInt_FromLong(0); 4341 4342 e = p + PyUnicode_GET_SIZE(self); 4343 for (; p < e; p++) { 4344 if (!Py_UNICODE_ISDIGIT(*p)) 4345 return PyInt_FromLong(0); 4346 } 4347 return PyInt_FromLong(1); 4348} 4349 4350static char isnumeric__doc__[] = 4351"S.isnumeric() -> int\n\ 4352\n\ 4353Return 1 if there are only numeric characters in S,\n\ 43540 otherwise."; 4355 4356static PyObject* 4357unicode_isnumeric(PyUnicodeObject *self) 4358{ 4359 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4360 register const Py_UNICODE *e; 4361 4362 /* Shortcut for single character strings */ 4363 if (PyUnicode_GET_SIZE(self) == 1 && 4364 Py_UNICODE_ISNUMERIC(*p)) 4365 return PyInt_FromLong(1); 4366 4367 /* Special case for empty strings */ 4368 if (PyString_GET_SIZE(self) == 0) 4369 return PyInt_FromLong(0); 4370 4371 e = p + PyUnicode_GET_SIZE(self); 4372 for (; p < e; p++) { 4373 if (!Py_UNICODE_ISNUMERIC(*p)) 4374 return PyInt_FromLong(0); 4375 } 4376 return PyInt_FromLong(1); 4377} 4378 4379static char join__doc__[] = 4380"S.join(sequence) -> unicode\n\ 4381\n\ 4382Return a string which is the concatenation of the strings in the\n\ 4383sequence. The separator between elements is S."; 4384 4385static PyObject* 4386unicode_join(PyObject *self, PyObject *data) 4387{ 4388 return PyUnicode_Join(self, data); 4389} 4390 4391static int 4392unicode_length(PyUnicodeObject *self) 4393{ 4394 return self->length; 4395} 4396 4397static char ljust__doc__[] = 4398"S.ljust(width) -> unicode\n\ 4399\n\ 4400Return S left justified in a Unicode string of length width. Padding is\n\ 4401done using spaces."; 4402 4403static PyObject * 4404unicode_ljust(PyUnicodeObject *self, PyObject *args) 4405{ 4406 int width; 4407 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 4408 return NULL; 4409 4410 if (self->length >= width && PyUnicode_CheckExact(self)) { 4411 Py_INCREF(self); 4412 return (PyObject*) self; 4413 } 4414 4415 return (PyObject*) pad(self, 0, width - self->length, ' '); 4416} 4417 4418static char lower__doc__[] = 4419"S.lower() -> unicode\n\ 4420\n\ 4421Return a copy of the string S converted to lowercase."; 4422 4423static PyObject* 4424unicode_lower(PyUnicodeObject *self) 4425{ 4426 return fixup(self, fixlower); 4427} 4428 4429static char lstrip__doc__[] = 4430"S.lstrip() -> unicode\n\ 4431\n\ 4432Return a copy of the string S with leading whitespace removed."; 4433 4434static PyObject * 4435unicode_lstrip(PyUnicodeObject *self) 4436{ 4437 return strip(self, 1, 0); 4438} 4439 4440static PyObject* 4441unicode_repeat(PyUnicodeObject *str, int len) 4442{ 4443 PyUnicodeObject *u; 4444 Py_UNICODE *p; 4445 int nchars; 4446 size_t nbytes; 4447 4448 if (len < 0) 4449 len = 0; 4450 4451 if (len == 1 && PyUnicode_CheckExact(str)) { 4452 /* no repeat, return original string */ 4453 Py_INCREF(str); 4454 return (PyObject*) str; 4455 } 4456 4457 /* ensure # of chars needed doesn't overflow int and # of bytes 4458 * needed doesn't overflow size_t 4459 */ 4460 nchars = len * str->length; 4461 if (len && nchars / len != str->length) { 4462 PyErr_SetString(PyExc_OverflowError, 4463 "repeated string is too long"); 4464 return NULL; 4465 } 4466 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4467 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4468 PyErr_SetString(PyExc_OverflowError, 4469 "repeated string is too long"); 4470 return NULL; 4471 } 4472 u = _PyUnicode_New(nchars); 4473 if (!u) 4474 return NULL; 4475 4476 p = u->str; 4477 4478 while (len-- > 0) { 4479 Py_UNICODE_COPY(p, str->str, str->length); 4480 p += str->length; 4481 } 4482 4483 return (PyObject*) u; 4484} 4485 4486PyObject *PyUnicode_Replace(PyObject *obj, 4487 PyObject *subobj, 4488 PyObject *replobj, 4489 int maxcount) 4490{ 4491 PyObject *self; 4492 PyObject *str1; 4493 PyObject *str2; 4494 PyObject *result; 4495 4496 self = PyUnicode_FromObject(obj); 4497 if (self == NULL) 4498 return NULL; 4499 str1 = PyUnicode_FromObject(subobj); 4500 if (str1 == NULL) { 4501 Py_DECREF(self); 4502 return NULL; 4503 } 4504 str2 = PyUnicode_FromObject(replobj); 4505 if (str2 == NULL) { 4506 Py_DECREF(self); 4507 Py_DECREF(str1); 4508 return NULL; 4509 } 4510 result = replace((PyUnicodeObject *)self, 4511 (PyUnicodeObject *)str1, 4512 (PyUnicodeObject *)str2, 4513 maxcount); 4514 Py_DECREF(self); 4515 Py_DECREF(str1); 4516 Py_DECREF(str2); 4517 return result; 4518} 4519 4520static char replace__doc__[] = 4521"S.replace (old, new[, maxsplit]) -> unicode\n\ 4522\n\ 4523Return a copy of S with all occurrences of substring\n\ 4524old replaced by new. If the optional argument maxsplit is\n\ 4525given, only the first maxsplit occurrences are replaced."; 4526 4527static PyObject* 4528unicode_replace(PyUnicodeObject *self, PyObject *args) 4529{ 4530 PyUnicodeObject *str1; 4531 PyUnicodeObject *str2; 4532 int maxcount = -1; 4533 PyObject *result; 4534 4535 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4536 return NULL; 4537 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4538 if (str1 == NULL) 4539 return NULL; 4540 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4541 if (str2 == NULL) 4542 return NULL; 4543 4544 result = replace(self, str1, str2, maxcount); 4545 4546 Py_DECREF(str1); 4547 Py_DECREF(str2); 4548 return result; 4549} 4550 4551static 4552PyObject *unicode_repr(PyObject *unicode) 4553{ 4554 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4555 PyUnicode_GET_SIZE(unicode), 4556 1); 4557} 4558 4559static char rfind__doc__[] = 4560"S.rfind(sub [,start [,end]]) -> int\n\ 4561\n\ 4562Return the highest index in S where substring sub is found,\n\ 4563such that sub is contained within s[start,end]. Optional\n\ 4564arguments start and end are interpreted as in slice notation.\n\ 4565\n\ 4566Return -1 on failure."; 4567 4568static PyObject * 4569unicode_rfind(PyUnicodeObject *self, PyObject *args) 4570{ 4571 PyUnicodeObject *substring; 4572 int start = 0; 4573 int end = INT_MAX; 4574 PyObject *result; 4575 4576 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4578 return NULL; 4579 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4580 (PyObject *)substring); 4581 if (substring == NULL) 4582 return NULL; 4583 4584 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4585 4586 Py_DECREF(substring); 4587 return result; 4588} 4589 4590static char rindex__doc__[] = 4591"S.rindex(sub [,start [,end]]) -> int\n\ 4592\n\ 4593Like S.rfind() but raise ValueError when the substring is not found."; 4594 4595static PyObject * 4596unicode_rindex(PyUnicodeObject *self, PyObject *args) 4597{ 4598 int result; 4599 PyUnicodeObject *substring; 4600 int start = 0; 4601 int end = INT_MAX; 4602 4603 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4604 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4605 return NULL; 4606 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4607 (PyObject *)substring); 4608 if (substring == NULL) 4609 return NULL; 4610 4611 result = findstring(self, substring, start, end, -1); 4612 4613 Py_DECREF(substring); 4614 if (result < 0) { 4615 PyErr_SetString(PyExc_ValueError, "substring not found"); 4616 return NULL; 4617 } 4618 return PyInt_FromLong(result); 4619} 4620 4621static char rjust__doc__[] = 4622"S.rjust(width) -> unicode\n\ 4623\n\ 4624Return S right justified in a Unicode string of length width. Padding is\n\ 4625done using spaces."; 4626 4627static PyObject * 4628unicode_rjust(PyUnicodeObject *self, PyObject *args) 4629{ 4630 int width; 4631 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4632 return NULL; 4633 4634 if (self->length >= width && PyUnicode_CheckExact(self)) { 4635 Py_INCREF(self); 4636 return (PyObject*) self; 4637 } 4638 4639 return (PyObject*) pad(self, width - self->length, 0, ' '); 4640} 4641 4642static char rstrip__doc__[] = 4643"S.rstrip() -> unicode\n\ 4644\n\ 4645Return a copy of the string S with trailing whitespace removed."; 4646 4647static PyObject * 4648unicode_rstrip(PyUnicodeObject *self) 4649{ 4650 return strip(self, 0, 1); 4651} 4652 4653static PyObject* 4654unicode_slice(PyUnicodeObject *self, int start, int end) 4655{ 4656 /* standard clamping */ 4657 if (start < 0) 4658 start = 0; 4659 if (end < 0) 4660 end = 0; 4661 if (end > self->length) 4662 end = self->length; 4663 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 4664 /* full slice, return original string */ 4665 Py_INCREF(self); 4666 return (PyObject*) self; 4667 } 4668 if (start > end) 4669 start = end; 4670 /* copy slice */ 4671 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4672 end - start); 4673} 4674 4675PyObject *PyUnicode_Split(PyObject *s, 4676 PyObject *sep, 4677 int maxsplit) 4678{ 4679 PyObject *result; 4680 4681 s = PyUnicode_FromObject(s); 4682 if (s == NULL) 4683 return NULL; 4684 if (sep != NULL) { 4685 sep = PyUnicode_FromObject(sep); 4686 if (sep == NULL) { 4687 Py_DECREF(s); 4688 return NULL; 4689 } 4690 } 4691 4692 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4693 4694 Py_DECREF(s); 4695 Py_XDECREF(sep); 4696 return result; 4697} 4698 4699static char split__doc__[] = 4700"S.split([sep [,maxsplit]]) -> list of strings\n\ 4701\n\ 4702Return a list of the words in S, using sep as the\n\ 4703delimiter string. If maxsplit is given, at most maxsplit\n\ 4704splits are done. If sep is not specified, any whitespace string\n\ 4705is a separator."; 4706 4707static PyObject* 4708unicode_split(PyUnicodeObject *self, PyObject *args) 4709{ 4710 PyObject *substring = Py_None; 4711 int maxcount = -1; 4712 4713 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4714 return NULL; 4715 4716 if (substring == Py_None) 4717 return split(self, NULL, maxcount); 4718 else if (PyUnicode_Check(substring)) 4719 return split(self, (PyUnicodeObject *)substring, maxcount); 4720 else 4721 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4722} 4723 4724static char splitlines__doc__[] = 4725"S.splitlines([keepends]]) -> list of strings\n\ 4726\n\ 4727Return a list of the lines in S, breaking at line boundaries.\n\ 4728Line breaks are not included in the resulting list unless keepends\n\ 4729is given and true."; 4730 4731static PyObject* 4732unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4733{ 4734 int keepends = 0; 4735 4736 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4737 return NULL; 4738 4739 return PyUnicode_Splitlines((PyObject *)self, keepends); 4740} 4741 4742static 4743PyObject *unicode_str(PyUnicodeObject *self) 4744{ 4745 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4746} 4747 4748static char strip__doc__[] = 4749"S.strip() -> unicode\n\ 4750\n\ 4751Return a copy of S with leading and trailing whitespace removed."; 4752 4753static PyObject * 4754unicode_strip(PyUnicodeObject *self) 4755{ 4756 return strip(self, 1, 1); 4757} 4758 4759static char swapcase__doc__[] = 4760"S.swapcase() -> unicode\n\ 4761\n\ 4762Return a copy of S with uppercase characters converted to lowercase\n\ 4763and vice versa."; 4764 4765static PyObject* 4766unicode_swapcase(PyUnicodeObject *self) 4767{ 4768 return fixup(self, fixswapcase); 4769} 4770 4771static char translate__doc__[] = 4772"S.translate(table) -> unicode\n\ 4773\n\ 4774Return a copy of the string S, where all characters have been mapped\n\ 4775through the given translation table, which must be a mapping of\n\ 4776Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4777are left untouched. Characters mapped to None are deleted."; 4778 4779static PyObject* 4780unicode_translate(PyUnicodeObject *self, PyObject *table) 4781{ 4782 return PyUnicode_TranslateCharmap(self->str, 4783 self->length, 4784 table, 4785 "ignore"); 4786} 4787 4788static char upper__doc__[] = 4789"S.upper() -> unicode\n\ 4790\n\ 4791Return a copy of S converted to uppercase."; 4792 4793static PyObject* 4794unicode_upper(PyUnicodeObject *self) 4795{ 4796 return fixup(self, fixupper); 4797} 4798 4799#if 0 4800static char zfill__doc__[] = 4801"S.zfill(width) -> unicode\n\ 4802\n\ 4803Pad a numeric string x with zeros on the left, to fill a field\n\ 4804of the specified width. The string x is never truncated."; 4805 4806static PyObject * 4807unicode_zfill(PyUnicodeObject *self, PyObject *args) 4808{ 4809 int fill; 4810 PyUnicodeObject *u; 4811 4812 int width; 4813 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4814 return NULL; 4815 4816 if (self->length >= width) { 4817 Py_INCREF(self); 4818 return (PyObject*) self; 4819 } 4820 4821 fill = width - self->length; 4822 4823 u = pad(self, fill, 0, '0'); 4824 4825 if (u->str[fill] == '+' || u->str[fill] == '-') { 4826 /* move sign to beginning of string */ 4827 u->str[0] = u->str[fill]; 4828 u->str[fill] = '0'; 4829 } 4830 4831 return (PyObject*) u; 4832} 4833#endif 4834 4835#if 0 4836static PyObject* 4837unicode_freelistsize(PyUnicodeObject *self) 4838{ 4839 return PyInt_FromLong(unicode_freelist_size); 4840} 4841#endif 4842 4843static char startswith__doc__[] = 4844"S.startswith(prefix[, start[, end]]) -> int\n\ 4845\n\ 4846Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4847optional start, test S beginning at that position. With optional end, stop\n\ 4848comparing S at that position."; 4849 4850static PyObject * 4851unicode_startswith(PyUnicodeObject *self, 4852 PyObject *args) 4853{ 4854 PyUnicodeObject *substring; 4855 int start = 0; 4856 int end = INT_MAX; 4857 PyObject *result; 4858 4859 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4860 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4861 return NULL; 4862 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4863 (PyObject *)substring); 4864 if (substring == NULL) 4865 return NULL; 4866 4867 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4868 4869 Py_DECREF(substring); 4870 return result; 4871} 4872 4873 4874static char endswith__doc__[] = 4875"S.endswith(suffix[, start[, end]]) -> int\n\ 4876\n\ 4877Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4878optional start, test S beginning at that position. With optional end, stop\n\ 4879comparing S at that position."; 4880 4881static PyObject * 4882unicode_endswith(PyUnicodeObject *self, 4883 PyObject *args) 4884{ 4885 PyUnicodeObject *substring; 4886 int start = 0; 4887 int end = INT_MAX; 4888 PyObject *result; 4889 4890 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4891 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4892 return NULL; 4893 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4894 (PyObject *)substring); 4895 if (substring == NULL) 4896 return NULL; 4897 4898 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4899 4900 Py_DECREF(substring); 4901 return result; 4902} 4903 4904 4905static PyMethodDef unicode_methods[] = { 4906 4907 /* Order is according to common usage: often used methods should 4908 appear first, since lookup is done sequentially. */ 4909 4910 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 4911 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 4912 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 4913 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 4914 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 4915 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 4916 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 4917 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 4918 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 4919 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 4920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 4921 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 4922 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 4923 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__}, 4924/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 4925 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 4926 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 4927 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 4928 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__}, 4929 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 4930 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__}, 4931 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 4932 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 4933 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 4934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 4935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 4936 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 4937 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 4938 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 4939 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 4940 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 4941 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 4942 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 4943 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 4944 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 4945#if 0 4946 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 4947 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 4948#endif 4949 4950#if 0 4951 /* This one is just used for debugging the implementation. */ 4952 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 4953#endif 4954 4955 {NULL, NULL} 4956}; 4957 4958static PySequenceMethods unicode_as_sequence = { 4959 (inquiry) unicode_length, /* sq_length */ 4960 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4961 (intargfunc) unicode_repeat, /* sq_repeat */ 4962 (intargfunc) unicode_getitem, /* sq_item */ 4963 (intintargfunc) unicode_slice, /* sq_slice */ 4964 0, /* sq_ass_item */ 4965 0, /* sq_ass_slice */ 4966 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4967}; 4968 4969static int 4970unicode_buffer_getreadbuf(PyUnicodeObject *self, 4971 int index, 4972 const void **ptr) 4973{ 4974 if (index != 0) { 4975 PyErr_SetString(PyExc_SystemError, 4976 "accessing non-existent unicode segment"); 4977 return -1; 4978 } 4979 *ptr = (void *) self->str; 4980 return PyUnicode_GET_DATA_SIZE(self); 4981} 4982 4983static int 4984unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4985 const void **ptr) 4986{ 4987 PyErr_SetString(PyExc_TypeError, 4988 "cannot use unicode as modifyable buffer"); 4989 return -1; 4990} 4991 4992static int 4993unicode_buffer_getsegcount(PyUnicodeObject *self, 4994 int *lenp) 4995{ 4996 if (lenp) 4997 *lenp = PyUnicode_GET_DATA_SIZE(self); 4998 return 1; 4999} 5000 5001static int 5002unicode_buffer_getcharbuf(PyUnicodeObject *self, 5003 int index, 5004 const void **ptr) 5005{ 5006 PyObject *str; 5007 5008 if (index != 0) { 5009 PyErr_SetString(PyExc_SystemError, 5010 "accessing non-existent unicode segment"); 5011 return -1; 5012 } 5013 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5014 if (str == NULL) 5015 return -1; 5016 *ptr = (void *) PyString_AS_STRING(str); 5017 return PyString_GET_SIZE(str); 5018} 5019 5020/* Helpers for PyUnicode_Format() */ 5021 5022static PyObject * 5023getnextarg(PyObject *args, int arglen, int *p_argidx) 5024{ 5025 int argidx = *p_argidx; 5026 if (argidx < arglen) { 5027 (*p_argidx)++; 5028 if (arglen < 0) 5029 return args; 5030 else 5031 return PyTuple_GetItem(args, argidx); 5032 } 5033 PyErr_SetString(PyExc_TypeError, 5034 "not enough arguments for format string"); 5035 return NULL; 5036} 5037 5038#define F_LJUST (1<<0) 5039#define F_SIGN (1<<1) 5040#define F_BLANK (1<<2) 5041#define F_ALT (1<<3) 5042#define F_ZERO (1<<4) 5043 5044static 5045int usprintf(register Py_UNICODE *buffer, char *format, ...) 5046{ 5047 register int i; 5048 int len; 5049 va_list va; 5050 char *charbuffer; 5051 va_start(va, format); 5052 5053 /* First, format the string as char array, then expand to Py_UNICODE 5054 array. */ 5055 charbuffer = (char *)buffer; 5056 len = vsprintf(charbuffer, format, va); 5057 for (i = len - 1; i >= 0; i--) 5058 buffer[i] = (Py_UNICODE) charbuffer[i]; 5059 5060 va_end(va); 5061 return len; 5062} 5063 5064static int 5065formatfloat(Py_UNICODE *buf, 5066 size_t buflen, 5067 int flags, 5068 int prec, 5069 int type, 5070 PyObject *v) 5071{ 5072 /* fmt = '%#.' + `prec` + `type` 5073 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 5074 char fmt[20]; 5075 double x; 5076 5077 x = PyFloat_AsDouble(v); 5078 if (x == -1.0 && PyErr_Occurred()) 5079 return -1; 5080 if (prec < 0) 5081 prec = 6; 5082 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 5083 type = 'g'; 5084 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 5085 (flags & F_ALT) ? "#" : "", prec, type); 5086 /* worst case length calc to ensure no buffer overrun: 5087 fmt = %#.<prec>g 5088 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 5089 for any double rep.) 5090 len = 1 + prec + 1 + 2 + 5 = 9 + prec 5091 If prec=0 the effective precision is 1 (the leading digit is 5092 always given), therefore increase by one to 10+prec. */ 5093 if (buflen <= (size_t)10 + (size_t)prec) { 5094 PyErr_SetString(PyExc_OverflowError, 5095 "formatted float is too long (precision too long?)"); 5096 return -1; 5097 } 5098 return usprintf(buf, fmt, x); 5099} 5100 5101static PyObject* 5102formatlong(PyObject *val, int flags, int prec, int type) 5103{ 5104 char *buf; 5105 int i, len; 5106 PyObject *str; /* temporary string object. */ 5107 PyUnicodeObject *result; 5108 5109 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 5110 if (!str) 5111 return NULL; 5112 result = _PyUnicode_New(len); 5113 for (i = 0; i < len; i++) 5114 result->str[i] = buf[i]; 5115 result->str[len] = 0; 5116 Py_DECREF(str); 5117 return (PyObject*)result; 5118} 5119 5120static int 5121formatint(Py_UNICODE *buf, 5122 size_t buflen, 5123 int flags, 5124 int prec, 5125 int type, 5126 PyObject *v) 5127{ 5128 /* fmt = '%#.' + `prec` + 'l' + `type` 5129 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 5130 + 1 + 1 = 24*/ 5131 char fmt[64]; /* plenty big enough! */ 5132 long x; 5133 int use_native_c_format = 1; 5134 5135 x = PyInt_AsLong(v); 5136 if (x == -1 && PyErr_Occurred()) 5137 return -1; 5138 if (prec < 0) 5139 prec = 1; 5140 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 5141 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 5142 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 5143 PyErr_SetString(PyExc_OverflowError, 5144 "formatted integer is too long (precision too long?)"); 5145 return -1; 5146 } 5147 /* When converting 0 under %#x or %#X, C leaves off the base marker, 5148 * but we want it (for consistency with other %#x conversions, and 5149 * for consistency with Python's hex() function). 5150 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks & 5151 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway. 5152 * So add it only if the platform doesn't already. 5153 */ 5154 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) { 5155 /* Only way to know what the platform does is to try it. */ 5156 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0); 5157 if (fmt[1] != (char)type) { 5158 /* Supply our own leading 0x/0X -- needed under std C */ 5159 use_native_c_format = 0; 5160 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type); 5161 } 5162 } 5163 if (use_native_c_format) 5164 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 5165 (flags & F_ALT) ? "#" : "", prec, type); 5166 return usprintf(buf, fmt, x); 5167} 5168 5169static int 5170formatchar(Py_UNICODE *buf, 5171 size_t buflen, 5172 PyObject *v) 5173{ 5174 /* presume that the buffer is at least 2 characters long */ 5175 if (PyUnicode_Check(v)) { 5176 if (PyUnicode_GET_SIZE(v) != 1) 5177 goto onError; 5178 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 5179 } 5180 5181 else if (PyString_Check(v)) { 5182 if (PyString_GET_SIZE(v) != 1) 5183 goto onError; 5184 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 5185 } 5186 5187 else { 5188 /* Integer input truncated to a character */ 5189 long x; 5190 x = PyInt_AsLong(v); 5191 if (x == -1 && PyErr_Occurred()) 5192 goto onError; 5193 buf[0] = (char) x; 5194 } 5195 buf[1] = '\0'; 5196 return 1; 5197 5198 onError: 5199 PyErr_SetString(PyExc_TypeError, 5200 "%c requires int or char"); 5201 return -1; 5202} 5203 5204/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 5205 5206 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 5207 chars are formatted. XXX This is a magic number. Each formatting 5208 routine does bounds checking to ensure no overflow, but a better 5209 solution may be to malloc a buffer of appropriate size for each 5210 format. For now, the current solution is sufficient. 5211*/ 5212#define FORMATBUFLEN (size_t)120 5213 5214PyObject *PyUnicode_Format(PyObject *format, 5215 PyObject *args) 5216{ 5217 Py_UNICODE *fmt, *res; 5218 int fmtcnt, rescnt, reslen, arglen, argidx; 5219 int args_owned = 0; 5220 PyUnicodeObject *result = NULL; 5221 PyObject *dict = NULL; 5222 PyObject *uformat; 5223 5224 if (format == NULL || args == NULL) { 5225 PyErr_BadInternalCall(); 5226 return NULL; 5227 } 5228 uformat = PyUnicode_FromObject(format); 5229 if (uformat == NULL) 5230 return NULL; 5231 fmt = PyUnicode_AS_UNICODE(uformat); 5232 fmtcnt = PyUnicode_GET_SIZE(uformat); 5233 5234 reslen = rescnt = fmtcnt + 100; 5235 result = _PyUnicode_New(reslen); 5236 if (result == NULL) 5237 goto onError; 5238 res = PyUnicode_AS_UNICODE(result); 5239 5240 if (PyTuple_Check(args)) { 5241 arglen = PyTuple_Size(args); 5242 argidx = 0; 5243 } 5244 else { 5245 arglen = -1; 5246 argidx = -2; 5247 } 5248 if (args->ob_type->tp_as_mapping) 5249 dict = args; 5250 5251 while (--fmtcnt >= 0) { 5252 if (*fmt != '%') { 5253 if (--rescnt < 0) { 5254 rescnt = fmtcnt + 100; 5255 reslen += rescnt; 5256 if (_PyUnicode_Resize(&result, reslen) < 0) 5257 return NULL; 5258 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 5259 --rescnt; 5260 } 5261 *res++ = *fmt++; 5262 } 5263 else { 5264 /* Got a format specifier */ 5265 int flags = 0; 5266 int width = -1; 5267 int prec = -1; 5268 Py_UNICODE c = '\0'; 5269 Py_UNICODE fill; 5270 PyObject *v = NULL; 5271 PyObject *temp = NULL; 5272 Py_UNICODE *pbuf; 5273 Py_UNICODE sign; 5274 int len; 5275 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 5276 5277 fmt++; 5278 if (*fmt == '(') { 5279 Py_UNICODE *keystart; 5280 int keylen; 5281 PyObject *key; 5282 int pcount = 1; 5283 5284 if (dict == NULL) { 5285 PyErr_SetString(PyExc_TypeError, 5286 "format requires a mapping"); 5287 goto onError; 5288 } 5289 ++fmt; 5290 --fmtcnt; 5291 keystart = fmt; 5292 /* Skip over balanced parentheses */ 5293 while (pcount > 0 && --fmtcnt >= 0) { 5294 if (*fmt == ')') 5295 --pcount; 5296 else if (*fmt == '(') 5297 ++pcount; 5298 fmt++; 5299 } 5300 keylen = fmt - keystart - 1; 5301 if (fmtcnt < 0 || pcount > 0) { 5302 PyErr_SetString(PyExc_ValueError, 5303 "incomplete format key"); 5304 goto onError; 5305 } 5306#if 0 5307 /* keys are converted to strings using UTF-8 and 5308 then looked up since Python uses strings to hold 5309 variables names etc. in its namespaces and we 5310 wouldn't want to break common idioms. */ 5311 key = PyUnicode_EncodeUTF8(keystart, 5312 keylen, 5313 NULL); 5314#else 5315 key = PyUnicode_FromUnicode(keystart, keylen); 5316#endif 5317 if (key == NULL) 5318 goto onError; 5319 if (args_owned) { 5320 Py_DECREF(args); 5321 args_owned = 0; 5322 } 5323 args = PyObject_GetItem(dict, key); 5324 Py_DECREF(key); 5325 if (args == NULL) { 5326 goto onError; 5327 } 5328 args_owned = 1; 5329 arglen = -1; 5330 argidx = -2; 5331 } 5332 while (--fmtcnt >= 0) { 5333 switch (c = *fmt++) { 5334 case '-': flags |= F_LJUST; continue; 5335 case '+': flags |= F_SIGN; continue; 5336 case ' ': flags |= F_BLANK; continue; 5337 case '#': flags |= F_ALT; continue; 5338 case '0': flags |= F_ZERO; continue; 5339 } 5340 break; 5341 } 5342 if (c == '*') { 5343 v = getnextarg(args, arglen, &argidx); 5344 if (v == NULL) 5345 goto onError; 5346 if (!PyInt_Check(v)) { 5347 PyErr_SetString(PyExc_TypeError, 5348 "* wants int"); 5349 goto onError; 5350 } 5351 width = PyInt_AsLong(v); 5352 if (width < 0) { 5353 flags |= F_LJUST; 5354 width = -width; 5355 } 5356 if (--fmtcnt >= 0) 5357 c = *fmt++; 5358 } 5359 else if (c >= '0' && c <= '9') { 5360 width = c - '0'; 5361 while (--fmtcnt >= 0) { 5362 c = *fmt++; 5363 if (c < '0' || c > '9') 5364 break; 5365 if ((width*10) / 10 != width) { 5366 PyErr_SetString(PyExc_ValueError, 5367 "width too big"); 5368 goto onError; 5369 } 5370 width = width*10 + (c - '0'); 5371 } 5372 } 5373 if (c == '.') { 5374 prec = 0; 5375 if (--fmtcnt >= 0) 5376 c = *fmt++; 5377 if (c == '*') { 5378 v = getnextarg(args, arglen, &argidx); 5379 if (v == NULL) 5380 goto onError; 5381 if (!PyInt_Check(v)) { 5382 PyErr_SetString(PyExc_TypeError, 5383 "* wants int"); 5384 goto onError; 5385 } 5386 prec = PyInt_AsLong(v); 5387 if (prec < 0) 5388 prec = 0; 5389 if (--fmtcnt >= 0) 5390 c = *fmt++; 5391 } 5392 else if (c >= '0' && c <= '9') { 5393 prec = c - '0'; 5394 while (--fmtcnt >= 0) { 5395 c = Py_CHARMASK(*fmt++); 5396 if (c < '0' || c > '9') 5397 break; 5398 if ((prec*10) / 10 != prec) { 5399 PyErr_SetString(PyExc_ValueError, 5400 "prec too big"); 5401 goto onError; 5402 } 5403 prec = prec*10 + (c - '0'); 5404 } 5405 } 5406 } /* prec */ 5407 if (fmtcnt >= 0) { 5408 if (c == 'h' || c == 'l' || c == 'L') { 5409 if (--fmtcnt >= 0) 5410 c = *fmt++; 5411 } 5412 } 5413 if (fmtcnt < 0) { 5414 PyErr_SetString(PyExc_ValueError, 5415 "incomplete format"); 5416 goto onError; 5417 } 5418 if (c != '%') { 5419 v = getnextarg(args, arglen, &argidx); 5420 if (v == NULL) 5421 goto onError; 5422 } 5423 sign = 0; 5424 fill = ' '; 5425 switch (c) { 5426 5427 case '%': 5428 pbuf = formatbuf; 5429 /* presume that buffer length is at least 1 */ 5430 pbuf[0] = '%'; 5431 len = 1; 5432 break; 5433 5434 case 's': 5435 case 'r': 5436 if (PyUnicode_Check(v) && c == 's') { 5437 temp = v; 5438 Py_INCREF(temp); 5439 } 5440 else { 5441 PyObject *unicode; 5442 if (c == 's') 5443 temp = PyObject_Str(v); 5444 else 5445 temp = PyObject_Repr(v); 5446 if (temp == NULL) 5447 goto onError; 5448 if (!PyString_Check(temp)) { 5449 /* XXX Note: this should never happen, since 5450 PyObject_Repr() and PyObject_Str() assure 5451 this */ 5452 Py_DECREF(temp); 5453 PyErr_SetString(PyExc_TypeError, 5454 "%s argument has non-string str()"); 5455 goto onError; 5456 } 5457 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5458 PyString_GET_SIZE(temp), 5459 NULL, 5460 "strict"); 5461 Py_DECREF(temp); 5462 temp = unicode; 5463 if (temp == NULL) 5464 goto onError; 5465 } 5466 pbuf = PyUnicode_AS_UNICODE(temp); 5467 len = PyUnicode_GET_SIZE(temp); 5468 if (prec >= 0 && len > prec) 5469 len = prec; 5470 break; 5471 5472 case 'i': 5473 case 'd': 5474 case 'u': 5475 case 'o': 5476 case 'x': 5477 case 'X': 5478 if (c == 'i') 5479 c = 'd'; 5480 if (PyLong_Check(v)) { 5481 temp = formatlong(v, flags, prec, c); 5482 if (!temp) 5483 goto onError; 5484 pbuf = PyUnicode_AS_UNICODE(temp); 5485 len = PyUnicode_GET_SIZE(temp); 5486 /* unbounded ints can always produce 5487 a sign character! */ 5488 sign = 1; 5489 } 5490 else { 5491 pbuf = formatbuf; 5492 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5493 flags, prec, c, v); 5494 if (len < 0) 5495 goto onError; 5496 /* only d conversion is signed */ 5497 sign = c == 'd'; 5498 } 5499 if (flags & F_ZERO) 5500 fill = '0'; 5501 break; 5502 5503 case 'e': 5504 case 'E': 5505 case 'f': 5506 case 'g': 5507 case 'G': 5508 pbuf = formatbuf; 5509 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5510 flags, prec, c, v); 5511 if (len < 0) 5512 goto onError; 5513 sign = 1; 5514 if (flags & F_ZERO) 5515 fill = '0'; 5516 break; 5517 5518 case 'c': 5519 pbuf = formatbuf; 5520 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5521 if (len < 0) 5522 goto onError; 5523 break; 5524 5525 default: 5526 PyErr_Format(PyExc_ValueError, 5527 "unsupported format character '%c' (0x%x) " 5528 "at index %i", 5529 (31<=c && c<=126) ? c : '?', 5530 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5531 goto onError; 5532 } 5533 if (sign) { 5534 if (*pbuf == '-' || *pbuf == '+') { 5535 sign = *pbuf++; 5536 len--; 5537 } 5538 else if (flags & F_SIGN) 5539 sign = '+'; 5540 else if (flags & F_BLANK) 5541 sign = ' '; 5542 else 5543 sign = 0; 5544 } 5545 if (width < len) 5546 width = len; 5547 if (rescnt < width + (sign != 0)) { 5548 reslen -= rescnt; 5549 rescnt = width + fmtcnt + 100; 5550 reslen += rescnt; 5551 if (_PyUnicode_Resize(&result, reslen) < 0) 5552 return NULL; 5553 res = PyUnicode_AS_UNICODE(result) 5554 + reslen - rescnt; 5555 } 5556 if (sign) { 5557 if (fill != ' ') 5558 *res++ = sign; 5559 rescnt--; 5560 if (width > len) 5561 width--; 5562 } 5563 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5564 assert(pbuf[0] == '0'); 5565 assert(pbuf[1] == c); 5566 if (fill != ' ') { 5567 *res++ = *pbuf++; 5568 *res++ = *pbuf++; 5569 } 5570 rescnt -= 2; 5571 width -= 2; 5572 if (width < 0) 5573 width = 0; 5574 len -= 2; 5575 } 5576 if (width > len && !(flags & F_LJUST)) { 5577 do { 5578 --rescnt; 5579 *res++ = fill; 5580 } while (--width > len); 5581 } 5582 if (fill == ' ') { 5583 if (sign) 5584 *res++ = sign; 5585 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5586 assert(pbuf[0] == '0'); 5587 assert(pbuf[1] == c); 5588 *res++ = *pbuf++; 5589 *res++ = *pbuf++; 5590 } 5591 } 5592 Py_UNICODE_COPY(res, pbuf, len); 5593 res += len; 5594 rescnt -= len; 5595 while (--width >= len) { 5596 --rescnt; 5597 *res++ = ' '; 5598 } 5599 if (dict && (argidx < arglen) && c != '%') { 5600 PyErr_SetString(PyExc_TypeError, 5601 "not all arguments converted"); 5602 goto onError; 5603 } 5604 Py_XDECREF(temp); 5605 } /* '%' */ 5606 } /* until end */ 5607 if (argidx < arglen && !dict) { 5608 PyErr_SetString(PyExc_TypeError, 5609 "not all arguments converted"); 5610 goto onError; 5611 } 5612 5613 if (args_owned) { 5614 Py_DECREF(args); 5615 } 5616 Py_DECREF(uformat); 5617 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5618 goto onError; 5619 return (PyObject *)result; 5620 5621 onError: 5622 Py_XDECREF(result); 5623 Py_DECREF(uformat); 5624 if (args_owned) { 5625 Py_DECREF(args); 5626 } 5627 return NULL; 5628} 5629 5630static PyBufferProcs unicode_as_buffer = { 5631 (getreadbufferproc) unicode_buffer_getreadbuf, 5632 (getwritebufferproc) unicode_buffer_getwritebuf, 5633 (getsegcountproc) unicode_buffer_getsegcount, 5634 (getcharbufferproc) unicode_buffer_getcharbuf, 5635}; 5636 5637staticforward PyObject * 5638unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 5639 5640static PyObject * 5641unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5642{ 5643 PyObject *x = NULL; 5644 static char *kwlist[] = {"string", "encoding", "errors", 0}; 5645 char *encoding = NULL; 5646 char *errors = NULL; 5647 5648 if (type != &PyUnicode_Type) 5649 return unicode_subtype_new(type, args, kwds); 5650 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 5651 kwlist, &x, &encoding, &errors)) 5652 return NULL; 5653 if (x == NULL) 5654 return (PyObject *)_PyUnicode_New(0); 5655 if (encoding == NULL && errors == NULL) 5656 return PyObject_Unicode(x); 5657 else 5658 return PyUnicode_FromEncodedObject(x, encoding, errors); 5659} 5660 5661static PyObject * 5662unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5663{ 5664 PyUnicodeObject *tmp, *pnew; 5665 int n; 5666 5667 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 5668 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 5669 if (tmp == NULL) 5670 return NULL; 5671 assert(PyUnicode_Check(tmp)); 5672 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 5673 if (pnew == NULL) 5674 return NULL; 5675 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 5676 if (pnew->str == NULL) { 5677 _Py_ForgetReference((PyObject *)pnew); 5678 PyObject_DEL(pnew); 5679 return NULL; 5680 } 5681 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 5682 pnew->length = n; 5683 pnew->hash = tmp->hash; 5684 Py_DECREF(tmp); 5685 return (PyObject *)pnew; 5686} 5687 5688static char unicode_doc[] = 5689"unicode(string [, encoding[, errors]]) -> object\n\ 5690\n\ 5691Create a new Unicode object from the given encoded string.\n\ 5692encoding defaults to the current default string encoding and \n\ 5693errors, defining the error handling, to 'strict'."; 5694 5695PyTypeObject PyUnicode_Type = { 5696 PyObject_HEAD_INIT(&PyType_Type) 5697 0, /* ob_size */ 5698 "unicode", /* tp_name */ 5699 sizeof(PyUnicodeObject), /* tp_size */ 5700 0, /* tp_itemsize */ 5701 /* Slots */ 5702 (destructor)unicode_dealloc, /* tp_dealloc */ 5703 0, /* tp_print */ 5704 0, /* tp_getattr */ 5705 0, /* tp_setattr */ 5706 (cmpfunc) unicode_compare, /* tp_compare */ 5707 (reprfunc) unicode_repr, /* tp_repr */ 5708 0, /* tp_as_number */ 5709 &unicode_as_sequence, /* tp_as_sequence */ 5710 0, /* tp_as_mapping */ 5711 (hashfunc) unicode_hash, /* tp_hash*/ 5712 0, /* tp_call*/ 5713 (reprfunc) unicode_str, /* tp_str */ 5714 PyObject_GenericGetAttr, /* tp_getattro */ 5715 0, /* tp_setattro */ 5716 &unicode_as_buffer, /* tp_as_buffer */ 5717 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 5718 unicode_doc, /* tp_doc */ 5719 0, /* tp_traverse */ 5720 0, /* tp_clear */ 5721 0, /* tp_richcompare */ 5722 0, /* tp_weaklistoffset */ 5723 0, /* tp_iter */ 5724 0, /* tp_iternext */ 5725 unicode_methods, /* tp_methods */ 5726 0, /* tp_members */ 5727 0, /* tp_getset */ 5728 0, /* tp_base */ 5729 0, /* tp_dict */ 5730 0, /* tp_descr_get */ 5731 0, /* tp_descr_set */ 5732 0, /* tp_dictoffset */ 5733 0, /* tp_init */ 5734 0, /* tp_alloc */ 5735 unicode_new, /* tp_new */ 5736 _PyObject_Del, /* tp_free */ 5737}; 5738 5739/* Initialize the Unicode implementation */ 5740 5741void _PyUnicode_Init(void) 5742{ 5743 int i; 5744 5745 /* Init the implementation */ 5746 unicode_freelist = NULL; 5747 unicode_freelist_size = 0; 5748 unicode_empty = _PyUnicode_New(0); 5749 strcpy(unicode_default_encoding, "ascii"); 5750 for (i = 0; i < 256; i++) 5751 unicode_latin1[i] = NULL; 5752} 5753 5754/* Finalize the Unicode implementation */ 5755 5756void 5757_PyUnicode_Fini(void) 5758{ 5759 PyUnicodeObject *u; 5760 int i; 5761 5762 Py_XDECREF(unicode_empty); 5763 unicode_empty = NULL; 5764 5765 for (i = 0; i < 256; i++) { 5766 if (unicode_latin1[i]) { 5767 Py_DECREF(unicode_latin1[i]); 5768 unicode_latin1[i] = NULL; 5769 } 5770 } 5771 5772 for (u = unicode_freelist; u != NULL;) { 5773 PyUnicodeObject *v = u; 5774 u = *(PyUnicodeObject **)u; 5775 if (v->str) 5776 PyMem_DEL(v->str); 5777 Py_XDECREF(v->defenc); 5778 PyObject_DEL(v); 5779 } 5780 unicode_freelist = NULL; 5781 unicode_freelist_size = 0; 5782} 5783