unicodeobject.c revision b5507ecd3cfce17bab26311298f527572611af0b
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_DEL(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (!PyUnicode_CheckExact(unicode)) { 230 unicode->ob_type->tp_free((PyObject *)unicode); 231 return; 232 } 233 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 234 /* Keep-Alive optimization */ 235 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 236 PyMem_DEL(unicode->str); 237 unicode->str = NULL; 238 unicode->length = 0; 239 } 240 if (unicode->defenc) { 241 Py_DECREF(unicode->defenc); 242 unicode->defenc = NULL; 243 } 244 /* Add to free list */ 245 *(PyUnicodeObject **)unicode = unicode_freelist; 246 unicode_freelist = unicode; 247 unicode_freelist_size++; 248 } 249 else { 250 PyMem_DEL(unicode->str); 251 Py_XDECREF(unicode->defenc); 252 PyObject_DEL(unicode); 253 } 254} 255 256int PyUnicode_Resize(PyObject **unicode, 257 int length) 258{ 259 register PyUnicodeObject *v; 260 261 /* Argument checks */ 262 if (unicode == NULL) { 263 PyErr_BadInternalCall(); 264 return -1; 265 } 266 v = (PyUnicodeObject *)*unicode; 267 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 268 PyErr_BadInternalCall(); 269 return -1; 270 } 271 272 /* Resizing unicode_empty and single character objects is not 273 possible since these are being shared. We simply return a fresh 274 copy with the same Unicode content. */ 275 if (v->length != length && 276 (v == unicode_empty || v->length == 1)) { 277 PyUnicodeObject *w = _PyUnicode_New(length); 278 if (w == NULL) 279 return -1; 280 Py_UNICODE_COPY(w->str, v->str, 281 length < v->length ? length : v->length); 282 *unicode = (PyObject *)w; 283 return 0; 284 } 285 286 /* Note that we don't have to modify *unicode for unshared Unicode 287 objects, since we can modify them in-place. */ 288 return unicode_resize(v, length); 289} 290 291/* Internal API for use in unicodeobject.c only ! */ 292#define _PyUnicode_Resize(unicodevar, length) \ 293 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 294 295PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 296 int size) 297{ 298 PyUnicodeObject *unicode; 299 300 /* If the Unicode data is known at construction time, we can apply 301 some optimizations which share commonly used objects. */ 302 if (u != NULL) { 303 304 /* Optimization for empty strings */ 305 if (size == 0 && unicode_empty != NULL) { 306 Py_INCREF(unicode_empty); 307 return (PyObject *)unicode_empty; 308 } 309 310 /* Single character Unicode objects in the Latin-1 range are 311 shared when using this constructor */ 312 if (size == 1 && *u < 256) { 313 unicode = unicode_latin1[*u]; 314 if (!unicode) { 315 unicode = _PyUnicode_New(1); 316 if (!unicode) 317 return NULL; 318 unicode->str[0] = *u; 319 unicode_latin1[*u] = unicode; 320 } 321 Py_INCREF(unicode); 322 return (PyObject *)unicode; 323 } 324 } 325 326 unicode = _PyUnicode_New(size); 327 if (!unicode) 328 return NULL; 329 330 /* Copy the Unicode data into the new object */ 331 if (u != NULL) 332 Py_UNICODE_COPY(unicode->str, u, size); 333 334 return (PyObject *)unicode; 335} 336 337#ifdef HAVE_WCHAR_H 338 339PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 340 int size) 341{ 342 PyUnicodeObject *unicode; 343 344 if (w == NULL) { 345 PyErr_BadInternalCall(); 346 return NULL; 347 } 348 349 unicode = _PyUnicode_New(size); 350 if (!unicode) 351 return NULL; 352 353 /* Copy the wchar_t data into the new object */ 354#ifdef HAVE_USABLE_WCHAR_T 355 memcpy(unicode->str, w, size * sizeof(wchar_t)); 356#else 357 { 358 register Py_UNICODE *u; 359 register int i; 360 u = PyUnicode_AS_UNICODE(unicode); 361 for (i = size; i >= 0; i--) 362 *u++ = *w++; 363 } 364#endif 365 366 return (PyObject *)unicode; 367} 368 369int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 370 register wchar_t *w, 371 int size) 372{ 373 if (unicode == NULL) { 374 PyErr_BadInternalCall(); 375 return -1; 376 } 377 if (size > PyUnicode_GET_SIZE(unicode)) 378 size = PyUnicode_GET_SIZE(unicode); 379#ifdef HAVE_USABLE_WCHAR_T 380 memcpy(w, unicode->str, size * sizeof(wchar_t)); 381#else 382 { 383 register Py_UNICODE *u; 384 register int i; 385 u = PyUnicode_AS_UNICODE(unicode); 386 for (i = size; i >= 0; i--) 387 *w++ = *u++; 388 } 389#endif 390 391 return size; 392} 393 394#endif 395 396PyObject *PyUnicode_FromObject(register PyObject *obj) 397{ 398 /* XXX Perhaps we should make this API an alias of 399 PyObject_Unicode() instead ?! */ 400 if (PyUnicode_CheckExact(obj)) { 401 Py_INCREF(obj); 402 return obj; 403 } 404 if (PyUnicode_Check(obj)) { 405 /* For a Unicode subtype that's not a Unicode object, 406 return a true Unicode object with the same data. */ 407 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 408 PyUnicode_GET_SIZE(obj)); 409 } 410 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 411} 412 413PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 414 const char *encoding, 415 const char *errors) 416{ 417 const char *s = NULL; 418 int len; 419 int owned = 0; 420 PyObject *v; 421 422 if (obj == NULL) { 423 PyErr_BadInternalCall(); 424 return NULL; 425 } 426 427#if 0 428 /* For b/w compatibility we also accept Unicode objects provided 429 that no encodings is given and then redirect to 430 PyObject_Unicode() which then applies the additional logic for 431 Unicode subclasses. 432 433 NOTE: This API should really only be used for object which 434 represent *encoded* Unicode ! 435 436 */ 437 if (PyUnicode_Check(obj)) { 438 if (encoding) { 439 PyErr_SetString(PyExc_TypeError, 440 "decoding Unicode is not supported"); 441 return NULL; 442 } 443 return PyObject_Unicode(obj); 444 } 445#else 446 if (PyUnicode_Check(obj)) { 447 PyErr_SetString(PyExc_TypeError, 448 "decoding Unicode is not supported"); 449 return NULL; 450 } 451#endif 452 453 /* Coerce object */ 454 if (PyString_Check(obj)) { 455 s = PyString_AS_STRING(obj); 456 len = PyString_GET_SIZE(obj); 457 } 458 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 459 /* Overwrite the error message with something more useful in 460 case of a TypeError. */ 461 if (PyErr_ExceptionMatches(PyExc_TypeError)) 462 PyErr_Format(PyExc_TypeError, 463 "coercing to Unicode: need string or buffer, " 464 "%.80s found", 465 obj->ob_type->tp_name); 466 goto onError; 467 } 468 469 /* Convert to Unicode */ 470 if (len == 0) { 471 Py_INCREF(unicode_empty); 472 v = (PyObject *)unicode_empty; 473 } 474 else 475 v = PyUnicode_Decode(s, len, encoding, errors); 476 477 if (owned) { 478 Py_DECREF(obj); 479 } 480 return v; 481 482 onError: 483 if (owned) { 484 Py_DECREF(obj); 485 } 486 return NULL; 487} 488 489PyObject *PyUnicode_Decode(const char *s, 490 int size, 491 const char *encoding, 492 const char *errors) 493{ 494 PyObject *buffer = NULL, *unicode; 495 496 if (encoding == NULL) 497 encoding = PyUnicode_GetDefaultEncoding(); 498 499 /* Shortcuts for common default encodings */ 500 if (strcmp(encoding, "utf-8") == 0) 501 return PyUnicode_DecodeUTF8(s, size, errors); 502 else if (strcmp(encoding, "latin-1") == 0) 503 return PyUnicode_DecodeLatin1(s, size, errors); 504 else if (strcmp(encoding, "ascii") == 0) 505 return PyUnicode_DecodeASCII(s, size, errors); 506 507 /* Decode via the codec registry */ 508 buffer = PyBuffer_FromMemory((void *)s, size); 509 if (buffer == NULL) 510 goto onError; 511 unicode = PyCodec_Decode(buffer, encoding, errors); 512 if (unicode == NULL) 513 goto onError; 514 if (!PyUnicode_Check(unicode)) { 515 PyErr_Format(PyExc_TypeError, 516 "decoder did not return an unicode object (type=%.400s)", 517 unicode->ob_type->tp_name); 518 Py_DECREF(unicode); 519 goto onError; 520 } 521 Py_DECREF(buffer); 522 return unicode; 523 524 onError: 525 Py_XDECREF(buffer); 526 return NULL; 527} 528 529PyObject *PyUnicode_Encode(const Py_UNICODE *s, 530 int size, 531 const char *encoding, 532 const char *errors) 533{ 534 PyObject *v, *unicode; 535 536 unicode = PyUnicode_FromUnicode(s, size); 537 if (unicode == NULL) 538 return NULL; 539 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 540 Py_DECREF(unicode); 541 return v; 542} 543 544PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 545 const char *encoding, 546 const char *errors) 547{ 548 PyObject *v; 549 550 if (!PyUnicode_Check(unicode)) { 551 PyErr_BadArgument(); 552 goto onError; 553 } 554 555 if (encoding == NULL) 556 encoding = PyUnicode_GetDefaultEncoding(); 557 558 /* Shortcuts for common default encodings */ 559 if (errors == NULL) { 560 if (strcmp(encoding, "utf-8") == 0) 561 return PyUnicode_AsUTF8String(unicode); 562 else if (strcmp(encoding, "latin-1") == 0) 563 return PyUnicode_AsLatin1String(unicode); 564 else if (strcmp(encoding, "ascii") == 0) 565 return PyUnicode_AsASCIIString(unicode); 566 } 567 568 /* Encode via the codec registry */ 569 v = PyCodec_Encode(unicode, encoding, errors); 570 if (v == NULL) 571 goto onError; 572 /* XXX Should we really enforce this ? */ 573 if (!PyString_Check(v)) { 574 PyErr_Format(PyExc_TypeError, 575 "encoder did not return a string object (type=%.400s)", 576 v->ob_type->tp_name); 577 Py_DECREF(v); 578 goto onError; 579 } 580 return v; 581 582 onError: 583 return NULL; 584} 585 586PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 587 const char *errors) 588{ 589 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 590 591 if (v) 592 return v; 593 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 594 if (v && errors == NULL) 595 ((PyUnicodeObject *)unicode)->defenc = v; 596 return v; 597} 598 599Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 600{ 601 if (!PyUnicode_Check(unicode)) { 602 PyErr_BadArgument(); 603 goto onError; 604 } 605 return PyUnicode_AS_UNICODE(unicode); 606 607 onError: 608 return NULL; 609} 610 611int PyUnicode_GetSize(PyObject *unicode) 612{ 613 if (!PyUnicode_Check(unicode)) { 614 PyErr_BadArgument(); 615 goto onError; 616 } 617 return PyUnicode_GET_SIZE(unicode); 618 619 onError: 620 return -1; 621} 622 623const char *PyUnicode_GetDefaultEncoding(void) 624{ 625 return unicode_default_encoding; 626} 627 628int PyUnicode_SetDefaultEncoding(const char *encoding) 629{ 630 PyObject *v; 631 632 /* Make sure the encoding is valid. As side effect, this also 633 loads the encoding into the codec registry cache. */ 634 v = _PyCodec_Lookup(encoding); 635 if (v == NULL) 636 goto onError; 637 Py_DECREF(v); 638 strncpy(unicode_default_encoding, 639 encoding, 640 sizeof(unicode_default_encoding)); 641 return 0; 642 643 onError: 644 return -1; 645} 646 647/* --- UTF-7 Codec -------------------------------------------------------- */ 648 649/* see RFC2152 for details */ 650 651static 652char utf7_special[128] = { 653 /* indicate whether a UTF-7 character is special i.e. cannot be directly 654 encoded: 655 0 - not special 656 1 - special 657 2 - whitespace (optional) 658 3 - RFC2152 Set O (optional) */ 659 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 660 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 661 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 663 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 665 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 667 668}; 669 670#define SPECIAL(c, encodeO, encodeWS) \ 671 (((c)>127 || utf7_special[(c)] == 1) || \ 672 (encodeWS && (utf7_special[(c)] == 2)) || \ 673 (encodeO && (utf7_special[(c)] == 3))) 674 675#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 676#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 677#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 678 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 679 680#define ENCODE(out, ch, bits) \ 681 while (bits >= 6) { \ 682 *out++ = B64(ch >> (bits-6)); \ 683 bits -= 6; \ 684 } 685 686#define DECODE(out, ch, bits, surrogate) \ 687 while (bits >= 16) { \ 688 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 689 bits -= 16; \ 690 if (surrogate) { \ 691 /* We have already generated an error for the high surrogate 692 so let's not bother seeing if the low surrogate is correct or not */\ 693 surrogate = 0; \ 694 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 695 /* This is a surrogate pair. Unfortunately we can't represent \ 696 it in a 16-bit character */ \ 697 surrogate = 1; \ 698 errmsg = "code pairs are not supported"; \ 699 goto utf7Error; \ 700 } else { \ 701 *out++ = outCh; \ 702 } \ 703 } \ 704 705static 706int utf7_decoding_error(Py_UNICODE **dest, 707 const char *errors, 708 const char *details) 709{ 710 if ((errors == NULL) || 711 (strcmp(errors,"strict") == 0)) { 712 PyErr_Format(PyExc_UnicodeError, 713 "UTF-7 decoding error: %.400s", 714 details); 715 return -1; 716 } 717 else if (strcmp(errors,"ignore") == 0) { 718 return 0; 719 } 720 else if (strcmp(errors,"replace") == 0) { 721 if (dest != NULL) { 722 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 723 (*dest)++; 724 } 725 return 0; 726 } 727 else { 728 PyErr_Format(PyExc_ValueError, 729 "UTF-7 decoding error; unknown error handling code: %.400s", 730 errors); 731 return -1; 732 } 733} 734 735PyObject *PyUnicode_DecodeUTF7(const char *s, 736 int size, 737 const char *errors) 738{ 739 const char *e; 740 PyUnicodeObject *unicode; 741 Py_UNICODE *p; 742 const char *errmsg = ""; 743 int inShift = 0; 744 unsigned int bitsleft = 0; 745 unsigned long charsleft = 0; 746 int surrogate = 0; 747 748 unicode = _PyUnicode_New(size); 749 if (!unicode) 750 return NULL; 751 if (size == 0) 752 return (PyObject *)unicode; 753 754 p = unicode->str; 755 e = s + size; 756 757 while (s < e) { 758 Py_UNICODE ch = *s; 759 760 if (inShift) { 761 if ((ch == '-') || !B64CHAR(ch)) { 762 inShift = 0; 763 s++; 764 765 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 766 if (bitsleft >= 6) { 767 /* The shift sequence has a partial character in it. If 768 bitsleft < 6 then we could just classify it as padding 769 but that is not the case here */ 770 771 errmsg = "partial character in shift sequence"; 772 goto utf7Error; 773 } 774 /* According to RFC2152 the remaining bits should be zero. We 775 choose to signal an error/insert a replacement character 776 here so indicate the potential of a misencoded character. */ 777 778 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 779 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 780 errmsg = "non-zero padding bits in shift sequence"; 781 goto utf7Error; 782 } 783 784 if (ch == '-') { 785 if ((s < e) && (*(s) == '-')) { 786 *p++ = '-'; 787 inShift = 1; 788 } 789 } else if (SPECIAL(ch,0,0)) { 790 errmsg = "unexpected special character"; 791 goto utf7Error; 792 } else { 793 *p++ = ch; 794 } 795 } else { 796 charsleft = (charsleft << 6) | UB64(ch); 797 bitsleft += 6; 798 s++; 799 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 800 } 801 } 802 else if ( ch == '+' ) { 803 s++; 804 if (s < e && *s == '-') { 805 s++; 806 *p++ = '+'; 807 } else 808 { 809 inShift = 1; 810 bitsleft = 0; 811 } 812 } 813 else if (SPECIAL(ch,0,0)) { 814 errmsg = "unexpected special character"; 815 s++; 816 goto utf7Error; 817 } 818 else { 819 *p++ = ch; 820 s++; 821 } 822 continue; 823 utf7Error: 824 if (utf7_decoding_error(&p, errors, errmsg)) 825 goto onError; 826 } 827 828 if (inShift) { 829 if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) 830 goto onError; 831 } 832 833 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 834 goto onError; 835 836 return (PyObject *)unicode; 837 838onError: 839 Py_DECREF(unicode); 840 return NULL; 841} 842 843 844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 845 int size, 846 int encodeSetO, 847 int encodeWhiteSpace, 848 const char *errors) 849{ 850 PyObject *v; 851 /* It might be possible to tighten this worst case */ 852 unsigned int cbAllocated = 5 * size; 853 int inShift = 0; 854 int i = 0; 855 unsigned int bitsleft = 0; 856 unsigned long charsleft = 0; 857 char * out; 858 char * start; 859 860 if (size == 0) 861 return PyString_FromStringAndSize(NULL, 0); 862 863 v = PyString_FromStringAndSize(NULL, cbAllocated); 864 if (v == NULL) 865 return NULL; 866 867 start = out = PyString_AS_STRING(v); 868 for (;i < size; ++i) { 869 Py_UNICODE ch = s[i]; 870 871 if (!inShift) { 872 if (ch == '+') { 873 *out++ = '+'; 874 *out++ = '-'; 875 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 876 charsleft = ch; 877 bitsleft = 16; 878 *out++ = '+'; 879 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 880 inShift = bitsleft > 0; 881 } else { 882 *out++ = (char) ch; 883 } 884 } else { 885 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 886 *out++ = B64(charsleft << (6-bitsleft)); 887 charsleft = 0; 888 bitsleft = 0; 889 /* Characters not in the BASE64 set implicitly unshift the sequence 890 so no '-' is required, except if the character is itself a '-' */ 891 if (B64CHAR(ch) || ch == '-') { 892 *out++ = '-'; 893 } 894 inShift = 0; 895 *out++ = (char) ch; 896 } else { 897 bitsleft += 16; 898 charsleft = (charsleft << 16) | ch; 899 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 900 901 /* If the next character is special then we dont' need to terminate 902 the shift sequence. If the next character is not a BASE64 character 903 or '-' then the shift sequence will be terminated implicitly and we 904 don't have to insert a '-'. */ 905 906 if (bitsleft == 0) { 907 if (i + 1 < size) { 908 Py_UNICODE ch2 = s[i+1]; 909 910 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 911 912 } else if (B64CHAR(ch2) || ch2 == '-') { 913 *out++ = '-'; 914 inShift = 0; 915 } else { 916 inShift = 0; 917 } 918 919 } 920 else { 921 *out++ = '-'; 922 inShift = 0; 923 } 924 } 925 } 926 } 927 } 928 if (bitsleft) { 929 *out++= B64(charsleft << (6-bitsleft) ); 930 *out++ = '-'; 931 } 932 933 if (_PyString_Resize(&v, out - start)) { 934 Py_DECREF(v); 935 return NULL; 936 } 937 return v; 938} 939 940#undef SPECIAL 941#undef B64 942#undef B64CHAR 943#undef UB64 944#undef ENCODE 945#undef DECODE 946 947/* --- UTF-8 Codec -------------------------------------------------------- */ 948 949static 950char utf8_code_length[256] = { 951 /* Map UTF-8 encoded prefix byte to sequence length. zero means 952 illegal prefix. see RFC 2279 for details */ 953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 964 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 965 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 966 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 967 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 968 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 969}; 970 971static 972int utf8_decoding_error(const char **source, 973 Py_UNICODE **dest, 974 const char *errors, 975 const char *details) 976{ 977 if ((errors == NULL) || 978 (strcmp(errors,"strict") == 0)) { 979 PyErr_Format(PyExc_UnicodeError, 980 "UTF-8 decoding error: %.400s", 981 details); 982 return -1; 983 } 984 else if (strcmp(errors,"ignore") == 0) { 985 (*source)++; 986 return 0; 987 } 988 else if (strcmp(errors,"replace") == 0) { 989 (*source)++; 990 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 991 (*dest)++; 992 return 0; 993 } 994 else { 995 PyErr_Format(PyExc_ValueError, 996 "UTF-8 decoding error; unknown error handling code: %.400s", 997 errors); 998 return -1; 999 } 1000} 1001 1002PyObject *PyUnicode_DecodeUTF8(const char *s, 1003 int size, 1004 const char *errors) 1005{ 1006 int n; 1007 const char *e; 1008 PyUnicodeObject *unicode; 1009 Py_UNICODE *p; 1010 const char *errmsg = ""; 1011 1012 /* Note: size will always be longer than the resulting Unicode 1013 character count */ 1014 unicode = _PyUnicode_New(size); 1015 if (!unicode) 1016 return NULL; 1017 if (size == 0) 1018 return (PyObject *)unicode; 1019 1020 /* Unpack UTF-8 encoded data */ 1021 p = unicode->str; 1022 e = s + size; 1023 1024 while (s < e) { 1025 Py_UCS4 ch = (unsigned char)*s; 1026 1027 if (ch < 0x80) { 1028 *p++ = (Py_UNICODE)ch; 1029 s++; 1030 continue; 1031 } 1032 1033 n = utf8_code_length[ch]; 1034 1035 if (s + n > e) { 1036 errmsg = "unexpected end of data"; 1037 goto utf8Error; 1038 } 1039 1040 switch (n) { 1041 1042 case 0: 1043 errmsg = "unexpected code byte"; 1044 goto utf8Error; 1045 1046 case 1: 1047 errmsg = "internal error"; 1048 goto utf8Error; 1049 1050 case 2: 1051 if ((s[1] & 0xc0) != 0x80) { 1052 errmsg = "invalid data"; 1053 goto utf8Error; 1054 } 1055 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1056 if (ch < 0x80) { 1057 errmsg = "illegal encoding"; 1058 goto utf8Error; 1059 } 1060 else 1061 *p++ = (Py_UNICODE)ch; 1062 break; 1063 1064 case 3: 1065 if ((s[1] & 0xc0) != 0x80 || 1066 (s[2] & 0xc0) != 0x80) { 1067 errmsg = "invalid data"; 1068 goto utf8Error; 1069 } 1070 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1071 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 1072 errmsg = "illegal encoding"; 1073 goto utf8Error; 1074 } 1075 else 1076 *p++ = (Py_UNICODE)ch; 1077 break; 1078 1079 case 4: 1080 if ((s[1] & 0xc0) != 0x80 || 1081 (s[2] & 0xc0) != 0x80 || 1082 (s[3] & 0xc0) != 0x80) { 1083 errmsg = "invalid data"; 1084 goto utf8Error; 1085 } 1086 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1087 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1088 /* validate and convert to UTF-16 */ 1089 if ((ch < 0x10000) /* minimum value allowed for 4 1090 byte encoding */ 1091 || (ch > 0x10ffff)) /* maximum value allowed for 1092 UTF-16 */ 1093 { 1094 errmsg = "illegal encoding"; 1095 goto utf8Error; 1096 } 1097#ifdef Py_UNICODE_WIDE 1098 *p++ = (Py_UNICODE)ch; 1099#else 1100 /* compute and append the two surrogates: */ 1101 1102 /* translate from 10000..10FFFF to 0..FFFF */ 1103 ch -= 0x10000; 1104 1105 /* high surrogate = top 10 bits added to D800 */ 1106 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1107 1108 /* low surrogate = bottom 10 bits added to DC00 */ 1109 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1110#endif 1111 break; 1112 1113 default: 1114 /* Other sizes are only needed for UCS-4 */ 1115 errmsg = "unsupported Unicode code range"; 1116 goto utf8Error; 1117 } 1118 s += n; 1119 continue; 1120 1121 utf8Error: 1122 if (utf8_decoding_error(&s, &p, errors, errmsg)) 1123 goto onError; 1124 } 1125 1126 /* Adjust length */ 1127 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1128 goto onError; 1129 1130 return (PyObject *)unicode; 1131 1132onError: 1133 Py_DECREF(unicode); 1134 return NULL; 1135} 1136 1137/* Not used anymore, now that the encoder supports UTF-16 1138 surrogates. */ 1139#if 0 1140static 1141int utf8_encoding_error(const Py_UNICODE **source, 1142 char **dest, 1143 const char *errors, 1144 const char *details) 1145{ 1146 if ((errors == NULL) || 1147 (strcmp(errors,"strict") == 0)) { 1148 PyErr_Format(PyExc_UnicodeError, 1149 "UTF-8 encoding error: %.400s", 1150 details); 1151 return -1; 1152 } 1153 else if (strcmp(errors,"ignore") == 0) { 1154 return 0; 1155 } 1156 else if (strcmp(errors,"replace") == 0) { 1157 **dest = '?'; 1158 (*dest)++; 1159 return 0; 1160 } 1161 else { 1162 PyErr_Format(PyExc_ValueError, 1163 "UTF-8 encoding error; " 1164 "unknown error handling code: %.400s", 1165 errors); 1166 return -1; 1167 } 1168} 1169#endif 1170 1171PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1172 int size, 1173 const char *errors) 1174{ 1175 PyObject *v; 1176 char *p; 1177 char *q; 1178 Py_UCS4 ch2; 1179 unsigned int cbAllocated = 3 * size; 1180 unsigned int cbWritten = 0; 1181 int i = 0; 1182 1183 v = PyString_FromStringAndSize(NULL, cbAllocated); 1184 if (v == NULL) 1185 return NULL; 1186 if (size == 0) 1187 return v; 1188 1189 p = q = PyString_AS_STRING(v); 1190 while (i < size) { 1191 Py_UCS4 ch = s[i++]; 1192 if (ch < 0x80) { 1193 *p++ = (char) ch; 1194 cbWritten++; 1195 } 1196 else if (ch < 0x0800) { 1197 *p++ = 0xc0 | (ch >> 6); 1198 *p++ = 0x80 | (ch & 0x3f); 1199 cbWritten += 2; 1200 } 1201 else if (ch < 0x10000) { 1202 /* Check for high surrogate */ 1203 if (0xD800 <= ch && ch <= 0xDBFF) { 1204 if (i != size) { 1205 ch2 = s[i]; 1206 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1207 1208 if (cbWritten >= (cbAllocated - 4)) { 1209 /* Provide enough room for some more 1210 surrogates */ 1211 cbAllocated += 4*10; 1212 if (_PyString_Resize(&v, cbAllocated)) 1213 goto onError; 1214 } 1215 1216 /* combine the two values */ 1217 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 1218 1219 *p++ = (char)((ch >> 18) | 0xf0); 1220 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1221 i++; 1222 cbWritten += 4; 1223 } 1224 } 1225 } 1226 else { 1227 *p++ = (char)(0xe0 | (ch >> 12)); 1228 cbWritten += 3; 1229 } 1230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1231 *p++ = (char)(0x80 | (ch & 0x3f)); 1232 } else { 1233 *p++ = 0xf0 | (ch>>18); 1234 *p++ = 0x80 | ((ch>>12) & 0x3f); 1235 *p++ = 0x80 | ((ch>>6) & 0x3f); 1236 *p++ = 0x80 | (ch & 0x3f); 1237 cbWritten += 4; 1238 } 1239 } 1240 *p = '\0'; 1241 if (_PyString_Resize(&v, p - q)) 1242 goto onError; 1243 return v; 1244 1245 onError: 1246 Py_DECREF(v); 1247 return NULL; 1248} 1249 1250PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1251{ 1252 if (!PyUnicode_Check(unicode)) { 1253 PyErr_BadArgument(); 1254 return NULL; 1255 } 1256 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1257 PyUnicode_GET_SIZE(unicode), 1258 NULL); 1259} 1260 1261/* --- UTF-16 Codec ------------------------------------------------------- */ 1262 1263static 1264int utf16_decoding_error(Py_UNICODE **dest, 1265 const char *errors, 1266 const char *details) 1267{ 1268 if ((errors == NULL) || 1269 (strcmp(errors,"strict") == 0)) { 1270 PyErr_Format(PyExc_UnicodeError, 1271 "UTF-16 decoding error: %.400s", 1272 details); 1273 return -1; 1274 } 1275 else if (strcmp(errors,"ignore") == 0) { 1276 return 0; 1277 } 1278 else if (strcmp(errors,"replace") == 0) { 1279 if (dest) { 1280 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1281 (*dest)++; 1282 } 1283 return 0; 1284 } 1285 else { 1286 PyErr_Format(PyExc_ValueError, 1287 "UTF-16 decoding error; " 1288 "unknown error handling code: %.400s", 1289 errors); 1290 return -1; 1291 } 1292} 1293 1294PyObject * 1295PyUnicode_DecodeUTF16(const char *s, 1296 int size, 1297 const char *errors, 1298 int *byteorder) 1299{ 1300 PyUnicodeObject *unicode; 1301 Py_UNICODE *p; 1302 const unsigned char *q, *e; 1303 int bo = 0; /* assume native ordering by default */ 1304 const char *errmsg = ""; 1305 /* Offsets from q for retrieving byte pairs in the right order. */ 1306#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1307 int ihi = 1, ilo = 0; 1308#else 1309 int ihi = 0, ilo = 1; 1310#endif 1311 1312 /* size should be an even number */ 1313 if (size & 1) { 1314 if (utf16_decoding_error(NULL, errors, "truncated data")) 1315 return NULL; 1316 --size; /* else ignore the oddball byte */ 1317 } 1318 1319 /* Note: size will always be longer than the resulting Unicode 1320 character count */ 1321 unicode = _PyUnicode_New(size); 1322 if (!unicode) 1323 return NULL; 1324 if (size == 0) 1325 return (PyObject *)unicode; 1326 1327 /* Unpack UTF-16 encoded data */ 1328 p = unicode->str; 1329 q = (unsigned char *)s; 1330 e = q + size; 1331 1332 if (byteorder) 1333 bo = *byteorder; 1334 1335 /* Check for BOM marks (U+FEFF) in the input and adjust current 1336 byte order setting accordingly. In native mode, the leading BOM 1337 mark is skipped, in all other modes, it is copied to the output 1338 stream as-is (giving a ZWNBSP character). */ 1339 if (bo == 0) { 1340 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1341#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1342 if (bom == 0xFEFF) { 1343 q += 2; 1344 bo = -1; 1345 } 1346 else if (bom == 0xFFFE) { 1347 q += 2; 1348 bo = 1; 1349 } 1350#else 1351 if (bom == 0xFEFF) { 1352 q += 2; 1353 bo = 1; 1354 } 1355 else if (bom == 0xFFFE) { 1356 q += 2; 1357 bo = -1; 1358 } 1359#endif 1360 } 1361 1362 if (bo == -1) { 1363 /* force LE */ 1364 ihi = 1; 1365 ilo = 0; 1366 } 1367 else if (bo == 1) { 1368 /* force BE */ 1369 ihi = 0; 1370 ilo = 1; 1371 } 1372 1373 while (q < e) { 1374 Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; 1375 q += 2; 1376 1377 if (ch < 0xD800 || ch > 0xDFFF) { 1378 *p++ = ch; 1379 continue; 1380 } 1381 1382 /* UTF-16 code pair: */ 1383 if (q >= e) { 1384 errmsg = "unexpected end of data"; 1385 goto utf16Error; 1386 } 1387 if (0xD800 <= ch && ch <= 0xDBFF) { 1388 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1389 q += 2; 1390 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1391#ifndef Py_UNICODE_WIDE 1392 *p++ = ch; 1393 *p++ = ch2; 1394#else 1395 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1396#endif 1397 continue; 1398 } 1399 else { 1400 errmsg = "illegal UTF-16 surrogate"; 1401 goto utf16Error; 1402 } 1403 1404 } 1405 errmsg = "illegal encoding"; 1406 /* Fall through to report the error */ 1407 1408 utf16Error: 1409 if (utf16_decoding_error(&p, errors, errmsg)) 1410 goto onError; 1411 } 1412 1413 if (byteorder) 1414 *byteorder = bo; 1415 1416 /* Adjust length */ 1417 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1418 goto onError; 1419 1420 return (PyObject *)unicode; 1421 1422onError: 1423 Py_DECREF(unicode); 1424 return NULL; 1425} 1426 1427PyObject * 1428PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1429 int size, 1430 const char *errors, 1431 int byteorder) 1432{ 1433 PyObject *v; 1434 unsigned char *p; 1435 int i, pairs; 1436 /* Offsets from p for storing byte pairs in the right order. */ 1437#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1438 int ihi = 1, ilo = 0; 1439#else 1440 int ihi = 0, ilo = 1; 1441#endif 1442 1443#define STORECHAR(CH) \ 1444 do { \ 1445 p[ihi] = ((CH) >> 8) & 0xff; \ 1446 p[ilo] = (CH) & 0xff; \ 1447 p += 2; \ 1448 } while(0) 1449 1450 for (i = pairs = 0; i < size; i++) 1451 if (s[i] >= 0x10000) 1452 pairs++; 1453 v = PyString_FromStringAndSize(NULL, 1454 2 * (size + pairs + (byteorder == 0))); 1455 if (v == NULL) 1456 return NULL; 1457 1458 p = (unsigned char *)PyString_AS_STRING(v); 1459 if (byteorder == 0) 1460 STORECHAR(0xFEFF); 1461 if (size == 0) 1462 return v; 1463 1464 if (byteorder == -1) { 1465 /* force LE */ 1466 ihi = 1; 1467 ilo = 0; 1468 } 1469 else if (byteorder == 1) { 1470 /* force BE */ 1471 ihi = 0; 1472 ilo = 1; 1473 } 1474 1475 while (size-- > 0) { 1476 Py_UNICODE ch = *s++; 1477 Py_UNICODE ch2 = 0; 1478 if (ch >= 0x10000) { 1479 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1480 ch = 0xD800 | ((ch-0x10000) >> 10); 1481 } 1482 STORECHAR(ch); 1483 if (ch2) 1484 STORECHAR(ch2); 1485 } 1486 return v; 1487#undef STORECHAR 1488} 1489 1490PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1491{ 1492 if (!PyUnicode_Check(unicode)) { 1493 PyErr_BadArgument(); 1494 return NULL; 1495 } 1496 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1497 PyUnicode_GET_SIZE(unicode), 1498 NULL, 1499 0); 1500} 1501 1502/* --- Unicode Escape Codec ----------------------------------------------- */ 1503 1504static 1505int unicodeescape_decoding_error(const char **source, 1506 Py_UNICODE *x, 1507 const char *errors, 1508 const char *details) 1509{ 1510 if ((errors == NULL) || 1511 (strcmp(errors,"strict") == 0)) { 1512 PyErr_Format(PyExc_UnicodeError, 1513 "Unicode-Escape decoding error: %.400s", 1514 details); 1515 return -1; 1516 } 1517 else if (strcmp(errors,"ignore") == 0) { 1518 return 0; 1519 } 1520 else if (strcmp(errors,"replace") == 0) { 1521 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1522 return 0; 1523 } 1524 else { 1525 PyErr_Format(PyExc_ValueError, 1526 "Unicode-Escape decoding error; " 1527 "unknown error handling code: %.400s", 1528 errors); 1529 return -1; 1530 } 1531} 1532 1533static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1534 1535PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1536 int size, 1537 const char *errors) 1538{ 1539 PyUnicodeObject *v; 1540 Py_UNICODE *p, *buf; 1541 const char *end; 1542 char* message; 1543 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1544 1545 /* Escaped strings will always be longer than the resulting 1546 Unicode string, so we start with size here and then reduce the 1547 length after conversion to the true value. */ 1548 v = _PyUnicode_New(size); 1549 if (v == NULL) 1550 goto onError; 1551 if (size == 0) 1552 return (PyObject *)v; 1553 1554 p = buf = PyUnicode_AS_UNICODE(v); 1555 end = s + size; 1556 1557 while (s < end) { 1558 unsigned char c; 1559 Py_UNICODE x; 1560 int i, digits; 1561 1562 /* Non-escape characters are interpreted as Unicode ordinals */ 1563 if (*s != '\\') { 1564 *p++ = (unsigned char) *s++; 1565 continue; 1566 } 1567 1568 /* \ - Escapes */ 1569 s++; 1570 switch (*s++) { 1571 1572 /* \x escapes */ 1573 case '\n': break; 1574 case '\\': *p++ = '\\'; break; 1575 case '\'': *p++ = '\''; break; 1576 case '\"': *p++ = '\"'; break; 1577 case 'b': *p++ = '\b'; break; 1578 case 'f': *p++ = '\014'; break; /* FF */ 1579 case 't': *p++ = '\t'; break; 1580 case 'n': *p++ = '\n'; break; 1581 case 'r': *p++ = '\r'; break; 1582 case 'v': *p++ = '\013'; break; /* VT */ 1583 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1584 1585 /* \OOO (octal) escapes */ 1586 case '0': case '1': case '2': case '3': 1587 case '4': case '5': case '6': case '7': 1588 x = s[-1] - '0'; 1589 if ('0' <= *s && *s <= '7') { 1590 x = (x<<3) + *s++ - '0'; 1591 if ('0' <= *s && *s <= '7') 1592 x = (x<<3) + *s++ - '0'; 1593 } 1594 *p++ = x; 1595 break; 1596 1597 /* hex escapes */ 1598 /* \xXX */ 1599 case 'x': 1600 digits = 2; 1601 message = "truncated \\xXX escape"; 1602 goto hexescape; 1603 1604 /* \uXXXX */ 1605 case 'u': 1606 digits = 4; 1607 message = "truncated \\uXXXX escape"; 1608 goto hexescape; 1609 1610 /* \UXXXXXXXX */ 1611 case 'U': 1612 digits = 8; 1613 message = "truncated \\UXXXXXXXX escape"; 1614 hexescape: 1615 chr = 0; 1616 for (i = 0; i < digits; i++) { 1617 c = (unsigned char) s[i]; 1618 if (!isxdigit(c)) { 1619 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1620 goto onError; 1621 chr = x; 1622 i++; 1623 break; 1624 } 1625 chr = (chr<<4) & ~0xF; 1626 if (c >= '0' && c <= '9') 1627 chr += c - '0'; 1628 else if (c >= 'a' && c <= 'f') 1629 chr += 10 + c - 'a'; 1630 else 1631 chr += 10 + c - 'A'; 1632 } 1633 s += i; 1634 store: 1635 /* when we get here, chr is a 32-bit unicode character */ 1636 if (chr <= 0xffff) 1637 /* UCS-2 character */ 1638 *p++ = (Py_UNICODE) chr; 1639 else if (chr <= 0x10ffff) { 1640 /* UCS-4 character. Either store directly, or as 1641 surrogate pair. */ 1642#ifdef Py_UNICODE_WIDE 1643 *p++ = chr; 1644#else 1645 chr -= 0x10000L; 1646 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1647 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1648#endif 1649 } else { 1650 if (unicodeescape_decoding_error( 1651 &s, &x, errors, 1652 "illegal Unicode character") 1653 ) 1654 goto onError; 1655 *p++ = x; /* store replacement character */ 1656 } 1657 break; 1658 1659 /* \N{name} */ 1660 case 'N': 1661 message = "malformed \\N character escape"; 1662 if (ucnhash_CAPI == NULL) { 1663 /* load the unicode data module */ 1664 PyObject *m, *v; 1665 m = PyImport_ImportModule("unicodedata"); 1666 if (m == NULL) 1667 goto ucnhashError; 1668 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1669 Py_DECREF(m); 1670 if (v == NULL) 1671 goto ucnhashError; 1672 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1673 Py_DECREF(v); 1674 if (ucnhash_CAPI == NULL) 1675 goto ucnhashError; 1676 } 1677 if (*s == '{') { 1678 const char *start = s+1; 1679 /* look for the closing brace */ 1680 while (*s != '}' && s < end) 1681 s++; 1682 if (s > start && s < end && *s == '}') { 1683 /* found a name. look it up in the unicode database */ 1684 message = "unknown Unicode character name"; 1685 s++; 1686 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1687 goto store; 1688 } 1689 } 1690 if (unicodeescape_decoding_error(&s, &x, errors, message)) 1691 goto onError; 1692 *p++ = x; 1693 break; 1694 1695 default: 1696 *p++ = '\\'; 1697 *p++ = (unsigned char)s[-1]; 1698 break; 1699 } 1700 } 1701 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1702 goto onError; 1703 return (PyObject *)v; 1704 1705ucnhashError: 1706 PyErr_SetString( 1707 PyExc_UnicodeError, 1708 "\\N escapes not supported (can't load unicodedata module)" 1709 ); 1710 return NULL; 1711 1712onError: 1713 Py_XDECREF(v); 1714 return NULL; 1715} 1716 1717/* Return a Unicode-Escape string version of the Unicode object. 1718 1719 If quotes is true, the string is enclosed in u"" or u'' quotes as 1720 appropriate. 1721 1722*/ 1723 1724static const Py_UNICODE *findchar(const Py_UNICODE *s, 1725 int size, 1726 Py_UNICODE ch); 1727 1728static 1729PyObject *unicodeescape_string(const Py_UNICODE *s, 1730 int size, 1731 int quotes) 1732{ 1733 PyObject *repr; 1734 char *p; 1735 1736 static const char *hexdigit = "0123456789abcdef"; 1737 1738 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1739 if (repr == NULL) 1740 return NULL; 1741 1742 p = PyString_AS_STRING(repr); 1743 1744 if (quotes) { 1745 *p++ = 'u'; 1746 *p++ = (findchar(s, size, '\'') && 1747 !findchar(s, size, '"')) ? '"' : '\''; 1748 } 1749 while (size-- > 0) { 1750 Py_UNICODE ch = *s++; 1751 1752 /* Escape quotes */ 1753 if (quotes && 1754 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1755 *p++ = '\\'; 1756 *p++ = (char) ch; 1757 continue; 1758 } 1759 1760#ifdef Py_UNICODE_WIDE 1761 /* Map 21-bit characters to '\U00xxxxxx' */ 1762 else if (ch >= 0x10000) { 1763 int offset = p - PyString_AS_STRING(repr); 1764 1765 /* Resize the string if necessary */ 1766 if (offset + 12 > PyString_GET_SIZE(repr)) { 1767 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1768 goto onError; 1769 p = PyString_AS_STRING(repr) + offset; 1770 } 1771 1772 *p++ = '\\'; 1773 *p++ = 'U'; 1774 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1775 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1776 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1777 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1778 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1779 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1780 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1781 *p++ = hexdigit[ch & 0x0000000F]; 1782 continue; 1783 } 1784#endif 1785 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1786 else if (ch >= 0xD800 && ch < 0xDC00) { 1787 Py_UNICODE ch2; 1788 Py_UCS4 ucs; 1789 1790 ch2 = *s++; 1791 size--; 1792 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1793 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1794 *p++ = '\\'; 1795 *p++ = 'U'; 1796 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1797 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1798 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1799 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1800 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1801 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1802 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1803 *p++ = hexdigit[ucs & 0x0000000F]; 1804 continue; 1805 } 1806 /* Fall through: isolated surrogates are copied as-is */ 1807 s--; 1808 size++; 1809 } 1810 1811 /* Map 16-bit characters to '\uxxxx' */ 1812 if (ch >= 256) { 1813 *p++ = '\\'; 1814 *p++ = 'u'; 1815 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1816 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1817 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1818 *p++ = hexdigit[ch & 0x000F]; 1819 } 1820 1821 /* Map special whitespace to '\t', \n', '\r' */ 1822 else if (ch == '\t') { 1823 *p++ = '\\'; 1824 *p++ = 't'; 1825 } 1826 else if (ch == '\n') { 1827 *p++ = '\\'; 1828 *p++ = 'n'; 1829 } 1830 else if (ch == '\r') { 1831 *p++ = '\\'; 1832 *p++ = 'r'; 1833 } 1834 1835 /* Map non-printable US ASCII to '\xhh' */ 1836 else if (ch < ' ' || ch >= 128) { 1837 *p++ = '\\'; 1838 *p++ = 'x'; 1839 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1840 *p++ = hexdigit[ch & 0x000F]; 1841 } 1842 1843 /* Copy everything else as-is */ 1844 else 1845 *p++ = (char) ch; 1846 } 1847 if (quotes) 1848 *p++ = PyString_AS_STRING(repr)[1]; 1849 1850 *p = '\0'; 1851 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 1852 goto onError; 1853 1854 return repr; 1855 1856 onError: 1857 Py_DECREF(repr); 1858 return NULL; 1859} 1860 1861PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1862 int size) 1863{ 1864 return unicodeescape_string(s, size, 0); 1865} 1866 1867PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1868{ 1869 if (!PyUnicode_Check(unicode)) { 1870 PyErr_BadArgument(); 1871 return NULL; 1872 } 1873 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1874 PyUnicode_GET_SIZE(unicode)); 1875} 1876 1877/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1878 1879PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1880 int size, 1881 const char *errors) 1882{ 1883 PyUnicodeObject *v; 1884 Py_UNICODE *p, *buf; 1885 const char *end; 1886 const char *bs; 1887 1888 /* Escaped strings will always be longer than the resulting 1889 Unicode string, so we start with size here and then reduce the 1890 length after conversion to the true value. */ 1891 v = _PyUnicode_New(size); 1892 if (v == NULL) 1893 goto onError; 1894 if (size == 0) 1895 return (PyObject *)v; 1896 p = buf = PyUnicode_AS_UNICODE(v); 1897 end = s + size; 1898 while (s < end) { 1899 unsigned char c; 1900 Py_UNICODE x; 1901 int i; 1902 1903 /* Non-escape characters are interpreted as Unicode ordinals */ 1904 if (*s != '\\') { 1905 *p++ = (unsigned char)*s++; 1906 continue; 1907 } 1908 1909 /* \u-escapes are only interpreted iff the number of leading 1910 backslashes if odd */ 1911 bs = s; 1912 for (;s < end;) { 1913 if (*s != '\\') 1914 break; 1915 *p++ = (unsigned char)*s++; 1916 } 1917 if (((s - bs) & 1) == 0 || 1918 s >= end || 1919 *s != 'u') { 1920 continue; 1921 } 1922 p--; 1923 s++; 1924 1925 /* \uXXXX with 4 hex digits */ 1926 for (x = 0, i = 0; i < 4; i++) { 1927 c = (unsigned char)s[i]; 1928 if (!isxdigit(c)) { 1929 if (unicodeescape_decoding_error(&s, &x, errors, 1930 "truncated \\uXXXX")) 1931 goto onError; 1932 i++; 1933 break; 1934 } 1935 x = (x<<4) & ~0xF; 1936 if (c >= '0' && c <= '9') 1937 x += c - '0'; 1938 else if (c >= 'a' && c <= 'f') 1939 x += 10 + c - 'a'; 1940 else 1941 x += 10 + c - 'A'; 1942 } 1943 s += i; 1944 *p++ = x; 1945 } 1946 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1947 goto onError; 1948 return (PyObject *)v; 1949 1950 onError: 1951 Py_XDECREF(v); 1952 return NULL; 1953} 1954 1955PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1956 int size) 1957{ 1958 PyObject *repr; 1959 char *p; 1960 char *q; 1961 1962 static const char *hexdigit = "0123456789abcdef"; 1963 1964 repr = PyString_FromStringAndSize(NULL, 6 * size); 1965 if (repr == NULL) 1966 return NULL; 1967 if (size == 0) 1968 return repr; 1969 1970 p = q = PyString_AS_STRING(repr); 1971 while (size-- > 0) { 1972 Py_UNICODE ch = *s++; 1973 /* Map 16-bit characters to '\uxxxx' */ 1974 if (ch >= 256) { 1975 *p++ = '\\'; 1976 *p++ = 'u'; 1977 *p++ = hexdigit[(ch >> 12) & 0xf]; 1978 *p++ = hexdigit[(ch >> 8) & 0xf]; 1979 *p++ = hexdigit[(ch >> 4) & 0xf]; 1980 *p++ = hexdigit[ch & 15]; 1981 } 1982 /* Copy everything else as-is */ 1983 else 1984 *p++ = (char) ch; 1985 } 1986 *p = '\0'; 1987 if (_PyString_Resize(&repr, p - q)) 1988 goto onError; 1989 1990 return repr; 1991 1992 onError: 1993 Py_DECREF(repr); 1994 return NULL; 1995} 1996 1997PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1998{ 1999 if (!PyUnicode_Check(unicode)) { 2000 PyErr_BadArgument(); 2001 return NULL; 2002 } 2003 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2004 PyUnicode_GET_SIZE(unicode)); 2005} 2006 2007/* --- Latin-1 Codec ------------------------------------------------------ */ 2008 2009PyObject *PyUnicode_DecodeLatin1(const char *s, 2010 int size, 2011 const char *errors) 2012{ 2013 PyUnicodeObject *v; 2014 Py_UNICODE *p; 2015 2016 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2017 if (size == 1 && *(unsigned char*)s < 256) { 2018 Py_UNICODE r = *(unsigned char*)s; 2019 return PyUnicode_FromUnicode(&r, 1); 2020 } 2021 2022 v = _PyUnicode_New(size); 2023 if (v == NULL) 2024 goto onError; 2025 if (size == 0) 2026 return (PyObject *)v; 2027 p = PyUnicode_AS_UNICODE(v); 2028 while (size-- > 0) 2029 *p++ = (unsigned char)*s++; 2030 return (PyObject *)v; 2031 2032 onError: 2033 Py_XDECREF(v); 2034 return NULL; 2035} 2036 2037static 2038int latin1_encoding_error(const Py_UNICODE **source, 2039 char **dest, 2040 const char *errors, 2041 const char *details) 2042{ 2043 if ((errors == NULL) || 2044 (strcmp(errors,"strict") == 0)) { 2045 PyErr_Format(PyExc_UnicodeError, 2046 "Latin-1 encoding error: %.400s", 2047 details); 2048 return -1; 2049 } 2050 else if (strcmp(errors,"ignore") == 0) { 2051 return 0; 2052 } 2053 else if (strcmp(errors,"replace") == 0) { 2054 **dest = '?'; 2055 (*dest)++; 2056 return 0; 2057 } 2058 else { 2059 PyErr_Format(PyExc_ValueError, 2060 "Latin-1 encoding error; " 2061 "unknown error handling code: %.400s", 2062 errors); 2063 return -1; 2064 } 2065} 2066 2067PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2068 int size, 2069 const char *errors) 2070{ 2071 PyObject *repr; 2072 char *s, *start; 2073 2074 repr = PyString_FromStringAndSize(NULL, size); 2075 if (repr == NULL) 2076 return NULL; 2077 if (size == 0) 2078 return repr; 2079 2080 s = PyString_AS_STRING(repr); 2081 start = s; 2082 while (size-- > 0) { 2083 Py_UNICODE ch = *p++; 2084 if (ch >= 256) { 2085 if (latin1_encoding_error(&p, &s, errors, 2086 "ordinal not in range(256)")) 2087 goto onError; 2088 } 2089 else 2090 *s++ = (char)ch; 2091 } 2092 /* Resize if error handling skipped some characters */ 2093 if (s - start < PyString_GET_SIZE(repr)) 2094 if (_PyString_Resize(&repr, s - start)) 2095 goto onError; 2096 return repr; 2097 2098 onError: 2099 Py_DECREF(repr); 2100 return NULL; 2101} 2102 2103PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2104{ 2105 if (!PyUnicode_Check(unicode)) { 2106 PyErr_BadArgument(); 2107 return NULL; 2108 } 2109 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2110 PyUnicode_GET_SIZE(unicode), 2111 NULL); 2112} 2113 2114/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2115 2116static 2117int ascii_decoding_error(const char **source, 2118 Py_UNICODE **dest, 2119 const char *errors, 2120 const char *details) 2121{ 2122 if ((errors == NULL) || 2123 (strcmp(errors,"strict") == 0)) { 2124 PyErr_Format(PyExc_UnicodeError, 2125 "ASCII decoding error: %.400s", 2126 details); 2127 return -1; 2128 } 2129 else if (strcmp(errors,"ignore") == 0) { 2130 return 0; 2131 } 2132 else if (strcmp(errors,"replace") == 0) { 2133 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2134 (*dest)++; 2135 return 0; 2136 } 2137 else { 2138 PyErr_Format(PyExc_ValueError, 2139 "ASCII decoding error; " 2140 "unknown error handling code: %.400s", 2141 errors); 2142 return -1; 2143 } 2144} 2145 2146PyObject *PyUnicode_DecodeASCII(const char *s, 2147 int size, 2148 const char *errors) 2149{ 2150 PyUnicodeObject *v; 2151 Py_UNICODE *p; 2152 2153 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2154 if (size == 1 && *(unsigned char*)s < 128) { 2155 Py_UNICODE r = *(unsigned char*)s; 2156 return PyUnicode_FromUnicode(&r, 1); 2157 } 2158 2159 v = _PyUnicode_New(size); 2160 if (v == NULL) 2161 goto onError; 2162 if (size == 0) 2163 return (PyObject *)v; 2164 p = PyUnicode_AS_UNICODE(v); 2165 while (size-- > 0) { 2166 register unsigned char c; 2167 2168 c = (unsigned char)*s++; 2169 if (c < 128) 2170 *p++ = c; 2171 else if (ascii_decoding_error(&s, &p, errors, 2172 "ordinal not in range(128)")) 2173 goto onError; 2174 } 2175 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2176 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2177 goto onError; 2178 return (PyObject *)v; 2179 2180 onError: 2181 Py_XDECREF(v); 2182 return NULL; 2183} 2184 2185static 2186int ascii_encoding_error(const Py_UNICODE **source, 2187 char **dest, 2188 const char *errors, 2189 const char *details) 2190{ 2191 if ((errors == NULL) || 2192 (strcmp(errors,"strict") == 0)) { 2193 PyErr_Format(PyExc_UnicodeError, 2194 "ASCII encoding error: %.400s", 2195 details); 2196 return -1; 2197 } 2198 else if (strcmp(errors,"ignore") == 0) { 2199 return 0; 2200 } 2201 else if (strcmp(errors,"replace") == 0) { 2202 **dest = '?'; 2203 (*dest)++; 2204 return 0; 2205 } 2206 else { 2207 PyErr_Format(PyExc_ValueError, 2208 "ASCII encoding error; " 2209 "unknown error handling code: %.400s", 2210 errors); 2211 return -1; 2212 } 2213} 2214 2215PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2216 int size, 2217 const char *errors) 2218{ 2219 PyObject *repr; 2220 char *s, *start; 2221 2222 repr = PyString_FromStringAndSize(NULL, size); 2223 if (repr == NULL) 2224 return NULL; 2225 if (size == 0) 2226 return repr; 2227 2228 s = PyString_AS_STRING(repr); 2229 start = s; 2230 while (size-- > 0) { 2231 Py_UNICODE ch = *p++; 2232 if (ch >= 128) { 2233 if (ascii_encoding_error(&p, &s, errors, 2234 "ordinal not in range(128)")) 2235 goto onError; 2236 } 2237 else 2238 *s++ = (char)ch; 2239 } 2240 /* Resize if error handling skipped some characters */ 2241 if (s - start < PyString_GET_SIZE(repr)) 2242 if (_PyString_Resize(&repr, s - start)) 2243 goto onError; 2244 return repr; 2245 2246 onError: 2247 Py_DECREF(repr); 2248 return NULL; 2249} 2250 2251PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2252{ 2253 if (!PyUnicode_Check(unicode)) { 2254 PyErr_BadArgument(); 2255 return NULL; 2256 } 2257 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2258 PyUnicode_GET_SIZE(unicode), 2259 NULL); 2260} 2261 2262#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) 2263 2264/* --- MBCS codecs for Windows -------------------------------------------- */ 2265 2266PyObject *PyUnicode_DecodeMBCS(const char *s, 2267 int size, 2268 const char *errors) 2269{ 2270 PyUnicodeObject *v; 2271 Py_UNICODE *p; 2272 2273 /* First get the size of the result */ 2274 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2275 if (size > 0 && usize==0) 2276 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2277 2278 v = _PyUnicode_New(usize); 2279 if (v == NULL) 2280 return NULL; 2281 if (usize == 0) 2282 return (PyObject *)v; 2283 p = PyUnicode_AS_UNICODE(v); 2284 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2285 Py_DECREF(v); 2286 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2287 } 2288 2289 return (PyObject *)v; 2290} 2291 2292PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2293 int size, 2294 const char *errors) 2295{ 2296 PyObject *repr; 2297 char *s; 2298 DWORD mbcssize; 2299 2300 /* If there are no characters, bail now! */ 2301 if (size==0) 2302 return PyString_FromString(""); 2303 2304 /* First get the size of the result */ 2305 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2306 if (mbcssize==0) 2307 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2308 2309 repr = PyString_FromStringAndSize(NULL, mbcssize); 2310 if (repr == NULL) 2311 return NULL; 2312 if (mbcssize == 0) 2313 return repr; 2314 2315 /* Do the conversion */ 2316 s = PyString_AS_STRING(repr); 2317 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2318 Py_DECREF(repr); 2319 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2320 } 2321 return repr; 2322} 2323 2324#endif /* MS_WIN32 */ 2325 2326/* --- Character Mapping Codec -------------------------------------------- */ 2327 2328static 2329int charmap_decoding_error(const char **source, 2330 Py_UNICODE **dest, 2331 const char *errors, 2332 const char *details) 2333{ 2334 if ((errors == NULL) || 2335 (strcmp(errors,"strict") == 0)) { 2336 PyErr_Format(PyExc_UnicodeError, 2337 "charmap decoding error: %.400s", 2338 details); 2339 return -1; 2340 } 2341 else if (strcmp(errors,"ignore") == 0) { 2342 return 0; 2343 } 2344 else if (strcmp(errors,"replace") == 0) { 2345 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2346 (*dest)++; 2347 return 0; 2348 } 2349 else { 2350 PyErr_Format(PyExc_ValueError, 2351 "charmap decoding error; " 2352 "unknown error handling code: %.400s", 2353 errors); 2354 return -1; 2355 } 2356} 2357 2358PyObject *PyUnicode_DecodeCharmap(const char *s, 2359 int size, 2360 PyObject *mapping, 2361 const char *errors) 2362{ 2363 PyUnicodeObject *v; 2364 Py_UNICODE *p; 2365 int extrachars = 0; 2366 2367 /* Default to Latin-1 */ 2368 if (mapping == NULL) 2369 return PyUnicode_DecodeLatin1(s, size, errors); 2370 2371 v = _PyUnicode_New(size); 2372 if (v == NULL) 2373 goto onError; 2374 if (size == 0) 2375 return (PyObject *)v; 2376 p = PyUnicode_AS_UNICODE(v); 2377 while (size-- > 0) { 2378 unsigned char ch = *s++; 2379 PyObject *w, *x; 2380 2381 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2382 w = PyInt_FromLong((long)ch); 2383 if (w == NULL) 2384 goto onError; 2385 x = PyObject_GetItem(mapping, w); 2386 Py_DECREF(w); 2387 if (x == NULL) { 2388 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2389 /* No mapping found means: mapping is undefined. */ 2390 PyErr_Clear(); 2391 x = Py_None; 2392 Py_INCREF(x); 2393 } else 2394 goto onError; 2395 } 2396 2397 /* Apply mapping */ 2398 if (PyInt_Check(x)) { 2399 long value = PyInt_AS_LONG(x); 2400 if (value < 0 || value > 65535) { 2401 PyErr_SetString(PyExc_TypeError, 2402 "character mapping must be in range(65536)"); 2403 Py_DECREF(x); 2404 goto onError; 2405 } 2406 *p++ = (Py_UNICODE)value; 2407 } 2408 else if (x == Py_None) { 2409 /* undefined mapping */ 2410 if (charmap_decoding_error(&s, &p, errors, 2411 "character maps to <undefined>")) { 2412 Py_DECREF(x); 2413 goto onError; 2414 } 2415 } 2416 else if (PyUnicode_Check(x)) { 2417 int targetsize = PyUnicode_GET_SIZE(x); 2418 2419 if (targetsize == 1) 2420 /* 1-1 mapping */ 2421 *p++ = *PyUnicode_AS_UNICODE(x); 2422 2423 else if (targetsize > 1) { 2424 /* 1-n mapping */ 2425 if (targetsize > extrachars) { 2426 /* resize first */ 2427 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2428 int needed = (targetsize - extrachars) + \ 2429 (targetsize << 2); 2430 extrachars += needed; 2431 if (_PyUnicode_Resize(&v, 2432 PyUnicode_GET_SIZE(v) + needed)) { 2433 Py_DECREF(x); 2434 goto onError; 2435 } 2436 p = PyUnicode_AS_UNICODE(v) + oldpos; 2437 } 2438 Py_UNICODE_COPY(p, 2439 PyUnicode_AS_UNICODE(x), 2440 targetsize); 2441 p += targetsize; 2442 extrachars -= targetsize; 2443 } 2444 /* 1-0 mapping: skip the character */ 2445 } 2446 else { 2447 /* wrong return value */ 2448 PyErr_SetString(PyExc_TypeError, 2449 "character mapping must return integer, None or unicode"); 2450 Py_DECREF(x); 2451 goto onError; 2452 } 2453 Py_DECREF(x); 2454 } 2455 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2456 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2457 goto onError; 2458 return (PyObject *)v; 2459 2460 onError: 2461 Py_XDECREF(v); 2462 return NULL; 2463} 2464 2465static 2466int charmap_encoding_error(const Py_UNICODE **source, 2467 char **dest, 2468 const char *errors, 2469 const char *details) 2470{ 2471 if ((errors == NULL) || 2472 (strcmp(errors,"strict") == 0)) { 2473 PyErr_Format(PyExc_UnicodeError, 2474 "charmap encoding error: %.400s", 2475 details); 2476 return -1; 2477 } 2478 else if (strcmp(errors,"ignore") == 0) { 2479 return 0; 2480 } 2481 else if (strcmp(errors,"replace") == 0) { 2482 **dest = '?'; 2483 (*dest)++; 2484 return 0; 2485 } 2486 else { 2487 PyErr_Format(PyExc_ValueError, 2488 "charmap encoding error; " 2489 "unknown error handling code: %.400s", 2490 errors); 2491 return -1; 2492 } 2493} 2494 2495PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2496 int size, 2497 PyObject *mapping, 2498 const char *errors) 2499{ 2500 PyObject *v; 2501 char *s; 2502 int extrachars = 0; 2503 2504 /* Default to Latin-1 */ 2505 if (mapping == NULL) 2506 return PyUnicode_EncodeLatin1(p, size, errors); 2507 2508 v = PyString_FromStringAndSize(NULL, size); 2509 if (v == NULL) 2510 return NULL; 2511 if (size == 0) 2512 return v; 2513 s = PyString_AS_STRING(v); 2514 while (size-- > 0) { 2515 Py_UNICODE ch = *p++; 2516 PyObject *w, *x; 2517 2518 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2519 w = PyInt_FromLong((long)ch); 2520 if (w == NULL) 2521 goto onError; 2522 x = PyObject_GetItem(mapping, w); 2523 Py_DECREF(w); 2524 if (x == NULL) { 2525 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2526 /* No mapping found means: mapping is undefined. */ 2527 PyErr_Clear(); 2528 x = Py_None; 2529 Py_INCREF(x); 2530 } else 2531 goto onError; 2532 } 2533 2534 /* Apply mapping */ 2535 if (PyInt_Check(x)) { 2536 long value = PyInt_AS_LONG(x); 2537 if (value < 0 || value > 255) { 2538 PyErr_SetString(PyExc_TypeError, 2539 "character mapping must be in range(256)"); 2540 Py_DECREF(x); 2541 goto onError; 2542 } 2543 *s++ = (char)value; 2544 } 2545 else if (x == Py_None) { 2546 /* undefined mapping */ 2547 if (charmap_encoding_error(&p, &s, errors, 2548 "character maps to <undefined>")) { 2549 Py_DECREF(x); 2550 goto onError; 2551 } 2552 } 2553 else if (PyString_Check(x)) { 2554 int targetsize = PyString_GET_SIZE(x); 2555 2556 if (targetsize == 1) 2557 /* 1-1 mapping */ 2558 *s++ = *PyString_AS_STRING(x); 2559 2560 else if (targetsize > 1) { 2561 /* 1-n mapping */ 2562 if (targetsize > extrachars) { 2563 /* resize first */ 2564 int oldpos = (int)(s - PyString_AS_STRING(v)); 2565 int needed = (targetsize - extrachars) + \ 2566 (targetsize << 2); 2567 extrachars += needed; 2568 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2569 Py_DECREF(x); 2570 goto onError; 2571 } 2572 s = PyString_AS_STRING(v) + oldpos; 2573 } 2574 memcpy(s, PyString_AS_STRING(x), targetsize); 2575 s += targetsize; 2576 extrachars -= targetsize; 2577 } 2578 /* 1-0 mapping: skip the character */ 2579 } 2580 else { 2581 /* wrong return value */ 2582 PyErr_SetString(PyExc_TypeError, 2583 "character mapping must return integer, None or unicode"); 2584 Py_DECREF(x); 2585 goto onError; 2586 } 2587 Py_DECREF(x); 2588 } 2589 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2590 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2591 goto onError; 2592 return v; 2593 2594 onError: 2595 Py_DECREF(v); 2596 return NULL; 2597} 2598 2599PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2600 PyObject *mapping) 2601{ 2602 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2603 PyErr_BadArgument(); 2604 return NULL; 2605 } 2606 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2607 PyUnicode_GET_SIZE(unicode), 2608 mapping, 2609 NULL); 2610} 2611 2612static 2613int translate_error(const Py_UNICODE **source, 2614 Py_UNICODE **dest, 2615 const char *errors, 2616 const char *details) 2617{ 2618 if ((errors == NULL) || 2619 (strcmp(errors,"strict") == 0)) { 2620 PyErr_Format(PyExc_UnicodeError, 2621 "translate error: %.400s", 2622 details); 2623 return -1; 2624 } 2625 else if (strcmp(errors,"ignore") == 0) { 2626 return 0; 2627 } 2628 else if (strcmp(errors,"replace") == 0) { 2629 **dest = '?'; 2630 (*dest)++; 2631 return 0; 2632 } 2633 else { 2634 PyErr_Format(PyExc_ValueError, 2635 "translate error; " 2636 "unknown error handling code: %.400s", 2637 errors); 2638 return -1; 2639 } 2640} 2641 2642PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2643 int size, 2644 PyObject *mapping, 2645 const char *errors) 2646{ 2647 PyUnicodeObject *v; 2648 Py_UNICODE *p; 2649 2650 if (mapping == NULL) { 2651 PyErr_BadArgument(); 2652 return NULL; 2653 } 2654 2655 /* Output will never be longer than input */ 2656 v = _PyUnicode_New(size); 2657 if (v == NULL) 2658 goto onError; 2659 if (size == 0) 2660 goto done; 2661 p = PyUnicode_AS_UNICODE(v); 2662 while (size-- > 0) { 2663 Py_UNICODE ch = *s++; 2664 PyObject *w, *x; 2665 2666 /* Get mapping */ 2667 w = PyInt_FromLong(ch); 2668 if (w == NULL) 2669 goto onError; 2670 x = PyObject_GetItem(mapping, w); 2671 Py_DECREF(w); 2672 if (x == NULL) { 2673 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2674 /* No mapping found: default to 1-1 mapping */ 2675 PyErr_Clear(); 2676 *p++ = ch; 2677 continue; 2678 } 2679 goto onError; 2680 } 2681 2682 /* Apply mapping */ 2683 if (PyInt_Check(x)) 2684 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2685 else if (x == Py_None) { 2686 /* undefined mapping */ 2687 if (translate_error(&s, &p, errors, 2688 "character maps to <undefined>")) { 2689 Py_DECREF(x); 2690 goto onError; 2691 } 2692 } 2693 else if (PyUnicode_Check(x)) { 2694 if (PyUnicode_GET_SIZE(x) != 1) { 2695 /* 1-n mapping */ 2696 PyErr_SetString(PyExc_NotImplementedError, 2697 "1-n mappings are currently not implemented"); 2698 Py_DECREF(x); 2699 goto onError; 2700 } 2701 *p++ = *PyUnicode_AS_UNICODE(x); 2702 } 2703 else { 2704 /* wrong return value */ 2705 PyErr_SetString(PyExc_TypeError, 2706 "translate mapping must return integer, None or unicode"); 2707 Py_DECREF(x); 2708 goto onError; 2709 } 2710 Py_DECREF(x); 2711 } 2712 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2713 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2714 goto onError; 2715 2716 done: 2717 return (PyObject *)v; 2718 2719 onError: 2720 Py_XDECREF(v); 2721 return NULL; 2722} 2723 2724PyObject *PyUnicode_Translate(PyObject *str, 2725 PyObject *mapping, 2726 const char *errors) 2727{ 2728 PyObject *result; 2729 2730 str = PyUnicode_FromObject(str); 2731 if (str == NULL) 2732 goto onError; 2733 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2734 PyUnicode_GET_SIZE(str), 2735 mapping, 2736 errors); 2737 Py_DECREF(str); 2738 return result; 2739 2740 onError: 2741 Py_XDECREF(str); 2742 return NULL; 2743} 2744 2745/* --- Decimal Encoder ---------------------------------------------------- */ 2746 2747int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2748 int length, 2749 char *output, 2750 const char *errors) 2751{ 2752 Py_UNICODE *p, *end; 2753 2754 if (output == NULL) { 2755 PyErr_BadArgument(); 2756 return -1; 2757 } 2758 2759 p = s; 2760 end = s + length; 2761 while (p < end) { 2762 register Py_UNICODE ch = *p++; 2763 int decimal; 2764 2765 if (Py_UNICODE_ISSPACE(ch)) { 2766 *output++ = ' '; 2767 continue; 2768 } 2769 decimal = Py_UNICODE_TODECIMAL(ch); 2770 if (decimal >= 0) { 2771 *output++ = '0' + decimal; 2772 continue; 2773 } 2774 if (0 < ch && ch < 256) { 2775 *output++ = (char)ch; 2776 continue; 2777 } 2778 /* All other characters are considered invalid */ 2779 if (errors == NULL || strcmp(errors, "strict") == 0) { 2780 PyErr_SetString(PyExc_ValueError, 2781 "invalid decimal Unicode string"); 2782 goto onError; 2783 } 2784 else if (strcmp(errors, "ignore") == 0) 2785 continue; 2786 else if (strcmp(errors, "replace") == 0) { 2787 *output++ = '?'; 2788 continue; 2789 } 2790 } 2791 /* 0-terminate the output string */ 2792 *output++ = '\0'; 2793 return 0; 2794 2795 onError: 2796 return -1; 2797} 2798 2799/* --- Helpers ------------------------------------------------------------ */ 2800 2801static 2802int count(PyUnicodeObject *self, 2803 int start, 2804 int end, 2805 PyUnicodeObject *substring) 2806{ 2807 int count = 0; 2808 2809 if (start < 0) 2810 start += self->length; 2811 if (start < 0) 2812 start = 0; 2813 if (end > self->length) 2814 end = self->length; 2815 if (end < 0) 2816 end += self->length; 2817 if (end < 0) 2818 end = 0; 2819 2820 if (substring->length == 0) 2821 return (end - start + 1); 2822 2823 end -= substring->length; 2824 2825 while (start <= end) 2826 if (Py_UNICODE_MATCH(self, start, substring)) { 2827 count++; 2828 start += substring->length; 2829 } else 2830 start++; 2831 2832 return count; 2833} 2834 2835int PyUnicode_Count(PyObject *str, 2836 PyObject *substr, 2837 int start, 2838 int end) 2839{ 2840 int result; 2841 2842 str = PyUnicode_FromObject(str); 2843 if (str == NULL) 2844 return -1; 2845 substr = PyUnicode_FromObject(substr); 2846 if (substr == NULL) { 2847 Py_DECREF(str); 2848 return -1; 2849 } 2850 2851 result = count((PyUnicodeObject *)str, 2852 start, end, 2853 (PyUnicodeObject *)substr); 2854 2855 Py_DECREF(str); 2856 Py_DECREF(substr); 2857 return result; 2858} 2859 2860static 2861int findstring(PyUnicodeObject *self, 2862 PyUnicodeObject *substring, 2863 int start, 2864 int end, 2865 int direction) 2866{ 2867 if (start < 0) 2868 start += self->length; 2869 if (start < 0) 2870 start = 0; 2871 2872 if (substring->length == 0) 2873 return start; 2874 2875 if (end > self->length) 2876 end = self->length; 2877 if (end < 0) 2878 end += self->length; 2879 if (end < 0) 2880 end = 0; 2881 2882 end -= substring->length; 2883 2884 if (direction < 0) { 2885 for (; end >= start; end--) 2886 if (Py_UNICODE_MATCH(self, end, substring)) 2887 return end; 2888 } else { 2889 for (; start <= end; start++) 2890 if (Py_UNICODE_MATCH(self, start, substring)) 2891 return start; 2892 } 2893 2894 return -1; 2895} 2896 2897int PyUnicode_Find(PyObject *str, 2898 PyObject *substr, 2899 int start, 2900 int end, 2901 int direction) 2902{ 2903 int result; 2904 2905 str = PyUnicode_FromObject(str); 2906 if (str == NULL) 2907 return -1; 2908 substr = PyUnicode_FromObject(substr); 2909 if (substr == NULL) { 2910 Py_DECREF(substr); 2911 return -1; 2912 } 2913 2914 result = findstring((PyUnicodeObject *)str, 2915 (PyUnicodeObject *)substr, 2916 start, end, direction); 2917 Py_DECREF(str); 2918 Py_DECREF(substr); 2919 return result; 2920} 2921 2922static 2923int tailmatch(PyUnicodeObject *self, 2924 PyUnicodeObject *substring, 2925 int start, 2926 int end, 2927 int direction) 2928{ 2929 if (start < 0) 2930 start += self->length; 2931 if (start < 0) 2932 start = 0; 2933 2934 if (substring->length == 0) 2935 return 1; 2936 2937 if (end > self->length) 2938 end = self->length; 2939 if (end < 0) 2940 end += self->length; 2941 if (end < 0) 2942 end = 0; 2943 2944 end -= substring->length; 2945 if (end < start) 2946 return 0; 2947 2948 if (direction > 0) { 2949 if (Py_UNICODE_MATCH(self, end, substring)) 2950 return 1; 2951 } else { 2952 if (Py_UNICODE_MATCH(self, start, substring)) 2953 return 1; 2954 } 2955 2956 return 0; 2957} 2958 2959int PyUnicode_Tailmatch(PyObject *str, 2960 PyObject *substr, 2961 int start, 2962 int end, 2963 int direction) 2964{ 2965 int result; 2966 2967 str = PyUnicode_FromObject(str); 2968 if (str == NULL) 2969 return -1; 2970 substr = PyUnicode_FromObject(substr); 2971 if (substr == NULL) { 2972 Py_DECREF(substr); 2973 return -1; 2974 } 2975 2976 result = tailmatch((PyUnicodeObject *)str, 2977 (PyUnicodeObject *)substr, 2978 start, end, direction); 2979 Py_DECREF(str); 2980 Py_DECREF(substr); 2981 return result; 2982} 2983 2984static 2985const Py_UNICODE *findchar(const Py_UNICODE *s, 2986 int size, 2987 Py_UNICODE ch) 2988{ 2989 /* like wcschr, but doesn't stop at NULL characters */ 2990 2991 while (size-- > 0) { 2992 if (*s == ch) 2993 return s; 2994 s++; 2995 } 2996 2997 return NULL; 2998} 2999 3000/* Apply fixfct filter to the Unicode object self and return a 3001 reference to the modified object */ 3002 3003static 3004PyObject *fixup(PyUnicodeObject *self, 3005 int (*fixfct)(PyUnicodeObject *s)) 3006{ 3007 3008 PyUnicodeObject *u; 3009 3010 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3011 if (u == NULL) 3012 return NULL; 3013 3014 Py_UNICODE_COPY(u->str, self->str, self->length); 3015 3016 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3017 /* fixfct should return TRUE if it modified the buffer. If 3018 FALSE, return a reference to the original buffer instead 3019 (to save space, not time) */ 3020 Py_INCREF(self); 3021 Py_DECREF(u); 3022 return (PyObject*) self; 3023 } 3024 return (PyObject*) u; 3025} 3026 3027static 3028int fixupper(PyUnicodeObject *self) 3029{ 3030 int len = self->length; 3031 Py_UNICODE *s = self->str; 3032 int status = 0; 3033 3034 while (len-- > 0) { 3035 register Py_UNICODE ch; 3036 3037 ch = Py_UNICODE_TOUPPER(*s); 3038 if (ch != *s) { 3039 status = 1; 3040 *s = ch; 3041 } 3042 s++; 3043 } 3044 3045 return status; 3046} 3047 3048static 3049int fixlower(PyUnicodeObject *self) 3050{ 3051 int len = self->length; 3052 Py_UNICODE *s = self->str; 3053 int status = 0; 3054 3055 while (len-- > 0) { 3056 register Py_UNICODE ch; 3057 3058 ch = Py_UNICODE_TOLOWER(*s); 3059 if (ch != *s) { 3060 status = 1; 3061 *s = ch; 3062 } 3063 s++; 3064 } 3065 3066 return status; 3067} 3068 3069static 3070int fixswapcase(PyUnicodeObject *self) 3071{ 3072 int len = self->length; 3073 Py_UNICODE *s = self->str; 3074 int status = 0; 3075 3076 while (len-- > 0) { 3077 if (Py_UNICODE_ISUPPER(*s)) { 3078 *s = Py_UNICODE_TOLOWER(*s); 3079 status = 1; 3080 } else if (Py_UNICODE_ISLOWER(*s)) { 3081 *s = Py_UNICODE_TOUPPER(*s); 3082 status = 1; 3083 } 3084 s++; 3085 } 3086 3087 return status; 3088} 3089 3090static 3091int fixcapitalize(PyUnicodeObject *self) 3092{ 3093 int len = self->length; 3094 Py_UNICODE *s = self->str; 3095 int status = 0; 3096 3097 if (len == 0) 3098 return 0; 3099 if (Py_UNICODE_ISLOWER(*s)) { 3100 *s = Py_UNICODE_TOUPPER(*s); 3101 status = 1; 3102 } 3103 s++; 3104 while (--len > 0) { 3105 if (Py_UNICODE_ISUPPER(*s)) { 3106 *s = Py_UNICODE_TOLOWER(*s); 3107 status = 1; 3108 } 3109 s++; 3110 } 3111 return status; 3112} 3113 3114static 3115int fixtitle(PyUnicodeObject *self) 3116{ 3117 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3118 register Py_UNICODE *e; 3119 int previous_is_cased; 3120 3121 /* Shortcut for single character strings */ 3122 if (PyUnicode_GET_SIZE(self) == 1) { 3123 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3124 if (*p != ch) { 3125 *p = ch; 3126 return 1; 3127 } 3128 else 3129 return 0; 3130 } 3131 3132 e = p + PyUnicode_GET_SIZE(self); 3133 previous_is_cased = 0; 3134 for (; p < e; p++) { 3135 register const Py_UNICODE ch = *p; 3136 3137 if (previous_is_cased) 3138 *p = Py_UNICODE_TOLOWER(ch); 3139 else 3140 *p = Py_UNICODE_TOTITLE(ch); 3141 3142 if (Py_UNICODE_ISLOWER(ch) || 3143 Py_UNICODE_ISUPPER(ch) || 3144 Py_UNICODE_ISTITLE(ch)) 3145 previous_is_cased = 1; 3146 else 3147 previous_is_cased = 0; 3148 } 3149 return 1; 3150} 3151 3152PyObject *PyUnicode_Join(PyObject *separator, 3153 PyObject *seq) 3154{ 3155 Py_UNICODE *sep; 3156 int seplen; 3157 PyUnicodeObject *res = NULL; 3158 int reslen = 0; 3159 Py_UNICODE *p; 3160 int sz = 100; 3161 int i; 3162 PyObject *it; 3163 3164 it = PyObject_GetIter(seq); 3165 if (it == NULL) 3166 return NULL; 3167 3168 if (separator == NULL) { 3169 Py_UNICODE blank = ' '; 3170 sep = ␣ 3171 seplen = 1; 3172 } 3173 else { 3174 separator = PyUnicode_FromObject(separator); 3175 if (separator == NULL) 3176 goto onError; 3177 sep = PyUnicode_AS_UNICODE(separator); 3178 seplen = PyUnicode_GET_SIZE(separator); 3179 } 3180 3181 res = _PyUnicode_New(sz); 3182 if (res == NULL) 3183 goto onError; 3184 p = PyUnicode_AS_UNICODE(res); 3185 reslen = 0; 3186 3187 for (i = 0; ; ++i) { 3188 int itemlen; 3189 PyObject *item = PyIter_Next(it); 3190 if (item == NULL) { 3191 if (PyErr_Occurred()) 3192 goto onError; 3193 break; 3194 } 3195 if (!PyUnicode_Check(item)) { 3196 PyObject *v; 3197 if (!PyString_Check(item)) { 3198 PyErr_Format(PyExc_TypeError, 3199 "sequence item %i: expected string or Unicode," 3200 " %.80s found", 3201 i, item->ob_type->tp_name); 3202 Py_DECREF(item); 3203 goto onError; 3204 } 3205 v = PyUnicode_FromObject(item); 3206 Py_DECREF(item); 3207 item = v; 3208 if (item == NULL) 3209 goto onError; 3210 } 3211 itemlen = PyUnicode_GET_SIZE(item); 3212 while (reslen + itemlen + seplen >= sz) { 3213 if (_PyUnicode_Resize(&res, sz*2)) { 3214 Py_DECREF(item); 3215 goto onError; 3216 } 3217 sz *= 2; 3218 p = PyUnicode_AS_UNICODE(res) + reslen; 3219 } 3220 if (i > 0) { 3221 Py_UNICODE_COPY(p, sep, seplen); 3222 p += seplen; 3223 reslen += seplen; 3224 } 3225 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3226 p += itemlen; 3227 reslen += itemlen; 3228 Py_DECREF(item); 3229 } 3230 if (_PyUnicode_Resize(&res, reslen)) 3231 goto onError; 3232 3233 Py_XDECREF(separator); 3234 Py_DECREF(it); 3235 return (PyObject *)res; 3236 3237 onError: 3238 Py_XDECREF(separator); 3239 Py_XDECREF(res); 3240 Py_DECREF(it); 3241 return NULL; 3242} 3243 3244static 3245PyUnicodeObject *pad(PyUnicodeObject *self, 3246 int left, 3247 int right, 3248 Py_UNICODE fill) 3249{ 3250 PyUnicodeObject *u; 3251 3252 if (left < 0) 3253 left = 0; 3254 if (right < 0) 3255 right = 0; 3256 3257 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3258 Py_INCREF(self); 3259 return self; 3260 } 3261 3262 u = _PyUnicode_New(left + self->length + right); 3263 if (u) { 3264 if (left) 3265 Py_UNICODE_FILL(u->str, fill, left); 3266 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3267 if (right) 3268 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3269 } 3270 3271 return u; 3272} 3273 3274#define SPLIT_APPEND(data, left, right) \ 3275 str = PyUnicode_FromUnicode(data + left, right - left); \ 3276 if (!str) \ 3277 goto onError; \ 3278 if (PyList_Append(list, str)) { \ 3279 Py_DECREF(str); \ 3280 goto onError; \ 3281 } \ 3282 else \ 3283 Py_DECREF(str); 3284 3285static 3286PyObject *split_whitespace(PyUnicodeObject *self, 3287 PyObject *list, 3288 int maxcount) 3289{ 3290 register int i; 3291 register int j; 3292 int len = self->length; 3293 PyObject *str; 3294 3295 for (i = j = 0; i < len; ) { 3296 /* find a token */ 3297 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3298 i++; 3299 j = i; 3300 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 3301 i++; 3302 if (j < i) { 3303 if (maxcount-- <= 0) 3304 break; 3305 SPLIT_APPEND(self->str, j, i); 3306 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3307 i++; 3308 j = i; 3309 } 3310 } 3311 if (j < len) { 3312 SPLIT_APPEND(self->str, j, len); 3313 } 3314 return list; 3315 3316 onError: 3317 Py_DECREF(list); 3318 return NULL; 3319} 3320 3321PyObject *PyUnicode_Splitlines(PyObject *string, 3322 int keepends) 3323{ 3324 register int i; 3325 register int j; 3326 int len; 3327 PyObject *list; 3328 PyObject *str; 3329 Py_UNICODE *data; 3330 3331 string = PyUnicode_FromObject(string); 3332 if (string == NULL) 3333 return NULL; 3334 data = PyUnicode_AS_UNICODE(string); 3335 len = PyUnicode_GET_SIZE(string); 3336 3337 list = PyList_New(0); 3338 if (!list) 3339 goto onError; 3340 3341 for (i = j = 0; i < len; ) { 3342 int eol; 3343 3344 /* Find a line and append it */ 3345 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 3346 i++; 3347 3348 /* Skip the line break reading CRLF as one line break */ 3349 eol = i; 3350 if (i < len) { 3351 if (data[i] == '\r' && i + 1 < len && 3352 data[i+1] == '\n') 3353 i += 2; 3354 else 3355 i++; 3356 if (keepends) 3357 eol = i; 3358 } 3359 SPLIT_APPEND(data, j, eol); 3360 j = i; 3361 } 3362 if (j < len) { 3363 SPLIT_APPEND(data, j, len); 3364 } 3365 3366 Py_DECREF(string); 3367 return list; 3368 3369 onError: 3370 Py_DECREF(list); 3371 Py_DECREF(string); 3372 return NULL; 3373} 3374 3375static 3376PyObject *split_char(PyUnicodeObject *self, 3377 PyObject *list, 3378 Py_UNICODE ch, 3379 int maxcount) 3380{ 3381 register int i; 3382 register int j; 3383 int len = self->length; 3384 PyObject *str; 3385 3386 for (i = j = 0; i < len; ) { 3387 if (self->str[i] == ch) { 3388 if (maxcount-- <= 0) 3389 break; 3390 SPLIT_APPEND(self->str, j, i); 3391 i = j = i + 1; 3392 } else 3393 i++; 3394 } 3395 if (j <= len) { 3396 SPLIT_APPEND(self->str, j, len); 3397 } 3398 return list; 3399 3400 onError: 3401 Py_DECREF(list); 3402 return NULL; 3403} 3404 3405static 3406PyObject *split_substring(PyUnicodeObject *self, 3407 PyObject *list, 3408 PyUnicodeObject *substring, 3409 int maxcount) 3410{ 3411 register int i; 3412 register int j; 3413 int len = self->length; 3414 int sublen = substring->length; 3415 PyObject *str; 3416 3417 for (i = j = 0; i <= len - sublen; ) { 3418 if (Py_UNICODE_MATCH(self, i, substring)) { 3419 if (maxcount-- <= 0) 3420 break; 3421 SPLIT_APPEND(self->str, j, i); 3422 i = j = i + sublen; 3423 } else 3424 i++; 3425 } 3426 if (j <= len) { 3427 SPLIT_APPEND(self->str, j, len); 3428 } 3429 return list; 3430 3431 onError: 3432 Py_DECREF(list); 3433 return NULL; 3434} 3435 3436#undef SPLIT_APPEND 3437 3438static 3439PyObject *split(PyUnicodeObject *self, 3440 PyUnicodeObject *substring, 3441 int maxcount) 3442{ 3443 PyObject *list; 3444 3445 if (maxcount < 0) 3446 maxcount = INT_MAX; 3447 3448 list = PyList_New(0); 3449 if (!list) 3450 return NULL; 3451 3452 if (substring == NULL) 3453 return split_whitespace(self,list,maxcount); 3454 3455 else if (substring->length == 1) 3456 return split_char(self,list,substring->str[0],maxcount); 3457 3458 else if (substring->length == 0) { 3459 Py_DECREF(list); 3460 PyErr_SetString(PyExc_ValueError, "empty separator"); 3461 return NULL; 3462 } 3463 else 3464 return split_substring(self,list,substring,maxcount); 3465} 3466 3467static 3468PyObject *strip(PyUnicodeObject *self, 3469 int left, 3470 int right) 3471{ 3472 Py_UNICODE *p = self->str; 3473 int start = 0; 3474 int end = self->length; 3475 3476 if (left) 3477 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3478 start++; 3479 3480 if (right) 3481 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3482 end--; 3483 3484 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 3485 /* couldn't strip anything off, return original string */ 3486 Py_INCREF(self); 3487 return (PyObject*) self; 3488 } 3489 3490 return (PyObject*) PyUnicode_FromUnicode( 3491 self->str + start, 3492 end - start 3493 ); 3494} 3495 3496static 3497PyObject *replace(PyUnicodeObject *self, 3498 PyUnicodeObject *str1, 3499 PyUnicodeObject *str2, 3500 int maxcount) 3501{ 3502 PyUnicodeObject *u; 3503 3504 if (maxcount < 0) 3505 maxcount = INT_MAX; 3506 3507 if (str1->length == 1 && str2->length == 1) { 3508 int i; 3509 3510 /* replace characters */ 3511 if (!findchar(self->str, self->length, str1->str[0]) && 3512 PyUnicode_CheckExact(self)) { 3513 /* nothing to replace, return original string */ 3514 Py_INCREF(self); 3515 u = self; 3516 } else { 3517 Py_UNICODE u1 = str1->str[0]; 3518 Py_UNICODE u2 = str2->str[0]; 3519 3520 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3521 NULL, 3522 self->length 3523 ); 3524 if (u != NULL) { 3525 Py_UNICODE_COPY(u->str, self->str, 3526 self->length); 3527 for (i = 0; i < u->length; i++) 3528 if (u->str[i] == u1) { 3529 if (--maxcount < 0) 3530 break; 3531 u->str[i] = u2; 3532 } 3533 } 3534 } 3535 3536 } else { 3537 int n, i; 3538 Py_UNICODE *p; 3539 3540 /* replace strings */ 3541 n = count(self, 0, self->length, str1); 3542 if (n > maxcount) 3543 n = maxcount; 3544 if (n == 0 && PyUnicode_CheckExact(self)) { 3545 /* nothing to replace, return original string */ 3546 Py_INCREF(self); 3547 u = self; 3548 } else { 3549 u = _PyUnicode_New( 3550 self->length + n * (str2->length - str1->length)); 3551 if (u) { 3552 i = 0; 3553 p = u->str; 3554 while (i <= self->length - str1->length) 3555 if (Py_UNICODE_MATCH(self, i, str1)) { 3556 /* replace string segment */ 3557 Py_UNICODE_COPY(p, str2->str, str2->length); 3558 p += str2->length; 3559 i += str1->length; 3560 if (--n <= 0) { 3561 /* copy remaining part */ 3562 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3563 break; 3564 } 3565 } else 3566 *p++ = self->str[i++]; 3567 } 3568 } 3569 } 3570 3571 return (PyObject *) u; 3572} 3573 3574/* --- Unicode Object Methods --------------------------------------------- */ 3575 3576static char title__doc__[] = 3577"S.title() -> unicode\n\ 3578\n\ 3579Return a titlecased version of S, i.e. words start with title case\n\ 3580characters, all remaining cased characters have lower case."; 3581 3582static PyObject* 3583unicode_title(PyUnicodeObject *self) 3584{ 3585 return fixup(self, fixtitle); 3586} 3587 3588static char capitalize__doc__[] = 3589"S.capitalize() -> unicode\n\ 3590\n\ 3591Return a capitalized version of S, i.e. make the first character\n\ 3592have upper case."; 3593 3594static PyObject* 3595unicode_capitalize(PyUnicodeObject *self) 3596{ 3597 return fixup(self, fixcapitalize); 3598} 3599 3600#if 0 3601static char capwords__doc__[] = 3602"S.capwords() -> unicode\n\ 3603\n\ 3604Apply .capitalize() to all words in S and return the result with\n\ 3605normalized whitespace (all whitespace strings are replaced by ' ')."; 3606 3607static PyObject* 3608unicode_capwords(PyUnicodeObject *self) 3609{ 3610 PyObject *list; 3611 PyObject *item; 3612 int i; 3613 3614 /* Split into words */ 3615 list = split(self, NULL, -1); 3616 if (!list) 3617 return NULL; 3618 3619 /* Capitalize each word */ 3620 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3621 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3622 fixcapitalize); 3623 if (item == NULL) 3624 goto onError; 3625 Py_DECREF(PyList_GET_ITEM(list, i)); 3626 PyList_SET_ITEM(list, i, item); 3627 } 3628 3629 /* Join the words to form a new string */ 3630 item = PyUnicode_Join(NULL, list); 3631 3632onError: 3633 Py_DECREF(list); 3634 return (PyObject *)item; 3635} 3636#endif 3637 3638static char center__doc__[] = 3639"S.center(width) -> unicode\n\ 3640\n\ 3641Return S centered in a Unicode string of length width. Padding is done\n\ 3642using spaces."; 3643 3644static PyObject * 3645unicode_center(PyUnicodeObject *self, PyObject *args) 3646{ 3647 int marg, left; 3648 int width; 3649 3650 if (!PyArg_ParseTuple(args, "i:center", &width)) 3651 return NULL; 3652 3653 if (self->length >= width && PyUnicode_CheckExact(self)) { 3654 Py_INCREF(self); 3655 return (PyObject*) self; 3656 } 3657 3658 marg = width - self->length; 3659 left = marg / 2 + (marg & width & 1); 3660 3661 return (PyObject*) pad(self, left, marg - left, ' '); 3662} 3663 3664#if 0 3665 3666/* This code should go into some future Unicode collation support 3667 module. The basic comparison should compare ordinals on a naive 3668 basis (this is what Java does and thus JPython too). */ 3669 3670/* speedy UTF-16 code point order comparison */ 3671/* gleaned from: */ 3672/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3673 3674static short utf16Fixup[32] = 3675{ 3676 0, 0, 0, 0, 0, 0, 0, 0, 3677 0, 0, 0, 0, 0, 0, 0, 0, 3678 0, 0, 0, 0, 0, 0, 0, 0, 3679 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3680}; 3681 3682static int 3683unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3684{ 3685 int len1, len2; 3686 3687 Py_UNICODE *s1 = str1->str; 3688 Py_UNICODE *s2 = str2->str; 3689 3690 len1 = str1->length; 3691 len2 = str2->length; 3692 3693 while (len1 > 0 && len2 > 0) { 3694 Py_UNICODE c1, c2; 3695 3696 c1 = *s1++; 3697 c2 = *s2++; 3698 3699 if (c1 > (1<<11) * 26) 3700 c1 += utf16Fixup[c1>>11]; 3701 if (c2 > (1<<11) * 26) 3702 c2 += utf16Fixup[c2>>11]; 3703 /* now c1 and c2 are in UTF-32-compatible order */ 3704 3705 if (c1 != c2) 3706 return (c1 < c2) ? -1 : 1; 3707 3708 len1--; len2--; 3709 } 3710 3711 return (len1 < len2) ? -1 : (len1 != len2); 3712} 3713 3714#else 3715 3716static int 3717unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3718{ 3719 register int len1, len2; 3720 3721 Py_UNICODE *s1 = str1->str; 3722 Py_UNICODE *s2 = str2->str; 3723 3724 len1 = str1->length; 3725 len2 = str2->length; 3726 3727 while (len1 > 0 && len2 > 0) { 3728 Py_UNICODE c1, c2; 3729 3730 c1 = *s1++; 3731 c2 = *s2++; 3732 3733 if (c1 != c2) 3734 return (c1 < c2) ? -1 : 1; 3735 3736 len1--; len2--; 3737 } 3738 3739 return (len1 < len2) ? -1 : (len1 != len2); 3740} 3741 3742#endif 3743 3744int PyUnicode_Compare(PyObject *left, 3745 PyObject *right) 3746{ 3747 PyUnicodeObject *u = NULL, *v = NULL; 3748 int result; 3749 3750 /* Coerce the two arguments */ 3751 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3752 if (u == NULL) 3753 goto onError; 3754 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3755 if (v == NULL) 3756 goto onError; 3757 3758 /* Shortcut for empty or interned objects */ 3759 if (v == u) { 3760 Py_DECREF(u); 3761 Py_DECREF(v); 3762 return 0; 3763 } 3764 3765 result = unicode_compare(u, v); 3766 3767 Py_DECREF(u); 3768 Py_DECREF(v); 3769 return result; 3770 3771onError: 3772 Py_XDECREF(u); 3773 Py_XDECREF(v); 3774 return -1; 3775} 3776 3777int PyUnicode_Contains(PyObject *container, 3778 PyObject *element) 3779{ 3780 PyUnicodeObject *u = NULL, *v = NULL; 3781 int result; 3782 register const Py_UNICODE *p, *e; 3783 register Py_UNICODE ch; 3784 3785 /* Coerce the two arguments */ 3786 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3787 if (v == NULL) { 3788 PyErr_SetString(PyExc_TypeError, 3789 "'in <string>' requires character as left operand"); 3790 goto onError; 3791 } 3792 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3793 if (u == NULL) { 3794 Py_DECREF(v); 3795 goto onError; 3796 } 3797 3798 /* Check v in u */ 3799 if (PyUnicode_GET_SIZE(v) != 1) { 3800 PyErr_SetString(PyExc_TypeError, 3801 "'in <string>' requires character as left operand"); 3802 goto onError; 3803 } 3804 ch = *PyUnicode_AS_UNICODE(v); 3805 p = PyUnicode_AS_UNICODE(u); 3806 e = p + PyUnicode_GET_SIZE(u); 3807 result = 0; 3808 while (p < e) { 3809 if (*p++ == ch) { 3810 result = 1; 3811 break; 3812 } 3813 } 3814 3815 Py_DECREF(u); 3816 Py_DECREF(v); 3817 return result; 3818 3819onError: 3820 Py_XDECREF(u); 3821 Py_XDECREF(v); 3822 return -1; 3823} 3824 3825/* Concat to string or Unicode object giving a new Unicode object. */ 3826 3827PyObject *PyUnicode_Concat(PyObject *left, 3828 PyObject *right) 3829{ 3830 PyUnicodeObject *u = NULL, *v = NULL, *w; 3831 3832 /* Coerce the two arguments */ 3833 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3834 if (u == NULL) 3835 goto onError; 3836 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3837 if (v == NULL) 3838 goto onError; 3839 3840 /* Shortcuts */ 3841 if (v == unicode_empty) { 3842 Py_DECREF(v); 3843 return (PyObject *)u; 3844 } 3845 if (u == unicode_empty) { 3846 Py_DECREF(u); 3847 return (PyObject *)v; 3848 } 3849 3850 /* Concat the two Unicode strings */ 3851 w = _PyUnicode_New(u->length + v->length); 3852 if (w == NULL) 3853 goto onError; 3854 Py_UNICODE_COPY(w->str, u->str, u->length); 3855 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3856 3857 Py_DECREF(u); 3858 Py_DECREF(v); 3859 return (PyObject *)w; 3860 3861onError: 3862 Py_XDECREF(u); 3863 Py_XDECREF(v); 3864 return NULL; 3865} 3866 3867static char count__doc__[] = 3868"S.count(sub[, start[, end]]) -> int\n\ 3869\n\ 3870Return the number of occurrences of substring sub in Unicode string\n\ 3871S[start:end]. Optional arguments start and end are\n\ 3872interpreted as in slice notation."; 3873 3874static PyObject * 3875unicode_count(PyUnicodeObject *self, PyObject *args) 3876{ 3877 PyUnicodeObject *substring; 3878 int start = 0; 3879 int end = INT_MAX; 3880 PyObject *result; 3881 3882 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3883 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3884 return NULL; 3885 3886 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3887 (PyObject *)substring); 3888 if (substring == NULL) 3889 return NULL; 3890 3891 if (start < 0) 3892 start += self->length; 3893 if (start < 0) 3894 start = 0; 3895 if (end > self->length) 3896 end = self->length; 3897 if (end < 0) 3898 end += self->length; 3899 if (end < 0) 3900 end = 0; 3901 3902 result = PyInt_FromLong((long) count(self, start, end, substring)); 3903 3904 Py_DECREF(substring); 3905 return result; 3906} 3907 3908static char encode__doc__[] = 3909"S.encode([encoding[,errors]]) -> string\n\ 3910\n\ 3911Return an encoded string version of S. Default encoding is the current\n\ 3912default string encoding. errors may be given to set a different error\n\ 3913handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3914a ValueError. Other possible values are 'ignore' and 'replace'."; 3915 3916static PyObject * 3917unicode_encode(PyUnicodeObject *self, PyObject *args) 3918{ 3919 char *encoding = NULL; 3920 char *errors = NULL; 3921 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3922 return NULL; 3923 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3924} 3925 3926static char expandtabs__doc__[] = 3927"S.expandtabs([tabsize]) -> unicode\n\ 3928\n\ 3929Return a copy of S where all tab characters are expanded using spaces.\n\ 3930If tabsize is not given, a tab size of 8 characters is assumed."; 3931 3932static PyObject* 3933unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3934{ 3935 Py_UNICODE *e; 3936 Py_UNICODE *p; 3937 Py_UNICODE *q; 3938 int i, j; 3939 PyUnicodeObject *u; 3940 int tabsize = 8; 3941 3942 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3943 return NULL; 3944 3945 /* First pass: determine size of output string */ 3946 i = j = 0; 3947 e = self->str + self->length; 3948 for (p = self->str; p < e; p++) 3949 if (*p == '\t') { 3950 if (tabsize > 0) 3951 j += tabsize - (j % tabsize); 3952 } 3953 else { 3954 j++; 3955 if (*p == '\n' || *p == '\r') { 3956 i += j; 3957 j = 0; 3958 } 3959 } 3960 3961 /* Second pass: create output string and fill it */ 3962 u = _PyUnicode_New(i + j); 3963 if (!u) 3964 return NULL; 3965 3966 j = 0; 3967 q = u->str; 3968 3969 for (p = self->str; p < e; p++) 3970 if (*p == '\t') { 3971 if (tabsize > 0) { 3972 i = tabsize - (j % tabsize); 3973 j += i; 3974 while (i--) 3975 *q++ = ' '; 3976 } 3977 } 3978 else { 3979 j++; 3980 *q++ = *p; 3981 if (*p == '\n' || *p == '\r') 3982 j = 0; 3983 } 3984 3985 return (PyObject*) u; 3986} 3987 3988static char find__doc__[] = 3989"S.find(sub [,start [,end]]) -> int\n\ 3990\n\ 3991Return the lowest index in S where substring sub is found,\n\ 3992such that sub is contained within s[start,end]. Optional\n\ 3993arguments start and end are interpreted as in slice notation.\n\ 3994\n\ 3995Return -1 on failure."; 3996 3997static PyObject * 3998unicode_find(PyUnicodeObject *self, PyObject *args) 3999{ 4000 PyUnicodeObject *substring; 4001 int start = 0; 4002 int end = INT_MAX; 4003 PyObject *result; 4004 4005 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4006 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4007 return NULL; 4008 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4009 (PyObject *)substring); 4010 if (substring == NULL) 4011 return NULL; 4012 4013 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4014 4015 Py_DECREF(substring); 4016 return result; 4017} 4018 4019static PyObject * 4020unicode_getitem(PyUnicodeObject *self, int index) 4021{ 4022 if (index < 0 || index >= self->length) { 4023 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4024 return NULL; 4025 } 4026 4027 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4028} 4029 4030static long 4031unicode_hash(PyUnicodeObject *self) 4032{ 4033 /* Since Unicode objects compare equal to their ASCII string 4034 counterparts, they should use the individual character values 4035 as basis for their hash value. This is needed to assure that 4036 strings and Unicode objects behave in the same way as 4037 dictionary keys. */ 4038 4039 register int len; 4040 register Py_UNICODE *p; 4041 register long x; 4042 4043 if (self->hash != -1) 4044 return self->hash; 4045 len = PyUnicode_GET_SIZE(self); 4046 p = PyUnicode_AS_UNICODE(self); 4047 x = *p << 7; 4048 while (--len >= 0) 4049 x = (1000003*x) ^ *p++; 4050 x ^= PyUnicode_GET_SIZE(self); 4051 if (x == -1) 4052 x = -2; 4053 self->hash = x; 4054 return x; 4055} 4056 4057static char index__doc__[] = 4058"S.index(sub [,start [,end]]) -> int\n\ 4059\n\ 4060Like S.find() but raise ValueError when the substring is not found."; 4061 4062static PyObject * 4063unicode_index(PyUnicodeObject *self, PyObject *args) 4064{ 4065 int result; 4066 PyUnicodeObject *substring; 4067 int start = 0; 4068 int end = INT_MAX; 4069 4070 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4071 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4072 return NULL; 4073 4074 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4075 (PyObject *)substring); 4076 if (substring == NULL) 4077 return NULL; 4078 4079 result = findstring(self, substring, start, end, 1); 4080 4081 Py_DECREF(substring); 4082 if (result < 0) { 4083 PyErr_SetString(PyExc_ValueError, "substring not found"); 4084 return NULL; 4085 } 4086 return PyInt_FromLong(result); 4087} 4088 4089static char islower__doc__[] = 4090"S.islower() -> int\n\ 4091\n\ 4092Return 1 if all cased characters in S are lowercase and there is\n\ 4093at least one cased character in S, 0 otherwise."; 4094 4095static PyObject* 4096unicode_islower(PyUnicodeObject *self) 4097{ 4098 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4099 register const Py_UNICODE *e; 4100 int cased; 4101 4102 /* Shortcut for single character strings */ 4103 if (PyUnicode_GET_SIZE(self) == 1) 4104 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 4105 4106 /* Special case for empty strings */ 4107 if (PyString_GET_SIZE(self) == 0) 4108 return PyInt_FromLong(0); 4109 4110 e = p + PyUnicode_GET_SIZE(self); 4111 cased = 0; 4112 for (; p < e; p++) { 4113 register const Py_UNICODE ch = *p; 4114 4115 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4116 return PyInt_FromLong(0); 4117 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4118 cased = 1; 4119 } 4120 return PyInt_FromLong(cased); 4121} 4122 4123static char isupper__doc__[] = 4124"S.isupper() -> int\n\ 4125\n\ 4126Return 1 if all cased characters in S are uppercase and there is\n\ 4127at least one cased character in S, 0 otherwise."; 4128 4129static PyObject* 4130unicode_isupper(PyUnicodeObject *self) 4131{ 4132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4133 register const Py_UNICODE *e; 4134 int cased; 4135 4136 /* Shortcut for single character strings */ 4137 if (PyUnicode_GET_SIZE(self) == 1) 4138 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4139 4140 /* Special case for empty strings */ 4141 if (PyString_GET_SIZE(self) == 0) 4142 return PyInt_FromLong(0); 4143 4144 e = p + PyUnicode_GET_SIZE(self); 4145 cased = 0; 4146 for (; p < e; p++) { 4147 register const Py_UNICODE ch = *p; 4148 4149 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4150 return PyInt_FromLong(0); 4151 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4152 cased = 1; 4153 } 4154 return PyInt_FromLong(cased); 4155} 4156 4157static char istitle__doc__[] = 4158"S.istitle() -> int\n\ 4159\n\ 4160Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 4161may only follow uncased characters and lowercase characters only cased\n\ 4162ones. Return 0 otherwise."; 4163 4164static PyObject* 4165unicode_istitle(PyUnicodeObject *self) 4166{ 4167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4168 register const Py_UNICODE *e; 4169 int cased, previous_is_cased; 4170 4171 /* Shortcut for single character strings */ 4172 if (PyUnicode_GET_SIZE(self) == 1) 4173 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4174 (Py_UNICODE_ISUPPER(*p) != 0)); 4175 4176 /* Special case for empty strings */ 4177 if (PyString_GET_SIZE(self) == 0) 4178 return PyInt_FromLong(0); 4179 4180 e = p + PyUnicode_GET_SIZE(self); 4181 cased = 0; 4182 previous_is_cased = 0; 4183 for (; p < e; p++) { 4184 register const Py_UNICODE ch = *p; 4185 4186 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4187 if (previous_is_cased) 4188 return PyInt_FromLong(0); 4189 previous_is_cased = 1; 4190 cased = 1; 4191 } 4192 else if (Py_UNICODE_ISLOWER(ch)) { 4193 if (!previous_is_cased) 4194 return PyInt_FromLong(0); 4195 previous_is_cased = 1; 4196 cased = 1; 4197 } 4198 else 4199 previous_is_cased = 0; 4200 } 4201 return PyInt_FromLong(cased); 4202} 4203 4204static char isspace__doc__[] = 4205"S.isspace() -> int\n\ 4206\n\ 4207Return 1 if there are only whitespace characters in S,\n\ 42080 otherwise."; 4209 4210static PyObject* 4211unicode_isspace(PyUnicodeObject *self) 4212{ 4213 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4214 register const Py_UNICODE *e; 4215 4216 /* Shortcut for single character strings */ 4217 if (PyUnicode_GET_SIZE(self) == 1 && 4218 Py_UNICODE_ISSPACE(*p)) 4219 return PyInt_FromLong(1); 4220 4221 /* Special case for empty strings */ 4222 if (PyString_GET_SIZE(self) == 0) 4223 return PyInt_FromLong(0); 4224 4225 e = p + PyUnicode_GET_SIZE(self); 4226 for (; p < e; p++) { 4227 if (!Py_UNICODE_ISSPACE(*p)) 4228 return PyInt_FromLong(0); 4229 } 4230 return PyInt_FromLong(1); 4231} 4232 4233static char isalpha__doc__[] = 4234"S.isalpha() -> int\n\ 4235\n\ 4236Return 1 if all characters in S are alphabetic\n\ 4237and there is at least one character in S, 0 otherwise."; 4238 4239static PyObject* 4240unicode_isalpha(PyUnicodeObject *self) 4241{ 4242 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4243 register const Py_UNICODE *e; 4244 4245 /* Shortcut for single character strings */ 4246 if (PyUnicode_GET_SIZE(self) == 1 && 4247 Py_UNICODE_ISALPHA(*p)) 4248 return PyInt_FromLong(1); 4249 4250 /* Special case for empty strings */ 4251 if (PyString_GET_SIZE(self) == 0) 4252 return PyInt_FromLong(0); 4253 4254 e = p + PyUnicode_GET_SIZE(self); 4255 for (; p < e; p++) { 4256 if (!Py_UNICODE_ISALPHA(*p)) 4257 return PyInt_FromLong(0); 4258 } 4259 return PyInt_FromLong(1); 4260} 4261 4262static char isalnum__doc__[] = 4263"S.isalnum() -> int\n\ 4264\n\ 4265Return 1 if all characters in S are alphanumeric\n\ 4266and there is at least one character in S, 0 otherwise."; 4267 4268static PyObject* 4269unicode_isalnum(PyUnicodeObject *self) 4270{ 4271 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4272 register const Py_UNICODE *e; 4273 4274 /* Shortcut for single character strings */ 4275 if (PyUnicode_GET_SIZE(self) == 1 && 4276 Py_UNICODE_ISALNUM(*p)) 4277 return PyInt_FromLong(1); 4278 4279 /* Special case for empty strings */ 4280 if (PyString_GET_SIZE(self) == 0) 4281 return PyInt_FromLong(0); 4282 4283 e = p + PyUnicode_GET_SIZE(self); 4284 for (; p < e; p++) { 4285 if (!Py_UNICODE_ISALNUM(*p)) 4286 return PyInt_FromLong(0); 4287 } 4288 return PyInt_FromLong(1); 4289} 4290 4291static char isdecimal__doc__[] = 4292"S.isdecimal() -> int\n\ 4293\n\ 4294Return 1 if there are only decimal characters in S,\n\ 42950 otherwise."; 4296 4297static PyObject* 4298unicode_isdecimal(PyUnicodeObject *self) 4299{ 4300 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4301 register const Py_UNICODE *e; 4302 4303 /* Shortcut for single character strings */ 4304 if (PyUnicode_GET_SIZE(self) == 1 && 4305 Py_UNICODE_ISDECIMAL(*p)) 4306 return PyInt_FromLong(1); 4307 4308 /* Special case for empty strings */ 4309 if (PyString_GET_SIZE(self) == 0) 4310 return PyInt_FromLong(0); 4311 4312 e = p + PyUnicode_GET_SIZE(self); 4313 for (; p < e; p++) { 4314 if (!Py_UNICODE_ISDECIMAL(*p)) 4315 return PyInt_FromLong(0); 4316 } 4317 return PyInt_FromLong(1); 4318} 4319 4320static char isdigit__doc__[] = 4321"S.isdigit() -> int\n\ 4322\n\ 4323Return 1 if there are only digit characters in S,\n\ 43240 otherwise."; 4325 4326static PyObject* 4327unicode_isdigit(PyUnicodeObject *self) 4328{ 4329 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4330 register const Py_UNICODE *e; 4331 4332 /* Shortcut for single character strings */ 4333 if (PyUnicode_GET_SIZE(self) == 1 && 4334 Py_UNICODE_ISDIGIT(*p)) 4335 return PyInt_FromLong(1); 4336 4337 /* Special case for empty strings */ 4338 if (PyString_GET_SIZE(self) == 0) 4339 return PyInt_FromLong(0); 4340 4341 e = p + PyUnicode_GET_SIZE(self); 4342 for (; p < e; p++) { 4343 if (!Py_UNICODE_ISDIGIT(*p)) 4344 return PyInt_FromLong(0); 4345 } 4346 return PyInt_FromLong(1); 4347} 4348 4349static char isnumeric__doc__[] = 4350"S.isnumeric() -> int\n\ 4351\n\ 4352Return 1 if there are only numeric characters in S,\n\ 43530 otherwise."; 4354 4355static PyObject* 4356unicode_isnumeric(PyUnicodeObject *self) 4357{ 4358 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4359 register const Py_UNICODE *e; 4360 4361 /* Shortcut for single character strings */ 4362 if (PyUnicode_GET_SIZE(self) == 1 && 4363 Py_UNICODE_ISNUMERIC(*p)) 4364 return PyInt_FromLong(1); 4365 4366 /* Special case for empty strings */ 4367 if (PyString_GET_SIZE(self) == 0) 4368 return PyInt_FromLong(0); 4369 4370 e = p + PyUnicode_GET_SIZE(self); 4371 for (; p < e; p++) { 4372 if (!Py_UNICODE_ISNUMERIC(*p)) 4373 return PyInt_FromLong(0); 4374 } 4375 return PyInt_FromLong(1); 4376} 4377 4378static char join__doc__[] = 4379"S.join(sequence) -> unicode\n\ 4380\n\ 4381Return a string which is the concatenation of the strings in the\n\ 4382sequence. The separator between elements is S."; 4383 4384static PyObject* 4385unicode_join(PyObject *self, PyObject *data) 4386{ 4387 return PyUnicode_Join(self, data); 4388} 4389 4390static int 4391unicode_length(PyUnicodeObject *self) 4392{ 4393 return self->length; 4394} 4395 4396static char ljust__doc__[] = 4397"S.ljust(width) -> unicode\n\ 4398\n\ 4399Return S left justified in a Unicode string of length width. Padding is\n\ 4400done using spaces."; 4401 4402static PyObject * 4403unicode_ljust(PyUnicodeObject *self, PyObject *args) 4404{ 4405 int width; 4406 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 4407 return NULL; 4408 4409 if (self->length >= width && PyUnicode_CheckExact(self)) { 4410 Py_INCREF(self); 4411 return (PyObject*) self; 4412 } 4413 4414 return (PyObject*) pad(self, 0, width - self->length, ' '); 4415} 4416 4417static char lower__doc__[] = 4418"S.lower() -> unicode\n\ 4419\n\ 4420Return a copy of the string S converted to lowercase."; 4421 4422static PyObject* 4423unicode_lower(PyUnicodeObject *self) 4424{ 4425 return fixup(self, fixlower); 4426} 4427 4428static char lstrip__doc__[] = 4429"S.lstrip() -> unicode\n\ 4430\n\ 4431Return a copy of the string S with leading whitespace removed."; 4432 4433static PyObject * 4434unicode_lstrip(PyUnicodeObject *self) 4435{ 4436 return strip(self, 1, 0); 4437} 4438 4439static PyObject* 4440unicode_repeat(PyUnicodeObject *str, int len) 4441{ 4442 PyUnicodeObject *u; 4443 Py_UNICODE *p; 4444 int nchars; 4445 size_t nbytes; 4446 4447 if (len < 0) 4448 len = 0; 4449 4450 if (len == 1 && PyUnicode_CheckExact(str)) { 4451 /* no repeat, return original string */ 4452 Py_INCREF(str); 4453 return (PyObject*) str; 4454 } 4455 4456 /* ensure # of chars needed doesn't overflow int and # of bytes 4457 * needed doesn't overflow size_t 4458 */ 4459 nchars = len * str->length; 4460 if (len && nchars / len != str->length) { 4461 PyErr_SetString(PyExc_OverflowError, 4462 "repeated string is too long"); 4463 return NULL; 4464 } 4465 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4466 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4467 PyErr_SetString(PyExc_OverflowError, 4468 "repeated string is too long"); 4469 return NULL; 4470 } 4471 u = _PyUnicode_New(nchars); 4472 if (!u) 4473 return NULL; 4474 4475 p = u->str; 4476 4477 while (len-- > 0) { 4478 Py_UNICODE_COPY(p, str->str, str->length); 4479 p += str->length; 4480 } 4481 4482 return (PyObject*) u; 4483} 4484 4485PyObject *PyUnicode_Replace(PyObject *obj, 4486 PyObject *subobj, 4487 PyObject *replobj, 4488 int maxcount) 4489{ 4490 PyObject *self; 4491 PyObject *str1; 4492 PyObject *str2; 4493 PyObject *result; 4494 4495 self = PyUnicode_FromObject(obj); 4496 if (self == NULL) 4497 return NULL; 4498 str1 = PyUnicode_FromObject(subobj); 4499 if (str1 == NULL) { 4500 Py_DECREF(self); 4501 return NULL; 4502 } 4503 str2 = PyUnicode_FromObject(replobj); 4504 if (str2 == NULL) { 4505 Py_DECREF(self); 4506 Py_DECREF(str1); 4507 return NULL; 4508 } 4509 result = replace((PyUnicodeObject *)self, 4510 (PyUnicodeObject *)str1, 4511 (PyUnicodeObject *)str2, 4512 maxcount); 4513 Py_DECREF(self); 4514 Py_DECREF(str1); 4515 Py_DECREF(str2); 4516 return result; 4517} 4518 4519static char replace__doc__[] = 4520"S.replace (old, new[, maxsplit]) -> unicode\n\ 4521\n\ 4522Return a copy of S with all occurrences of substring\n\ 4523old replaced by new. If the optional argument maxsplit is\n\ 4524given, only the first maxsplit occurrences are replaced."; 4525 4526static PyObject* 4527unicode_replace(PyUnicodeObject *self, PyObject *args) 4528{ 4529 PyUnicodeObject *str1; 4530 PyUnicodeObject *str2; 4531 int maxcount = -1; 4532 PyObject *result; 4533 4534 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4535 return NULL; 4536 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4537 if (str1 == NULL) 4538 return NULL; 4539 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4540 if (str2 == NULL) 4541 return NULL; 4542 4543 result = replace(self, str1, str2, maxcount); 4544 4545 Py_DECREF(str1); 4546 Py_DECREF(str2); 4547 return result; 4548} 4549 4550static 4551PyObject *unicode_repr(PyObject *unicode) 4552{ 4553 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4554 PyUnicode_GET_SIZE(unicode), 4555 1); 4556} 4557 4558static char rfind__doc__[] = 4559"S.rfind(sub [,start [,end]]) -> int\n\ 4560\n\ 4561Return the highest index in S where substring sub is found,\n\ 4562such that sub is contained within s[start,end]. Optional\n\ 4563arguments start and end are interpreted as in slice notation.\n\ 4564\n\ 4565Return -1 on failure."; 4566 4567static PyObject * 4568unicode_rfind(PyUnicodeObject *self, PyObject *args) 4569{ 4570 PyUnicodeObject *substring; 4571 int start = 0; 4572 int end = INT_MAX; 4573 PyObject *result; 4574 4575 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4576 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4577 return NULL; 4578 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4579 (PyObject *)substring); 4580 if (substring == NULL) 4581 return NULL; 4582 4583 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4584 4585 Py_DECREF(substring); 4586 return result; 4587} 4588 4589static char rindex__doc__[] = 4590"S.rindex(sub [,start [,end]]) -> int\n\ 4591\n\ 4592Like S.rfind() but raise ValueError when the substring is not found."; 4593 4594static PyObject * 4595unicode_rindex(PyUnicodeObject *self, PyObject *args) 4596{ 4597 int result; 4598 PyUnicodeObject *substring; 4599 int start = 0; 4600 int end = INT_MAX; 4601 4602 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4603 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4604 return NULL; 4605 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4606 (PyObject *)substring); 4607 if (substring == NULL) 4608 return NULL; 4609 4610 result = findstring(self, substring, start, end, -1); 4611 4612 Py_DECREF(substring); 4613 if (result < 0) { 4614 PyErr_SetString(PyExc_ValueError, "substring not found"); 4615 return NULL; 4616 } 4617 return PyInt_FromLong(result); 4618} 4619 4620static char rjust__doc__[] = 4621"S.rjust(width) -> unicode\n\ 4622\n\ 4623Return S right justified in a Unicode string of length width. Padding is\n\ 4624done using spaces."; 4625 4626static PyObject * 4627unicode_rjust(PyUnicodeObject *self, PyObject *args) 4628{ 4629 int width; 4630 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4631 return NULL; 4632 4633 if (self->length >= width && PyUnicode_CheckExact(self)) { 4634 Py_INCREF(self); 4635 return (PyObject*) self; 4636 } 4637 4638 return (PyObject*) pad(self, width - self->length, 0, ' '); 4639} 4640 4641static char rstrip__doc__[] = 4642"S.rstrip() -> unicode\n\ 4643\n\ 4644Return a copy of the string S with trailing whitespace removed."; 4645 4646static PyObject * 4647unicode_rstrip(PyUnicodeObject *self) 4648{ 4649 return strip(self, 0, 1); 4650} 4651 4652static PyObject* 4653unicode_slice(PyUnicodeObject *self, int start, int end) 4654{ 4655 /* standard clamping */ 4656 if (start < 0) 4657 start = 0; 4658 if (end < 0) 4659 end = 0; 4660 if (end > self->length) 4661 end = self->length; 4662 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 4663 /* full slice, return original string */ 4664 Py_INCREF(self); 4665 return (PyObject*) self; 4666 } 4667 if (start > end) 4668 start = end; 4669 /* copy slice */ 4670 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4671 end - start); 4672} 4673 4674PyObject *PyUnicode_Split(PyObject *s, 4675 PyObject *sep, 4676 int maxsplit) 4677{ 4678 PyObject *result; 4679 4680 s = PyUnicode_FromObject(s); 4681 if (s == NULL) 4682 return NULL; 4683 if (sep != NULL) { 4684 sep = PyUnicode_FromObject(sep); 4685 if (sep == NULL) { 4686 Py_DECREF(s); 4687 return NULL; 4688 } 4689 } 4690 4691 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4692 4693 Py_DECREF(s); 4694 Py_XDECREF(sep); 4695 return result; 4696} 4697 4698static char split__doc__[] = 4699"S.split([sep [,maxsplit]]) -> list of strings\n\ 4700\n\ 4701Return a list of the words in S, using sep as the\n\ 4702delimiter string. If maxsplit is given, at most maxsplit\n\ 4703splits are done. If sep is not specified, any whitespace string\n\ 4704is a separator."; 4705 4706static PyObject* 4707unicode_split(PyUnicodeObject *self, PyObject *args) 4708{ 4709 PyObject *substring = Py_None; 4710 int maxcount = -1; 4711 4712 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4713 return NULL; 4714 4715 if (substring == Py_None) 4716 return split(self, NULL, maxcount); 4717 else if (PyUnicode_Check(substring)) 4718 return split(self, (PyUnicodeObject *)substring, maxcount); 4719 else 4720 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4721} 4722 4723static char splitlines__doc__[] = 4724"S.splitlines([keepends]]) -> list of strings\n\ 4725\n\ 4726Return a list of the lines in S, breaking at line boundaries.\n\ 4727Line breaks are not included in the resulting list unless keepends\n\ 4728is given and true."; 4729 4730static PyObject* 4731unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4732{ 4733 int keepends = 0; 4734 4735 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4736 return NULL; 4737 4738 return PyUnicode_Splitlines((PyObject *)self, keepends); 4739} 4740 4741static 4742PyObject *unicode_str(PyUnicodeObject *self) 4743{ 4744 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4745} 4746 4747static char strip__doc__[] = 4748"S.strip() -> unicode\n\ 4749\n\ 4750Return a copy of S with leading and trailing whitespace removed."; 4751 4752static PyObject * 4753unicode_strip(PyUnicodeObject *self) 4754{ 4755 return strip(self, 1, 1); 4756} 4757 4758static char swapcase__doc__[] = 4759"S.swapcase() -> unicode\n\ 4760\n\ 4761Return a copy of S with uppercase characters converted to lowercase\n\ 4762and vice versa."; 4763 4764static PyObject* 4765unicode_swapcase(PyUnicodeObject *self) 4766{ 4767 return fixup(self, fixswapcase); 4768} 4769 4770static char translate__doc__[] = 4771"S.translate(table) -> unicode\n\ 4772\n\ 4773Return a copy of the string S, where all characters have been mapped\n\ 4774through the given translation table, which must be a mapping of\n\ 4775Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4776are left untouched. Characters mapped to None are deleted."; 4777 4778static PyObject* 4779unicode_translate(PyUnicodeObject *self, PyObject *table) 4780{ 4781 return PyUnicode_TranslateCharmap(self->str, 4782 self->length, 4783 table, 4784 "ignore"); 4785} 4786 4787static char upper__doc__[] = 4788"S.upper() -> unicode\n\ 4789\n\ 4790Return a copy of S converted to uppercase."; 4791 4792static PyObject* 4793unicode_upper(PyUnicodeObject *self) 4794{ 4795 return fixup(self, fixupper); 4796} 4797 4798#if 0 4799static char zfill__doc__[] = 4800"S.zfill(width) -> unicode\n\ 4801\n\ 4802Pad a numeric string x with zeros on the left, to fill a field\n\ 4803of the specified width. The string x is never truncated."; 4804 4805static PyObject * 4806unicode_zfill(PyUnicodeObject *self, PyObject *args) 4807{ 4808 int fill; 4809 PyUnicodeObject *u; 4810 4811 int width; 4812 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4813 return NULL; 4814 4815 if (self->length >= width) { 4816 Py_INCREF(self); 4817 return (PyObject*) self; 4818 } 4819 4820 fill = width - self->length; 4821 4822 u = pad(self, fill, 0, '0'); 4823 4824 if (u->str[fill] == '+' || u->str[fill] == '-') { 4825 /* move sign to beginning of string */ 4826 u->str[0] = u->str[fill]; 4827 u->str[fill] = '0'; 4828 } 4829 4830 return (PyObject*) u; 4831} 4832#endif 4833 4834#if 0 4835static PyObject* 4836unicode_freelistsize(PyUnicodeObject *self) 4837{ 4838 return PyInt_FromLong(unicode_freelist_size); 4839} 4840#endif 4841 4842static char startswith__doc__[] = 4843"S.startswith(prefix[, start[, end]]) -> int\n\ 4844\n\ 4845Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4846optional start, test S beginning at that position. With optional end, stop\n\ 4847comparing S at that position."; 4848 4849static PyObject * 4850unicode_startswith(PyUnicodeObject *self, 4851 PyObject *args) 4852{ 4853 PyUnicodeObject *substring; 4854 int start = 0; 4855 int end = INT_MAX; 4856 PyObject *result; 4857 4858 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4859 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4860 return NULL; 4861 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4862 (PyObject *)substring); 4863 if (substring == NULL) 4864 return NULL; 4865 4866 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4867 4868 Py_DECREF(substring); 4869 return result; 4870} 4871 4872 4873static char endswith__doc__[] = 4874"S.endswith(suffix[, start[, end]]) -> int\n\ 4875\n\ 4876Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4877optional start, test S beginning at that position. With optional end, stop\n\ 4878comparing S at that position."; 4879 4880static PyObject * 4881unicode_endswith(PyUnicodeObject *self, 4882 PyObject *args) 4883{ 4884 PyUnicodeObject *substring; 4885 int start = 0; 4886 int end = INT_MAX; 4887 PyObject *result; 4888 4889 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4890 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4891 return NULL; 4892 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4893 (PyObject *)substring); 4894 if (substring == NULL) 4895 return NULL; 4896 4897 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4898 4899 Py_DECREF(substring); 4900 return result; 4901} 4902 4903 4904static PyMethodDef unicode_methods[] = { 4905 4906 /* Order is according to common usage: often used methods should 4907 appear first, since lookup is done sequentially. */ 4908 4909 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 4910 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 4911 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 4912 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 4913 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 4914 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 4915 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 4916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 4917 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 4918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 4919 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 4920 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 4921 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 4922 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__}, 4923/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 4924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 4925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 4926 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 4927 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__}, 4928 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 4929 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__}, 4930 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 4931 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 4932 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 4933 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 4934 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 4935 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 4936 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 4937 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 4938 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 4939 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 4940 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 4941 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 4942 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 4943 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 4944#if 0 4945 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 4946 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 4947#endif 4948 4949#if 0 4950 /* This one is just used for debugging the implementation. */ 4951 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 4952#endif 4953 4954 {NULL, NULL} 4955}; 4956 4957static PySequenceMethods unicode_as_sequence = { 4958 (inquiry) unicode_length, /* sq_length */ 4959 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4960 (intargfunc) unicode_repeat, /* sq_repeat */ 4961 (intargfunc) unicode_getitem, /* sq_item */ 4962 (intintargfunc) unicode_slice, /* sq_slice */ 4963 0, /* sq_ass_item */ 4964 0, /* sq_ass_slice */ 4965 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4966}; 4967 4968static int 4969unicode_buffer_getreadbuf(PyUnicodeObject *self, 4970 int index, 4971 const void **ptr) 4972{ 4973 if (index != 0) { 4974 PyErr_SetString(PyExc_SystemError, 4975 "accessing non-existent unicode segment"); 4976 return -1; 4977 } 4978 *ptr = (void *) self->str; 4979 return PyUnicode_GET_DATA_SIZE(self); 4980} 4981 4982static int 4983unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4984 const void **ptr) 4985{ 4986 PyErr_SetString(PyExc_TypeError, 4987 "cannot use unicode as modifyable buffer"); 4988 return -1; 4989} 4990 4991static int 4992unicode_buffer_getsegcount(PyUnicodeObject *self, 4993 int *lenp) 4994{ 4995 if (lenp) 4996 *lenp = PyUnicode_GET_DATA_SIZE(self); 4997 return 1; 4998} 4999 5000static int 5001unicode_buffer_getcharbuf(PyUnicodeObject *self, 5002 int index, 5003 const void **ptr) 5004{ 5005 PyObject *str; 5006 5007 if (index != 0) { 5008 PyErr_SetString(PyExc_SystemError, 5009 "accessing non-existent unicode segment"); 5010 return -1; 5011 } 5012 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5013 if (str == NULL) 5014 return -1; 5015 *ptr = (void *) PyString_AS_STRING(str); 5016 return PyString_GET_SIZE(str); 5017} 5018 5019/* Helpers for PyUnicode_Format() */ 5020 5021static PyObject * 5022getnextarg(PyObject *args, int arglen, int *p_argidx) 5023{ 5024 int argidx = *p_argidx; 5025 if (argidx < arglen) { 5026 (*p_argidx)++; 5027 if (arglen < 0) 5028 return args; 5029 else 5030 return PyTuple_GetItem(args, argidx); 5031 } 5032 PyErr_SetString(PyExc_TypeError, 5033 "not enough arguments for format string"); 5034 return NULL; 5035} 5036 5037#define F_LJUST (1<<0) 5038#define F_SIGN (1<<1) 5039#define F_BLANK (1<<2) 5040#define F_ALT (1<<3) 5041#define F_ZERO (1<<4) 5042 5043static 5044int usprintf(register Py_UNICODE *buffer, char *format, ...) 5045{ 5046 register int i; 5047 int len; 5048 va_list va; 5049 char *charbuffer; 5050 va_start(va, format); 5051 5052 /* First, format the string as char array, then expand to Py_UNICODE 5053 array. */ 5054 charbuffer = (char *)buffer; 5055 len = vsprintf(charbuffer, format, va); 5056 for (i = len - 1; i >= 0; i--) 5057 buffer[i] = (Py_UNICODE) charbuffer[i]; 5058 5059 va_end(va); 5060 return len; 5061} 5062 5063static int 5064formatfloat(Py_UNICODE *buf, 5065 size_t buflen, 5066 int flags, 5067 int prec, 5068 int type, 5069 PyObject *v) 5070{ 5071 /* fmt = '%#.' + `prec` + `type` 5072 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 5073 char fmt[20]; 5074 double x; 5075 5076 x = PyFloat_AsDouble(v); 5077 if (x == -1.0 && PyErr_Occurred()) 5078 return -1; 5079 if (prec < 0) 5080 prec = 6; 5081 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 5082 type = 'g'; 5083 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type); 5084 /* worst case length calc to ensure no buffer overrun: 5085 fmt = %#.<prec>g 5086 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 5087 for any double rep.) 5088 len = 1 + prec + 1 + 2 + 5 = 9 + prec 5089 If prec=0 the effective precision is 1 (the leading digit is 5090 always given), therefore increase by one to 10+prec. */ 5091 if (buflen <= (size_t)10 + (size_t)prec) { 5092 PyErr_SetString(PyExc_OverflowError, 5093 "formatted float is too long (precision too long?)"); 5094 return -1; 5095 } 5096 return usprintf(buf, fmt, x); 5097} 5098 5099static PyObject* 5100formatlong(PyObject *val, int flags, int prec, int type) 5101{ 5102 char *buf; 5103 int i, len; 5104 PyObject *str; /* temporary string object. */ 5105 PyUnicodeObject *result; 5106 5107 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 5108 if (!str) 5109 return NULL; 5110 result = _PyUnicode_New(len); 5111 for (i = 0; i < len; i++) 5112 result->str[i] = buf[i]; 5113 result->str[len] = 0; 5114 Py_DECREF(str); 5115 return (PyObject*)result; 5116} 5117 5118static int 5119formatint(Py_UNICODE *buf, 5120 size_t buflen, 5121 int flags, 5122 int prec, 5123 int type, 5124 PyObject *v) 5125{ 5126 /* fmt = '%#.' + `prec` + 'l' + `type` 5127 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 5128 + 1 + 1 = 24*/ 5129 char fmt[64]; /* plenty big enough! */ 5130 long x; 5131 int use_native_c_format = 1; 5132 5133 x = PyInt_AsLong(v); 5134 if (x == -1 && PyErr_Occurred()) 5135 return -1; 5136 if (prec < 0) 5137 prec = 1; 5138 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 5139 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 5140 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 5141 PyErr_SetString(PyExc_OverflowError, 5142 "formatted integer is too long (precision too long?)"); 5143 return -1; 5144 } 5145 /* When converting 0 under %#x or %#X, C leaves off the base marker, 5146 * but we want it (for consistency with other %#x conversions, and 5147 * for consistency with Python's hex() function). 5148 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks & 5149 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway. 5150 * So add it only if the platform doesn't already. 5151 */ 5152 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) { 5153 /* Only way to know what the platform does is to try it. */ 5154 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0); 5155 if (fmt[1] != (char)type) { 5156 /* Supply our own leading 0x/0X -- needed under std C */ 5157 use_native_c_format = 0; 5158 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type); 5159 } 5160 } 5161 if (use_native_c_format) 5162 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type); 5163 return usprintf(buf, fmt, x); 5164} 5165 5166static int 5167formatchar(Py_UNICODE *buf, 5168 size_t buflen, 5169 PyObject *v) 5170{ 5171 /* presume that the buffer is at least 2 characters long */ 5172 if (PyUnicode_Check(v)) { 5173 if (PyUnicode_GET_SIZE(v) != 1) 5174 goto onError; 5175 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 5176 } 5177 5178 else if (PyString_Check(v)) { 5179 if (PyString_GET_SIZE(v) != 1) 5180 goto onError; 5181 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 5182 } 5183 5184 else { 5185 /* Integer input truncated to a character */ 5186 long x; 5187 x = PyInt_AsLong(v); 5188 if (x == -1 && PyErr_Occurred()) 5189 goto onError; 5190 buf[0] = (char) x; 5191 } 5192 buf[1] = '\0'; 5193 return 1; 5194 5195 onError: 5196 PyErr_SetString(PyExc_TypeError, 5197 "%c requires int or char"); 5198 return -1; 5199} 5200 5201/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 5202 5203 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 5204 chars are formatted. XXX This is a magic number. Each formatting 5205 routine does bounds checking to ensure no overflow, but a better 5206 solution may be to malloc a buffer of appropriate size for each 5207 format. For now, the current solution is sufficient. 5208*/ 5209#define FORMATBUFLEN (size_t)120 5210 5211PyObject *PyUnicode_Format(PyObject *format, 5212 PyObject *args) 5213{ 5214 Py_UNICODE *fmt, *res; 5215 int fmtcnt, rescnt, reslen, arglen, argidx; 5216 int args_owned = 0; 5217 PyUnicodeObject *result = NULL; 5218 PyObject *dict = NULL; 5219 PyObject *uformat; 5220 5221 if (format == NULL || args == NULL) { 5222 PyErr_BadInternalCall(); 5223 return NULL; 5224 } 5225 uformat = PyUnicode_FromObject(format); 5226 if (uformat == NULL) 5227 return NULL; 5228 fmt = PyUnicode_AS_UNICODE(uformat); 5229 fmtcnt = PyUnicode_GET_SIZE(uformat); 5230 5231 reslen = rescnt = fmtcnt + 100; 5232 result = _PyUnicode_New(reslen); 5233 if (result == NULL) 5234 goto onError; 5235 res = PyUnicode_AS_UNICODE(result); 5236 5237 if (PyTuple_Check(args)) { 5238 arglen = PyTuple_Size(args); 5239 argidx = 0; 5240 } 5241 else { 5242 arglen = -1; 5243 argidx = -2; 5244 } 5245 if (args->ob_type->tp_as_mapping) 5246 dict = args; 5247 5248 while (--fmtcnt >= 0) { 5249 if (*fmt != '%') { 5250 if (--rescnt < 0) { 5251 rescnt = fmtcnt + 100; 5252 reslen += rescnt; 5253 if (_PyUnicode_Resize(&result, reslen) < 0) 5254 return NULL; 5255 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 5256 --rescnt; 5257 } 5258 *res++ = *fmt++; 5259 } 5260 else { 5261 /* Got a format specifier */ 5262 int flags = 0; 5263 int width = -1; 5264 int prec = -1; 5265 Py_UNICODE c = '\0'; 5266 Py_UNICODE fill; 5267 PyObject *v = NULL; 5268 PyObject *temp = NULL; 5269 Py_UNICODE *pbuf; 5270 Py_UNICODE sign; 5271 int len; 5272 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 5273 5274 fmt++; 5275 if (*fmt == '(') { 5276 Py_UNICODE *keystart; 5277 int keylen; 5278 PyObject *key; 5279 int pcount = 1; 5280 5281 if (dict == NULL) { 5282 PyErr_SetString(PyExc_TypeError, 5283 "format requires a mapping"); 5284 goto onError; 5285 } 5286 ++fmt; 5287 --fmtcnt; 5288 keystart = fmt; 5289 /* Skip over balanced parentheses */ 5290 while (pcount > 0 && --fmtcnt >= 0) { 5291 if (*fmt == ')') 5292 --pcount; 5293 else if (*fmt == '(') 5294 ++pcount; 5295 fmt++; 5296 } 5297 keylen = fmt - keystart - 1; 5298 if (fmtcnt < 0 || pcount > 0) { 5299 PyErr_SetString(PyExc_ValueError, 5300 "incomplete format key"); 5301 goto onError; 5302 } 5303 /* keys are converted to strings using UTF-8 and 5304 then looked up since Python uses strings to hold 5305 variables names etc. in its namespaces and we 5306 wouldn't want to break common idioms. */ 5307 key = PyUnicode_EncodeUTF8(keystart, 5308 keylen, 5309 NULL); 5310 if (key == NULL) 5311 goto onError; 5312 if (args_owned) { 5313 Py_DECREF(args); 5314 args_owned = 0; 5315 } 5316 args = PyObject_GetItem(dict, key); 5317 Py_DECREF(key); 5318 if (args == NULL) { 5319 goto onError; 5320 } 5321 args_owned = 1; 5322 arglen = -1; 5323 argidx = -2; 5324 } 5325 while (--fmtcnt >= 0) { 5326 switch (c = *fmt++) { 5327 case '-': flags |= F_LJUST; continue; 5328 case '+': flags |= F_SIGN; continue; 5329 case ' ': flags |= F_BLANK; continue; 5330 case '#': flags |= F_ALT; continue; 5331 case '0': flags |= F_ZERO; continue; 5332 } 5333 break; 5334 } 5335 if (c == '*') { 5336 v = getnextarg(args, arglen, &argidx); 5337 if (v == NULL) 5338 goto onError; 5339 if (!PyInt_Check(v)) { 5340 PyErr_SetString(PyExc_TypeError, 5341 "* wants int"); 5342 goto onError; 5343 } 5344 width = PyInt_AsLong(v); 5345 if (width < 0) { 5346 flags |= F_LJUST; 5347 width = -width; 5348 } 5349 if (--fmtcnt >= 0) 5350 c = *fmt++; 5351 } 5352 else if (c >= '0' && c <= '9') { 5353 width = c - '0'; 5354 while (--fmtcnt >= 0) { 5355 c = *fmt++; 5356 if (c < '0' || c > '9') 5357 break; 5358 if ((width*10) / 10 != width) { 5359 PyErr_SetString(PyExc_ValueError, 5360 "width too big"); 5361 goto onError; 5362 } 5363 width = width*10 + (c - '0'); 5364 } 5365 } 5366 if (c == '.') { 5367 prec = 0; 5368 if (--fmtcnt >= 0) 5369 c = *fmt++; 5370 if (c == '*') { 5371 v = getnextarg(args, arglen, &argidx); 5372 if (v == NULL) 5373 goto onError; 5374 if (!PyInt_Check(v)) { 5375 PyErr_SetString(PyExc_TypeError, 5376 "* wants int"); 5377 goto onError; 5378 } 5379 prec = PyInt_AsLong(v); 5380 if (prec < 0) 5381 prec = 0; 5382 if (--fmtcnt >= 0) 5383 c = *fmt++; 5384 } 5385 else if (c >= '0' && c <= '9') { 5386 prec = c - '0'; 5387 while (--fmtcnt >= 0) { 5388 c = Py_CHARMASK(*fmt++); 5389 if (c < '0' || c > '9') 5390 break; 5391 if ((prec*10) / 10 != prec) { 5392 PyErr_SetString(PyExc_ValueError, 5393 "prec too big"); 5394 goto onError; 5395 } 5396 prec = prec*10 + (c - '0'); 5397 } 5398 } 5399 } /* prec */ 5400 if (fmtcnt >= 0) { 5401 if (c == 'h' || c == 'l' || c == 'L') { 5402 if (--fmtcnt >= 0) 5403 c = *fmt++; 5404 } 5405 } 5406 if (fmtcnt < 0) { 5407 PyErr_SetString(PyExc_ValueError, 5408 "incomplete format"); 5409 goto onError; 5410 } 5411 if (c != '%') { 5412 v = getnextarg(args, arglen, &argidx); 5413 if (v == NULL) 5414 goto onError; 5415 } 5416 sign = 0; 5417 fill = ' '; 5418 switch (c) { 5419 5420 case '%': 5421 pbuf = formatbuf; 5422 /* presume that buffer length is at least 1 */ 5423 pbuf[0] = '%'; 5424 len = 1; 5425 break; 5426 5427 case 's': 5428 case 'r': 5429 if (PyUnicode_Check(v) && c == 's') { 5430 temp = v; 5431 Py_INCREF(temp); 5432 } 5433 else { 5434 PyObject *unicode; 5435 if (c == 's') 5436 temp = PyObject_Str(v); 5437 else 5438 temp = PyObject_Repr(v); 5439 if (temp == NULL) 5440 goto onError; 5441 if (!PyString_Check(temp)) { 5442 /* XXX Note: this should never happen, since 5443 PyObject_Repr() and PyObject_Str() assure 5444 this */ 5445 Py_DECREF(temp); 5446 PyErr_SetString(PyExc_TypeError, 5447 "%s argument has non-string str()"); 5448 goto onError; 5449 } 5450 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5451 PyString_GET_SIZE(temp), 5452 NULL, 5453 "strict"); 5454 Py_DECREF(temp); 5455 temp = unicode; 5456 if (temp == NULL) 5457 goto onError; 5458 } 5459 pbuf = PyUnicode_AS_UNICODE(temp); 5460 len = PyUnicode_GET_SIZE(temp); 5461 if (prec >= 0 && len > prec) 5462 len = prec; 5463 break; 5464 5465 case 'i': 5466 case 'd': 5467 case 'u': 5468 case 'o': 5469 case 'x': 5470 case 'X': 5471 if (c == 'i') 5472 c = 'd'; 5473 if (PyLong_Check(v)) { 5474 temp = formatlong(v, flags, prec, c); 5475 if (!temp) 5476 goto onError; 5477 pbuf = PyUnicode_AS_UNICODE(temp); 5478 len = PyUnicode_GET_SIZE(temp); 5479 /* unbounded ints can always produce 5480 a sign character! */ 5481 sign = 1; 5482 } 5483 else { 5484 pbuf = formatbuf; 5485 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5486 flags, prec, c, v); 5487 if (len < 0) 5488 goto onError; 5489 /* only d conversion is signed */ 5490 sign = c == 'd'; 5491 } 5492 if (flags & F_ZERO) 5493 fill = '0'; 5494 break; 5495 5496 case 'e': 5497 case 'E': 5498 case 'f': 5499 case 'g': 5500 case 'G': 5501 pbuf = formatbuf; 5502 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5503 flags, prec, c, v); 5504 if (len < 0) 5505 goto onError; 5506 sign = 1; 5507 if (flags & F_ZERO) 5508 fill = '0'; 5509 break; 5510 5511 case 'c': 5512 pbuf = formatbuf; 5513 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5514 if (len < 0) 5515 goto onError; 5516 break; 5517 5518 default: 5519 PyErr_Format(PyExc_ValueError, 5520 "unsupported format character '%c' (0x%x) " 5521 "at index %i", 5522 (31<=c && c<=126) ? c : '?', 5523 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5524 goto onError; 5525 } 5526 if (sign) { 5527 if (*pbuf == '-' || *pbuf == '+') { 5528 sign = *pbuf++; 5529 len--; 5530 } 5531 else if (flags & F_SIGN) 5532 sign = '+'; 5533 else if (flags & F_BLANK) 5534 sign = ' '; 5535 else 5536 sign = 0; 5537 } 5538 if (width < len) 5539 width = len; 5540 if (rescnt < width + (sign != 0)) { 5541 reslen -= rescnt; 5542 rescnt = width + fmtcnt + 100; 5543 reslen += rescnt; 5544 if (_PyUnicode_Resize(&result, reslen) < 0) 5545 return NULL; 5546 res = PyUnicode_AS_UNICODE(result) 5547 + reslen - rescnt; 5548 } 5549 if (sign) { 5550 if (fill != ' ') 5551 *res++ = sign; 5552 rescnt--; 5553 if (width > len) 5554 width--; 5555 } 5556 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5557 assert(pbuf[0] == '0'); 5558 assert(pbuf[1] == c); 5559 if (fill != ' ') { 5560 *res++ = *pbuf++; 5561 *res++ = *pbuf++; 5562 } 5563 rescnt -= 2; 5564 width -= 2; 5565 if (width < 0) 5566 width = 0; 5567 len -= 2; 5568 } 5569 if (width > len && !(flags & F_LJUST)) { 5570 do { 5571 --rescnt; 5572 *res++ = fill; 5573 } while (--width > len); 5574 } 5575 if (fill == ' ') { 5576 if (sign) 5577 *res++ = sign; 5578 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5579 assert(pbuf[0] == '0'); 5580 assert(pbuf[1] == c); 5581 *res++ = *pbuf++; 5582 *res++ = *pbuf++; 5583 } 5584 } 5585 Py_UNICODE_COPY(res, pbuf, len); 5586 res += len; 5587 rescnt -= len; 5588 while (--width >= len) { 5589 --rescnt; 5590 *res++ = ' '; 5591 } 5592 if (dict && (argidx < arglen) && c != '%') { 5593 PyErr_SetString(PyExc_TypeError, 5594 "not all arguments converted"); 5595 goto onError; 5596 } 5597 Py_XDECREF(temp); 5598 } /* '%' */ 5599 } /* until end */ 5600 if (argidx < arglen && !dict) { 5601 PyErr_SetString(PyExc_TypeError, 5602 "not all arguments converted"); 5603 goto onError; 5604 } 5605 5606 if (args_owned) { 5607 Py_DECREF(args); 5608 } 5609 Py_DECREF(uformat); 5610 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5611 goto onError; 5612 return (PyObject *)result; 5613 5614 onError: 5615 Py_XDECREF(result); 5616 Py_DECREF(uformat); 5617 if (args_owned) { 5618 Py_DECREF(args); 5619 } 5620 return NULL; 5621} 5622 5623static PyBufferProcs unicode_as_buffer = { 5624 (getreadbufferproc) unicode_buffer_getreadbuf, 5625 (getwritebufferproc) unicode_buffer_getwritebuf, 5626 (getsegcountproc) unicode_buffer_getsegcount, 5627 (getcharbufferproc) unicode_buffer_getcharbuf, 5628}; 5629 5630staticforward PyObject * 5631unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 5632 5633static PyObject * 5634unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5635{ 5636 PyObject *x = NULL; 5637 static char *kwlist[] = {"string", "encoding", "errors", 0}; 5638 char *encoding = NULL; 5639 char *errors = NULL; 5640 5641 if (type != &PyUnicode_Type) 5642 return unicode_subtype_new(type, args, kwds); 5643 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 5644 kwlist, &x, &encoding, &errors)) 5645 return NULL; 5646 if (x == NULL) 5647 return (PyObject *)_PyUnicode_New(0); 5648 if (encoding == NULL && errors == NULL) 5649 return PyObject_Unicode(x); 5650 else 5651 return PyUnicode_FromEncodedObject(x, encoding, errors); 5652} 5653 5654static PyObject * 5655unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5656{ 5657 PyUnicodeObject *tmp, *pnew; 5658 int n; 5659 5660 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 5661 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 5662 if (tmp == NULL) 5663 return NULL; 5664 assert(PyUnicode_Check(tmp)); 5665 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 5666 if (pnew == NULL) 5667 return NULL; 5668 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 5669 if (pnew->str == NULL) { 5670 _Py_ForgetReference((PyObject *)pnew); 5671 PyObject_DEL(pnew); 5672 return NULL; 5673 } 5674 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 5675 pnew->length = n; 5676 pnew->hash = tmp->hash; 5677 Py_DECREF(tmp); 5678 return (PyObject *)pnew; 5679} 5680 5681static char unicode_doc[] = 5682"unicode(string [, encoding[, errors]]) -> object\n\ 5683\n\ 5684Create a new Unicode object from the given encoded string.\n\ 5685encoding defaults to the current default string encoding and \n\ 5686errors, defining the error handling, to 'strict'."; 5687 5688PyTypeObject PyUnicode_Type = { 5689 PyObject_HEAD_INIT(&PyType_Type) 5690 0, /* ob_size */ 5691 "unicode", /* tp_name */ 5692 sizeof(PyUnicodeObject), /* tp_size */ 5693 0, /* tp_itemsize */ 5694 /* Slots */ 5695 (destructor)unicode_dealloc, /* tp_dealloc */ 5696 0, /* tp_print */ 5697 0, /* tp_getattr */ 5698 0, /* tp_setattr */ 5699 (cmpfunc) unicode_compare, /* tp_compare */ 5700 (reprfunc) unicode_repr, /* tp_repr */ 5701 0, /* tp_as_number */ 5702 &unicode_as_sequence, /* tp_as_sequence */ 5703 0, /* tp_as_mapping */ 5704 (hashfunc) unicode_hash, /* tp_hash*/ 5705 0, /* tp_call*/ 5706 (reprfunc) unicode_str, /* tp_str */ 5707 PyObject_GenericGetAttr, /* tp_getattro */ 5708 0, /* tp_setattro */ 5709 &unicode_as_buffer, /* tp_as_buffer */ 5710 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 5711 unicode_doc, /* tp_doc */ 5712 0, /* tp_traverse */ 5713 0, /* tp_clear */ 5714 0, /* tp_richcompare */ 5715 0, /* tp_weaklistoffset */ 5716 0, /* tp_iter */ 5717 0, /* tp_iternext */ 5718 unicode_methods, /* tp_methods */ 5719 0, /* tp_members */ 5720 0, /* tp_getset */ 5721 0, /* tp_base */ 5722 0, /* tp_dict */ 5723 0, /* tp_descr_get */ 5724 0, /* tp_descr_set */ 5725 0, /* tp_dictoffset */ 5726 0, /* tp_init */ 5727 0, /* tp_alloc */ 5728 unicode_new, /* tp_new */ 5729 _PyObject_Del, /* tp_free */ 5730}; 5731 5732/* Initialize the Unicode implementation */ 5733 5734void _PyUnicode_Init(void) 5735{ 5736 int i; 5737 5738 /* Init the implementation */ 5739 unicode_freelist = NULL; 5740 unicode_freelist_size = 0; 5741 unicode_empty = _PyUnicode_New(0); 5742 strcpy(unicode_default_encoding, "ascii"); 5743 for (i = 0; i < 256; i++) 5744 unicode_latin1[i] = NULL; 5745} 5746 5747/* Finalize the Unicode implementation */ 5748 5749void 5750_PyUnicode_Fini(void) 5751{ 5752 PyUnicodeObject *u; 5753 int i; 5754 5755 Py_XDECREF(unicode_empty); 5756 unicode_empty = NULL; 5757 5758 for (i = 0; i < 256; i++) { 5759 if (unicode_latin1[i]) { 5760 Py_DECREF(unicode_latin1[i]); 5761 unicode_latin1[i] = NULL; 5762 } 5763 } 5764 5765 for (u = unicode_freelist; u != NULL;) { 5766 PyUnicodeObject *v = u; 5767 u = *(PyUnicodeObject **)u; 5768 if (v->str) 5769 PyMem_DEL(v->str); 5770 Py_XDECREF(v->defenc); 5771 PyObject_DEL(v); 5772 } 5773 unicode_freelist = NULL; 5774 unicode_freelist_size = 0; 5775} 5776