unicodeobject.c revision dcc819a5c9e3cb60eba05a3c0b2547bc1fb28b80
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyMalloc_New(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyMalloc_Del(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 *unicode = (PyObject *)w; 280 return 0; 281 } 282 283 /* Note that we don't have to modify *unicode for unshared Unicode 284 objects, since we can modify them in-place. */ 285 return unicode_resize(v, length); 286} 287 288/* Internal API for use in unicodeobject.c only ! */ 289#define _PyUnicode_Resize(unicodevar, length) \ 290 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 291 292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 293 int size) 294{ 295 PyUnicodeObject *unicode; 296 297 /* If the Unicode data is known at construction time, we can apply 298 some optimizations which share commonly used objects. */ 299 if (u != NULL) { 300 301 /* Optimization for empty strings */ 302 if (size == 0 && unicode_empty != NULL) { 303 Py_INCREF(unicode_empty); 304 return (PyObject *)unicode_empty; 305 } 306 307 /* Single character Unicode objects in the Latin-1 range are 308 shared when using this constructor */ 309 if (size == 1 && *u < 256) { 310 unicode = unicode_latin1[*u]; 311 if (!unicode) { 312 unicode = _PyUnicode_New(1); 313 if (!unicode) 314 return NULL; 315 unicode->str[0] = *u; 316 unicode_latin1[*u] = unicode; 317 } 318 Py_INCREF(unicode); 319 return (PyObject *)unicode; 320 } 321 } 322 323 unicode = _PyUnicode_New(size); 324 if (!unicode) 325 return NULL; 326 327 /* Copy the Unicode data into the new object */ 328 if (u != NULL) 329 Py_UNICODE_COPY(unicode->str, u, size); 330 331 return (PyObject *)unicode; 332} 333 334#ifdef HAVE_WCHAR_H 335 336PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 337 int size) 338{ 339 PyUnicodeObject *unicode; 340 341 if (w == NULL) { 342 PyErr_BadInternalCall(); 343 return NULL; 344 } 345 346 unicode = _PyUnicode_New(size); 347 if (!unicode) 348 return NULL; 349 350 /* Copy the wchar_t data into the new object */ 351#ifdef HAVE_USABLE_WCHAR_T 352 memcpy(unicode->str, w, size * sizeof(wchar_t)); 353#else 354 { 355 register Py_UNICODE *u; 356 register int i; 357 u = PyUnicode_AS_UNICODE(unicode); 358 for (i = size; i >= 0; i--) 359 *u++ = *w++; 360 } 361#endif 362 363 return (PyObject *)unicode; 364} 365 366int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 367 register wchar_t *w, 368 int size) 369{ 370 if (unicode == NULL) { 371 PyErr_BadInternalCall(); 372 return -1; 373 } 374 if (size > PyUnicode_GET_SIZE(unicode)) 375 size = PyUnicode_GET_SIZE(unicode); 376#ifdef HAVE_USABLE_WCHAR_T 377 memcpy(w, unicode->str, size * sizeof(wchar_t)); 378#else 379 { 380 register Py_UNICODE *u; 381 register int i; 382 u = PyUnicode_AS_UNICODE(unicode); 383 for (i = size; i >= 0; i--) 384 *w++ = *u++; 385 } 386#endif 387 388 return size; 389} 390 391#endif 392 393PyObject *PyUnicode_FromObject(register PyObject *obj) 394{ 395 /* XXX Perhaps we should make this API an alias of 396 PyObject_Unicode() instead ?! */ 397 if (PyUnicode_CheckExact(obj)) { 398 Py_INCREF(obj); 399 return obj; 400 } 401 if (PyUnicode_Check(obj)) { 402 /* For a Unicode subtype that's not a Unicode object, 403 return a true Unicode object with the same data. */ 404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 405 PyUnicode_GET_SIZE(obj)); 406 } 407 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 408} 409 410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 411 const char *encoding, 412 const char *errors) 413{ 414 const char *s = NULL; 415 int len; 416 int owned = 0; 417 PyObject *v; 418 419 if (obj == NULL) { 420 PyErr_BadInternalCall(); 421 return NULL; 422 } 423 424#if 0 425 /* For b/w compatibility we also accept Unicode objects provided 426 that no encodings is given and then redirect to 427 PyObject_Unicode() which then applies the additional logic for 428 Unicode subclasses. 429 430 NOTE: This API should really only be used for object which 431 represent *encoded* Unicode ! 432 433 */ 434 if (PyUnicode_Check(obj)) { 435 if (encoding) { 436 PyErr_SetString(PyExc_TypeError, 437 "decoding Unicode is not supported"); 438 return NULL; 439 } 440 return PyObject_Unicode(obj); 441 } 442#else 443 if (PyUnicode_Check(obj)) { 444 PyErr_SetString(PyExc_TypeError, 445 "decoding Unicode is not supported"); 446 return NULL; 447 } 448#endif 449 450 /* Coerce object */ 451 if (PyString_Check(obj)) { 452 s = PyString_AS_STRING(obj); 453 len = PyString_GET_SIZE(obj); 454 } 455 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 456 /* Overwrite the error message with something more useful in 457 case of a TypeError. */ 458 if (PyErr_ExceptionMatches(PyExc_TypeError)) 459 PyErr_Format(PyExc_TypeError, 460 "coercing to Unicode: need string or buffer, " 461 "%.80s found", 462 obj->ob_type->tp_name); 463 goto onError; 464 } 465 466 /* Convert to Unicode */ 467 if (len == 0) { 468 Py_INCREF(unicode_empty); 469 v = (PyObject *)unicode_empty; 470 } 471 else 472 v = PyUnicode_Decode(s, len, encoding, errors); 473 474 if (owned) { 475 Py_DECREF(obj); 476 } 477 return v; 478 479 onError: 480 if (owned) { 481 Py_DECREF(obj); 482 } 483 return NULL; 484} 485 486PyObject *PyUnicode_Decode(const char *s, 487 int size, 488 const char *encoding, 489 const char *errors) 490{ 491 PyObject *buffer = NULL, *unicode; 492 493 if (encoding == NULL) 494 encoding = PyUnicode_GetDefaultEncoding(); 495 496 /* Shortcuts for common default encodings */ 497 if (strcmp(encoding, "utf-8") == 0) 498 return PyUnicode_DecodeUTF8(s, size, errors); 499 else if (strcmp(encoding, "latin-1") == 0) 500 return PyUnicode_DecodeLatin1(s, size, errors); 501 else if (strcmp(encoding, "ascii") == 0) 502 return PyUnicode_DecodeASCII(s, size, errors); 503 504 /* Decode via the codec registry */ 505 buffer = PyBuffer_FromMemory((void *)s, size); 506 if (buffer == NULL) 507 goto onError; 508 unicode = PyCodec_Decode(buffer, encoding, errors); 509 if (unicode == NULL) 510 goto onError; 511 if (!PyUnicode_Check(unicode)) { 512 PyErr_Format(PyExc_TypeError, 513 "decoder did not return an unicode object (type=%.400s)", 514 unicode->ob_type->tp_name); 515 Py_DECREF(unicode); 516 goto onError; 517 } 518 Py_DECREF(buffer); 519 return unicode; 520 521 onError: 522 Py_XDECREF(buffer); 523 return NULL; 524} 525 526PyObject *PyUnicode_Encode(const Py_UNICODE *s, 527 int size, 528 const char *encoding, 529 const char *errors) 530{ 531 PyObject *v, *unicode; 532 533 unicode = PyUnicode_FromUnicode(s, size); 534 if (unicode == NULL) 535 return NULL; 536 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 537 Py_DECREF(unicode); 538 return v; 539} 540 541PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 542 const char *encoding, 543 const char *errors) 544{ 545 PyObject *v; 546 547 if (!PyUnicode_Check(unicode)) { 548 PyErr_BadArgument(); 549 goto onError; 550 } 551 552 if (encoding == NULL) 553 encoding = PyUnicode_GetDefaultEncoding(); 554 555 /* Shortcuts for common default encodings */ 556 if (errors == NULL) { 557 if (strcmp(encoding, "utf-8") == 0) 558 return PyUnicode_AsUTF8String(unicode); 559 else if (strcmp(encoding, "latin-1") == 0) 560 return PyUnicode_AsLatin1String(unicode); 561 else if (strcmp(encoding, "ascii") == 0) 562 return PyUnicode_AsASCIIString(unicode); 563 } 564 565 /* Encode via the codec registry */ 566 v = PyCodec_Encode(unicode, encoding, errors); 567 if (v == NULL) 568 goto onError; 569 /* XXX Should we really enforce this ? */ 570 if (!PyString_Check(v)) { 571 PyErr_Format(PyExc_TypeError, 572 "encoder did not return a string object (type=%.400s)", 573 v->ob_type->tp_name); 574 Py_DECREF(v); 575 goto onError; 576 } 577 return v; 578 579 onError: 580 return NULL; 581} 582 583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 584 const char *errors) 585{ 586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 587 588 if (v) 589 return v; 590 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 591 if (v && errors == NULL) 592 ((PyUnicodeObject *)unicode)->defenc = v; 593 return v; 594} 595 596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 597{ 598 if (!PyUnicode_Check(unicode)) { 599 PyErr_BadArgument(); 600 goto onError; 601 } 602 return PyUnicode_AS_UNICODE(unicode); 603 604 onError: 605 return NULL; 606} 607 608int PyUnicode_GetSize(PyObject *unicode) 609{ 610 if (!PyUnicode_Check(unicode)) { 611 PyErr_BadArgument(); 612 goto onError; 613 } 614 return PyUnicode_GET_SIZE(unicode); 615 616 onError: 617 return -1; 618} 619 620const char *PyUnicode_GetDefaultEncoding(void) 621{ 622 return unicode_default_encoding; 623} 624 625int PyUnicode_SetDefaultEncoding(const char *encoding) 626{ 627 PyObject *v; 628 629 /* Make sure the encoding is valid. As side effect, this also 630 loads the encoding into the codec registry cache. */ 631 v = _PyCodec_Lookup(encoding); 632 if (v == NULL) 633 goto onError; 634 Py_DECREF(v); 635 strncpy(unicode_default_encoding, 636 encoding, 637 sizeof(unicode_default_encoding)); 638 return 0; 639 640 onError: 641 return -1; 642} 643 644/* --- UTF-7 Codec -------------------------------------------------------- */ 645 646/* see RFC2152 for details */ 647 648static 649char utf7_special[128] = { 650 /* indicate whether a UTF-7 character is special i.e. cannot be directly 651 encoded: 652 0 - not special 653 1 - special 654 2 - whitespace (optional) 655 3 - RFC2152 Set O (optional) */ 656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 664 665}; 666 667#define SPECIAL(c, encodeO, encodeWS) \ 668 (((c)>127 || utf7_special[(c)] == 1) || \ 669 (encodeWS && (utf7_special[(c)] == 2)) || \ 670 (encodeO && (utf7_special[(c)] == 3))) 671 672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 676 677#define ENCODE(out, ch, bits) \ 678 while (bits >= 6) { \ 679 *out++ = B64(ch >> (bits-6)); \ 680 bits -= 6; \ 681 } 682 683#define DECODE(out, ch, bits, surrogate) \ 684 while (bits >= 16) { \ 685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 686 bits -= 16; \ 687 if (surrogate) { \ 688 /* We have already generated an error for the high surrogate 689 so let's not bother seeing if the low surrogate is correct or not */\ 690 surrogate = 0; \ 691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 692 /* This is a surrogate pair. Unfortunately we can't represent \ 693 it in a 16-bit character */ \ 694 surrogate = 1; \ 695 errmsg = "code pairs are not supported"; \ 696 goto utf7Error; \ 697 } else { \ 698 *out++ = outCh; \ 699 } \ 700 } \ 701 702static 703int utf7_decoding_error(Py_UNICODE **dest, 704 const char *errors, 705 const char *details) 706{ 707 if ((errors == NULL) || 708 (strcmp(errors,"strict") == 0)) { 709 PyErr_Format(PyExc_UnicodeError, 710 "UTF-7 decoding error: %.400s", 711 details); 712 return -1; 713 } 714 else if (strcmp(errors,"ignore") == 0) { 715 return 0; 716 } 717 else if (strcmp(errors,"replace") == 0) { 718 if (dest != NULL) { 719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 720 (*dest)++; 721 } 722 return 0; 723 } 724 else { 725 PyErr_Format(PyExc_ValueError, 726 "UTF-7 decoding error; unknown error handling code: %.400s", 727 errors); 728 return -1; 729 } 730} 731 732PyObject *PyUnicode_DecodeUTF7(const char *s, 733 int size, 734 const char *errors) 735{ 736 const char *e; 737 PyUnicodeObject *unicode; 738 Py_UNICODE *p; 739 const char *errmsg = ""; 740 int inShift = 0; 741 unsigned int bitsleft = 0; 742 unsigned long charsleft = 0; 743 int surrogate = 0; 744 745 unicode = _PyUnicode_New(size); 746 if (!unicode) 747 return NULL; 748 if (size == 0) 749 return (PyObject *)unicode; 750 751 p = unicode->str; 752 e = s + size; 753 754 while (s < e) { 755 Py_UNICODE ch = *s; 756 757 if (inShift) { 758 if ((ch == '-') || !B64CHAR(ch)) { 759 inShift = 0; 760 s++; 761 762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 763 if (bitsleft >= 6) { 764 /* The shift sequence has a partial character in it. If 765 bitsleft < 6 then we could just classify it as padding 766 but that is not the case here */ 767 768 errmsg = "partial character in shift sequence"; 769 goto utf7Error; 770 } 771 /* According to RFC2152 the remaining bits should be zero. We 772 choose to signal an error/insert a replacement character 773 here so indicate the potential of a misencoded character. */ 774 775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 777 errmsg = "non-zero padding bits in shift sequence"; 778 goto utf7Error; 779 } 780 781 if (ch == '-') { 782 if ((s < e) && (*(s) == '-')) { 783 *p++ = '-'; 784 inShift = 1; 785 } 786 } else if (SPECIAL(ch,0,0)) { 787 errmsg = "unexpected special character"; 788 goto utf7Error; 789 } else { 790 *p++ = ch; 791 } 792 } else { 793 charsleft = (charsleft << 6) | UB64(ch); 794 bitsleft += 6; 795 s++; 796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 797 } 798 } 799 else if ( ch == '+' ) { 800 s++; 801 if (s < e && *s == '-') { 802 s++; 803 *p++ = '+'; 804 } else 805 { 806 inShift = 1; 807 bitsleft = 0; 808 } 809 } 810 else if (SPECIAL(ch,0,0)) { 811 errmsg = "unexpected special character"; 812 s++; 813 goto utf7Error; 814 } 815 else { 816 *p++ = ch; 817 s++; 818 } 819 continue; 820 utf7Error: 821 if (utf7_decoding_error(&p, errors, errmsg)) 822 goto onError; 823 } 824 825 if (inShift) { 826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) 827 goto onError; 828 } 829 830 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 831 goto onError; 832 833 return (PyObject *)unicode; 834 835onError: 836 Py_DECREF(unicode); 837 return NULL; 838} 839 840 841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 842 int size, 843 int encodeSetO, 844 int encodeWhiteSpace, 845 const char *errors) 846{ 847 PyObject *v; 848 /* It might be possible to tighten this worst case */ 849 unsigned int cbAllocated = 5 * size; 850 int inShift = 0; 851 int i = 0; 852 unsigned int bitsleft = 0; 853 unsigned long charsleft = 0; 854 char * out; 855 char * start; 856 857 if (size == 0) 858 return PyString_FromStringAndSize(NULL, 0); 859 860 v = PyString_FromStringAndSize(NULL, cbAllocated); 861 if (v == NULL) 862 return NULL; 863 864 start = out = PyString_AS_STRING(v); 865 for (;i < size; ++i) { 866 Py_UNICODE ch = s[i]; 867 868 if (!inShift) { 869 if (ch == '+') { 870 *out++ = '+'; 871 *out++ = '-'; 872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 873 charsleft = ch; 874 bitsleft = 16; 875 *out++ = '+'; 876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 877 inShift = bitsleft > 0; 878 } else { 879 *out++ = (char) ch; 880 } 881 } else { 882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 883 *out++ = B64(charsleft << (6-bitsleft)); 884 charsleft = 0; 885 bitsleft = 0; 886 /* Characters not in the BASE64 set implicitly unshift the sequence 887 so no '-' is required, except if the character is itself a '-' */ 888 if (B64CHAR(ch) || ch == '-') { 889 *out++ = '-'; 890 } 891 inShift = 0; 892 *out++ = (char) ch; 893 } else { 894 bitsleft += 16; 895 charsleft = (charsleft << 16) | ch; 896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 897 898 /* If the next character is special then we dont' need to terminate 899 the shift sequence. If the next character is not a BASE64 character 900 or '-' then the shift sequence will be terminated implicitly and we 901 don't have to insert a '-'. */ 902 903 if (bitsleft == 0) { 904 if (i + 1 < size) { 905 Py_UNICODE ch2 = s[i+1]; 906 907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 908 909 } else if (B64CHAR(ch2) || ch2 == '-') { 910 *out++ = '-'; 911 inShift = 0; 912 } else { 913 inShift = 0; 914 } 915 916 } 917 else { 918 *out++ = '-'; 919 inShift = 0; 920 } 921 } 922 } 923 } 924 } 925 if (bitsleft) { 926 *out++= B64(charsleft << (6-bitsleft) ); 927 *out++ = '-'; 928 } 929 930 if (_PyString_Resize(&v, out - start)) { 931 Py_DECREF(v); 932 return NULL; 933 } 934 return v; 935} 936 937#undef SPECIAL 938#undef B64 939#undef B64CHAR 940#undef UB64 941#undef ENCODE 942#undef DECODE 943 944/* --- UTF-8 Codec -------------------------------------------------------- */ 945 946static 947char utf8_code_length[256] = { 948 /* Map UTF-8 encoded prefix byte to sequence length. zero means 949 illegal prefix. see RFC 2279 for details */ 950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 966}; 967 968static 969int utf8_decoding_error(const char **source, 970 Py_UNICODE **dest, 971 const char *errors, 972 const char *details) 973{ 974 if ((errors == NULL) || 975 (strcmp(errors,"strict") == 0)) { 976 PyErr_Format(PyExc_UnicodeError, 977 "UTF-8 decoding error: %.400s", 978 details); 979 return -1; 980 } 981 else if (strcmp(errors,"ignore") == 0) { 982 (*source)++; 983 return 0; 984 } 985 else if (strcmp(errors,"replace") == 0) { 986 (*source)++; 987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 988 (*dest)++; 989 return 0; 990 } 991 else { 992 PyErr_Format(PyExc_ValueError, 993 "UTF-8 decoding error; unknown error handling code: %.400s", 994 errors); 995 return -1; 996 } 997} 998 999PyObject *PyUnicode_DecodeUTF8(const char *s, 1000 int size, 1001 const char *errors) 1002{ 1003 int n; 1004 const char *e; 1005 PyUnicodeObject *unicode; 1006 Py_UNICODE *p; 1007 const char *errmsg = ""; 1008 1009 /* Note: size will always be longer than the resulting Unicode 1010 character count */ 1011 unicode = _PyUnicode_New(size); 1012 if (!unicode) 1013 return NULL; 1014 if (size == 0) 1015 return (PyObject *)unicode; 1016 1017 /* Unpack UTF-8 encoded data */ 1018 p = unicode->str; 1019 e = s + size; 1020 1021 while (s < e) { 1022 Py_UCS4 ch = (unsigned char)*s; 1023 1024 if (ch < 0x80) { 1025 *p++ = (Py_UNICODE)ch; 1026 s++; 1027 continue; 1028 } 1029 1030 n = utf8_code_length[ch]; 1031 1032 if (s + n > e) { 1033 errmsg = "unexpected end of data"; 1034 goto utf8Error; 1035 } 1036 1037 switch (n) { 1038 1039 case 0: 1040 errmsg = "unexpected code byte"; 1041 goto utf8Error; 1042 1043 case 1: 1044 errmsg = "internal error"; 1045 goto utf8Error; 1046 1047 case 2: 1048 if ((s[1] & 0xc0) != 0x80) { 1049 errmsg = "invalid data"; 1050 goto utf8Error; 1051 } 1052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1053 if (ch < 0x80) { 1054 errmsg = "illegal encoding"; 1055 goto utf8Error; 1056 } 1057 else 1058 *p++ = (Py_UNICODE)ch; 1059 break; 1060 1061 case 3: 1062 if ((s[1] & 0xc0) != 0x80 || 1063 (s[2] & 0xc0) != 0x80) { 1064 errmsg = "invalid data"; 1065 goto utf8Error; 1066 } 1067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1068 if (ch < 0x0800) { 1069 /* Note: UTF-8 encodings of surrogates are considered 1070 legal UTF-8 sequences; 1071 1072 XXX For wide builds (UCS-4) we should probably try 1073 to recombine the surrogates into a single code 1074 unit. 1075 */ 1076 errmsg = "illegal encoding"; 1077 goto utf8Error; 1078 } 1079 else 1080 *p++ = (Py_UNICODE)ch; 1081 break; 1082 1083 case 4: 1084 if ((s[1] & 0xc0) != 0x80 || 1085 (s[2] & 0xc0) != 0x80 || 1086 (s[3] & 0xc0) != 0x80) { 1087 errmsg = "invalid data"; 1088 goto utf8Error; 1089 } 1090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1092 /* validate and convert to UTF-16 */ 1093 if ((ch < 0x10000) /* minimum value allowed for 4 1094 byte encoding */ 1095 || (ch > 0x10ffff)) /* maximum value allowed for 1096 UTF-16 */ 1097 { 1098 errmsg = "illegal encoding"; 1099 goto utf8Error; 1100 } 1101#ifdef Py_UNICODE_WIDE 1102 *p++ = (Py_UNICODE)ch; 1103#else 1104 /* compute and append the two surrogates: */ 1105 1106 /* translate from 10000..10FFFF to 0..FFFF */ 1107 ch -= 0x10000; 1108 1109 /* high surrogate = top 10 bits added to D800 */ 1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1111 1112 /* low surrogate = bottom 10 bits added to DC00 */ 1113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1114#endif 1115 break; 1116 1117 default: 1118 /* Other sizes are only needed for UCS-4 */ 1119 errmsg = "unsupported Unicode code range"; 1120 goto utf8Error; 1121 } 1122 s += n; 1123 continue; 1124 1125 utf8Error: 1126 if (utf8_decoding_error(&s, &p, errors, errmsg)) 1127 goto onError; 1128 } 1129 1130 /* Adjust length */ 1131 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1132 goto onError; 1133 1134 return (PyObject *)unicode; 1135 1136onError: 1137 Py_DECREF(unicode); 1138 return NULL; 1139} 1140 1141/* Not used anymore, now that the encoder supports UTF-16 1142 surrogates. */ 1143#if 0 1144static 1145int utf8_encoding_error(const Py_UNICODE **source, 1146 char **dest, 1147 const char *errors, 1148 const char *details) 1149{ 1150 if ((errors == NULL) || 1151 (strcmp(errors,"strict") == 0)) { 1152 PyErr_Format(PyExc_UnicodeError, 1153 "UTF-8 encoding error: %.400s", 1154 details); 1155 return -1; 1156 } 1157 else if (strcmp(errors,"ignore") == 0) { 1158 return 0; 1159 } 1160 else if (strcmp(errors,"replace") == 0) { 1161 **dest = '?'; 1162 (*dest)++; 1163 return 0; 1164 } 1165 else { 1166 PyErr_Format(PyExc_ValueError, 1167 "UTF-8 encoding error; " 1168 "unknown error handling code: %.400s", 1169 errors); 1170 return -1; 1171 } 1172} 1173#endif 1174 1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1176 int size, 1177 const char *errors) 1178{ 1179 PyObject *v; 1180 char *p; 1181 unsigned int cbAllocated = 2 * size; 1182 unsigned int cbWritten = 0; 1183 int i = 0; 1184 1185 /* Short-cut for emtpy strings */ 1186 if (size == 0) 1187 return PyString_FromStringAndSize(NULL, 0); 1188 1189 /* We allocate 4 more bytes to have room for at least one full 1190 UTF-8 sequence; saves a few cycles in the loop below */ 1191 v = PyString_FromStringAndSize(NULL, cbAllocated + 4); 1192 if (v == NULL) 1193 return NULL; 1194 1195 p = PyString_AS_STRING(v); 1196 while (i < size) { 1197 Py_UCS4 ch = s[i++]; 1198 1199 if (ch < 0x80) { 1200 *p++ = (char) ch; 1201 cbWritten++; 1202 } 1203 1204 else if (ch < 0x0800) { 1205 *p++ = (char)(0xc0 | (ch >> 6)); 1206 *p++ = (char)(0x80 | (ch & 0x3f)); 1207 cbWritten += 2; 1208 } 1209 1210 else { 1211 1212 /* Assure that we have enough room for high order Unicode 1213 ordinals */ 1214 if (cbWritten >= cbAllocated) { 1215 cbAllocated += 4 * 10; 1216 if (_PyString_Resize(&v, cbAllocated + 4)) 1217 goto onError; 1218 p = PyString_AS_STRING(v) + cbWritten; 1219 } 1220 1221 if (ch < 0x10000) { 1222 /* Check for high surrogate */ 1223 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1224 Py_UCS4 ch2 = s[i]; 1225 /* Check for low surrogate */ 1226 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1227 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 1228 *p++ = (char)((ch >> 18) | 0xf0); 1229 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1230 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1231 *p++ = (char)(0x80 | (ch & 0x3f)); 1232 i++; 1233 cbWritten += 4; 1234 continue; 1235 } 1236 /* Fall through: handles isolated high surrogates */ 1237 } 1238 *p++ = (char)(0xe0 | (ch >> 12)); 1239 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1240 *p++ = (char)(0x80 | (ch & 0x3f)); 1241 cbWritten += 3; 1242 1243 } else { 1244 *p++ = (char)(0xf0 | (ch>>18)); 1245 *p++ = (char)(0x80 | ((ch>>12) & 0x3f)); 1246 *p++ = (char)(0x80 | ((ch>>6) & 0x3f)); 1247 *p++ = (char)(0x80 | (ch & 0x3f)); 1248 cbWritten += 4; 1249 } 1250 } 1251 } 1252 *p = '\0'; 1253 if (_PyString_Resize(&v, cbWritten)) 1254 goto onError; 1255 return v; 1256 1257 onError: 1258 Py_DECREF(v); 1259 return NULL; 1260} 1261 1262PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1263{ 1264 if (!PyUnicode_Check(unicode)) { 1265 PyErr_BadArgument(); 1266 return NULL; 1267 } 1268 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1269 PyUnicode_GET_SIZE(unicode), 1270 NULL); 1271} 1272 1273/* --- UTF-16 Codec ------------------------------------------------------- */ 1274 1275static 1276int utf16_decoding_error(Py_UNICODE **dest, 1277 const char *errors, 1278 const char *details) 1279{ 1280 if ((errors == NULL) || 1281 (strcmp(errors,"strict") == 0)) { 1282 PyErr_Format(PyExc_UnicodeError, 1283 "UTF-16 decoding error: %.400s", 1284 details); 1285 return -1; 1286 } 1287 else if (strcmp(errors,"ignore") == 0) { 1288 return 0; 1289 } 1290 else if (strcmp(errors,"replace") == 0) { 1291 if (dest) { 1292 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1293 (*dest)++; 1294 } 1295 return 0; 1296 } 1297 else { 1298 PyErr_Format(PyExc_ValueError, 1299 "UTF-16 decoding error; " 1300 "unknown error handling code: %.400s", 1301 errors); 1302 return -1; 1303 } 1304} 1305 1306PyObject * 1307PyUnicode_DecodeUTF16(const char *s, 1308 int size, 1309 const char *errors, 1310 int *byteorder) 1311{ 1312 PyUnicodeObject *unicode; 1313 Py_UNICODE *p; 1314 const unsigned char *q, *e; 1315 int bo = 0; /* assume native ordering by default */ 1316 const char *errmsg = ""; 1317 /* Offsets from q for retrieving byte pairs in the right order. */ 1318#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1319 int ihi = 1, ilo = 0; 1320#else 1321 int ihi = 0, ilo = 1; 1322#endif 1323 1324 /* size should be an even number */ 1325 if (size & 1) { 1326 if (utf16_decoding_error(NULL, errors, "truncated data")) 1327 return NULL; 1328 --size; /* else ignore the oddball byte */ 1329 } 1330 1331 /* Note: size will always be longer than the resulting Unicode 1332 character count */ 1333 unicode = _PyUnicode_New(size); 1334 if (!unicode) 1335 return NULL; 1336 if (size == 0) 1337 return (PyObject *)unicode; 1338 1339 /* Unpack UTF-16 encoded data */ 1340 p = unicode->str; 1341 q = (unsigned char *)s; 1342 e = q + size; 1343 1344 if (byteorder) 1345 bo = *byteorder; 1346 1347 /* Check for BOM marks (U+FEFF) in the input and adjust current 1348 byte order setting accordingly. In native mode, the leading BOM 1349 mark is skipped, in all other modes, it is copied to the output 1350 stream as-is (giving a ZWNBSP character). */ 1351 if (bo == 0) { 1352 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1353#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1354 if (bom == 0xFEFF) { 1355 q += 2; 1356 bo = -1; 1357 } 1358 else if (bom == 0xFFFE) { 1359 q += 2; 1360 bo = 1; 1361 } 1362#else 1363 if (bom == 0xFEFF) { 1364 q += 2; 1365 bo = 1; 1366 } 1367 else if (bom == 0xFFFE) { 1368 q += 2; 1369 bo = -1; 1370 } 1371#endif 1372 } 1373 1374 if (bo == -1) { 1375 /* force LE */ 1376 ihi = 1; 1377 ilo = 0; 1378 } 1379 else if (bo == 1) { 1380 /* force BE */ 1381 ihi = 0; 1382 ilo = 1; 1383 } 1384 1385 while (q < e) { 1386 Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; 1387 q += 2; 1388 1389 if (ch < 0xD800 || ch > 0xDFFF) { 1390 *p++ = ch; 1391 continue; 1392 } 1393 1394 /* UTF-16 code pair: */ 1395 if (q >= e) { 1396 errmsg = "unexpected end of data"; 1397 goto utf16Error; 1398 } 1399 if (0xD800 <= ch && ch <= 0xDBFF) { 1400 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1401 q += 2; 1402 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1403#ifndef Py_UNICODE_WIDE 1404 *p++ = ch; 1405 *p++ = ch2; 1406#else 1407 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1408#endif 1409 continue; 1410 } 1411 else { 1412 errmsg = "illegal UTF-16 surrogate"; 1413 goto utf16Error; 1414 } 1415 1416 } 1417 errmsg = "illegal encoding"; 1418 /* Fall through to report the error */ 1419 1420 utf16Error: 1421 if (utf16_decoding_error(&p, errors, errmsg)) 1422 goto onError; 1423 } 1424 1425 if (byteorder) 1426 *byteorder = bo; 1427 1428 /* Adjust length */ 1429 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1430 goto onError; 1431 1432 return (PyObject *)unicode; 1433 1434onError: 1435 Py_DECREF(unicode); 1436 return NULL; 1437} 1438 1439PyObject * 1440PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1441 int size, 1442 const char *errors, 1443 int byteorder) 1444{ 1445 PyObject *v; 1446 unsigned char *p; 1447 int i, pairs; 1448 /* Offsets from p for storing byte pairs in the right order. */ 1449#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1450 int ihi = 1, ilo = 0; 1451#else 1452 int ihi = 0, ilo = 1; 1453#endif 1454 1455#define STORECHAR(CH) \ 1456 do { \ 1457 p[ihi] = ((CH) >> 8) & 0xff; \ 1458 p[ilo] = (CH) & 0xff; \ 1459 p += 2; \ 1460 } while(0) 1461 1462 for (i = pairs = 0; i < size; i++) 1463 if (s[i] >= 0x10000) 1464 pairs++; 1465 v = PyString_FromStringAndSize(NULL, 1466 2 * (size + pairs + (byteorder == 0))); 1467 if (v == NULL) 1468 return NULL; 1469 1470 p = (unsigned char *)PyString_AS_STRING(v); 1471 if (byteorder == 0) 1472 STORECHAR(0xFEFF); 1473 if (size == 0) 1474 return v; 1475 1476 if (byteorder == -1) { 1477 /* force LE */ 1478 ihi = 1; 1479 ilo = 0; 1480 } 1481 else if (byteorder == 1) { 1482 /* force BE */ 1483 ihi = 0; 1484 ilo = 1; 1485 } 1486 1487 while (size-- > 0) { 1488 Py_UNICODE ch = *s++; 1489 Py_UNICODE ch2 = 0; 1490 if (ch >= 0x10000) { 1491 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1492 ch = 0xD800 | ((ch-0x10000) >> 10); 1493 } 1494 STORECHAR(ch); 1495 if (ch2) 1496 STORECHAR(ch2); 1497 } 1498 return v; 1499#undef STORECHAR 1500} 1501 1502PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1503{ 1504 if (!PyUnicode_Check(unicode)) { 1505 PyErr_BadArgument(); 1506 return NULL; 1507 } 1508 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1509 PyUnicode_GET_SIZE(unicode), 1510 NULL, 1511 0); 1512} 1513 1514/* --- Unicode Escape Codec ----------------------------------------------- */ 1515 1516static 1517int unicodeescape_decoding_error(Py_UNICODE **x, 1518 const char *errors, 1519 const char *details) 1520{ 1521 if ((errors == NULL) || 1522 (strcmp(errors,"strict") == 0)) { 1523 PyErr_Format(PyExc_UnicodeError, 1524 "Unicode-Escape decoding error: %.400s", 1525 details); 1526 return -1; 1527 } 1528 else if (strcmp(errors,"ignore") == 0) { 1529 return 0; 1530 } 1531 else if (strcmp(errors,"replace") == 0) { 1532 **x = Py_UNICODE_REPLACEMENT_CHARACTER; 1533 (*x)++; 1534 return 0; 1535 } 1536 else { 1537 PyErr_Format(PyExc_ValueError, 1538 "Unicode-Escape decoding error; " 1539 "unknown error handling code: %.400s", 1540 errors); 1541 return -1; 1542 } 1543} 1544 1545static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1546 1547PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1548 int size, 1549 const char *errors) 1550{ 1551 PyUnicodeObject *v; 1552 Py_UNICODE *p, *buf; 1553 const char *end; 1554 char* message; 1555 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1556 1557 /* Escaped strings will always be longer than the resulting 1558 Unicode string, so we start with size here and then reduce the 1559 length after conversion to the true value. */ 1560 v = _PyUnicode_New(size); 1561 if (v == NULL) 1562 goto onError; 1563 if (size == 0) 1564 return (PyObject *)v; 1565 1566 p = buf = PyUnicode_AS_UNICODE(v); 1567 end = s + size; 1568 1569 while (s < end) { 1570 unsigned char c; 1571 Py_UNICODE x; 1572 int i, digits; 1573 1574 /* Non-escape characters are interpreted as Unicode ordinals */ 1575 if (*s != '\\') { 1576 *p++ = (unsigned char) *s++; 1577 continue; 1578 } 1579 1580 /* \ - Escapes */ 1581 s++; 1582 switch (*s++) { 1583 1584 /* \x escapes */ 1585 case '\n': break; 1586 case '\\': *p++ = '\\'; break; 1587 case '\'': *p++ = '\''; break; 1588 case '\"': *p++ = '\"'; break; 1589 case 'b': *p++ = '\b'; break; 1590 case 'f': *p++ = '\014'; break; /* FF */ 1591 case 't': *p++ = '\t'; break; 1592 case 'n': *p++ = '\n'; break; 1593 case 'r': *p++ = '\r'; break; 1594 case 'v': *p++ = '\013'; break; /* VT */ 1595 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1596 1597 /* \OOO (octal) escapes */ 1598 case '0': case '1': case '2': case '3': 1599 case '4': case '5': case '6': case '7': 1600 x = s[-1] - '0'; 1601 if ('0' <= *s && *s <= '7') { 1602 x = (x<<3) + *s++ - '0'; 1603 if ('0' <= *s && *s <= '7') 1604 x = (x<<3) + *s++ - '0'; 1605 } 1606 *p++ = x; 1607 break; 1608 1609 /* hex escapes */ 1610 /* \xXX */ 1611 case 'x': 1612 digits = 2; 1613 message = "truncated \\xXX escape"; 1614 goto hexescape; 1615 1616 /* \uXXXX */ 1617 case 'u': 1618 digits = 4; 1619 message = "truncated \\uXXXX escape"; 1620 goto hexescape; 1621 1622 /* \UXXXXXXXX */ 1623 case 'U': 1624 digits = 8; 1625 message = "truncated \\UXXXXXXXX escape"; 1626 hexescape: 1627 chr = 0; 1628 for (i = 0; i < digits; i++) { 1629 c = (unsigned char) s[i]; 1630 if (!isxdigit(c)) { 1631 if (unicodeescape_decoding_error(&p, errors, message)) 1632 goto onError; 1633 chr = 0xffffffff; 1634 i++; 1635 break; 1636 } 1637 chr = (chr<<4) & ~0xF; 1638 if (c >= '0' && c <= '9') 1639 chr += c - '0'; 1640 else if (c >= 'a' && c <= 'f') 1641 chr += 10 + c - 'a'; 1642 else 1643 chr += 10 + c - 'A'; 1644 } 1645 s += i; 1646 if (chr == 0xffffffff) 1647 /* _decoding_error will have already written into the 1648 target buffer. */ 1649 break; 1650 store: 1651 /* when we get here, chr is a 32-bit unicode character */ 1652 if (chr <= 0xffff) 1653 /* UCS-2 character */ 1654 *p++ = (Py_UNICODE) chr; 1655 else if (chr <= 0x10ffff) { 1656 /* UCS-4 character. Either store directly, or as 1657 surrogate pair. */ 1658#ifdef Py_UNICODE_WIDE 1659 *p++ = chr; 1660#else 1661 chr -= 0x10000L; 1662 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1663 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1664#endif 1665 } else { 1666 if (unicodeescape_decoding_error( 1667 &p, errors, 1668 "illegal Unicode character") 1669 ) 1670 goto onError; 1671 } 1672 break; 1673 1674 /* \N{name} */ 1675 case 'N': 1676 message = "malformed \\N character escape"; 1677 if (ucnhash_CAPI == NULL) { 1678 /* load the unicode data module */ 1679 PyObject *m, *v; 1680 m = PyImport_ImportModule("unicodedata"); 1681 if (m == NULL) 1682 goto ucnhashError; 1683 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1684 Py_DECREF(m); 1685 if (v == NULL) 1686 goto ucnhashError; 1687 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1688 Py_DECREF(v); 1689 if (ucnhash_CAPI == NULL) 1690 goto ucnhashError; 1691 } 1692 if (*s == '{') { 1693 const char *start = s+1; 1694 /* look for the closing brace */ 1695 while (*s != '}' && s < end) 1696 s++; 1697 if (s > start && s < end && *s == '}') { 1698 /* found a name. look it up in the unicode database */ 1699 message = "unknown Unicode character name"; 1700 s++; 1701 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1702 goto store; 1703 } 1704 } 1705 if (unicodeescape_decoding_error(&p, errors, message)) 1706 goto onError; 1707 break; 1708 1709 default: 1710 if (s > end) { 1711 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) 1712 goto onError; 1713 } 1714 else { 1715 *p++ = '\\'; 1716 *p++ = (unsigned char)s[-1]; 1717 } 1718 break; 1719 } 1720 } 1721 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1722 goto onError; 1723 return (PyObject *)v; 1724 1725ucnhashError: 1726 PyErr_SetString( 1727 PyExc_UnicodeError, 1728 "\\N escapes not supported (can't load unicodedata module)" 1729 ); 1730 return NULL; 1731 1732onError: 1733 Py_XDECREF(v); 1734 return NULL; 1735} 1736 1737/* Return a Unicode-Escape string version of the Unicode object. 1738 1739 If quotes is true, the string is enclosed in u"" or u'' quotes as 1740 appropriate. 1741 1742*/ 1743 1744static const Py_UNICODE *findchar(const Py_UNICODE *s, 1745 int size, 1746 Py_UNICODE ch); 1747 1748static 1749PyObject *unicodeescape_string(const Py_UNICODE *s, 1750 int size, 1751 int quotes) 1752{ 1753 PyObject *repr; 1754 char *p; 1755 1756 static const char *hexdigit = "0123456789abcdef"; 1757 1758 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1759 if (repr == NULL) 1760 return NULL; 1761 1762 p = PyString_AS_STRING(repr); 1763 1764 if (quotes) { 1765 *p++ = 'u'; 1766 *p++ = (findchar(s, size, '\'') && 1767 !findchar(s, size, '"')) ? '"' : '\''; 1768 } 1769 while (size-- > 0) { 1770 Py_UNICODE ch = *s++; 1771 1772 /* Escape quotes */ 1773 if (quotes && 1774 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1775 *p++ = '\\'; 1776 *p++ = (char) ch; 1777 continue; 1778 } 1779 1780#ifdef Py_UNICODE_WIDE 1781 /* Map 21-bit characters to '\U00xxxxxx' */ 1782 else if (ch >= 0x10000) { 1783 int offset = p - PyString_AS_STRING(repr); 1784 1785 /* Resize the string if necessary */ 1786 if (offset + 12 > PyString_GET_SIZE(repr)) { 1787 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1788 goto onError; 1789 p = PyString_AS_STRING(repr) + offset; 1790 } 1791 1792 *p++ = '\\'; 1793 *p++ = 'U'; 1794 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1795 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1796 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1797 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1798 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1799 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1800 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1801 *p++ = hexdigit[ch & 0x0000000F]; 1802 continue; 1803 } 1804#endif 1805 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1806 else if (ch >= 0xD800 && ch < 0xDC00) { 1807 Py_UNICODE ch2; 1808 Py_UCS4 ucs; 1809 1810 ch2 = *s++; 1811 size--; 1812 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1813 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1814 *p++ = '\\'; 1815 *p++ = 'U'; 1816 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1817 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1818 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1819 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1820 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1821 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1822 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1823 *p++ = hexdigit[ucs & 0x0000000F]; 1824 continue; 1825 } 1826 /* Fall through: isolated surrogates are copied as-is */ 1827 s--; 1828 size++; 1829 } 1830 1831 /* Map 16-bit characters to '\uxxxx' */ 1832 if (ch >= 256) { 1833 *p++ = '\\'; 1834 *p++ = 'u'; 1835 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1836 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1837 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1838 *p++ = hexdigit[ch & 0x000F]; 1839 } 1840 1841 /* Map special whitespace to '\t', \n', '\r' */ 1842 else if (ch == '\t') { 1843 *p++ = '\\'; 1844 *p++ = 't'; 1845 } 1846 else if (ch == '\n') { 1847 *p++ = '\\'; 1848 *p++ = 'n'; 1849 } 1850 else if (ch == '\r') { 1851 *p++ = '\\'; 1852 *p++ = 'r'; 1853 } 1854 1855 /* Map non-printable US ASCII to '\xhh' */ 1856 else if (ch < ' ' || ch >= 0x7F) { 1857 *p++ = '\\'; 1858 *p++ = 'x'; 1859 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1860 *p++ = hexdigit[ch & 0x000F]; 1861 } 1862 1863 /* Copy everything else as-is */ 1864 else 1865 *p++ = (char) ch; 1866 } 1867 if (quotes) 1868 *p++ = PyString_AS_STRING(repr)[1]; 1869 1870 *p = '\0'; 1871 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr))) 1872 goto onError; 1873 1874 return repr; 1875 1876 onError: 1877 Py_DECREF(repr); 1878 return NULL; 1879} 1880 1881PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1882 int size) 1883{ 1884 return unicodeescape_string(s, size, 0); 1885} 1886 1887PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1888{ 1889 if (!PyUnicode_Check(unicode)) { 1890 PyErr_BadArgument(); 1891 return NULL; 1892 } 1893 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1894 PyUnicode_GET_SIZE(unicode)); 1895} 1896 1897/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1898 1899PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1900 int size, 1901 const char *errors) 1902{ 1903 PyUnicodeObject *v; 1904 Py_UNICODE *p, *buf; 1905 const char *end; 1906 const char *bs; 1907 1908 /* Escaped strings will always be longer than the resulting 1909 Unicode string, so we start with size here and then reduce the 1910 length after conversion to the true value. */ 1911 v = _PyUnicode_New(size); 1912 if (v == NULL) 1913 goto onError; 1914 if (size == 0) 1915 return (PyObject *)v; 1916 p = buf = PyUnicode_AS_UNICODE(v); 1917 end = s + size; 1918 while (s < end) { 1919 unsigned char c; 1920 Py_UCS4 x; 1921 int i; 1922 1923 /* Non-escape characters are interpreted as Unicode ordinals */ 1924 if (*s != '\\') { 1925 *p++ = (unsigned char)*s++; 1926 continue; 1927 } 1928 1929 /* \u-escapes are only interpreted iff the number of leading 1930 backslashes if odd */ 1931 bs = s; 1932 for (;s < end;) { 1933 if (*s != '\\') 1934 break; 1935 *p++ = (unsigned char)*s++; 1936 } 1937 if (((s - bs) & 1) == 0 || 1938 s >= end || 1939 *s != 'u') { 1940 continue; 1941 } 1942 p--; 1943 s++; 1944 1945 /* \uXXXX with 4 hex digits */ 1946 for (x = 0, i = 0; i < 4; i++) { 1947 c = (unsigned char)s[i]; 1948 if (!isxdigit(c)) { 1949 if (unicodeescape_decoding_error(&p, errors, 1950 "truncated \\uXXXX")) 1951 goto onError; 1952 x = 0xffffffff; 1953 i++; 1954 break; 1955 } 1956 x = (x<<4) & ~0xF; 1957 if (c >= '0' && c <= '9') 1958 x += c - '0'; 1959 else if (c >= 'a' && c <= 'f') 1960 x += 10 + c - 'a'; 1961 else 1962 x += 10 + c - 'A'; 1963 } 1964 s += i; 1965 if (x != 0xffffffff) 1966 *p++ = x; 1967 } 1968 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1969 goto onError; 1970 return (PyObject *)v; 1971 1972 onError: 1973 Py_XDECREF(v); 1974 return NULL; 1975} 1976 1977PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1978 int size) 1979{ 1980 PyObject *repr; 1981 char *p; 1982 char *q; 1983 1984 static const char *hexdigit = "0123456789abcdef"; 1985 1986 repr = PyString_FromStringAndSize(NULL, 6 * size); 1987 if (repr == NULL) 1988 return NULL; 1989 if (size == 0) 1990 return repr; 1991 1992 p = q = PyString_AS_STRING(repr); 1993 while (size-- > 0) { 1994 Py_UNICODE ch = *s++; 1995 /* Map 16-bit characters to '\uxxxx' */ 1996 if (ch >= 256) { 1997 *p++ = '\\'; 1998 *p++ = 'u'; 1999 *p++ = hexdigit[(ch >> 12) & 0xf]; 2000 *p++ = hexdigit[(ch >> 8) & 0xf]; 2001 *p++ = hexdigit[(ch >> 4) & 0xf]; 2002 *p++ = hexdigit[ch & 15]; 2003 } 2004 /* Copy everything else as-is */ 2005 else 2006 *p++ = (char) ch; 2007 } 2008 *p = '\0'; 2009 if (_PyString_Resize(&repr, p - q)) 2010 goto onError; 2011 2012 return repr; 2013 2014 onError: 2015 Py_DECREF(repr); 2016 return NULL; 2017} 2018 2019PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2020{ 2021 if (!PyUnicode_Check(unicode)) { 2022 PyErr_BadArgument(); 2023 return NULL; 2024 } 2025 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2026 PyUnicode_GET_SIZE(unicode)); 2027} 2028 2029/* --- Latin-1 Codec ------------------------------------------------------ */ 2030 2031PyObject *PyUnicode_DecodeLatin1(const char *s, 2032 int size, 2033 const char *errors) 2034{ 2035 PyUnicodeObject *v; 2036 Py_UNICODE *p; 2037 2038 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2039 if (size == 1 && *(unsigned char*)s < 256) { 2040 Py_UNICODE r = *(unsigned char*)s; 2041 return PyUnicode_FromUnicode(&r, 1); 2042 } 2043 2044 v = _PyUnicode_New(size); 2045 if (v == NULL) 2046 goto onError; 2047 if (size == 0) 2048 return (PyObject *)v; 2049 p = PyUnicode_AS_UNICODE(v); 2050 while (size-- > 0) 2051 *p++ = (unsigned char)*s++; 2052 return (PyObject *)v; 2053 2054 onError: 2055 Py_XDECREF(v); 2056 return NULL; 2057} 2058 2059static 2060int latin1_encoding_error(const Py_UNICODE **source, 2061 char **dest, 2062 const char *errors, 2063 const char *details) 2064{ 2065 if ((errors == NULL) || 2066 (strcmp(errors,"strict") == 0)) { 2067 PyErr_Format(PyExc_UnicodeError, 2068 "Latin-1 encoding error: %.400s", 2069 details); 2070 return -1; 2071 } 2072 else if (strcmp(errors,"ignore") == 0) { 2073 return 0; 2074 } 2075 else if (strcmp(errors,"replace") == 0) { 2076 **dest = '?'; 2077 (*dest)++; 2078 return 0; 2079 } 2080 else { 2081 PyErr_Format(PyExc_ValueError, 2082 "Latin-1 encoding error; " 2083 "unknown error handling code: %.400s", 2084 errors); 2085 return -1; 2086 } 2087} 2088 2089PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2090 int size, 2091 const char *errors) 2092{ 2093 PyObject *repr; 2094 char *s, *start; 2095 2096 repr = PyString_FromStringAndSize(NULL, size); 2097 if (repr == NULL) 2098 return NULL; 2099 if (size == 0) 2100 return repr; 2101 2102 s = PyString_AS_STRING(repr); 2103 start = s; 2104 while (size-- > 0) { 2105 Py_UNICODE ch = *p++; 2106 if (ch >= 256) { 2107 if (latin1_encoding_error(&p, &s, errors, 2108 "ordinal not in range(256)")) 2109 goto onError; 2110 } 2111 else 2112 *s++ = (char)ch; 2113 } 2114 /* Resize if error handling skipped some characters */ 2115 if (s - start < PyString_GET_SIZE(repr)) 2116 if (_PyString_Resize(&repr, s - start)) 2117 goto onError; 2118 return repr; 2119 2120 onError: 2121 Py_DECREF(repr); 2122 return NULL; 2123} 2124 2125PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2126{ 2127 if (!PyUnicode_Check(unicode)) { 2128 PyErr_BadArgument(); 2129 return NULL; 2130 } 2131 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2132 PyUnicode_GET_SIZE(unicode), 2133 NULL); 2134} 2135 2136/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2137 2138static 2139int ascii_decoding_error(const char **source, 2140 Py_UNICODE **dest, 2141 const char *errors, 2142 const char *details) 2143{ 2144 if ((errors == NULL) || 2145 (strcmp(errors,"strict") == 0)) { 2146 PyErr_Format(PyExc_UnicodeError, 2147 "ASCII decoding error: %.400s", 2148 details); 2149 return -1; 2150 } 2151 else if (strcmp(errors,"ignore") == 0) { 2152 return 0; 2153 } 2154 else if (strcmp(errors,"replace") == 0) { 2155 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2156 (*dest)++; 2157 return 0; 2158 } 2159 else { 2160 PyErr_Format(PyExc_ValueError, 2161 "ASCII decoding error; " 2162 "unknown error handling code: %.400s", 2163 errors); 2164 return -1; 2165 } 2166} 2167 2168PyObject *PyUnicode_DecodeASCII(const char *s, 2169 int size, 2170 const char *errors) 2171{ 2172 PyUnicodeObject *v; 2173 Py_UNICODE *p; 2174 2175 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2176 if (size == 1 && *(unsigned char*)s < 128) { 2177 Py_UNICODE r = *(unsigned char*)s; 2178 return PyUnicode_FromUnicode(&r, 1); 2179 } 2180 2181 v = _PyUnicode_New(size); 2182 if (v == NULL) 2183 goto onError; 2184 if (size == 0) 2185 return (PyObject *)v; 2186 p = PyUnicode_AS_UNICODE(v); 2187 while (size-- > 0) { 2188 register unsigned char c; 2189 2190 c = (unsigned char)*s++; 2191 if (c < 128) 2192 *p++ = c; 2193 else if (ascii_decoding_error(&s, &p, errors, 2194 "ordinal not in range(128)")) 2195 goto onError; 2196 } 2197 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2198 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2199 goto onError; 2200 return (PyObject *)v; 2201 2202 onError: 2203 Py_XDECREF(v); 2204 return NULL; 2205} 2206 2207static 2208int ascii_encoding_error(const Py_UNICODE **source, 2209 char **dest, 2210 const char *errors, 2211 const char *details) 2212{ 2213 if ((errors == NULL) || 2214 (strcmp(errors,"strict") == 0)) { 2215 PyErr_Format(PyExc_UnicodeError, 2216 "ASCII encoding error: %.400s", 2217 details); 2218 return -1; 2219 } 2220 else if (strcmp(errors,"ignore") == 0) { 2221 return 0; 2222 } 2223 else if (strcmp(errors,"replace") == 0) { 2224 **dest = '?'; 2225 (*dest)++; 2226 return 0; 2227 } 2228 else { 2229 PyErr_Format(PyExc_ValueError, 2230 "ASCII encoding error; " 2231 "unknown error handling code: %.400s", 2232 errors); 2233 return -1; 2234 } 2235} 2236 2237PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2238 int size, 2239 const char *errors) 2240{ 2241 PyObject *repr; 2242 char *s, *start; 2243 2244 repr = PyString_FromStringAndSize(NULL, size); 2245 if (repr == NULL) 2246 return NULL; 2247 if (size == 0) 2248 return repr; 2249 2250 s = PyString_AS_STRING(repr); 2251 start = s; 2252 while (size-- > 0) { 2253 Py_UNICODE ch = *p++; 2254 if (ch >= 128) { 2255 if (ascii_encoding_error(&p, &s, errors, 2256 "ordinal not in range(128)")) 2257 goto onError; 2258 } 2259 else 2260 *s++ = (char)ch; 2261 } 2262 /* Resize if error handling skipped some characters */ 2263 if (s - start < PyString_GET_SIZE(repr)) 2264 if (_PyString_Resize(&repr, s - start)) 2265 goto onError; 2266 return repr; 2267 2268 onError: 2269 Py_DECREF(repr); 2270 return NULL; 2271} 2272 2273PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2274{ 2275 if (!PyUnicode_Check(unicode)) { 2276 PyErr_BadArgument(); 2277 return NULL; 2278 } 2279 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2280 PyUnicode_GET_SIZE(unicode), 2281 NULL); 2282} 2283 2284#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) 2285 2286/* --- MBCS codecs for Windows -------------------------------------------- */ 2287 2288PyObject *PyUnicode_DecodeMBCS(const char *s, 2289 int size, 2290 const char *errors) 2291{ 2292 PyUnicodeObject *v; 2293 Py_UNICODE *p; 2294 2295 /* First get the size of the result */ 2296 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2297 if (size > 0 && usize==0) 2298 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2299 2300 v = _PyUnicode_New(usize); 2301 if (v == NULL) 2302 return NULL; 2303 if (usize == 0) 2304 return (PyObject *)v; 2305 p = PyUnicode_AS_UNICODE(v); 2306 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2307 Py_DECREF(v); 2308 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2309 } 2310 2311 return (PyObject *)v; 2312} 2313 2314PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2315 int size, 2316 const char *errors) 2317{ 2318 PyObject *repr; 2319 char *s; 2320 DWORD mbcssize; 2321 2322 /* If there are no characters, bail now! */ 2323 if (size==0) 2324 return PyString_FromString(""); 2325 2326 /* First get the size of the result */ 2327 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2328 if (mbcssize==0) 2329 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2330 2331 repr = PyString_FromStringAndSize(NULL, mbcssize); 2332 if (repr == NULL) 2333 return NULL; 2334 if (mbcssize == 0) 2335 return repr; 2336 2337 /* Do the conversion */ 2338 s = PyString_AS_STRING(repr); 2339 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2340 Py_DECREF(repr); 2341 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2342 } 2343 return repr; 2344} 2345 2346#endif /* MS_WIN32 */ 2347 2348/* --- Character Mapping Codec -------------------------------------------- */ 2349 2350static 2351int charmap_decoding_error(const char **source, 2352 Py_UNICODE **dest, 2353 const char *errors, 2354 const char *details) 2355{ 2356 if ((errors == NULL) || 2357 (strcmp(errors,"strict") == 0)) { 2358 PyErr_Format(PyExc_UnicodeError, 2359 "charmap decoding error: %.400s", 2360 details); 2361 return -1; 2362 } 2363 else if (strcmp(errors,"ignore") == 0) { 2364 return 0; 2365 } 2366 else if (strcmp(errors,"replace") == 0) { 2367 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2368 (*dest)++; 2369 return 0; 2370 } 2371 else { 2372 PyErr_Format(PyExc_ValueError, 2373 "charmap decoding error; " 2374 "unknown error handling code: %.400s", 2375 errors); 2376 return -1; 2377 } 2378} 2379 2380PyObject *PyUnicode_DecodeCharmap(const char *s, 2381 int size, 2382 PyObject *mapping, 2383 const char *errors) 2384{ 2385 PyUnicodeObject *v; 2386 Py_UNICODE *p; 2387 int extrachars = 0; 2388 2389 /* Default to Latin-1 */ 2390 if (mapping == NULL) 2391 return PyUnicode_DecodeLatin1(s, size, errors); 2392 2393 v = _PyUnicode_New(size); 2394 if (v == NULL) 2395 goto onError; 2396 if (size == 0) 2397 return (PyObject *)v; 2398 p = PyUnicode_AS_UNICODE(v); 2399 while (size-- > 0) { 2400 unsigned char ch = *s++; 2401 PyObject *w, *x; 2402 2403 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2404 w = PyInt_FromLong((long)ch); 2405 if (w == NULL) 2406 goto onError; 2407 x = PyObject_GetItem(mapping, w); 2408 Py_DECREF(w); 2409 if (x == NULL) { 2410 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2411 /* No mapping found means: mapping is undefined. */ 2412 PyErr_Clear(); 2413 x = Py_None; 2414 Py_INCREF(x); 2415 } else 2416 goto onError; 2417 } 2418 2419 /* Apply mapping */ 2420 if (PyInt_Check(x)) { 2421 long value = PyInt_AS_LONG(x); 2422 if (value < 0 || value > 65535) { 2423 PyErr_SetString(PyExc_TypeError, 2424 "character mapping must be in range(65536)"); 2425 Py_DECREF(x); 2426 goto onError; 2427 } 2428 *p++ = (Py_UNICODE)value; 2429 } 2430 else if (x == Py_None) { 2431 /* undefined mapping */ 2432 if (charmap_decoding_error(&s, &p, errors, 2433 "character maps to <undefined>")) { 2434 Py_DECREF(x); 2435 goto onError; 2436 } 2437 } 2438 else if (PyUnicode_Check(x)) { 2439 int targetsize = PyUnicode_GET_SIZE(x); 2440 2441 if (targetsize == 1) 2442 /* 1-1 mapping */ 2443 *p++ = *PyUnicode_AS_UNICODE(x); 2444 2445 else if (targetsize > 1) { 2446 /* 1-n mapping */ 2447 if (targetsize > extrachars) { 2448 /* resize first */ 2449 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2450 int needed = (targetsize - extrachars) + \ 2451 (targetsize << 2); 2452 extrachars += needed; 2453 if (_PyUnicode_Resize(&v, 2454 PyUnicode_GET_SIZE(v) + needed)) { 2455 Py_DECREF(x); 2456 goto onError; 2457 } 2458 p = PyUnicode_AS_UNICODE(v) + oldpos; 2459 } 2460 Py_UNICODE_COPY(p, 2461 PyUnicode_AS_UNICODE(x), 2462 targetsize); 2463 p += targetsize; 2464 extrachars -= targetsize; 2465 } 2466 /* 1-0 mapping: skip the character */ 2467 } 2468 else { 2469 /* wrong return value */ 2470 PyErr_SetString(PyExc_TypeError, 2471 "character mapping must return integer, None or unicode"); 2472 Py_DECREF(x); 2473 goto onError; 2474 } 2475 Py_DECREF(x); 2476 } 2477 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2478 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2479 goto onError; 2480 return (PyObject *)v; 2481 2482 onError: 2483 Py_XDECREF(v); 2484 return NULL; 2485} 2486 2487static 2488int charmap_encoding_error(const Py_UNICODE **source, 2489 char **dest, 2490 const char *errors, 2491 const char *details) 2492{ 2493 if ((errors == NULL) || 2494 (strcmp(errors,"strict") == 0)) { 2495 PyErr_Format(PyExc_UnicodeError, 2496 "charmap encoding error: %.400s", 2497 details); 2498 return -1; 2499 } 2500 else if (strcmp(errors,"ignore") == 0) { 2501 return 0; 2502 } 2503 else if (strcmp(errors,"replace") == 0) { 2504 **dest = '?'; 2505 (*dest)++; 2506 return 0; 2507 } 2508 else { 2509 PyErr_Format(PyExc_ValueError, 2510 "charmap encoding error; " 2511 "unknown error handling code: %.400s", 2512 errors); 2513 return -1; 2514 } 2515} 2516 2517PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2518 int size, 2519 PyObject *mapping, 2520 const char *errors) 2521{ 2522 PyObject *v; 2523 char *s; 2524 int extrachars = 0; 2525 2526 /* Default to Latin-1 */ 2527 if (mapping == NULL) 2528 return PyUnicode_EncodeLatin1(p, size, errors); 2529 2530 v = PyString_FromStringAndSize(NULL, size); 2531 if (v == NULL) 2532 return NULL; 2533 if (size == 0) 2534 return v; 2535 s = PyString_AS_STRING(v); 2536 while (size-- > 0) { 2537 Py_UNICODE ch = *p++; 2538 PyObject *w, *x; 2539 2540 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2541 w = PyInt_FromLong((long)ch); 2542 if (w == NULL) 2543 goto onError; 2544 x = PyObject_GetItem(mapping, w); 2545 Py_DECREF(w); 2546 if (x == NULL) { 2547 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2548 /* No mapping found means: mapping is undefined. */ 2549 PyErr_Clear(); 2550 x = Py_None; 2551 Py_INCREF(x); 2552 } else 2553 goto onError; 2554 } 2555 2556 /* Apply mapping */ 2557 if (PyInt_Check(x)) { 2558 long value = PyInt_AS_LONG(x); 2559 if (value < 0 || value > 255) { 2560 PyErr_SetString(PyExc_TypeError, 2561 "character mapping must be in range(256)"); 2562 Py_DECREF(x); 2563 goto onError; 2564 } 2565 *s++ = (char)value; 2566 } 2567 else if (x == Py_None) { 2568 /* undefined mapping */ 2569 if (charmap_encoding_error(&p, &s, errors, 2570 "character maps to <undefined>")) { 2571 Py_DECREF(x); 2572 goto onError; 2573 } 2574 } 2575 else if (PyString_Check(x)) { 2576 int targetsize = PyString_GET_SIZE(x); 2577 2578 if (targetsize == 1) 2579 /* 1-1 mapping */ 2580 *s++ = *PyString_AS_STRING(x); 2581 2582 else if (targetsize > 1) { 2583 /* 1-n mapping */ 2584 if (targetsize > extrachars) { 2585 /* resize first */ 2586 int oldpos = (int)(s - PyString_AS_STRING(v)); 2587 int needed = (targetsize - extrachars) + \ 2588 (targetsize << 2); 2589 extrachars += needed; 2590 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2591 Py_DECREF(x); 2592 goto onError; 2593 } 2594 s = PyString_AS_STRING(v) + oldpos; 2595 } 2596 memcpy(s, PyString_AS_STRING(x), targetsize); 2597 s += targetsize; 2598 extrachars -= targetsize; 2599 } 2600 /* 1-0 mapping: skip the character */ 2601 } 2602 else { 2603 /* wrong return value */ 2604 PyErr_SetString(PyExc_TypeError, 2605 "character mapping must return integer, None or unicode"); 2606 Py_DECREF(x); 2607 goto onError; 2608 } 2609 Py_DECREF(x); 2610 } 2611 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2612 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2613 goto onError; 2614 return v; 2615 2616 onError: 2617 Py_DECREF(v); 2618 return NULL; 2619} 2620 2621PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2622 PyObject *mapping) 2623{ 2624 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2625 PyErr_BadArgument(); 2626 return NULL; 2627 } 2628 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2629 PyUnicode_GET_SIZE(unicode), 2630 mapping, 2631 NULL); 2632} 2633 2634static 2635int translate_error(const Py_UNICODE **source, 2636 Py_UNICODE **dest, 2637 const char *errors, 2638 const char *details) 2639{ 2640 if ((errors == NULL) || 2641 (strcmp(errors,"strict") == 0)) { 2642 PyErr_Format(PyExc_UnicodeError, 2643 "translate error: %.400s", 2644 details); 2645 return -1; 2646 } 2647 else if (strcmp(errors,"ignore") == 0) { 2648 return 0; 2649 } 2650 else if (strcmp(errors,"replace") == 0) { 2651 **dest = '?'; 2652 (*dest)++; 2653 return 0; 2654 } 2655 else { 2656 PyErr_Format(PyExc_ValueError, 2657 "translate error; " 2658 "unknown error handling code: %.400s", 2659 errors); 2660 return -1; 2661 } 2662} 2663 2664PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2665 int size, 2666 PyObject *mapping, 2667 const char *errors) 2668{ 2669 PyUnicodeObject *v; 2670 Py_UNICODE *p; 2671 2672 if (mapping == NULL) { 2673 PyErr_BadArgument(); 2674 return NULL; 2675 } 2676 2677 /* Output will never be longer than input */ 2678 v = _PyUnicode_New(size); 2679 if (v == NULL) 2680 goto onError; 2681 if (size == 0) 2682 goto done; 2683 p = PyUnicode_AS_UNICODE(v); 2684 while (size-- > 0) { 2685 Py_UNICODE ch = *s++; 2686 PyObject *w, *x; 2687 2688 /* Get mapping */ 2689 w = PyInt_FromLong(ch); 2690 if (w == NULL) 2691 goto onError; 2692 x = PyObject_GetItem(mapping, w); 2693 Py_DECREF(w); 2694 if (x == NULL) { 2695 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2696 /* No mapping found: default to 1-1 mapping */ 2697 PyErr_Clear(); 2698 *p++ = ch; 2699 continue; 2700 } 2701 goto onError; 2702 } 2703 2704 /* Apply mapping */ 2705 if (PyInt_Check(x)) 2706 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2707 else if (x == Py_None) { 2708 /* undefined mapping */ 2709 if (translate_error(&s, &p, errors, 2710 "character maps to <undefined>")) { 2711 Py_DECREF(x); 2712 goto onError; 2713 } 2714 } 2715 else if (PyUnicode_Check(x)) { 2716 if (PyUnicode_GET_SIZE(x) != 1) { 2717 /* 1-n mapping */ 2718 PyErr_SetString(PyExc_NotImplementedError, 2719 "1-n mappings are currently not implemented"); 2720 Py_DECREF(x); 2721 goto onError; 2722 } 2723 *p++ = *PyUnicode_AS_UNICODE(x); 2724 } 2725 else { 2726 /* wrong return value */ 2727 PyErr_SetString(PyExc_TypeError, 2728 "translate mapping must return integer, None or unicode"); 2729 Py_DECREF(x); 2730 goto onError; 2731 } 2732 Py_DECREF(x); 2733 } 2734 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2735 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2736 goto onError; 2737 2738 done: 2739 return (PyObject *)v; 2740 2741 onError: 2742 Py_XDECREF(v); 2743 return NULL; 2744} 2745 2746PyObject *PyUnicode_Translate(PyObject *str, 2747 PyObject *mapping, 2748 const char *errors) 2749{ 2750 PyObject *result; 2751 2752 str = PyUnicode_FromObject(str); 2753 if (str == NULL) 2754 goto onError; 2755 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2756 PyUnicode_GET_SIZE(str), 2757 mapping, 2758 errors); 2759 Py_DECREF(str); 2760 return result; 2761 2762 onError: 2763 Py_XDECREF(str); 2764 return NULL; 2765} 2766 2767/* --- Decimal Encoder ---------------------------------------------------- */ 2768 2769int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2770 int length, 2771 char *output, 2772 const char *errors) 2773{ 2774 Py_UNICODE *p, *end; 2775 2776 if (output == NULL) { 2777 PyErr_BadArgument(); 2778 return -1; 2779 } 2780 2781 p = s; 2782 end = s + length; 2783 while (p < end) { 2784 register Py_UNICODE ch = *p++; 2785 int decimal; 2786 2787 if (Py_UNICODE_ISSPACE(ch)) { 2788 *output++ = ' '; 2789 continue; 2790 } 2791 decimal = Py_UNICODE_TODECIMAL(ch); 2792 if (decimal >= 0) { 2793 *output++ = '0' + decimal; 2794 continue; 2795 } 2796 if (0 < ch && ch < 256) { 2797 *output++ = (char)ch; 2798 continue; 2799 } 2800 /* All other characters are considered invalid */ 2801 if (errors == NULL || strcmp(errors, "strict") == 0) { 2802 PyErr_SetString(PyExc_ValueError, 2803 "invalid decimal Unicode string"); 2804 goto onError; 2805 } 2806 else if (strcmp(errors, "ignore") == 0) 2807 continue; 2808 else if (strcmp(errors, "replace") == 0) { 2809 *output++ = '?'; 2810 continue; 2811 } 2812 } 2813 /* 0-terminate the output string */ 2814 *output++ = '\0'; 2815 return 0; 2816 2817 onError: 2818 return -1; 2819} 2820 2821/* --- Helpers ------------------------------------------------------------ */ 2822 2823static 2824int count(PyUnicodeObject *self, 2825 int start, 2826 int end, 2827 PyUnicodeObject *substring) 2828{ 2829 int count = 0; 2830 2831 if (start < 0) 2832 start += self->length; 2833 if (start < 0) 2834 start = 0; 2835 if (end > self->length) 2836 end = self->length; 2837 if (end < 0) 2838 end += self->length; 2839 if (end < 0) 2840 end = 0; 2841 2842 if (substring->length == 0) 2843 return (end - start + 1); 2844 2845 end -= substring->length; 2846 2847 while (start <= end) 2848 if (Py_UNICODE_MATCH(self, start, substring)) { 2849 count++; 2850 start += substring->length; 2851 } else 2852 start++; 2853 2854 return count; 2855} 2856 2857int PyUnicode_Count(PyObject *str, 2858 PyObject *substr, 2859 int start, 2860 int end) 2861{ 2862 int result; 2863 2864 str = PyUnicode_FromObject(str); 2865 if (str == NULL) 2866 return -1; 2867 substr = PyUnicode_FromObject(substr); 2868 if (substr == NULL) { 2869 Py_DECREF(str); 2870 return -1; 2871 } 2872 2873 result = count((PyUnicodeObject *)str, 2874 start, end, 2875 (PyUnicodeObject *)substr); 2876 2877 Py_DECREF(str); 2878 Py_DECREF(substr); 2879 return result; 2880} 2881 2882static 2883int findstring(PyUnicodeObject *self, 2884 PyUnicodeObject *substring, 2885 int start, 2886 int end, 2887 int direction) 2888{ 2889 if (start < 0) 2890 start += self->length; 2891 if (start < 0) 2892 start = 0; 2893 2894 if (substring->length == 0) 2895 return start; 2896 2897 if (end > self->length) 2898 end = self->length; 2899 if (end < 0) 2900 end += self->length; 2901 if (end < 0) 2902 end = 0; 2903 2904 end -= substring->length; 2905 2906 if (direction < 0) { 2907 for (; end >= start; end--) 2908 if (Py_UNICODE_MATCH(self, end, substring)) 2909 return end; 2910 } else { 2911 for (; start <= end; start++) 2912 if (Py_UNICODE_MATCH(self, start, substring)) 2913 return start; 2914 } 2915 2916 return -1; 2917} 2918 2919int PyUnicode_Find(PyObject *str, 2920 PyObject *substr, 2921 int start, 2922 int end, 2923 int direction) 2924{ 2925 int result; 2926 2927 str = PyUnicode_FromObject(str); 2928 if (str == NULL) 2929 return -1; 2930 substr = PyUnicode_FromObject(substr); 2931 if (substr == NULL) { 2932 Py_DECREF(substr); 2933 return -1; 2934 } 2935 2936 result = findstring((PyUnicodeObject *)str, 2937 (PyUnicodeObject *)substr, 2938 start, end, direction); 2939 Py_DECREF(str); 2940 Py_DECREF(substr); 2941 return result; 2942} 2943 2944static 2945int tailmatch(PyUnicodeObject *self, 2946 PyUnicodeObject *substring, 2947 int start, 2948 int end, 2949 int direction) 2950{ 2951 if (start < 0) 2952 start += self->length; 2953 if (start < 0) 2954 start = 0; 2955 2956 if (substring->length == 0) 2957 return 1; 2958 2959 if (end > self->length) 2960 end = self->length; 2961 if (end < 0) 2962 end += self->length; 2963 if (end < 0) 2964 end = 0; 2965 2966 end -= substring->length; 2967 if (end < start) 2968 return 0; 2969 2970 if (direction > 0) { 2971 if (Py_UNICODE_MATCH(self, end, substring)) 2972 return 1; 2973 } else { 2974 if (Py_UNICODE_MATCH(self, start, substring)) 2975 return 1; 2976 } 2977 2978 return 0; 2979} 2980 2981int PyUnicode_Tailmatch(PyObject *str, 2982 PyObject *substr, 2983 int start, 2984 int end, 2985 int direction) 2986{ 2987 int result; 2988 2989 str = PyUnicode_FromObject(str); 2990 if (str == NULL) 2991 return -1; 2992 substr = PyUnicode_FromObject(substr); 2993 if (substr == NULL) { 2994 Py_DECREF(substr); 2995 return -1; 2996 } 2997 2998 result = tailmatch((PyUnicodeObject *)str, 2999 (PyUnicodeObject *)substr, 3000 start, end, direction); 3001 Py_DECREF(str); 3002 Py_DECREF(substr); 3003 return result; 3004} 3005 3006static 3007const Py_UNICODE *findchar(const Py_UNICODE *s, 3008 int size, 3009 Py_UNICODE ch) 3010{ 3011 /* like wcschr, but doesn't stop at NULL characters */ 3012 3013 while (size-- > 0) { 3014 if (*s == ch) 3015 return s; 3016 s++; 3017 } 3018 3019 return NULL; 3020} 3021 3022/* Apply fixfct filter to the Unicode object self and return a 3023 reference to the modified object */ 3024 3025static 3026PyObject *fixup(PyUnicodeObject *self, 3027 int (*fixfct)(PyUnicodeObject *s)) 3028{ 3029 3030 PyUnicodeObject *u; 3031 3032 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3033 if (u == NULL) 3034 return NULL; 3035 3036 Py_UNICODE_COPY(u->str, self->str, self->length); 3037 3038 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3039 /* fixfct should return TRUE if it modified the buffer. If 3040 FALSE, return a reference to the original buffer instead 3041 (to save space, not time) */ 3042 Py_INCREF(self); 3043 Py_DECREF(u); 3044 return (PyObject*) self; 3045 } 3046 return (PyObject*) u; 3047} 3048 3049static 3050int fixupper(PyUnicodeObject *self) 3051{ 3052 int len = self->length; 3053 Py_UNICODE *s = self->str; 3054 int status = 0; 3055 3056 while (len-- > 0) { 3057 register Py_UNICODE ch; 3058 3059 ch = Py_UNICODE_TOUPPER(*s); 3060 if (ch != *s) { 3061 status = 1; 3062 *s = ch; 3063 } 3064 s++; 3065 } 3066 3067 return status; 3068} 3069 3070static 3071int fixlower(PyUnicodeObject *self) 3072{ 3073 int len = self->length; 3074 Py_UNICODE *s = self->str; 3075 int status = 0; 3076 3077 while (len-- > 0) { 3078 register Py_UNICODE ch; 3079 3080 ch = Py_UNICODE_TOLOWER(*s); 3081 if (ch != *s) { 3082 status = 1; 3083 *s = ch; 3084 } 3085 s++; 3086 } 3087 3088 return status; 3089} 3090 3091static 3092int fixswapcase(PyUnicodeObject *self) 3093{ 3094 int len = self->length; 3095 Py_UNICODE *s = self->str; 3096 int status = 0; 3097 3098 while (len-- > 0) { 3099 if (Py_UNICODE_ISUPPER(*s)) { 3100 *s = Py_UNICODE_TOLOWER(*s); 3101 status = 1; 3102 } else if (Py_UNICODE_ISLOWER(*s)) { 3103 *s = Py_UNICODE_TOUPPER(*s); 3104 status = 1; 3105 } 3106 s++; 3107 } 3108 3109 return status; 3110} 3111 3112static 3113int fixcapitalize(PyUnicodeObject *self) 3114{ 3115 int len = self->length; 3116 Py_UNICODE *s = self->str; 3117 int status = 0; 3118 3119 if (len == 0) 3120 return 0; 3121 if (Py_UNICODE_ISLOWER(*s)) { 3122 *s = Py_UNICODE_TOUPPER(*s); 3123 status = 1; 3124 } 3125 s++; 3126 while (--len > 0) { 3127 if (Py_UNICODE_ISUPPER(*s)) { 3128 *s = Py_UNICODE_TOLOWER(*s); 3129 status = 1; 3130 } 3131 s++; 3132 } 3133 return status; 3134} 3135 3136static 3137int fixtitle(PyUnicodeObject *self) 3138{ 3139 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3140 register Py_UNICODE *e; 3141 int previous_is_cased; 3142 3143 /* Shortcut for single character strings */ 3144 if (PyUnicode_GET_SIZE(self) == 1) { 3145 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3146 if (*p != ch) { 3147 *p = ch; 3148 return 1; 3149 } 3150 else 3151 return 0; 3152 } 3153 3154 e = p + PyUnicode_GET_SIZE(self); 3155 previous_is_cased = 0; 3156 for (; p < e; p++) { 3157 register const Py_UNICODE ch = *p; 3158 3159 if (previous_is_cased) 3160 *p = Py_UNICODE_TOLOWER(ch); 3161 else 3162 *p = Py_UNICODE_TOTITLE(ch); 3163 3164 if (Py_UNICODE_ISLOWER(ch) || 3165 Py_UNICODE_ISUPPER(ch) || 3166 Py_UNICODE_ISTITLE(ch)) 3167 previous_is_cased = 1; 3168 else 3169 previous_is_cased = 0; 3170 } 3171 return 1; 3172} 3173 3174PyObject *PyUnicode_Join(PyObject *separator, 3175 PyObject *seq) 3176{ 3177 Py_UNICODE *sep; 3178 int seplen; 3179 PyUnicodeObject *res = NULL; 3180 int reslen = 0; 3181 Py_UNICODE *p; 3182 int sz = 100; 3183 int i; 3184 PyObject *it; 3185 3186 it = PyObject_GetIter(seq); 3187 if (it == NULL) 3188 return NULL; 3189 3190 if (separator == NULL) { 3191 Py_UNICODE blank = ' '; 3192 sep = ␣ 3193 seplen = 1; 3194 } 3195 else { 3196 separator = PyUnicode_FromObject(separator); 3197 if (separator == NULL) 3198 goto onError; 3199 sep = PyUnicode_AS_UNICODE(separator); 3200 seplen = PyUnicode_GET_SIZE(separator); 3201 } 3202 3203 res = _PyUnicode_New(sz); 3204 if (res == NULL) 3205 goto onError; 3206 p = PyUnicode_AS_UNICODE(res); 3207 reslen = 0; 3208 3209 for (i = 0; ; ++i) { 3210 int itemlen; 3211 PyObject *item = PyIter_Next(it); 3212 if (item == NULL) { 3213 if (PyErr_Occurred()) 3214 goto onError; 3215 break; 3216 } 3217 if (!PyUnicode_Check(item)) { 3218 PyObject *v; 3219 if (!PyString_Check(item)) { 3220 PyErr_Format(PyExc_TypeError, 3221 "sequence item %i: expected string or Unicode," 3222 " %.80s found", 3223 i, item->ob_type->tp_name); 3224 Py_DECREF(item); 3225 goto onError; 3226 } 3227 v = PyUnicode_FromObject(item); 3228 Py_DECREF(item); 3229 item = v; 3230 if (item == NULL) 3231 goto onError; 3232 } 3233 itemlen = PyUnicode_GET_SIZE(item); 3234 while (reslen + itemlen + seplen >= sz) { 3235 if (_PyUnicode_Resize(&res, sz*2)) { 3236 Py_DECREF(item); 3237 goto onError; 3238 } 3239 sz *= 2; 3240 p = PyUnicode_AS_UNICODE(res) + reslen; 3241 } 3242 if (i > 0) { 3243 Py_UNICODE_COPY(p, sep, seplen); 3244 p += seplen; 3245 reslen += seplen; 3246 } 3247 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3248 p += itemlen; 3249 reslen += itemlen; 3250 Py_DECREF(item); 3251 } 3252 if (_PyUnicode_Resize(&res, reslen)) 3253 goto onError; 3254 3255 Py_XDECREF(separator); 3256 Py_DECREF(it); 3257 return (PyObject *)res; 3258 3259 onError: 3260 Py_XDECREF(separator); 3261 Py_XDECREF(res); 3262 Py_DECREF(it); 3263 return NULL; 3264} 3265 3266static 3267PyUnicodeObject *pad(PyUnicodeObject *self, 3268 int left, 3269 int right, 3270 Py_UNICODE fill) 3271{ 3272 PyUnicodeObject *u; 3273 3274 if (left < 0) 3275 left = 0; 3276 if (right < 0) 3277 right = 0; 3278 3279 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3280 Py_INCREF(self); 3281 return self; 3282 } 3283 3284 u = _PyUnicode_New(left + self->length + right); 3285 if (u) { 3286 if (left) 3287 Py_UNICODE_FILL(u->str, fill, left); 3288 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3289 if (right) 3290 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3291 } 3292 3293 return u; 3294} 3295 3296#define SPLIT_APPEND(data, left, right) \ 3297 str = PyUnicode_FromUnicode(data + left, right - left); \ 3298 if (!str) \ 3299 goto onError; \ 3300 if (PyList_Append(list, str)) { \ 3301 Py_DECREF(str); \ 3302 goto onError; \ 3303 } \ 3304 else \ 3305 Py_DECREF(str); 3306 3307static 3308PyObject *split_whitespace(PyUnicodeObject *self, 3309 PyObject *list, 3310 int maxcount) 3311{ 3312 register int i; 3313 register int j; 3314 int len = self->length; 3315 PyObject *str; 3316 3317 for (i = j = 0; i < len; ) { 3318 /* find a token */ 3319 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3320 i++; 3321 j = i; 3322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 3323 i++; 3324 if (j < i) { 3325 if (maxcount-- <= 0) 3326 break; 3327 SPLIT_APPEND(self->str, j, i); 3328 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3329 i++; 3330 j = i; 3331 } 3332 } 3333 if (j < len) { 3334 SPLIT_APPEND(self->str, j, len); 3335 } 3336 return list; 3337 3338 onError: 3339 Py_DECREF(list); 3340 return NULL; 3341} 3342 3343PyObject *PyUnicode_Splitlines(PyObject *string, 3344 int keepends) 3345{ 3346 register int i; 3347 register int j; 3348 int len; 3349 PyObject *list; 3350 PyObject *str; 3351 Py_UNICODE *data; 3352 3353 string = PyUnicode_FromObject(string); 3354 if (string == NULL) 3355 return NULL; 3356 data = PyUnicode_AS_UNICODE(string); 3357 len = PyUnicode_GET_SIZE(string); 3358 3359 list = PyList_New(0); 3360 if (!list) 3361 goto onError; 3362 3363 for (i = j = 0; i < len; ) { 3364 int eol; 3365 3366 /* Find a line and append it */ 3367 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 3368 i++; 3369 3370 /* Skip the line break reading CRLF as one line break */ 3371 eol = i; 3372 if (i < len) { 3373 if (data[i] == '\r' && i + 1 < len && 3374 data[i+1] == '\n') 3375 i += 2; 3376 else 3377 i++; 3378 if (keepends) 3379 eol = i; 3380 } 3381 SPLIT_APPEND(data, j, eol); 3382 j = i; 3383 } 3384 if (j < len) { 3385 SPLIT_APPEND(data, j, len); 3386 } 3387 3388 Py_DECREF(string); 3389 return list; 3390 3391 onError: 3392 Py_DECREF(list); 3393 Py_DECREF(string); 3394 return NULL; 3395} 3396 3397static 3398PyObject *split_char(PyUnicodeObject *self, 3399 PyObject *list, 3400 Py_UNICODE ch, 3401 int maxcount) 3402{ 3403 register int i; 3404 register int j; 3405 int len = self->length; 3406 PyObject *str; 3407 3408 for (i = j = 0; i < len; ) { 3409 if (self->str[i] == ch) { 3410 if (maxcount-- <= 0) 3411 break; 3412 SPLIT_APPEND(self->str, j, i); 3413 i = j = i + 1; 3414 } else 3415 i++; 3416 } 3417 if (j <= len) { 3418 SPLIT_APPEND(self->str, j, len); 3419 } 3420 return list; 3421 3422 onError: 3423 Py_DECREF(list); 3424 return NULL; 3425} 3426 3427static 3428PyObject *split_substring(PyUnicodeObject *self, 3429 PyObject *list, 3430 PyUnicodeObject *substring, 3431 int maxcount) 3432{ 3433 register int i; 3434 register int j; 3435 int len = self->length; 3436 int sublen = substring->length; 3437 PyObject *str; 3438 3439 for (i = j = 0; i <= len - sublen; ) { 3440 if (Py_UNICODE_MATCH(self, i, substring)) { 3441 if (maxcount-- <= 0) 3442 break; 3443 SPLIT_APPEND(self->str, j, i); 3444 i = j = i + sublen; 3445 } else 3446 i++; 3447 } 3448 if (j <= len) { 3449 SPLIT_APPEND(self->str, j, len); 3450 } 3451 return list; 3452 3453 onError: 3454 Py_DECREF(list); 3455 return NULL; 3456} 3457 3458#undef SPLIT_APPEND 3459 3460static 3461PyObject *split(PyUnicodeObject *self, 3462 PyUnicodeObject *substring, 3463 int maxcount) 3464{ 3465 PyObject *list; 3466 3467 if (maxcount < 0) 3468 maxcount = INT_MAX; 3469 3470 list = PyList_New(0); 3471 if (!list) 3472 return NULL; 3473 3474 if (substring == NULL) 3475 return split_whitespace(self,list,maxcount); 3476 3477 else if (substring->length == 1) 3478 return split_char(self,list,substring->str[0],maxcount); 3479 3480 else if (substring->length == 0) { 3481 Py_DECREF(list); 3482 PyErr_SetString(PyExc_ValueError, "empty separator"); 3483 return NULL; 3484 } 3485 else 3486 return split_substring(self,list,substring,maxcount); 3487} 3488 3489static 3490PyObject *strip(PyUnicodeObject *self, 3491 int left, 3492 int right) 3493{ 3494 Py_UNICODE *p = self->str; 3495 int start = 0; 3496 int end = self->length; 3497 3498 if (left) 3499 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3500 start++; 3501 3502 if (right) 3503 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3504 end--; 3505 3506 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 3507 /* couldn't strip anything off, return original string */ 3508 Py_INCREF(self); 3509 return (PyObject*) self; 3510 } 3511 3512 return (PyObject*) PyUnicode_FromUnicode( 3513 self->str + start, 3514 end - start 3515 ); 3516} 3517 3518static 3519PyObject *replace(PyUnicodeObject *self, 3520 PyUnicodeObject *str1, 3521 PyUnicodeObject *str2, 3522 int maxcount) 3523{ 3524 PyUnicodeObject *u; 3525 3526 if (maxcount < 0) 3527 maxcount = INT_MAX; 3528 3529 if (str1->length == 1 && str2->length == 1) { 3530 int i; 3531 3532 /* replace characters */ 3533 if (!findchar(self->str, self->length, str1->str[0]) && 3534 PyUnicode_CheckExact(self)) { 3535 /* nothing to replace, return original string */ 3536 Py_INCREF(self); 3537 u = self; 3538 } else { 3539 Py_UNICODE u1 = str1->str[0]; 3540 Py_UNICODE u2 = str2->str[0]; 3541 3542 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3543 NULL, 3544 self->length 3545 ); 3546 if (u != NULL) { 3547 Py_UNICODE_COPY(u->str, self->str, 3548 self->length); 3549 for (i = 0; i < u->length; i++) 3550 if (u->str[i] == u1) { 3551 if (--maxcount < 0) 3552 break; 3553 u->str[i] = u2; 3554 } 3555 } 3556 } 3557 3558 } else { 3559 int n, i; 3560 Py_UNICODE *p; 3561 3562 /* replace strings */ 3563 n = count(self, 0, self->length, str1); 3564 if (n > maxcount) 3565 n = maxcount; 3566 if (n == 0 && PyUnicode_CheckExact(self)) { 3567 /* nothing to replace, return original string */ 3568 Py_INCREF(self); 3569 u = self; 3570 } else { 3571 u = _PyUnicode_New( 3572 self->length + n * (str2->length - str1->length)); 3573 if (u) { 3574 i = 0; 3575 p = u->str; 3576 while (i <= self->length - str1->length) 3577 if (Py_UNICODE_MATCH(self, i, str1)) { 3578 /* replace string segment */ 3579 Py_UNICODE_COPY(p, str2->str, str2->length); 3580 p += str2->length; 3581 i += str1->length; 3582 if (--n <= 0) { 3583 /* copy remaining part */ 3584 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3585 break; 3586 } 3587 } else 3588 *p++ = self->str[i++]; 3589 } 3590 } 3591 } 3592 3593 return (PyObject *) u; 3594} 3595 3596/* --- Unicode Object Methods --------------------------------------------- */ 3597 3598static char title__doc__[] = 3599"S.title() -> unicode\n\ 3600\n\ 3601Return a titlecased version of S, i.e. words start with title case\n\ 3602characters, all remaining cased characters have lower case."; 3603 3604static PyObject* 3605unicode_title(PyUnicodeObject *self) 3606{ 3607 return fixup(self, fixtitle); 3608} 3609 3610static char capitalize__doc__[] = 3611"S.capitalize() -> unicode\n\ 3612\n\ 3613Return a capitalized version of S, i.e. make the first character\n\ 3614have upper case."; 3615 3616static PyObject* 3617unicode_capitalize(PyUnicodeObject *self) 3618{ 3619 return fixup(self, fixcapitalize); 3620} 3621 3622#if 0 3623static char capwords__doc__[] = 3624"S.capwords() -> unicode\n\ 3625\n\ 3626Apply .capitalize() to all words in S and return the result with\n\ 3627normalized whitespace (all whitespace strings are replaced by ' ')."; 3628 3629static PyObject* 3630unicode_capwords(PyUnicodeObject *self) 3631{ 3632 PyObject *list; 3633 PyObject *item; 3634 int i; 3635 3636 /* Split into words */ 3637 list = split(self, NULL, -1); 3638 if (!list) 3639 return NULL; 3640 3641 /* Capitalize each word */ 3642 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3643 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3644 fixcapitalize); 3645 if (item == NULL) 3646 goto onError; 3647 Py_DECREF(PyList_GET_ITEM(list, i)); 3648 PyList_SET_ITEM(list, i, item); 3649 } 3650 3651 /* Join the words to form a new string */ 3652 item = PyUnicode_Join(NULL, list); 3653 3654onError: 3655 Py_DECREF(list); 3656 return (PyObject *)item; 3657} 3658#endif 3659 3660static char center__doc__[] = 3661"S.center(width) -> unicode\n\ 3662\n\ 3663Return S centered in a Unicode string of length width. Padding is done\n\ 3664using spaces."; 3665 3666static PyObject * 3667unicode_center(PyUnicodeObject *self, PyObject *args) 3668{ 3669 int marg, left; 3670 int width; 3671 3672 if (!PyArg_ParseTuple(args, "i:center", &width)) 3673 return NULL; 3674 3675 if (self->length >= width && PyUnicode_CheckExact(self)) { 3676 Py_INCREF(self); 3677 return (PyObject*) self; 3678 } 3679 3680 marg = width - self->length; 3681 left = marg / 2 + (marg & width & 1); 3682 3683 return (PyObject*) pad(self, left, marg - left, ' '); 3684} 3685 3686#if 0 3687 3688/* This code should go into some future Unicode collation support 3689 module. The basic comparison should compare ordinals on a naive 3690 basis (this is what Java does and thus JPython too). */ 3691 3692/* speedy UTF-16 code point order comparison */ 3693/* gleaned from: */ 3694/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3695 3696static short utf16Fixup[32] = 3697{ 3698 0, 0, 0, 0, 0, 0, 0, 0, 3699 0, 0, 0, 0, 0, 0, 0, 0, 3700 0, 0, 0, 0, 0, 0, 0, 0, 3701 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3702}; 3703 3704static int 3705unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3706{ 3707 int len1, len2; 3708 3709 Py_UNICODE *s1 = str1->str; 3710 Py_UNICODE *s2 = str2->str; 3711 3712 len1 = str1->length; 3713 len2 = str2->length; 3714 3715 while (len1 > 0 && len2 > 0) { 3716 Py_UNICODE c1, c2; 3717 3718 c1 = *s1++; 3719 c2 = *s2++; 3720 3721 if (c1 > (1<<11) * 26) 3722 c1 += utf16Fixup[c1>>11]; 3723 if (c2 > (1<<11) * 26) 3724 c2 += utf16Fixup[c2>>11]; 3725 /* now c1 and c2 are in UTF-32-compatible order */ 3726 3727 if (c1 != c2) 3728 return (c1 < c2) ? -1 : 1; 3729 3730 len1--; len2--; 3731 } 3732 3733 return (len1 < len2) ? -1 : (len1 != len2); 3734} 3735 3736#else 3737 3738static int 3739unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3740{ 3741 register int len1, len2; 3742 3743 Py_UNICODE *s1 = str1->str; 3744 Py_UNICODE *s2 = str2->str; 3745 3746 len1 = str1->length; 3747 len2 = str2->length; 3748 3749 while (len1 > 0 && len2 > 0) { 3750 Py_UNICODE c1, c2; 3751 3752 c1 = *s1++; 3753 c2 = *s2++; 3754 3755 if (c1 != c2) 3756 return (c1 < c2) ? -1 : 1; 3757 3758 len1--; len2--; 3759 } 3760 3761 return (len1 < len2) ? -1 : (len1 != len2); 3762} 3763 3764#endif 3765 3766int PyUnicode_Compare(PyObject *left, 3767 PyObject *right) 3768{ 3769 PyUnicodeObject *u = NULL, *v = NULL; 3770 int result; 3771 3772 /* Coerce the two arguments */ 3773 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3774 if (u == NULL) 3775 goto onError; 3776 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3777 if (v == NULL) 3778 goto onError; 3779 3780 /* Shortcut for empty or interned objects */ 3781 if (v == u) { 3782 Py_DECREF(u); 3783 Py_DECREF(v); 3784 return 0; 3785 } 3786 3787 result = unicode_compare(u, v); 3788 3789 Py_DECREF(u); 3790 Py_DECREF(v); 3791 return result; 3792 3793onError: 3794 Py_XDECREF(u); 3795 Py_XDECREF(v); 3796 return -1; 3797} 3798 3799int PyUnicode_Contains(PyObject *container, 3800 PyObject *element) 3801{ 3802 PyUnicodeObject *u = NULL, *v = NULL; 3803 int result; 3804 register const Py_UNICODE *p, *e; 3805 register Py_UNICODE ch; 3806 3807 /* Coerce the two arguments */ 3808 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3809 if (v == NULL) { 3810 PyErr_SetString(PyExc_TypeError, 3811 "'in <string>' requires character as left operand"); 3812 goto onError; 3813 } 3814 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3815 if (u == NULL) { 3816 Py_DECREF(v); 3817 goto onError; 3818 } 3819 3820 /* Check v in u */ 3821 if (PyUnicode_GET_SIZE(v) != 1) { 3822 PyErr_SetString(PyExc_TypeError, 3823 "'in <string>' requires character as left operand"); 3824 goto onError; 3825 } 3826 ch = *PyUnicode_AS_UNICODE(v); 3827 p = PyUnicode_AS_UNICODE(u); 3828 e = p + PyUnicode_GET_SIZE(u); 3829 result = 0; 3830 while (p < e) { 3831 if (*p++ == ch) { 3832 result = 1; 3833 break; 3834 } 3835 } 3836 3837 Py_DECREF(u); 3838 Py_DECREF(v); 3839 return result; 3840 3841onError: 3842 Py_XDECREF(u); 3843 Py_XDECREF(v); 3844 return -1; 3845} 3846 3847/* Concat to string or Unicode object giving a new Unicode object. */ 3848 3849PyObject *PyUnicode_Concat(PyObject *left, 3850 PyObject *right) 3851{ 3852 PyUnicodeObject *u = NULL, *v = NULL, *w; 3853 3854 /* Coerce the two arguments */ 3855 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3856 if (u == NULL) 3857 goto onError; 3858 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3859 if (v == NULL) 3860 goto onError; 3861 3862 /* Shortcuts */ 3863 if (v == unicode_empty) { 3864 Py_DECREF(v); 3865 return (PyObject *)u; 3866 } 3867 if (u == unicode_empty) { 3868 Py_DECREF(u); 3869 return (PyObject *)v; 3870 } 3871 3872 /* Concat the two Unicode strings */ 3873 w = _PyUnicode_New(u->length + v->length); 3874 if (w == NULL) 3875 goto onError; 3876 Py_UNICODE_COPY(w->str, u->str, u->length); 3877 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3878 3879 Py_DECREF(u); 3880 Py_DECREF(v); 3881 return (PyObject *)w; 3882 3883onError: 3884 Py_XDECREF(u); 3885 Py_XDECREF(v); 3886 return NULL; 3887} 3888 3889static char count__doc__[] = 3890"S.count(sub[, start[, end]]) -> int\n\ 3891\n\ 3892Return the number of occurrences of substring sub in Unicode string\n\ 3893S[start:end]. Optional arguments start and end are\n\ 3894interpreted as in slice notation."; 3895 3896static PyObject * 3897unicode_count(PyUnicodeObject *self, PyObject *args) 3898{ 3899 PyUnicodeObject *substring; 3900 int start = 0; 3901 int end = INT_MAX; 3902 PyObject *result; 3903 3904 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3905 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3906 return NULL; 3907 3908 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3909 (PyObject *)substring); 3910 if (substring == NULL) 3911 return NULL; 3912 3913 if (start < 0) 3914 start += self->length; 3915 if (start < 0) 3916 start = 0; 3917 if (end > self->length) 3918 end = self->length; 3919 if (end < 0) 3920 end += self->length; 3921 if (end < 0) 3922 end = 0; 3923 3924 result = PyInt_FromLong((long) count(self, start, end, substring)); 3925 3926 Py_DECREF(substring); 3927 return result; 3928} 3929 3930static char encode__doc__[] = 3931"S.encode([encoding[,errors]]) -> string\n\ 3932\n\ 3933Return an encoded string version of S. Default encoding is the current\n\ 3934default string encoding. errors may be given to set a different error\n\ 3935handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3936a ValueError. Other possible values are 'ignore' and 'replace'."; 3937 3938static PyObject * 3939unicode_encode(PyUnicodeObject *self, PyObject *args) 3940{ 3941 char *encoding = NULL; 3942 char *errors = NULL; 3943 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3944 return NULL; 3945 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3946} 3947 3948static char expandtabs__doc__[] = 3949"S.expandtabs([tabsize]) -> unicode\n\ 3950\n\ 3951Return a copy of S where all tab characters are expanded using spaces.\n\ 3952If tabsize is not given, a tab size of 8 characters is assumed."; 3953 3954static PyObject* 3955unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3956{ 3957 Py_UNICODE *e; 3958 Py_UNICODE *p; 3959 Py_UNICODE *q; 3960 int i, j; 3961 PyUnicodeObject *u; 3962 int tabsize = 8; 3963 3964 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3965 return NULL; 3966 3967 /* First pass: determine size of output string */ 3968 i = j = 0; 3969 e = self->str + self->length; 3970 for (p = self->str; p < e; p++) 3971 if (*p == '\t') { 3972 if (tabsize > 0) 3973 j += tabsize - (j % tabsize); 3974 } 3975 else { 3976 j++; 3977 if (*p == '\n' || *p == '\r') { 3978 i += j; 3979 j = 0; 3980 } 3981 } 3982 3983 /* Second pass: create output string and fill it */ 3984 u = _PyUnicode_New(i + j); 3985 if (!u) 3986 return NULL; 3987 3988 j = 0; 3989 q = u->str; 3990 3991 for (p = self->str; p < e; p++) 3992 if (*p == '\t') { 3993 if (tabsize > 0) { 3994 i = tabsize - (j % tabsize); 3995 j += i; 3996 while (i--) 3997 *q++ = ' '; 3998 } 3999 } 4000 else { 4001 j++; 4002 *q++ = *p; 4003 if (*p == '\n' || *p == '\r') 4004 j = 0; 4005 } 4006 4007 return (PyObject*) u; 4008} 4009 4010static char find__doc__[] = 4011"S.find(sub [,start [,end]]) -> int\n\ 4012\n\ 4013Return the lowest index in S where substring sub is found,\n\ 4014such that sub is contained within s[start,end]. Optional\n\ 4015arguments start and end are interpreted as in slice notation.\n\ 4016\n\ 4017Return -1 on failure."; 4018 4019static PyObject * 4020unicode_find(PyUnicodeObject *self, PyObject *args) 4021{ 4022 PyUnicodeObject *substring; 4023 int start = 0; 4024 int end = INT_MAX; 4025 PyObject *result; 4026 4027 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4028 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4029 return NULL; 4030 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4031 (PyObject *)substring); 4032 if (substring == NULL) 4033 return NULL; 4034 4035 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4036 4037 Py_DECREF(substring); 4038 return result; 4039} 4040 4041static PyObject * 4042unicode_getitem(PyUnicodeObject *self, int index) 4043{ 4044 if (index < 0 || index >= self->length) { 4045 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4046 return NULL; 4047 } 4048 4049 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4050} 4051 4052static long 4053unicode_hash(PyUnicodeObject *self) 4054{ 4055 /* Since Unicode objects compare equal to their ASCII string 4056 counterparts, they should use the individual character values 4057 as basis for their hash value. This is needed to assure that 4058 strings and Unicode objects behave in the same way as 4059 dictionary keys. */ 4060 4061 register int len; 4062 register Py_UNICODE *p; 4063 register long x; 4064 4065 if (self->hash != -1) 4066 return self->hash; 4067 len = PyUnicode_GET_SIZE(self); 4068 p = PyUnicode_AS_UNICODE(self); 4069 x = *p << 7; 4070 while (--len >= 0) 4071 x = (1000003*x) ^ *p++; 4072 x ^= PyUnicode_GET_SIZE(self); 4073 if (x == -1) 4074 x = -2; 4075 self->hash = x; 4076 return x; 4077} 4078 4079static char index__doc__[] = 4080"S.index(sub [,start [,end]]) -> int\n\ 4081\n\ 4082Like S.find() but raise ValueError when the substring is not found."; 4083 4084static PyObject * 4085unicode_index(PyUnicodeObject *self, PyObject *args) 4086{ 4087 int result; 4088 PyUnicodeObject *substring; 4089 int start = 0; 4090 int end = INT_MAX; 4091 4092 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4093 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4094 return NULL; 4095 4096 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4097 (PyObject *)substring); 4098 if (substring == NULL) 4099 return NULL; 4100 4101 result = findstring(self, substring, start, end, 1); 4102 4103 Py_DECREF(substring); 4104 if (result < 0) { 4105 PyErr_SetString(PyExc_ValueError, "substring not found"); 4106 return NULL; 4107 } 4108 return PyInt_FromLong(result); 4109} 4110 4111static char islower__doc__[] = 4112"S.islower() -> int\n\ 4113\n\ 4114Return 1 if all cased characters in S are lowercase and there is\n\ 4115at least one cased character in S, 0 otherwise."; 4116 4117static PyObject* 4118unicode_islower(PyUnicodeObject *self) 4119{ 4120 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4121 register const Py_UNICODE *e; 4122 int cased; 4123 4124 /* Shortcut for single character strings */ 4125 if (PyUnicode_GET_SIZE(self) == 1) 4126 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 4127 4128 /* Special case for empty strings */ 4129 if (PyString_GET_SIZE(self) == 0) 4130 return PyInt_FromLong(0); 4131 4132 e = p + PyUnicode_GET_SIZE(self); 4133 cased = 0; 4134 for (; p < e; p++) { 4135 register const Py_UNICODE ch = *p; 4136 4137 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4138 return PyInt_FromLong(0); 4139 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4140 cased = 1; 4141 } 4142 return PyInt_FromLong(cased); 4143} 4144 4145static char isupper__doc__[] = 4146"S.isupper() -> int\n\ 4147\n\ 4148Return 1 if all cased characters in S are uppercase and there is\n\ 4149at least one cased character in S, 0 otherwise."; 4150 4151static PyObject* 4152unicode_isupper(PyUnicodeObject *self) 4153{ 4154 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4155 register const Py_UNICODE *e; 4156 int cased; 4157 4158 /* Shortcut for single character strings */ 4159 if (PyUnicode_GET_SIZE(self) == 1) 4160 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4161 4162 /* Special case for empty strings */ 4163 if (PyString_GET_SIZE(self) == 0) 4164 return PyInt_FromLong(0); 4165 4166 e = p + PyUnicode_GET_SIZE(self); 4167 cased = 0; 4168 for (; p < e; p++) { 4169 register const Py_UNICODE ch = *p; 4170 4171 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4172 return PyInt_FromLong(0); 4173 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4174 cased = 1; 4175 } 4176 return PyInt_FromLong(cased); 4177} 4178 4179static char istitle__doc__[] = 4180"S.istitle() -> int\n\ 4181\n\ 4182Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 4183may only follow uncased characters and lowercase characters only cased\n\ 4184ones. Return 0 otherwise."; 4185 4186static PyObject* 4187unicode_istitle(PyUnicodeObject *self) 4188{ 4189 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4190 register const Py_UNICODE *e; 4191 int cased, previous_is_cased; 4192 4193 /* Shortcut for single character strings */ 4194 if (PyUnicode_GET_SIZE(self) == 1) 4195 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4196 (Py_UNICODE_ISUPPER(*p) != 0)); 4197 4198 /* Special case for empty strings */ 4199 if (PyString_GET_SIZE(self) == 0) 4200 return PyInt_FromLong(0); 4201 4202 e = p + PyUnicode_GET_SIZE(self); 4203 cased = 0; 4204 previous_is_cased = 0; 4205 for (; p < e; p++) { 4206 register const Py_UNICODE ch = *p; 4207 4208 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4209 if (previous_is_cased) 4210 return PyInt_FromLong(0); 4211 previous_is_cased = 1; 4212 cased = 1; 4213 } 4214 else if (Py_UNICODE_ISLOWER(ch)) { 4215 if (!previous_is_cased) 4216 return PyInt_FromLong(0); 4217 previous_is_cased = 1; 4218 cased = 1; 4219 } 4220 else 4221 previous_is_cased = 0; 4222 } 4223 return PyInt_FromLong(cased); 4224} 4225 4226static char isspace__doc__[] = 4227"S.isspace() -> int\n\ 4228\n\ 4229Return 1 if there are only whitespace characters in S,\n\ 42300 otherwise."; 4231 4232static PyObject* 4233unicode_isspace(PyUnicodeObject *self) 4234{ 4235 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4236 register const Py_UNICODE *e; 4237 4238 /* Shortcut for single character strings */ 4239 if (PyUnicode_GET_SIZE(self) == 1 && 4240 Py_UNICODE_ISSPACE(*p)) 4241 return PyInt_FromLong(1); 4242 4243 /* Special case for empty strings */ 4244 if (PyString_GET_SIZE(self) == 0) 4245 return PyInt_FromLong(0); 4246 4247 e = p + PyUnicode_GET_SIZE(self); 4248 for (; p < e; p++) { 4249 if (!Py_UNICODE_ISSPACE(*p)) 4250 return PyInt_FromLong(0); 4251 } 4252 return PyInt_FromLong(1); 4253} 4254 4255static char isalpha__doc__[] = 4256"S.isalpha() -> int\n\ 4257\n\ 4258Return 1 if all characters in S are alphabetic\n\ 4259and there is at least one character in S, 0 otherwise."; 4260 4261static PyObject* 4262unicode_isalpha(PyUnicodeObject *self) 4263{ 4264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4265 register const Py_UNICODE *e; 4266 4267 /* Shortcut for single character strings */ 4268 if (PyUnicode_GET_SIZE(self) == 1 && 4269 Py_UNICODE_ISALPHA(*p)) 4270 return PyInt_FromLong(1); 4271 4272 /* Special case for empty strings */ 4273 if (PyString_GET_SIZE(self) == 0) 4274 return PyInt_FromLong(0); 4275 4276 e = p + PyUnicode_GET_SIZE(self); 4277 for (; p < e; p++) { 4278 if (!Py_UNICODE_ISALPHA(*p)) 4279 return PyInt_FromLong(0); 4280 } 4281 return PyInt_FromLong(1); 4282} 4283 4284static char isalnum__doc__[] = 4285"S.isalnum() -> int\n\ 4286\n\ 4287Return 1 if all characters in S are alphanumeric\n\ 4288and there is at least one character in S, 0 otherwise."; 4289 4290static PyObject* 4291unicode_isalnum(PyUnicodeObject *self) 4292{ 4293 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4294 register const Py_UNICODE *e; 4295 4296 /* Shortcut for single character strings */ 4297 if (PyUnicode_GET_SIZE(self) == 1 && 4298 Py_UNICODE_ISALNUM(*p)) 4299 return PyInt_FromLong(1); 4300 4301 /* Special case for empty strings */ 4302 if (PyString_GET_SIZE(self) == 0) 4303 return PyInt_FromLong(0); 4304 4305 e = p + PyUnicode_GET_SIZE(self); 4306 for (; p < e; p++) { 4307 if (!Py_UNICODE_ISALNUM(*p)) 4308 return PyInt_FromLong(0); 4309 } 4310 return PyInt_FromLong(1); 4311} 4312 4313static char isdecimal__doc__[] = 4314"S.isdecimal() -> int\n\ 4315\n\ 4316Return 1 if there are only decimal characters in S,\n\ 43170 otherwise."; 4318 4319static PyObject* 4320unicode_isdecimal(PyUnicodeObject *self) 4321{ 4322 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4323 register const Py_UNICODE *e; 4324 4325 /* Shortcut for single character strings */ 4326 if (PyUnicode_GET_SIZE(self) == 1 && 4327 Py_UNICODE_ISDECIMAL(*p)) 4328 return PyInt_FromLong(1); 4329 4330 /* Special case for empty strings */ 4331 if (PyString_GET_SIZE(self) == 0) 4332 return PyInt_FromLong(0); 4333 4334 e = p + PyUnicode_GET_SIZE(self); 4335 for (; p < e; p++) { 4336 if (!Py_UNICODE_ISDECIMAL(*p)) 4337 return PyInt_FromLong(0); 4338 } 4339 return PyInt_FromLong(1); 4340} 4341 4342static char isdigit__doc__[] = 4343"S.isdigit() -> int\n\ 4344\n\ 4345Return 1 if there are only digit characters in S,\n\ 43460 otherwise."; 4347 4348static PyObject* 4349unicode_isdigit(PyUnicodeObject *self) 4350{ 4351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4352 register const Py_UNICODE *e; 4353 4354 /* Shortcut for single character strings */ 4355 if (PyUnicode_GET_SIZE(self) == 1 && 4356 Py_UNICODE_ISDIGIT(*p)) 4357 return PyInt_FromLong(1); 4358 4359 /* Special case for empty strings */ 4360 if (PyString_GET_SIZE(self) == 0) 4361 return PyInt_FromLong(0); 4362 4363 e = p + PyUnicode_GET_SIZE(self); 4364 for (; p < e; p++) { 4365 if (!Py_UNICODE_ISDIGIT(*p)) 4366 return PyInt_FromLong(0); 4367 } 4368 return PyInt_FromLong(1); 4369} 4370 4371static char isnumeric__doc__[] = 4372"S.isnumeric() -> int\n\ 4373\n\ 4374Return 1 if there are only numeric characters in S,\n\ 43750 otherwise."; 4376 4377static PyObject* 4378unicode_isnumeric(PyUnicodeObject *self) 4379{ 4380 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4381 register const Py_UNICODE *e; 4382 4383 /* Shortcut for single character strings */ 4384 if (PyUnicode_GET_SIZE(self) == 1 && 4385 Py_UNICODE_ISNUMERIC(*p)) 4386 return PyInt_FromLong(1); 4387 4388 /* Special case for empty strings */ 4389 if (PyString_GET_SIZE(self) == 0) 4390 return PyInt_FromLong(0); 4391 4392 e = p + PyUnicode_GET_SIZE(self); 4393 for (; p < e; p++) { 4394 if (!Py_UNICODE_ISNUMERIC(*p)) 4395 return PyInt_FromLong(0); 4396 } 4397 return PyInt_FromLong(1); 4398} 4399 4400static char join__doc__[] = 4401"S.join(sequence) -> unicode\n\ 4402\n\ 4403Return a string which is the concatenation of the strings in the\n\ 4404sequence. The separator between elements is S."; 4405 4406static PyObject* 4407unicode_join(PyObject *self, PyObject *data) 4408{ 4409 return PyUnicode_Join(self, data); 4410} 4411 4412static int 4413unicode_length(PyUnicodeObject *self) 4414{ 4415 return self->length; 4416} 4417 4418static char ljust__doc__[] = 4419"S.ljust(width) -> unicode\n\ 4420\n\ 4421Return S left justified in a Unicode string of length width. Padding is\n\ 4422done using spaces."; 4423 4424static PyObject * 4425unicode_ljust(PyUnicodeObject *self, PyObject *args) 4426{ 4427 int width; 4428 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 4429 return NULL; 4430 4431 if (self->length >= width && PyUnicode_CheckExact(self)) { 4432 Py_INCREF(self); 4433 return (PyObject*) self; 4434 } 4435 4436 return (PyObject*) pad(self, 0, width - self->length, ' '); 4437} 4438 4439static char lower__doc__[] = 4440"S.lower() -> unicode\n\ 4441\n\ 4442Return a copy of the string S converted to lowercase."; 4443 4444static PyObject* 4445unicode_lower(PyUnicodeObject *self) 4446{ 4447 return fixup(self, fixlower); 4448} 4449 4450static char lstrip__doc__[] = 4451"S.lstrip() -> unicode\n\ 4452\n\ 4453Return a copy of the string S with leading whitespace removed."; 4454 4455static PyObject * 4456unicode_lstrip(PyUnicodeObject *self) 4457{ 4458 return strip(self, 1, 0); 4459} 4460 4461static PyObject* 4462unicode_repeat(PyUnicodeObject *str, int len) 4463{ 4464 PyUnicodeObject *u; 4465 Py_UNICODE *p; 4466 int nchars; 4467 size_t nbytes; 4468 4469 if (len < 0) 4470 len = 0; 4471 4472 if (len == 1 && PyUnicode_CheckExact(str)) { 4473 /* no repeat, return original string */ 4474 Py_INCREF(str); 4475 return (PyObject*) str; 4476 } 4477 4478 /* ensure # of chars needed doesn't overflow int and # of bytes 4479 * needed doesn't overflow size_t 4480 */ 4481 nchars = len * str->length; 4482 if (len && nchars / len != str->length) { 4483 PyErr_SetString(PyExc_OverflowError, 4484 "repeated string is too long"); 4485 return NULL; 4486 } 4487 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4488 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4489 PyErr_SetString(PyExc_OverflowError, 4490 "repeated string is too long"); 4491 return NULL; 4492 } 4493 u = _PyUnicode_New(nchars); 4494 if (!u) 4495 return NULL; 4496 4497 p = u->str; 4498 4499 while (len-- > 0) { 4500 Py_UNICODE_COPY(p, str->str, str->length); 4501 p += str->length; 4502 } 4503 4504 return (PyObject*) u; 4505} 4506 4507PyObject *PyUnicode_Replace(PyObject *obj, 4508 PyObject *subobj, 4509 PyObject *replobj, 4510 int maxcount) 4511{ 4512 PyObject *self; 4513 PyObject *str1; 4514 PyObject *str2; 4515 PyObject *result; 4516 4517 self = PyUnicode_FromObject(obj); 4518 if (self == NULL) 4519 return NULL; 4520 str1 = PyUnicode_FromObject(subobj); 4521 if (str1 == NULL) { 4522 Py_DECREF(self); 4523 return NULL; 4524 } 4525 str2 = PyUnicode_FromObject(replobj); 4526 if (str2 == NULL) { 4527 Py_DECREF(self); 4528 Py_DECREF(str1); 4529 return NULL; 4530 } 4531 result = replace((PyUnicodeObject *)self, 4532 (PyUnicodeObject *)str1, 4533 (PyUnicodeObject *)str2, 4534 maxcount); 4535 Py_DECREF(self); 4536 Py_DECREF(str1); 4537 Py_DECREF(str2); 4538 return result; 4539} 4540 4541static char replace__doc__[] = 4542"S.replace (old, new[, maxsplit]) -> unicode\n\ 4543\n\ 4544Return a copy of S with all occurrences of substring\n\ 4545old replaced by new. If the optional argument maxsplit is\n\ 4546given, only the first maxsplit occurrences are replaced."; 4547 4548static PyObject* 4549unicode_replace(PyUnicodeObject *self, PyObject *args) 4550{ 4551 PyUnicodeObject *str1; 4552 PyUnicodeObject *str2; 4553 int maxcount = -1; 4554 PyObject *result; 4555 4556 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4557 return NULL; 4558 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4559 if (str1 == NULL) 4560 return NULL; 4561 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4562 if (str2 == NULL) 4563 return NULL; 4564 4565 result = replace(self, str1, str2, maxcount); 4566 4567 Py_DECREF(str1); 4568 Py_DECREF(str2); 4569 return result; 4570} 4571 4572static 4573PyObject *unicode_repr(PyObject *unicode) 4574{ 4575 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4576 PyUnicode_GET_SIZE(unicode), 4577 1); 4578} 4579 4580static char rfind__doc__[] = 4581"S.rfind(sub [,start [,end]]) -> int\n\ 4582\n\ 4583Return the highest index in S where substring sub is found,\n\ 4584such that sub is contained within s[start,end]. Optional\n\ 4585arguments start and end are interpreted as in slice notation.\n\ 4586\n\ 4587Return -1 on failure."; 4588 4589static PyObject * 4590unicode_rfind(PyUnicodeObject *self, PyObject *args) 4591{ 4592 PyUnicodeObject *substring; 4593 int start = 0; 4594 int end = INT_MAX; 4595 PyObject *result; 4596 4597 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4598 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4599 return NULL; 4600 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4601 (PyObject *)substring); 4602 if (substring == NULL) 4603 return NULL; 4604 4605 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4606 4607 Py_DECREF(substring); 4608 return result; 4609} 4610 4611static char rindex__doc__[] = 4612"S.rindex(sub [,start [,end]]) -> int\n\ 4613\n\ 4614Like S.rfind() but raise ValueError when the substring is not found."; 4615 4616static PyObject * 4617unicode_rindex(PyUnicodeObject *self, PyObject *args) 4618{ 4619 int result; 4620 PyUnicodeObject *substring; 4621 int start = 0; 4622 int end = INT_MAX; 4623 4624 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4625 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4626 return NULL; 4627 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4628 (PyObject *)substring); 4629 if (substring == NULL) 4630 return NULL; 4631 4632 result = findstring(self, substring, start, end, -1); 4633 4634 Py_DECREF(substring); 4635 if (result < 0) { 4636 PyErr_SetString(PyExc_ValueError, "substring not found"); 4637 return NULL; 4638 } 4639 return PyInt_FromLong(result); 4640} 4641 4642static char rjust__doc__[] = 4643"S.rjust(width) -> unicode\n\ 4644\n\ 4645Return S right justified in a Unicode string of length width. Padding is\n\ 4646done using spaces."; 4647 4648static PyObject * 4649unicode_rjust(PyUnicodeObject *self, PyObject *args) 4650{ 4651 int width; 4652 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4653 return NULL; 4654 4655 if (self->length >= width && PyUnicode_CheckExact(self)) { 4656 Py_INCREF(self); 4657 return (PyObject*) self; 4658 } 4659 4660 return (PyObject*) pad(self, width - self->length, 0, ' '); 4661} 4662 4663static char rstrip__doc__[] = 4664"S.rstrip() -> unicode\n\ 4665\n\ 4666Return a copy of the string S with trailing whitespace removed."; 4667 4668static PyObject * 4669unicode_rstrip(PyUnicodeObject *self) 4670{ 4671 return strip(self, 0, 1); 4672} 4673 4674static PyObject* 4675unicode_slice(PyUnicodeObject *self, int start, int end) 4676{ 4677 /* standard clamping */ 4678 if (start < 0) 4679 start = 0; 4680 if (end < 0) 4681 end = 0; 4682 if (end > self->length) 4683 end = self->length; 4684 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 4685 /* full slice, return original string */ 4686 Py_INCREF(self); 4687 return (PyObject*) self; 4688 } 4689 if (start > end) 4690 start = end; 4691 /* copy slice */ 4692 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4693 end - start); 4694} 4695 4696PyObject *PyUnicode_Split(PyObject *s, 4697 PyObject *sep, 4698 int maxsplit) 4699{ 4700 PyObject *result; 4701 4702 s = PyUnicode_FromObject(s); 4703 if (s == NULL) 4704 return NULL; 4705 if (sep != NULL) { 4706 sep = PyUnicode_FromObject(sep); 4707 if (sep == NULL) { 4708 Py_DECREF(s); 4709 return NULL; 4710 } 4711 } 4712 4713 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4714 4715 Py_DECREF(s); 4716 Py_XDECREF(sep); 4717 return result; 4718} 4719 4720static char split__doc__[] = 4721"S.split([sep [,maxsplit]]) -> list of strings\n\ 4722\n\ 4723Return a list of the words in S, using sep as the\n\ 4724delimiter string. If maxsplit is given, at most maxsplit\n\ 4725splits are done. If sep is not specified, any whitespace string\n\ 4726is a separator."; 4727 4728static PyObject* 4729unicode_split(PyUnicodeObject *self, PyObject *args) 4730{ 4731 PyObject *substring = Py_None; 4732 int maxcount = -1; 4733 4734 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4735 return NULL; 4736 4737 if (substring == Py_None) 4738 return split(self, NULL, maxcount); 4739 else if (PyUnicode_Check(substring)) 4740 return split(self, (PyUnicodeObject *)substring, maxcount); 4741 else 4742 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4743} 4744 4745static char splitlines__doc__[] = 4746"S.splitlines([keepends]]) -> list of strings\n\ 4747\n\ 4748Return a list of the lines in S, breaking at line boundaries.\n\ 4749Line breaks are not included in the resulting list unless keepends\n\ 4750is given and true."; 4751 4752static PyObject* 4753unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4754{ 4755 int keepends = 0; 4756 4757 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4758 return NULL; 4759 4760 return PyUnicode_Splitlines((PyObject *)self, keepends); 4761} 4762 4763static 4764PyObject *unicode_str(PyUnicodeObject *self) 4765{ 4766 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4767} 4768 4769static char strip__doc__[] = 4770"S.strip() -> unicode\n\ 4771\n\ 4772Return a copy of S with leading and trailing whitespace removed."; 4773 4774static PyObject * 4775unicode_strip(PyUnicodeObject *self) 4776{ 4777 return strip(self, 1, 1); 4778} 4779 4780static char swapcase__doc__[] = 4781"S.swapcase() -> unicode\n\ 4782\n\ 4783Return a copy of S with uppercase characters converted to lowercase\n\ 4784and vice versa."; 4785 4786static PyObject* 4787unicode_swapcase(PyUnicodeObject *self) 4788{ 4789 return fixup(self, fixswapcase); 4790} 4791 4792static char translate__doc__[] = 4793"S.translate(table) -> unicode\n\ 4794\n\ 4795Return a copy of the string S, where all characters have been mapped\n\ 4796through the given translation table, which must be a mapping of\n\ 4797Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4798are left untouched. Characters mapped to None are deleted."; 4799 4800static PyObject* 4801unicode_translate(PyUnicodeObject *self, PyObject *table) 4802{ 4803 return PyUnicode_TranslateCharmap(self->str, 4804 self->length, 4805 table, 4806 "ignore"); 4807} 4808 4809static char upper__doc__[] = 4810"S.upper() -> unicode\n\ 4811\n\ 4812Return a copy of S converted to uppercase."; 4813 4814static PyObject* 4815unicode_upper(PyUnicodeObject *self) 4816{ 4817 return fixup(self, fixupper); 4818} 4819 4820#if 0 4821static char zfill__doc__[] = 4822"S.zfill(width) -> unicode\n\ 4823\n\ 4824Pad a numeric string x with zeros on the left, to fill a field\n\ 4825of the specified width. The string x is never truncated."; 4826 4827static PyObject * 4828unicode_zfill(PyUnicodeObject *self, PyObject *args) 4829{ 4830 int fill; 4831 PyUnicodeObject *u; 4832 4833 int width; 4834 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4835 return NULL; 4836 4837 if (self->length >= width) { 4838 Py_INCREF(self); 4839 return (PyObject*) self; 4840 } 4841 4842 fill = width - self->length; 4843 4844 u = pad(self, fill, 0, '0'); 4845 4846 if (u->str[fill] == '+' || u->str[fill] == '-') { 4847 /* move sign to beginning of string */ 4848 u->str[0] = u->str[fill]; 4849 u->str[fill] = '0'; 4850 } 4851 4852 return (PyObject*) u; 4853} 4854#endif 4855 4856#if 0 4857static PyObject* 4858unicode_freelistsize(PyUnicodeObject *self) 4859{ 4860 return PyInt_FromLong(unicode_freelist_size); 4861} 4862#endif 4863 4864static char startswith__doc__[] = 4865"S.startswith(prefix[, start[, end]]) -> int\n\ 4866\n\ 4867Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4868optional start, test S beginning at that position. With optional end, stop\n\ 4869comparing S at that position."; 4870 4871static PyObject * 4872unicode_startswith(PyUnicodeObject *self, 4873 PyObject *args) 4874{ 4875 PyUnicodeObject *substring; 4876 int start = 0; 4877 int end = INT_MAX; 4878 PyObject *result; 4879 4880 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4881 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4882 return NULL; 4883 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4884 (PyObject *)substring); 4885 if (substring == NULL) 4886 return NULL; 4887 4888 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4889 4890 Py_DECREF(substring); 4891 return result; 4892} 4893 4894 4895static char endswith__doc__[] = 4896"S.endswith(suffix[, start[, end]]) -> int\n\ 4897\n\ 4898Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4899optional start, test S beginning at that position. With optional end, stop\n\ 4900comparing S at that position."; 4901 4902static PyObject * 4903unicode_endswith(PyUnicodeObject *self, 4904 PyObject *args) 4905{ 4906 PyUnicodeObject *substring; 4907 int start = 0; 4908 int end = INT_MAX; 4909 PyObject *result; 4910 4911 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4912 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4913 return NULL; 4914 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4915 (PyObject *)substring); 4916 if (substring == NULL) 4917 return NULL; 4918 4919 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4920 4921 Py_DECREF(substring); 4922 return result; 4923} 4924 4925 4926static PyMethodDef unicode_methods[] = { 4927 4928 /* Order is according to common usage: often used methods should 4929 appear first, since lookup is done sequentially. */ 4930 4931 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 4932 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 4933 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 4934 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 4935 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 4936 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 4937 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 4938 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 4939 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 4940 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 4941 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 4942 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 4943 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 4944 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__}, 4945/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 4946 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 4947 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 4948 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 4949 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__}, 4950 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 4951 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__}, 4952 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 4953 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 4954 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 4955 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 4956 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 4957 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 4958 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 4959 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 4960 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 4961 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 4962 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 4963 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 4964 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 4965 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 4966#if 0 4967 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 4968 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 4969#endif 4970 4971#if 0 4972 /* This one is just used for debugging the implementation. */ 4973 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 4974#endif 4975 4976 {NULL, NULL} 4977}; 4978 4979static PySequenceMethods unicode_as_sequence = { 4980 (inquiry) unicode_length, /* sq_length */ 4981 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4982 (intargfunc) unicode_repeat, /* sq_repeat */ 4983 (intargfunc) unicode_getitem, /* sq_item */ 4984 (intintargfunc) unicode_slice, /* sq_slice */ 4985 0, /* sq_ass_item */ 4986 0, /* sq_ass_slice */ 4987 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4988}; 4989 4990static int 4991unicode_buffer_getreadbuf(PyUnicodeObject *self, 4992 int index, 4993 const void **ptr) 4994{ 4995 if (index != 0) { 4996 PyErr_SetString(PyExc_SystemError, 4997 "accessing non-existent unicode segment"); 4998 return -1; 4999 } 5000 *ptr = (void *) self->str; 5001 return PyUnicode_GET_DATA_SIZE(self); 5002} 5003 5004static int 5005unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 5006 const void **ptr) 5007{ 5008 PyErr_SetString(PyExc_TypeError, 5009 "cannot use unicode as modifyable buffer"); 5010 return -1; 5011} 5012 5013static int 5014unicode_buffer_getsegcount(PyUnicodeObject *self, 5015 int *lenp) 5016{ 5017 if (lenp) 5018 *lenp = PyUnicode_GET_DATA_SIZE(self); 5019 return 1; 5020} 5021 5022static int 5023unicode_buffer_getcharbuf(PyUnicodeObject *self, 5024 int index, 5025 const void **ptr) 5026{ 5027 PyObject *str; 5028 5029 if (index != 0) { 5030 PyErr_SetString(PyExc_SystemError, 5031 "accessing non-existent unicode segment"); 5032 return -1; 5033 } 5034 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5035 if (str == NULL) 5036 return -1; 5037 *ptr = (void *) PyString_AS_STRING(str); 5038 return PyString_GET_SIZE(str); 5039} 5040 5041/* Helpers for PyUnicode_Format() */ 5042 5043static PyObject * 5044getnextarg(PyObject *args, int arglen, int *p_argidx) 5045{ 5046 int argidx = *p_argidx; 5047 if (argidx < arglen) { 5048 (*p_argidx)++; 5049 if (arglen < 0) 5050 return args; 5051 else 5052 return PyTuple_GetItem(args, argidx); 5053 } 5054 PyErr_SetString(PyExc_TypeError, 5055 "not enough arguments for format string"); 5056 return NULL; 5057} 5058 5059#define F_LJUST (1<<0) 5060#define F_SIGN (1<<1) 5061#define F_BLANK (1<<2) 5062#define F_ALT (1<<3) 5063#define F_ZERO (1<<4) 5064 5065static 5066int usprintf(register Py_UNICODE *buffer, char *format, ...) 5067{ 5068 register int i; 5069 int len; 5070 va_list va; 5071 char *charbuffer; 5072 va_start(va, format); 5073 5074 /* First, format the string as char array, then expand to Py_UNICODE 5075 array. */ 5076 charbuffer = (char *)buffer; 5077 len = vsprintf(charbuffer, format, va); 5078 for (i = len - 1; i >= 0; i--) 5079 buffer[i] = (Py_UNICODE) charbuffer[i]; 5080 5081 va_end(va); 5082 return len; 5083} 5084 5085static int 5086formatfloat(Py_UNICODE *buf, 5087 size_t buflen, 5088 int flags, 5089 int prec, 5090 int type, 5091 PyObject *v) 5092{ 5093 /* fmt = '%#.' + `prec` + `type` 5094 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 5095 char fmt[20]; 5096 double x; 5097 5098 x = PyFloat_AsDouble(v); 5099 if (x == -1.0 && PyErr_Occurred()) 5100 return -1; 5101 if (prec < 0) 5102 prec = 6; 5103 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 5104 type = 'g'; 5105 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 5106 (flags & F_ALT) ? "#" : "", prec, type); 5107 /* worst case length calc to ensure no buffer overrun: 5108 fmt = %#.<prec>g 5109 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 5110 for any double rep.) 5111 len = 1 + prec + 1 + 2 + 5 = 9 + prec 5112 If prec=0 the effective precision is 1 (the leading digit is 5113 always given), therefore increase by one to 10+prec. */ 5114 if (buflen <= (size_t)10 + (size_t)prec) { 5115 PyErr_SetString(PyExc_OverflowError, 5116 "formatted float is too long (precision too long?)"); 5117 return -1; 5118 } 5119 return usprintf(buf, fmt, x); 5120} 5121 5122static PyObject* 5123formatlong(PyObject *val, int flags, int prec, int type) 5124{ 5125 char *buf; 5126 int i, len; 5127 PyObject *str; /* temporary string object. */ 5128 PyUnicodeObject *result; 5129 5130 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 5131 if (!str) 5132 return NULL; 5133 result = _PyUnicode_New(len); 5134 for (i = 0; i < len; i++) 5135 result->str[i] = buf[i]; 5136 result->str[len] = 0; 5137 Py_DECREF(str); 5138 return (PyObject*)result; 5139} 5140 5141static int 5142formatint(Py_UNICODE *buf, 5143 size_t buflen, 5144 int flags, 5145 int prec, 5146 int type, 5147 PyObject *v) 5148{ 5149 /* fmt = '%#.' + `prec` + 'l' + `type` 5150 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 5151 * + 1 + 1 5152 * = 24 5153 */ 5154 char fmt[64]; /* plenty big enough! */ 5155 long x; 5156 5157 x = PyInt_AsLong(v); 5158 if (x == -1 && PyErr_Occurred()) 5159 return -1; 5160 if (prec < 0) 5161 prec = 1; 5162 5163 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 5164 * worst case buf = '0x' + [0-9]*prec, where prec >= 11 5165 */ 5166 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) { 5167 PyErr_SetString(PyExc_OverflowError, 5168 "formatted integer is too long (precision too large?)"); 5169 return -1; 5170 } 5171 5172 if ((flags & F_ALT) && 5173 (type == 'x' || type == 'X')) { 5174 /* When converting under %#x or %#X, there are a number 5175 * of issues that cause pain: 5176 * - when 0 is being converted, the C standard leaves off 5177 * the '0x' or '0X', which is inconsistent with other 5178 * %#x/%#X conversions and inconsistent with Python's 5179 * hex() function 5180 * - there are platforms that violate the standard and 5181 * convert 0 with the '0x' or '0X' 5182 * (Metrowerks, Compaq Tru64) 5183 * - there are platforms that give '0x' when converting 5184 * under %#X, but convert 0 in accordance with the 5185 * standard (OS/2 EMX) 5186 * 5187 * We can achieve the desired consistency by inserting our 5188 * own '0x' or '0X' prefix, and substituting %x/%X in place 5189 * of %#x/%#X. 5190 * 5191 * Note that this is the same approach as used in 5192 * formatint() in stringobject.c 5193 */ 5194 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c", 5195 type, prec, type); 5196 } 5197 else { 5198 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 5199 (flags&F_ALT) ? "#" : "", 5200 prec, type); 5201 } 5202 return usprintf(buf, fmt, x); 5203} 5204 5205static int 5206formatchar(Py_UNICODE *buf, 5207 size_t buflen, 5208 PyObject *v) 5209{ 5210 /* presume that the buffer is at least 2 characters long */ 5211 if (PyUnicode_Check(v)) { 5212 if (PyUnicode_GET_SIZE(v) != 1) 5213 goto onError; 5214 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 5215 } 5216 5217 else if (PyString_Check(v)) { 5218 if (PyString_GET_SIZE(v) != 1) 5219 goto onError; 5220 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 5221 } 5222 5223 else { 5224 /* Integer input truncated to a character */ 5225 long x; 5226 x = PyInt_AsLong(v); 5227 if (x == -1 && PyErr_Occurred()) 5228 goto onError; 5229 buf[0] = (char) x; 5230 } 5231 buf[1] = '\0'; 5232 return 1; 5233 5234 onError: 5235 PyErr_SetString(PyExc_TypeError, 5236 "%c requires int or char"); 5237 return -1; 5238} 5239 5240/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 5241 5242 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 5243 chars are formatted. XXX This is a magic number. Each formatting 5244 routine does bounds checking to ensure no overflow, but a better 5245 solution may be to malloc a buffer of appropriate size for each 5246 format. For now, the current solution is sufficient. 5247*/ 5248#define FORMATBUFLEN (size_t)120 5249 5250PyObject *PyUnicode_Format(PyObject *format, 5251 PyObject *args) 5252{ 5253 Py_UNICODE *fmt, *res; 5254 int fmtcnt, rescnt, reslen, arglen, argidx; 5255 int args_owned = 0; 5256 PyUnicodeObject *result = NULL; 5257 PyObject *dict = NULL; 5258 PyObject *uformat; 5259 5260 if (format == NULL || args == NULL) { 5261 PyErr_BadInternalCall(); 5262 return NULL; 5263 } 5264 uformat = PyUnicode_FromObject(format); 5265 if (uformat == NULL) 5266 return NULL; 5267 fmt = PyUnicode_AS_UNICODE(uformat); 5268 fmtcnt = PyUnicode_GET_SIZE(uformat); 5269 5270 reslen = rescnt = fmtcnt + 100; 5271 result = _PyUnicode_New(reslen); 5272 if (result == NULL) 5273 goto onError; 5274 res = PyUnicode_AS_UNICODE(result); 5275 5276 if (PyTuple_Check(args)) { 5277 arglen = PyTuple_Size(args); 5278 argidx = 0; 5279 } 5280 else { 5281 arglen = -1; 5282 argidx = -2; 5283 } 5284 if (args->ob_type->tp_as_mapping) 5285 dict = args; 5286 5287 while (--fmtcnt >= 0) { 5288 if (*fmt != '%') { 5289 if (--rescnt < 0) { 5290 rescnt = fmtcnt + 100; 5291 reslen += rescnt; 5292 if (_PyUnicode_Resize(&result, reslen) < 0) 5293 return NULL; 5294 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 5295 --rescnt; 5296 } 5297 *res++ = *fmt++; 5298 } 5299 else { 5300 /* Got a format specifier */ 5301 int flags = 0; 5302 int width = -1; 5303 int prec = -1; 5304 Py_UNICODE c = '\0'; 5305 Py_UNICODE fill; 5306 PyObject *v = NULL; 5307 PyObject *temp = NULL; 5308 Py_UNICODE *pbuf; 5309 Py_UNICODE sign; 5310 int len; 5311 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 5312 5313 fmt++; 5314 if (*fmt == '(') { 5315 Py_UNICODE *keystart; 5316 int keylen; 5317 PyObject *key; 5318 int pcount = 1; 5319 5320 if (dict == NULL) { 5321 PyErr_SetString(PyExc_TypeError, 5322 "format requires a mapping"); 5323 goto onError; 5324 } 5325 ++fmt; 5326 --fmtcnt; 5327 keystart = fmt; 5328 /* Skip over balanced parentheses */ 5329 while (pcount > 0 && --fmtcnt >= 0) { 5330 if (*fmt == ')') 5331 --pcount; 5332 else if (*fmt == '(') 5333 ++pcount; 5334 fmt++; 5335 } 5336 keylen = fmt - keystart - 1; 5337 if (fmtcnt < 0 || pcount > 0) { 5338 PyErr_SetString(PyExc_ValueError, 5339 "incomplete format key"); 5340 goto onError; 5341 } 5342#if 0 5343 /* keys are converted to strings using UTF-8 and 5344 then looked up since Python uses strings to hold 5345 variables names etc. in its namespaces and we 5346 wouldn't want to break common idioms. */ 5347 key = PyUnicode_EncodeUTF8(keystart, 5348 keylen, 5349 NULL); 5350#else 5351 key = PyUnicode_FromUnicode(keystart, keylen); 5352#endif 5353 if (key == NULL) 5354 goto onError; 5355 if (args_owned) { 5356 Py_DECREF(args); 5357 args_owned = 0; 5358 } 5359 args = PyObject_GetItem(dict, key); 5360 Py_DECREF(key); 5361 if (args == NULL) { 5362 goto onError; 5363 } 5364 args_owned = 1; 5365 arglen = -1; 5366 argidx = -2; 5367 } 5368 while (--fmtcnt >= 0) { 5369 switch (c = *fmt++) { 5370 case '-': flags |= F_LJUST; continue; 5371 case '+': flags |= F_SIGN; continue; 5372 case ' ': flags |= F_BLANK; continue; 5373 case '#': flags |= F_ALT; continue; 5374 case '0': flags |= F_ZERO; continue; 5375 } 5376 break; 5377 } 5378 if (c == '*') { 5379 v = getnextarg(args, arglen, &argidx); 5380 if (v == NULL) 5381 goto onError; 5382 if (!PyInt_Check(v)) { 5383 PyErr_SetString(PyExc_TypeError, 5384 "* wants int"); 5385 goto onError; 5386 } 5387 width = PyInt_AsLong(v); 5388 if (width < 0) { 5389 flags |= F_LJUST; 5390 width = -width; 5391 } 5392 if (--fmtcnt >= 0) 5393 c = *fmt++; 5394 } 5395 else if (c >= '0' && c <= '9') { 5396 width = c - '0'; 5397 while (--fmtcnt >= 0) { 5398 c = *fmt++; 5399 if (c < '0' || c > '9') 5400 break; 5401 if ((width*10) / 10 != width) { 5402 PyErr_SetString(PyExc_ValueError, 5403 "width too big"); 5404 goto onError; 5405 } 5406 width = width*10 + (c - '0'); 5407 } 5408 } 5409 if (c == '.') { 5410 prec = 0; 5411 if (--fmtcnt >= 0) 5412 c = *fmt++; 5413 if (c == '*') { 5414 v = getnextarg(args, arglen, &argidx); 5415 if (v == NULL) 5416 goto onError; 5417 if (!PyInt_Check(v)) { 5418 PyErr_SetString(PyExc_TypeError, 5419 "* wants int"); 5420 goto onError; 5421 } 5422 prec = PyInt_AsLong(v); 5423 if (prec < 0) 5424 prec = 0; 5425 if (--fmtcnt >= 0) 5426 c = *fmt++; 5427 } 5428 else if (c >= '0' && c <= '9') { 5429 prec = c - '0'; 5430 while (--fmtcnt >= 0) { 5431 c = Py_CHARMASK(*fmt++); 5432 if (c < '0' || c > '9') 5433 break; 5434 if ((prec*10) / 10 != prec) { 5435 PyErr_SetString(PyExc_ValueError, 5436 "prec too big"); 5437 goto onError; 5438 } 5439 prec = prec*10 + (c - '0'); 5440 } 5441 } 5442 } /* prec */ 5443 if (fmtcnt >= 0) { 5444 if (c == 'h' || c == 'l' || c == 'L') { 5445 if (--fmtcnt >= 0) 5446 c = *fmt++; 5447 } 5448 } 5449 if (fmtcnt < 0) { 5450 PyErr_SetString(PyExc_ValueError, 5451 "incomplete format"); 5452 goto onError; 5453 } 5454 if (c != '%') { 5455 v = getnextarg(args, arglen, &argidx); 5456 if (v == NULL) 5457 goto onError; 5458 } 5459 sign = 0; 5460 fill = ' '; 5461 switch (c) { 5462 5463 case '%': 5464 pbuf = formatbuf; 5465 /* presume that buffer length is at least 1 */ 5466 pbuf[0] = '%'; 5467 len = 1; 5468 break; 5469 5470 case 's': 5471 case 'r': 5472 if (PyUnicode_Check(v) && c == 's') { 5473 temp = v; 5474 Py_INCREF(temp); 5475 } 5476 else { 5477 PyObject *unicode; 5478 if (c == 's') 5479 temp = PyObject_Str(v); 5480 else 5481 temp = PyObject_Repr(v); 5482 if (temp == NULL) 5483 goto onError; 5484 if (!PyString_Check(temp)) { 5485 /* XXX Note: this should never happen, since 5486 PyObject_Repr() and PyObject_Str() assure 5487 this */ 5488 Py_DECREF(temp); 5489 PyErr_SetString(PyExc_TypeError, 5490 "%s argument has non-string str()"); 5491 goto onError; 5492 } 5493 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5494 PyString_GET_SIZE(temp), 5495 NULL, 5496 "strict"); 5497 Py_DECREF(temp); 5498 temp = unicode; 5499 if (temp == NULL) 5500 goto onError; 5501 } 5502 pbuf = PyUnicode_AS_UNICODE(temp); 5503 len = PyUnicode_GET_SIZE(temp); 5504 if (prec >= 0 && len > prec) 5505 len = prec; 5506 break; 5507 5508 case 'i': 5509 case 'd': 5510 case 'u': 5511 case 'o': 5512 case 'x': 5513 case 'X': 5514 if (c == 'i') 5515 c = 'd'; 5516 if (PyLong_Check(v)) { 5517 temp = formatlong(v, flags, prec, c); 5518 if (!temp) 5519 goto onError; 5520 pbuf = PyUnicode_AS_UNICODE(temp); 5521 len = PyUnicode_GET_SIZE(temp); 5522 /* unbounded ints can always produce 5523 a sign character! */ 5524 sign = 1; 5525 } 5526 else { 5527 pbuf = formatbuf; 5528 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5529 flags, prec, c, v); 5530 if (len < 0) 5531 goto onError; 5532 /* only d conversion is signed */ 5533 sign = c == 'd'; 5534 } 5535 if (flags & F_ZERO) 5536 fill = '0'; 5537 break; 5538 5539 case 'e': 5540 case 'E': 5541 case 'f': 5542 case 'g': 5543 case 'G': 5544 pbuf = formatbuf; 5545 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5546 flags, prec, c, v); 5547 if (len < 0) 5548 goto onError; 5549 sign = 1; 5550 if (flags & F_ZERO) 5551 fill = '0'; 5552 break; 5553 5554 case 'c': 5555 pbuf = formatbuf; 5556 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5557 if (len < 0) 5558 goto onError; 5559 break; 5560 5561 default: 5562 PyErr_Format(PyExc_ValueError, 5563 "unsupported format character '%c' (0x%x) " 5564 "at index %i", 5565 (31<=c && c<=126) ? c : '?', 5566 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5567 goto onError; 5568 } 5569 if (sign) { 5570 if (*pbuf == '-' || *pbuf == '+') { 5571 sign = *pbuf++; 5572 len--; 5573 } 5574 else if (flags & F_SIGN) 5575 sign = '+'; 5576 else if (flags & F_BLANK) 5577 sign = ' '; 5578 else 5579 sign = 0; 5580 } 5581 if (width < len) 5582 width = len; 5583 if (rescnt < width + (sign != 0)) { 5584 reslen -= rescnt; 5585 rescnt = width + fmtcnt + 100; 5586 reslen += rescnt; 5587 if (_PyUnicode_Resize(&result, reslen) < 0) 5588 return NULL; 5589 res = PyUnicode_AS_UNICODE(result) 5590 + reslen - rescnt; 5591 } 5592 if (sign) { 5593 if (fill != ' ') 5594 *res++ = sign; 5595 rescnt--; 5596 if (width > len) 5597 width--; 5598 } 5599 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5600 assert(pbuf[0] == '0'); 5601 assert(pbuf[1] == c); 5602 if (fill != ' ') { 5603 *res++ = *pbuf++; 5604 *res++ = *pbuf++; 5605 } 5606 rescnt -= 2; 5607 width -= 2; 5608 if (width < 0) 5609 width = 0; 5610 len -= 2; 5611 } 5612 if (width > len && !(flags & F_LJUST)) { 5613 do { 5614 --rescnt; 5615 *res++ = fill; 5616 } while (--width > len); 5617 } 5618 if (fill == ' ') { 5619 if (sign) 5620 *res++ = sign; 5621 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5622 assert(pbuf[0] == '0'); 5623 assert(pbuf[1] == c); 5624 *res++ = *pbuf++; 5625 *res++ = *pbuf++; 5626 } 5627 } 5628 Py_UNICODE_COPY(res, pbuf, len); 5629 res += len; 5630 rescnt -= len; 5631 while (--width >= len) { 5632 --rescnt; 5633 *res++ = ' '; 5634 } 5635 if (dict && (argidx < arglen) && c != '%') { 5636 PyErr_SetString(PyExc_TypeError, 5637 "not all arguments converted"); 5638 goto onError; 5639 } 5640 Py_XDECREF(temp); 5641 } /* '%' */ 5642 } /* until end */ 5643 if (argidx < arglen && !dict) { 5644 PyErr_SetString(PyExc_TypeError, 5645 "not all arguments converted"); 5646 goto onError; 5647 } 5648 5649 if (args_owned) { 5650 Py_DECREF(args); 5651 } 5652 Py_DECREF(uformat); 5653 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5654 goto onError; 5655 return (PyObject *)result; 5656 5657 onError: 5658 Py_XDECREF(result); 5659 Py_DECREF(uformat); 5660 if (args_owned) { 5661 Py_DECREF(args); 5662 } 5663 return NULL; 5664} 5665 5666static PyBufferProcs unicode_as_buffer = { 5667 (getreadbufferproc) unicode_buffer_getreadbuf, 5668 (getwritebufferproc) unicode_buffer_getwritebuf, 5669 (getsegcountproc) unicode_buffer_getsegcount, 5670 (getcharbufferproc) unicode_buffer_getcharbuf, 5671}; 5672 5673staticforward PyObject * 5674unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 5675 5676static PyObject * 5677unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5678{ 5679 PyObject *x = NULL; 5680 static char *kwlist[] = {"string", "encoding", "errors", 0}; 5681 char *encoding = NULL; 5682 char *errors = NULL; 5683 5684 if (type != &PyUnicode_Type) 5685 return unicode_subtype_new(type, args, kwds); 5686 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 5687 kwlist, &x, &encoding, &errors)) 5688 return NULL; 5689 if (x == NULL) 5690 return (PyObject *)_PyUnicode_New(0); 5691 if (encoding == NULL && errors == NULL) 5692 return PyObject_Unicode(x); 5693 else 5694 return PyUnicode_FromEncodedObject(x, encoding, errors); 5695} 5696 5697static PyObject * 5698unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5699{ 5700 PyUnicodeObject *tmp, *pnew; 5701 int n; 5702 5703 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 5704 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 5705 if (tmp == NULL) 5706 return NULL; 5707 assert(PyUnicode_Check(tmp)); 5708 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 5709 if (pnew == NULL) 5710 return NULL; 5711 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 5712 if (pnew->str == NULL) { 5713 _Py_ForgetReference((PyObject *)pnew); 5714 PyMalloc_Del(pnew); 5715 return NULL; 5716 } 5717 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 5718 pnew->length = n; 5719 pnew->hash = tmp->hash; 5720 Py_DECREF(tmp); 5721 return (PyObject *)pnew; 5722} 5723 5724static char unicode_doc[] = 5725"unicode(string [, encoding[, errors]]) -> object\n\ 5726\n\ 5727Create a new Unicode object from the given encoded string.\n\ 5728encoding defaults to the current default string encoding and \n\ 5729errors, defining the error handling, to 'strict'."; 5730 5731PyTypeObject PyUnicode_Type = { 5732 PyObject_HEAD_INIT(&PyType_Type) 5733 0, /* ob_size */ 5734 "unicode", /* tp_name */ 5735 sizeof(PyUnicodeObject), /* tp_size */ 5736 0, /* tp_itemsize */ 5737 /* Slots */ 5738 (destructor)unicode_dealloc, /* tp_dealloc */ 5739 0, /* tp_print */ 5740 0, /* tp_getattr */ 5741 0, /* tp_setattr */ 5742 (cmpfunc) unicode_compare, /* tp_compare */ 5743 (reprfunc) unicode_repr, /* tp_repr */ 5744 0, /* tp_as_number */ 5745 &unicode_as_sequence, /* tp_as_sequence */ 5746 0, /* tp_as_mapping */ 5747 (hashfunc) unicode_hash, /* tp_hash*/ 5748 0, /* tp_call*/ 5749 (reprfunc) unicode_str, /* tp_str */ 5750 PyObject_GenericGetAttr, /* tp_getattro */ 5751 0, /* tp_setattro */ 5752 &unicode_as_buffer, /* tp_as_buffer */ 5753 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 5754 unicode_doc, /* tp_doc */ 5755 0, /* tp_traverse */ 5756 0, /* tp_clear */ 5757 0, /* tp_richcompare */ 5758 0, /* tp_weaklistoffset */ 5759 0, /* tp_iter */ 5760 0, /* tp_iternext */ 5761 unicode_methods, /* tp_methods */ 5762 0, /* tp_members */ 5763 0, /* tp_getset */ 5764 0, /* tp_base */ 5765 0, /* tp_dict */ 5766 0, /* tp_descr_get */ 5767 0, /* tp_descr_set */ 5768 0, /* tp_dictoffset */ 5769 0, /* tp_init */ 5770 0, /* tp_alloc */ 5771 unicode_new, /* tp_new */ 5772 _PyMalloc_Del, /* tp_free */ 5773}; 5774 5775/* Initialize the Unicode implementation */ 5776 5777void _PyUnicode_Init(void) 5778{ 5779 int i; 5780 5781 /* Init the implementation */ 5782 unicode_freelist = NULL; 5783 unicode_freelist_size = 0; 5784 unicode_empty = _PyUnicode_New(0); 5785 strcpy(unicode_default_encoding, "ascii"); 5786 for (i = 0; i < 256; i++) 5787 unicode_latin1[i] = NULL; 5788} 5789 5790/* Finalize the Unicode implementation */ 5791 5792void 5793_PyUnicode_Fini(void) 5794{ 5795 PyUnicodeObject *u; 5796 int i; 5797 5798 Py_XDECREF(unicode_empty); 5799 unicode_empty = NULL; 5800 5801 for (i = 0; i < 256; i++) { 5802 if (unicode_latin1[i]) { 5803 Py_DECREF(unicode_latin1[i]); 5804 unicode_latin1[i] = NULL; 5805 } 5806 } 5807 5808 for (u = unicode_freelist; u != NULL;) { 5809 PyUnicodeObject *v = u; 5810 u = *(PyUnicodeObject **)u; 5811 if (v->str) 5812 PyMem_DEL(v->str); 5813 Py_XDECREF(v->defenc); 5814 PyMalloc_Del(v); 5815 } 5816 unicode_freelist = NULL; 5817 unicode_freelist_size = 0; 5818} 5819