unicodeobject.c revision 0ebac97058baad8250adf710f287e8fb8770f7fa
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WIN32 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_Del(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 *unicode = (PyObject *)w; 280 return 0; 281 } 282 283 /* Note that we don't have to modify *unicode for unshared Unicode 284 objects, since we can modify them in-place. */ 285 return unicode_resize(v, length); 286} 287 288/* Internal API for use in unicodeobject.c only ! */ 289#define _PyUnicode_Resize(unicodevar, length) \ 290 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 291 292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 293 int size) 294{ 295 PyUnicodeObject *unicode; 296 297 /* If the Unicode data is known at construction time, we can apply 298 some optimizations which share commonly used objects. */ 299 if (u != NULL) { 300 301 /* Optimization for empty strings */ 302 if (size == 0 && unicode_empty != NULL) { 303 Py_INCREF(unicode_empty); 304 return (PyObject *)unicode_empty; 305 } 306 307 /* Single character Unicode objects in the Latin-1 range are 308 shared when using this constructor */ 309 if (size == 1 && *u < 256) { 310 unicode = unicode_latin1[*u]; 311 if (!unicode) { 312 unicode = _PyUnicode_New(1); 313 if (!unicode) 314 return NULL; 315 unicode->str[0] = *u; 316 unicode_latin1[*u] = unicode; 317 } 318 Py_INCREF(unicode); 319 return (PyObject *)unicode; 320 } 321 } 322 323 unicode = _PyUnicode_New(size); 324 if (!unicode) 325 return NULL; 326 327 /* Copy the Unicode data into the new object */ 328 if (u != NULL) 329 Py_UNICODE_COPY(unicode->str, u, size); 330 331 return (PyObject *)unicode; 332} 333 334#ifdef HAVE_WCHAR_H 335 336PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 337 int size) 338{ 339 PyUnicodeObject *unicode; 340 341 if (w == NULL) { 342 PyErr_BadInternalCall(); 343 return NULL; 344 } 345 346 unicode = _PyUnicode_New(size); 347 if (!unicode) 348 return NULL; 349 350 /* Copy the wchar_t data into the new object */ 351#ifdef HAVE_USABLE_WCHAR_T 352 memcpy(unicode->str, w, size * sizeof(wchar_t)); 353#else 354 { 355 register Py_UNICODE *u; 356 register int i; 357 u = PyUnicode_AS_UNICODE(unicode); 358 for (i = size; i >= 0; i--) 359 *u++ = *w++; 360 } 361#endif 362 363 return (PyObject *)unicode; 364} 365 366int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 367 register wchar_t *w, 368 int size) 369{ 370 if (unicode == NULL) { 371 PyErr_BadInternalCall(); 372 return -1; 373 } 374 if (size > PyUnicode_GET_SIZE(unicode)) 375 size = PyUnicode_GET_SIZE(unicode); 376#ifdef HAVE_USABLE_WCHAR_T 377 memcpy(w, unicode->str, size * sizeof(wchar_t)); 378#else 379 { 380 register Py_UNICODE *u; 381 register int i; 382 u = PyUnicode_AS_UNICODE(unicode); 383 for (i = size; i >= 0; i--) 384 *w++ = *u++; 385 } 386#endif 387 388 return size; 389} 390 391#endif 392 393PyObject *PyUnicode_FromObject(register PyObject *obj) 394{ 395 /* XXX Perhaps we should make this API an alias of 396 PyObject_Unicode() instead ?! */ 397 if (PyUnicode_CheckExact(obj)) { 398 Py_INCREF(obj); 399 return obj; 400 } 401 if (PyUnicode_Check(obj)) { 402 /* For a Unicode subtype that's not a Unicode object, 403 return a true Unicode object with the same data. */ 404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 405 PyUnicode_GET_SIZE(obj)); 406 } 407 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 408} 409 410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 411 const char *encoding, 412 const char *errors) 413{ 414 const char *s = NULL; 415 int len; 416 int owned = 0; 417 PyObject *v; 418 419 if (obj == NULL) { 420 PyErr_BadInternalCall(); 421 return NULL; 422 } 423 424#if 0 425 /* For b/w compatibility we also accept Unicode objects provided 426 that no encodings is given and then redirect to 427 PyObject_Unicode() which then applies the additional logic for 428 Unicode subclasses. 429 430 NOTE: This API should really only be used for object which 431 represent *encoded* Unicode ! 432 433 */ 434 if (PyUnicode_Check(obj)) { 435 if (encoding) { 436 PyErr_SetString(PyExc_TypeError, 437 "decoding Unicode is not supported"); 438 return NULL; 439 } 440 return PyObject_Unicode(obj); 441 } 442#else 443 if (PyUnicode_Check(obj)) { 444 PyErr_SetString(PyExc_TypeError, 445 "decoding Unicode is not supported"); 446 return NULL; 447 } 448#endif 449 450 /* Coerce object */ 451 if (PyString_Check(obj)) { 452 s = PyString_AS_STRING(obj); 453 len = PyString_GET_SIZE(obj); 454 } 455 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 456 /* Overwrite the error message with something more useful in 457 case of a TypeError. */ 458 if (PyErr_ExceptionMatches(PyExc_TypeError)) 459 PyErr_Format(PyExc_TypeError, 460 "coercing to Unicode: need string or buffer, " 461 "%.80s found", 462 obj->ob_type->tp_name); 463 goto onError; 464 } 465 466 /* Convert to Unicode */ 467 if (len == 0) { 468 Py_INCREF(unicode_empty); 469 v = (PyObject *)unicode_empty; 470 } 471 else 472 v = PyUnicode_Decode(s, len, encoding, errors); 473 474 if (owned) { 475 Py_DECREF(obj); 476 } 477 return v; 478 479 onError: 480 if (owned) { 481 Py_DECREF(obj); 482 } 483 return NULL; 484} 485 486PyObject *PyUnicode_Decode(const char *s, 487 int size, 488 const char *encoding, 489 const char *errors) 490{ 491 PyObject *buffer = NULL, *unicode; 492 493 if (encoding == NULL) 494 encoding = PyUnicode_GetDefaultEncoding(); 495 496 /* Shortcuts for common default encodings */ 497 if (strcmp(encoding, "utf-8") == 0) 498 return PyUnicode_DecodeUTF8(s, size, errors); 499 else if (strcmp(encoding, "latin-1") == 0) 500 return PyUnicode_DecodeLatin1(s, size, errors); 501 else if (strcmp(encoding, "ascii") == 0) 502 return PyUnicode_DecodeASCII(s, size, errors); 503 504 /* Decode via the codec registry */ 505 buffer = PyBuffer_FromMemory((void *)s, size); 506 if (buffer == NULL) 507 goto onError; 508 unicode = PyCodec_Decode(buffer, encoding, errors); 509 if (unicode == NULL) 510 goto onError; 511 if (!PyUnicode_Check(unicode)) { 512 PyErr_Format(PyExc_TypeError, 513 "decoder did not return an unicode object (type=%.400s)", 514 unicode->ob_type->tp_name); 515 Py_DECREF(unicode); 516 goto onError; 517 } 518 Py_DECREF(buffer); 519 return unicode; 520 521 onError: 522 Py_XDECREF(buffer); 523 return NULL; 524} 525 526PyObject *PyUnicode_Encode(const Py_UNICODE *s, 527 int size, 528 const char *encoding, 529 const char *errors) 530{ 531 PyObject *v, *unicode; 532 533 unicode = PyUnicode_FromUnicode(s, size); 534 if (unicode == NULL) 535 return NULL; 536 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 537 Py_DECREF(unicode); 538 return v; 539} 540 541PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 542 const char *encoding, 543 const char *errors) 544{ 545 PyObject *v; 546 547 if (!PyUnicode_Check(unicode)) { 548 PyErr_BadArgument(); 549 goto onError; 550 } 551 552 if (encoding == NULL) 553 encoding = PyUnicode_GetDefaultEncoding(); 554 555 /* Shortcuts for common default encodings */ 556 if (errors == NULL) { 557 if (strcmp(encoding, "utf-8") == 0) 558 return PyUnicode_AsUTF8String(unicode); 559 else if (strcmp(encoding, "latin-1") == 0) 560 return PyUnicode_AsLatin1String(unicode); 561 else if (strcmp(encoding, "ascii") == 0) 562 return PyUnicode_AsASCIIString(unicode); 563 } 564 565 /* Encode via the codec registry */ 566 v = PyCodec_Encode(unicode, encoding, errors); 567 if (v == NULL) 568 goto onError; 569 /* XXX Should we really enforce this ? */ 570 if (!PyString_Check(v)) { 571 PyErr_Format(PyExc_TypeError, 572 "encoder did not return a string object (type=%.400s)", 573 v->ob_type->tp_name); 574 Py_DECREF(v); 575 goto onError; 576 } 577 return v; 578 579 onError: 580 return NULL; 581} 582 583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 584 const char *errors) 585{ 586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 587 588 if (v) 589 return v; 590 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 591 if (v && errors == NULL) 592 ((PyUnicodeObject *)unicode)->defenc = v; 593 return v; 594} 595 596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 597{ 598 if (!PyUnicode_Check(unicode)) { 599 PyErr_BadArgument(); 600 goto onError; 601 } 602 return PyUnicode_AS_UNICODE(unicode); 603 604 onError: 605 return NULL; 606} 607 608int PyUnicode_GetSize(PyObject *unicode) 609{ 610 if (!PyUnicode_Check(unicode)) { 611 PyErr_BadArgument(); 612 goto onError; 613 } 614 return PyUnicode_GET_SIZE(unicode); 615 616 onError: 617 return -1; 618} 619 620const char *PyUnicode_GetDefaultEncoding(void) 621{ 622 return unicode_default_encoding; 623} 624 625int PyUnicode_SetDefaultEncoding(const char *encoding) 626{ 627 PyObject *v; 628 629 /* Make sure the encoding is valid. As side effect, this also 630 loads the encoding into the codec registry cache. */ 631 v = _PyCodec_Lookup(encoding); 632 if (v == NULL) 633 goto onError; 634 Py_DECREF(v); 635 strncpy(unicode_default_encoding, 636 encoding, 637 sizeof(unicode_default_encoding)); 638 return 0; 639 640 onError: 641 return -1; 642} 643 644/* --- UTF-7 Codec -------------------------------------------------------- */ 645 646/* see RFC2152 for details */ 647 648static 649char utf7_special[128] = { 650 /* indicate whether a UTF-7 character is special i.e. cannot be directly 651 encoded: 652 0 - not special 653 1 - special 654 2 - whitespace (optional) 655 3 - RFC2152 Set O (optional) */ 656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 664 665}; 666 667#define SPECIAL(c, encodeO, encodeWS) \ 668 (((c)>127 || utf7_special[(c)] == 1) || \ 669 (encodeWS && (utf7_special[(c)] == 2)) || \ 670 (encodeO && (utf7_special[(c)] == 3))) 671 672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 676 677#define ENCODE(out, ch, bits) \ 678 while (bits >= 6) { \ 679 *out++ = B64(ch >> (bits-6)); \ 680 bits -= 6; \ 681 } 682 683#define DECODE(out, ch, bits, surrogate) \ 684 while (bits >= 16) { \ 685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 686 bits -= 16; \ 687 if (surrogate) { \ 688 /* We have already generated an error for the high surrogate 689 so let's not bother seeing if the low surrogate is correct or not */\ 690 surrogate = 0; \ 691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 692 /* This is a surrogate pair. Unfortunately we can't represent \ 693 it in a 16-bit character */ \ 694 surrogate = 1; \ 695 errmsg = "code pairs are not supported"; \ 696 goto utf7Error; \ 697 } else { \ 698 *out++ = outCh; \ 699 } \ 700 } \ 701 702static 703int utf7_decoding_error(Py_UNICODE **dest, 704 const char *errors, 705 const char *details) 706{ 707 if ((errors == NULL) || 708 (strcmp(errors,"strict") == 0)) { 709 PyErr_Format(PyExc_UnicodeError, 710 "UTF-7 decoding error: %.400s", 711 details); 712 return -1; 713 } 714 else if (strcmp(errors,"ignore") == 0) { 715 return 0; 716 } 717 else if (strcmp(errors,"replace") == 0) { 718 if (dest != NULL) { 719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 720 (*dest)++; 721 } 722 return 0; 723 } 724 else { 725 PyErr_Format(PyExc_ValueError, 726 "UTF-7 decoding error; unknown error handling code: %.400s", 727 errors); 728 return -1; 729 } 730} 731 732PyObject *PyUnicode_DecodeUTF7(const char *s, 733 int size, 734 const char *errors) 735{ 736 const char *e; 737 PyUnicodeObject *unicode; 738 Py_UNICODE *p; 739 const char *errmsg = ""; 740 int inShift = 0; 741 unsigned int bitsleft = 0; 742 unsigned long charsleft = 0; 743 int surrogate = 0; 744 745 unicode = _PyUnicode_New(size); 746 if (!unicode) 747 return NULL; 748 if (size == 0) 749 return (PyObject *)unicode; 750 751 p = unicode->str; 752 e = s + size; 753 754 while (s < e) { 755 Py_UNICODE ch = *s; 756 757 if (inShift) { 758 if ((ch == '-') || !B64CHAR(ch)) { 759 inShift = 0; 760 s++; 761 762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 763 if (bitsleft >= 6) { 764 /* The shift sequence has a partial character in it. If 765 bitsleft < 6 then we could just classify it as padding 766 but that is not the case here */ 767 768 errmsg = "partial character in shift sequence"; 769 goto utf7Error; 770 } 771 /* According to RFC2152 the remaining bits should be zero. We 772 choose to signal an error/insert a replacement character 773 here so indicate the potential of a misencoded character. */ 774 775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 777 errmsg = "non-zero padding bits in shift sequence"; 778 goto utf7Error; 779 } 780 781 if (ch == '-') { 782 if ((s < e) && (*(s) == '-')) { 783 *p++ = '-'; 784 inShift = 1; 785 } 786 } else if (SPECIAL(ch,0,0)) { 787 errmsg = "unexpected special character"; 788 goto utf7Error; 789 } else { 790 *p++ = ch; 791 } 792 } else { 793 charsleft = (charsleft << 6) | UB64(ch); 794 bitsleft += 6; 795 s++; 796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 797 } 798 } 799 else if ( ch == '+' ) { 800 s++; 801 if (s < e && *s == '-') { 802 s++; 803 *p++ = '+'; 804 } else 805 { 806 inShift = 1; 807 bitsleft = 0; 808 } 809 } 810 else if (SPECIAL(ch,0,0)) { 811 errmsg = "unexpected special character"; 812 s++; 813 goto utf7Error; 814 } 815 else { 816 *p++ = ch; 817 s++; 818 } 819 continue; 820 utf7Error: 821 if (utf7_decoding_error(&p, errors, errmsg)) 822 goto onError; 823 } 824 825 if (inShift) { 826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) 827 goto onError; 828 } 829 830 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 831 goto onError; 832 833 return (PyObject *)unicode; 834 835onError: 836 Py_DECREF(unicode); 837 return NULL; 838} 839 840 841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 842 int size, 843 int encodeSetO, 844 int encodeWhiteSpace, 845 const char *errors) 846{ 847 PyObject *v; 848 /* It might be possible to tighten this worst case */ 849 unsigned int cbAllocated = 5 * size; 850 int inShift = 0; 851 int i = 0; 852 unsigned int bitsleft = 0; 853 unsigned long charsleft = 0; 854 char * out; 855 char * start; 856 857 if (size == 0) 858 return PyString_FromStringAndSize(NULL, 0); 859 860 v = PyString_FromStringAndSize(NULL, cbAllocated); 861 if (v == NULL) 862 return NULL; 863 864 start = out = PyString_AS_STRING(v); 865 for (;i < size; ++i) { 866 Py_UNICODE ch = s[i]; 867 868 if (!inShift) { 869 if (ch == '+') { 870 *out++ = '+'; 871 *out++ = '-'; 872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 873 charsleft = ch; 874 bitsleft = 16; 875 *out++ = '+'; 876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 877 inShift = bitsleft > 0; 878 } else { 879 *out++ = (char) ch; 880 } 881 } else { 882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 883 *out++ = B64(charsleft << (6-bitsleft)); 884 charsleft = 0; 885 bitsleft = 0; 886 /* Characters not in the BASE64 set implicitly unshift the sequence 887 so no '-' is required, except if the character is itself a '-' */ 888 if (B64CHAR(ch) || ch == '-') { 889 *out++ = '-'; 890 } 891 inShift = 0; 892 *out++ = (char) ch; 893 } else { 894 bitsleft += 16; 895 charsleft = (charsleft << 16) | ch; 896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 897 898 /* If the next character is special then we dont' need to terminate 899 the shift sequence. If the next character is not a BASE64 character 900 or '-' then the shift sequence will be terminated implicitly and we 901 don't have to insert a '-'. */ 902 903 if (bitsleft == 0) { 904 if (i + 1 < size) { 905 Py_UNICODE ch2 = s[i+1]; 906 907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 908 909 } else if (B64CHAR(ch2) || ch2 == '-') { 910 *out++ = '-'; 911 inShift = 0; 912 } else { 913 inShift = 0; 914 } 915 916 } 917 else { 918 *out++ = '-'; 919 inShift = 0; 920 } 921 } 922 } 923 } 924 } 925 if (bitsleft) { 926 *out++= B64(charsleft << (6-bitsleft) ); 927 *out++ = '-'; 928 } 929 930 _PyString_Resize(&v, out - start); 931 return v; 932} 933 934#undef SPECIAL 935#undef B64 936#undef B64CHAR 937#undef UB64 938#undef ENCODE 939#undef DECODE 940 941/* --- UTF-8 Codec -------------------------------------------------------- */ 942 943static 944char utf8_code_length[256] = { 945 /* Map UTF-8 encoded prefix byte to sequence length. zero means 946 illegal prefix. see RFC 2279 for details */ 947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 959 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 960 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 961 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 962 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 963}; 964 965static 966int utf8_decoding_error(const char **source, 967 Py_UNICODE **dest, 968 const char *errors, 969 const char *details) 970{ 971 if ((errors == NULL) || 972 (strcmp(errors,"strict") == 0)) { 973 PyErr_Format(PyExc_UnicodeError, 974 "UTF-8 decoding error: %.400s", 975 details); 976 return -1; 977 } 978 else if (strcmp(errors,"ignore") == 0) { 979 (*source)++; 980 return 0; 981 } 982 else if (strcmp(errors,"replace") == 0) { 983 (*source)++; 984 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 985 (*dest)++; 986 return 0; 987 } 988 else { 989 PyErr_Format(PyExc_ValueError, 990 "UTF-8 decoding error; unknown error handling code: %.400s", 991 errors); 992 return -1; 993 } 994} 995 996PyObject *PyUnicode_DecodeUTF8(const char *s, 997 int size, 998 const char *errors) 999{ 1000 int n; 1001 const char *e; 1002 PyUnicodeObject *unicode; 1003 Py_UNICODE *p; 1004 const char *errmsg = ""; 1005 1006 /* Note: size will always be longer than the resulting Unicode 1007 character count */ 1008 unicode = _PyUnicode_New(size); 1009 if (!unicode) 1010 return NULL; 1011 if (size == 0) 1012 return (PyObject *)unicode; 1013 1014 /* Unpack UTF-8 encoded data */ 1015 p = unicode->str; 1016 e = s + size; 1017 1018 while (s < e) { 1019 Py_UCS4 ch = (unsigned char)*s; 1020 1021 if (ch < 0x80) { 1022 *p++ = (Py_UNICODE)ch; 1023 s++; 1024 continue; 1025 } 1026 1027 n = utf8_code_length[ch]; 1028 1029 if (s + n > e) { 1030 errmsg = "unexpected end of data"; 1031 goto utf8Error; 1032 } 1033 1034 switch (n) { 1035 1036 case 0: 1037 errmsg = "unexpected code byte"; 1038 goto utf8Error; 1039 1040 case 1: 1041 errmsg = "internal error"; 1042 goto utf8Error; 1043 1044 case 2: 1045 if ((s[1] & 0xc0) != 0x80) { 1046 errmsg = "invalid data"; 1047 goto utf8Error; 1048 } 1049 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1050 if (ch < 0x80) { 1051 errmsg = "illegal encoding"; 1052 goto utf8Error; 1053 } 1054 else 1055 *p++ = (Py_UNICODE)ch; 1056 break; 1057 1058 case 3: 1059 if ((s[1] & 0xc0) != 0x80 || 1060 (s[2] & 0xc0) != 0x80) { 1061 errmsg = "invalid data"; 1062 goto utf8Error; 1063 } 1064 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1065 if (ch < 0x0800) { 1066 /* Note: UTF-8 encodings of surrogates are considered 1067 legal UTF-8 sequences; 1068 1069 XXX For wide builds (UCS-4) we should probably try 1070 to recombine the surrogates into a single code 1071 unit. 1072 */ 1073 errmsg = "illegal encoding"; 1074 goto utf8Error; 1075 } 1076 else 1077 *p++ = (Py_UNICODE)ch; 1078 break; 1079 1080 case 4: 1081 if ((s[1] & 0xc0) != 0x80 || 1082 (s[2] & 0xc0) != 0x80 || 1083 (s[3] & 0xc0) != 0x80) { 1084 errmsg = "invalid data"; 1085 goto utf8Error; 1086 } 1087 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1088 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1089 /* validate and convert to UTF-16 */ 1090 if ((ch < 0x10000) /* minimum value allowed for 4 1091 byte encoding */ 1092 || (ch > 0x10ffff)) /* maximum value allowed for 1093 UTF-16 */ 1094 { 1095 errmsg = "illegal encoding"; 1096 goto utf8Error; 1097 } 1098#ifdef Py_UNICODE_WIDE 1099 *p++ = (Py_UNICODE)ch; 1100#else 1101 /* compute and append the two surrogates: */ 1102 1103 /* translate from 10000..10FFFF to 0..FFFF */ 1104 ch -= 0x10000; 1105 1106 /* high surrogate = top 10 bits added to D800 */ 1107 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1108 1109 /* low surrogate = bottom 10 bits added to DC00 */ 1110 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1111#endif 1112 break; 1113 1114 default: 1115 /* Other sizes are only needed for UCS-4 */ 1116 errmsg = "unsupported Unicode code range"; 1117 goto utf8Error; 1118 } 1119 s += n; 1120 continue; 1121 1122 utf8Error: 1123 if (utf8_decoding_error(&s, &p, errors, errmsg)) 1124 goto onError; 1125 } 1126 1127 /* Adjust length */ 1128 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1129 goto onError; 1130 1131 return (PyObject *)unicode; 1132 1133onError: 1134 Py_DECREF(unicode); 1135 return NULL; 1136} 1137 1138/* Allocation strategy: if the string is short, convert into a stack buffer 1139 and allocate exactly as much space needed at the end. Else allocate the 1140 maximum possible needed (4 result bytes per Unicode character), and return 1141 the excess memory at the end. 1142*/ 1143PyObject * 1144PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1145 int size, 1146 const char *errors) 1147{ 1148#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1149 1150 int i; /* index into s of next input byte */ 1151 PyObject *v; /* result string object */ 1152 char *p; /* next free byte in output buffer */ 1153 int nallocated; /* number of result bytes allocated */ 1154 int nneeded; /* number of result bytes needed */ 1155 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1156 1157 assert(s != NULL); 1158 assert(size >= 0); 1159 1160 if (size <= MAX_SHORT_UNICHARS) { 1161 /* Write into the stack buffer; nallocated can't overflow. 1162 * At the end, we'll allocate exactly as much heap space as it 1163 * turns out we need. 1164 */ 1165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1166 v = NULL; /* will allocate after we're done */ 1167 p = stackbuf; 1168 } 1169 else { 1170 /* Overallocate on the heap, and give the excess back at the end. */ 1171 nallocated = size * 4; 1172 if (nallocated / 4 != size) /* overflow! */ 1173 return PyErr_NoMemory(); 1174 v = PyString_FromStringAndSize(NULL, nallocated); 1175 if (v == NULL) 1176 return NULL; 1177 p = PyString_AS_STRING(v); 1178 } 1179 1180 for (i = 0; i < size;) { 1181 Py_UCS4 ch = s[i++]; 1182 1183 if (ch < 0x80) 1184 /* Encode ASCII */ 1185 *p++ = (char) ch; 1186 1187 else if (ch < 0x0800) { 1188 /* Encode Latin-1 */ 1189 *p++ = (char)(0xc0 | (ch >> 6)); 1190 *p++ = (char)(0x80 | (ch & 0x3f)); 1191 } 1192 else { 1193 /* Encode UCS2 Unicode ordinals */ 1194 if (ch < 0x10000) { 1195 /* Special case: check for high surrogate */ 1196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1197 Py_UCS4 ch2 = s[i]; 1198 /* Check for low surrogate and combine the two to 1199 form a UCS4 value */ 1200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1202 i++; 1203 goto encodeUCS4; 1204 } 1205 /* Fall through: handles isolated high surrogates */ 1206 } 1207 *p++ = (char)(0xe0 | (ch >> 12)); 1208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1209 *p++ = (char)(0x80 | (ch & 0x3f)); 1210 continue; 1211 } 1212encodeUCS4: 1213 /* Encode UCS4 Unicode ordinals */ 1214 *p++ = (char)(0xf0 | (ch >> 18)); 1215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1217 *p++ = (char)(0x80 | (ch & 0x3f)); 1218 } 1219 } 1220 1221 if (v == NULL) { 1222 /* This was stack allocated. */ 1223 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); 1224 assert(nneeded <= nallocated); 1225 v = PyString_FromStringAndSize(stackbuf, nneeded); 1226 } 1227 else { 1228 /* Cut back to size actually needed. */ 1229 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); 1230 assert(nneeded <= nallocated); 1231 _PyString_Resize(&v, nneeded); 1232 } 1233 return v; 1234 1235#undef MAX_SHORT_UNICHARS 1236} 1237 1238PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1239{ 1240 if (!PyUnicode_Check(unicode)) { 1241 PyErr_BadArgument(); 1242 return NULL; 1243 } 1244 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1245 PyUnicode_GET_SIZE(unicode), 1246 NULL); 1247} 1248 1249/* --- UTF-16 Codec ------------------------------------------------------- */ 1250 1251static 1252int utf16_decoding_error(Py_UNICODE **dest, 1253 const char *errors, 1254 const char *details) 1255{ 1256 if ((errors == NULL) || 1257 (strcmp(errors,"strict") == 0)) { 1258 PyErr_Format(PyExc_UnicodeError, 1259 "UTF-16 decoding error: %.400s", 1260 details); 1261 return -1; 1262 } 1263 else if (strcmp(errors,"ignore") == 0) { 1264 return 0; 1265 } 1266 else if (strcmp(errors,"replace") == 0) { 1267 if (dest) { 1268 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1269 (*dest)++; 1270 } 1271 return 0; 1272 } 1273 else { 1274 PyErr_Format(PyExc_ValueError, 1275 "UTF-16 decoding error; " 1276 "unknown error handling code: %.400s", 1277 errors); 1278 return -1; 1279 } 1280} 1281 1282PyObject * 1283PyUnicode_DecodeUTF16(const char *s, 1284 int size, 1285 const char *errors, 1286 int *byteorder) 1287{ 1288 PyUnicodeObject *unicode; 1289 Py_UNICODE *p; 1290 const unsigned char *q, *e; 1291 int bo = 0; /* assume native ordering by default */ 1292 const char *errmsg = ""; 1293 /* Offsets from q for retrieving byte pairs in the right order. */ 1294#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1295 int ihi = 1, ilo = 0; 1296#else 1297 int ihi = 0, ilo = 1; 1298#endif 1299 1300 /* size should be an even number */ 1301 if (size & 1) { 1302 if (utf16_decoding_error(NULL, errors, "truncated data")) 1303 return NULL; 1304 --size; /* else ignore the oddball byte */ 1305 } 1306 1307 /* Note: size will always be longer than the resulting Unicode 1308 character count */ 1309 unicode = _PyUnicode_New(size); 1310 if (!unicode) 1311 return NULL; 1312 if (size == 0) 1313 return (PyObject *)unicode; 1314 1315 /* Unpack UTF-16 encoded data */ 1316 p = unicode->str; 1317 q = (unsigned char *)s; 1318 e = q + size; 1319 1320 if (byteorder) 1321 bo = *byteorder; 1322 1323 /* Check for BOM marks (U+FEFF) in the input and adjust current 1324 byte order setting accordingly. In native mode, the leading BOM 1325 mark is skipped, in all other modes, it is copied to the output 1326 stream as-is (giving a ZWNBSP character). */ 1327 if (bo == 0) { 1328 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1329#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1330 if (bom == 0xFEFF) { 1331 q += 2; 1332 bo = -1; 1333 } 1334 else if (bom == 0xFFFE) { 1335 q += 2; 1336 bo = 1; 1337 } 1338#else 1339 if (bom == 0xFEFF) { 1340 q += 2; 1341 bo = 1; 1342 } 1343 else if (bom == 0xFFFE) { 1344 q += 2; 1345 bo = -1; 1346 } 1347#endif 1348 } 1349 1350 if (bo == -1) { 1351 /* force LE */ 1352 ihi = 1; 1353 ilo = 0; 1354 } 1355 else if (bo == 1) { 1356 /* force BE */ 1357 ihi = 0; 1358 ilo = 1; 1359 } 1360 1361 while (q < e) { 1362 Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; 1363 q += 2; 1364 1365 if (ch < 0xD800 || ch > 0xDFFF) { 1366 *p++ = ch; 1367 continue; 1368 } 1369 1370 /* UTF-16 code pair: */ 1371 if (q >= e) { 1372 errmsg = "unexpected end of data"; 1373 goto utf16Error; 1374 } 1375 if (0xD800 <= ch && ch <= 0xDBFF) { 1376 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1377 q += 2; 1378 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1379#ifndef Py_UNICODE_WIDE 1380 *p++ = ch; 1381 *p++ = ch2; 1382#else 1383 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1384#endif 1385 continue; 1386 } 1387 else { 1388 errmsg = "illegal UTF-16 surrogate"; 1389 goto utf16Error; 1390 } 1391 1392 } 1393 errmsg = "illegal encoding"; 1394 /* Fall through to report the error */ 1395 1396 utf16Error: 1397 if (utf16_decoding_error(&p, errors, errmsg)) 1398 goto onError; 1399 } 1400 1401 if (byteorder) 1402 *byteorder = bo; 1403 1404 /* Adjust length */ 1405 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1406 goto onError; 1407 1408 return (PyObject *)unicode; 1409 1410onError: 1411 Py_DECREF(unicode); 1412 return NULL; 1413} 1414 1415PyObject * 1416PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1417 int size, 1418 const char *errors, 1419 int byteorder) 1420{ 1421 PyObject *v; 1422 unsigned char *p; 1423 int i, pairs; 1424 /* Offsets from p for storing byte pairs in the right order. */ 1425#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1426 int ihi = 1, ilo = 0; 1427#else 1428 int ihi = 0, ilo = 1; 1429#endif 1430 1431#define STORECHAR(CH) \ 1432 do { \ 1433 p[ihi] = ((CH) >> 8) & 0xff; \ 1434 p[ilo] = (CH) & 0xff; \ 1435 p += 2; \ 1436 } while(0) 1437 1438 for (i = pairs = 0; i < size; i++) 1439 if (s[i] >= 0x10000) 1440 pairs++; 1441 v = PyString_FromStringAndSize(NULL, 1442 2 * (size + pairs + (byteorder == 0))); 1443 if (v == NULL) 1444 return NULL; 1445 1446 p = (unsigned char *)PyString_AS_STRING(v); 1447 if (byteorder == 0) 1448 STORECHAR(0xFEFF); 1449 if (size == 0) 1450 return v; 1451 1452 if (byteorder == -1) { 1453 /* force LE */ 1454 ihi = 1; 1455 ilo = 0; 1456 } 1457 else if (byteorder == 1) { 1458 /* force BE */ 1459 ihi = 0; 1460 ilo = 1; 1461 } 1462 1463 while (size-- > 0) { 1464 Py_UNICODE ch = *s++; 1465 Py_UNICODE ch2 = 0; 1466 if (ch >= 0x10000) { 1467 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1468 ch = 0xD800 | ((ch-0x10000) >> 10); 1469 } 1470 STORECHAR(ch); 1471 if (ch2) 1472 STORECHAR(ch2); 1473 } 1474 return v; 1475#undef STORECHAR 1476} 1477 1478PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1479{ 1480 if (!PyUnicode_Check(unicode)) { 1481 PyErr_BadArgument(); 1482 return NULL; 1483 } 1484 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1485 PyUnicode_GET_SIZE(unicode), 1486 NULL, 1487 0); 1488} 1489 1490/* --- Unicode Escape Codec ----------------------------------------------- */ 1491 1492static 1493int unicodeescape_decoding_error(Py_UNICODE **x, 1494 const char *errors, 1495 const char *details) 1496{ 1497 if ((errors == NULL) || 1498 (strcmp(errors,"strict") == 0)) { 1499 PyErr_Format(PyExc_UnicodeError, 1500 "Unicode-Escape decoding error: %.400s", 1501 details); 1502 return -1; 1503 } 1504 else if (strcmp(errors,"ignore") == 0) { 1505 return 0; 1506 } 1507 else if (strcmp(errors,"replace") == 0) { 1508 **x = Py_UNICODE_REPLACEMENT_CHARACTER; 1509 (*x)++; 1510 return 0; 1511 } 1512 else { 1513 PyErr_Format(PyExc_ValueError, 1514 "Unicode-Escape decoding error; " 1515 "unknown error handling code: %.400s", 1516 errors); 1517 return -1; 1518 } 1519} 1520 1521static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1522 1523PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1524 int size, 1525 const char *errors) 1526{ 1527 PyUnicodeObject *v; 1528 Py_UNICODE *p, *buf; 1529 const char *end; 1530 char* message; 1531 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1532 1533 /* Escaped strings will always be longer than the resulting 1534 Unicode string, so we start with size here and then reduce the 1535 length after conversion to the true value. */ 1536 v = _PyUnicode_New(size); 1537 if (v == NULL) 1538 goto onError; 1539 if (size == 0) 1540 return (PyObject *)v; 1541 1542 p = buf = PyUnicode_AS_UNICODE(v); 1543 end = s + size; 1544 1545 while (s < end) { 1546 unsigned char c; 1547 Py_UNICODE x; 1548 int i, digits; 1549 1550 /* Non-escape characters are interpreted as Unicode ordinals */ 1551 if (*s != '\\') { 1552 *p++ = (unsigned char) *s++; 1553 continue; 1554 } 1555 1556 /* \ - Escapes */ 1557 s++; 1558 switch (*s++) { 1559 1560 /* \x escapes */ 1561 case '\n': break; 1562 case '\\': *p++ = '\\'; break; 1563 case '\'': *p++ = '\''; break; 1564 case '\"': *p++ = '\"'; break; 1565 case 'b': *p++ = '\b'; break; 1566 case 'f': *p++ = '\014'; break; /* FF */ 1567 case 't': *p++ = '\t'; break; 1568 case 'n': *p++ = '\n'; break; 1569 case 'r': *p++ = '\r'; break; 1570 case 'v': *p++ = '\013'; break; /* VT */ 1571 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1572 1573 /* \OOO (octal) escapes */ 1574 case '0': case '1': case '2': case '3': 1575 case '4': case '5': case '6': case '7': 1576 x = s[-1] - '0'; 1577 if ('0' <= *s && *s <= '7') { 1578 x = (x<<3) + *s++ - '0'; 1579 if ('0' <= *s && *s <= '7') 1580 x = (x<<3) + *s++ - '0'; 1581 } 1582 *p++ = x; 1583 break; 1584 1585 /* hex escapes */ 1586 /* \xXX */ 1587 case 'x': 1588 digits = 2; 1589 message = "truncated \\xXX escape"; 1590 goto hexescape; 1591 1592 /* \uXXXX */ 1593 case 'u': 1594 digits = 4; 1595 message = "truncated \\uXXXX escape"; 1596 goto hexescape; 1597 1598 /* \UXXXXXXXX */ 1599 case 'U': 1600 digits = 8; 1601 message = "truncated \\UXXXXXXXX escape"; 1602 hexescape: 1603 chr = 0; 1604 for (i = 0; i < digits; i++) { 1605 c = (unsigned char) s[i]; 1606 if (!isxdigit(c)) { 1607 if (unicodeescape_decoding_error(&p, errors, message)) 1608 goto onError; 1609 chr = 0xffffffff; 1610 i++; 1611 break; 1612 } 1613 chr = (chr<<4) & ~0xF; 1614 if (c >= '0' && c <= '9') 1615 chr += c - '0'; 1616 else if (c >= 'a' && c <= 'f') 1617 chr += 10 + c - 'a'; 1618 else 1619 chr += 10 + c - 'A'; 1620 } 1621 s += i; 1622 if (chr == 0xffffffff) 1623 /* _decoding_error will have already written into the 1624 target buffer. */ 1625 break; 1626 store: 1627 /* when we get here, chr is a 32-bit unicode character */ 1628 if (chr <= 0xffff) 1629 /* UCS-2 character */ 1630 *p++ = (Py_UNICODE) chr; 1631 else if (chr <= 0x10ffff) { 1632 /* UCS-4 character. Either store directly, or as 1633 surrogate pair. */ 1634#ifdef Py_UNICODE_WIDE 1635 *p++ = chr; 1636#else 1637 chr -= 0x10000L; 1638 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1639 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1640#endif 1641 } else { 1642 if (unicodeescape_decoding_error( 1643 &p, errors, 1644 "illegal Unicode character") 1645 ) 1646 goto onError; 1647 } 1648 break; 1649 1650 /* \N{name} */ 1651 case 'N': 1652 message = "malformed \\N character escape"; 1653 if (ucnhash_CAPI == NULL) { 1654 /* load the unicode data module */ 1655 PyObject *m, *v; 1656 m = PyImport_ImportModule("unicodedata"); 1657 if (m == NULL) 1658 goto ucnhashError; 1659 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1660 Py_DECREF(m); 1661 if (v == NULL) 1662 goto ucnhashError; 1663 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1664 Py_DECREF(v); 1665 if (ucnhash_CAPI == NULL) 1666 goto ucnhashError; 1667 } 1668 if (*s == '{') { 1669 const char *start = s+1; 1670 /* look for the closing brace */ 1671 while (*s != '}' && s < end) 1672 s++; 1673 if (s > start && s < end && *s == '}') { 1674 /* found a name. look it up in the unicode database */ 1675 message = "unknown Unicode character name"; 1676 s++; 1677 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1678 goto store; 1679 } 1680 } 1681 if (unicodeescape_decoding_error(&p, errors, message)) 1682 goto onError; 1683 break; 1684 1685 default: 1686 if (s > end) { 1687 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) 1688 goto onError; 1689 } 1690 else { 1691 *p++ = '\\'; 1692 *p++ = (unsigned char)s[-1]; 1693 } 1694 break; 1695 } 1696 } 1697 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1698 goto onError; 1699 return (PyObject *)v; 1700 1701ucnhashError: 1702 PyErr_SetString( 1703 PyExc_UnicodeError, 1704 "\\N escapes not supported (can't load unicodedata module)" 1705 ); 1706 return NULL; 1707 1708onError: 1709 Py_XDECREF(v); 1710 return NULL; 1711} 1712 1713/* Return a Unicode-Escape string version of the Unicode object. 1714 1715 If quotes is true, the string is enclosed in u"" or u'' quotes as 1716 appropriate. 1717 1718*/ 1719 1720static const Py_UNICODE *findchar(const Py_UNICODE *s, 1721 int size, 1722 Py_UNICODE ch); 1723 1724static 1725PyObject *unicodeescape_string(const Py_UNICODE *s, 1726 int size, 1727 int quotes) 1728{ 1729 PyObject *repr; 1730 char *p; 1731 1732 static const char *hexdigit = "0123456789abcdef"; 1733 1734 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1735 if (repr == NULL) 1736 return NULL; 1737 1738 p = PyString_AS_STRING(repr); 1739 1740 if (quotes) { 1741 *p++ = 'u'; 1742 *p++ = (findchar(s, size, '\'') && 1743 !findchar(s, size, '"')) ? '"' : '\''; 1744 } 1745 while (size-- > 0) { 1746 Py_UNICODE ch = *s++; 1747 1748 /* Escape quotes */ 1749 if (quotes && 1750 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1751 *p++ = '\\'; 1752 *p++ = (char) ch; 1753 continue; 1754 } 1755 1756#ifdef Py_UNICODE_WIDE 1757 /* Map 21-bit characters to '\U00xxxxxx' */ 1758 else if (ch >= 0x10000) { 1759 int offset = p - PyString_AS_STRING(repr); 1760 1761 /* Resize the string if necessary */ 1762 if (offset + 12 > PyString_GET_SIZE(repr)) { 1763 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1764 return NULL; 1765 p = PyString_AS_STRING(repr) + offset; 1766 } 1767 1768 *p++ = '\\'; 1769 *p++ = 'U'; 1770 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1771 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1772 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1773 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1774 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1775 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1776 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1777 *p++ = hexdigit[ch & 0x0000000F]; 1778 continue; 1779 } 1780#endif 1781 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1782 else if (ch >= 0xD800 && ch < 0xDC00) { 1783 Py_UNICODE ch2; 1784 Py_UCS4 ucs; 1785 1786 ch2 = *s++; 1787 size--; 1788 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1789 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1790 *p++ = '\\'; 1791 *p++ = 'U'; 1792 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1793 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1794 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1795 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1796 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1797 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1798 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1799 *p++ = hexdigit[ucs & 0x0000000F]; 1800 continue; 1801 } 1802 /* Fall through: isolated surrogates are copied as-is */ 1803 s--; 1804 size++; 1805 } 1806 1807 /* Map 16-bit characters to '\uxxxx' */ 1808 if (ch >= 256) { 1809 *p++ = '\\'; 1810 *p++ = 'u'; 1811 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1812 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1813 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1814 *p++ = hexdigit[ch & 0x000F]; 1815 } 1816 1817 /* Map special whitespace to '\t', \n', '\r' */ 1818 else if (ch == '\t') { 1819 *p++ = '\\'; 1820 *p++ = 't'; 1821 } 1822 else if (ch == '\n') { 1823 *p++ = '\\'; 1824 *p++ = 'n'; 1825 } 1826 else if (ch == '\r') { 1827 *p++ = '\\'; 1828 *p++ = 'r'; 1829 } 1830 1831 /* Map non-printable US ASCII to '\xhh' */ 1832 else if (ch < ' ' || ch >= 0x7F) { 1833 *p++ = '\\'; 1834 *p++ = 'x'; 1835 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1836 *p++ = hexdigit[ch & 0x000F]; 1837 } 1838 1839 /* Copy everything else as-is */ 1840 else 1841 *p++ = (char) ch; 1842 } 1843 if (quotes) 1844 *p++ = PyString_AS_STRING(repr)[1]; 1845 1846 *p = '\0'; 1847 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 1848 return repr; 1849} 1850 1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1852 int size) 1853{ 1854 return unicodeescape_string(s, size, 0); 1855} 1856 1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1858{ 1859 if (!PyUnicode_Check(unicode)) { 1860 PyErr_BadArgument(); 1861 return NULL; 1862 } 1863 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1864 PyUnicode_GET_SIZE(unicode)); 1865} 1866 1867/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1868 1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1870 int size, 1871 const char *errors) 1872{ 1873 PyUnicodeObject *v; 1874 Py_UNICODE *p, *buf; 1875 const char *end; 1876 const char *bs; 1877 1878 /* Escaped strings will always be longer than the resulting 1879 Unicode string, so we start with size here and then reduce the 1880 length after conversion to the true value. */ 1881 v = _PyUnicode_New(size); 1882 if (v == NULL) 1883 goto onError; 1884 if (size == 0) 1885 return (PyObject *)v; 1886 p = buf = PyUnicode_AS_UNICODE(v); 1887 end = s + size; 1888 while (s < end) { 1889 unsigned char c; 1890 Py_UCS4 x; 1891 int i; 1892 1893 /* Non-escape characters are interpreted as Unicode ordinals */ 1894 if (*s != '\\') { 1895 *p++ = (unsigned char)*s++; 1896 continue; 1897 } 1898 1899 /* \u-escapes are only interpreted iff the number of leading 1900 backslashes if odd */ 1901 bs = s; 1902 for (;s < end;) { 1903 if (*s != '\\') 1904 break; 1905 *p++ = (unsigned char)*s++; 1906 } 1907 if (((s - bs) & 1) == 0 || 1908 s >= end || 1909 *s != 'u') { 1910 continue; 1911 } 1912 p--; 1913 s++; 1914 1915 /* \uXXXX with 4 hex digits */ 1916 for (x = 0, i = 0; i < 4; i++) { 1917 c = (unsigned char)s[i]; 1918 if (!isxdigit(c)) { 1919 if (unicodeescape_decoding_error(&p, errors, 1920 "truncated \\uXXXX")) 1921 goto onError; 1922 x = 0xffffffff; 1923 i++; 1924 break; 1925 } 1926 x = (x<<4) & ~0xF; 1927 if (c >= '0' && c <= '9') 1928 x += c - '0'; 1929 else if (c >= 'a' && c <= 'f') 1930 x += 10 + c - 'a'; 1931 else 1932 x += 10 + c - 'A'; 1933 } 1934 s += i; 1935 if (x != 0xffffffff) 1936 *p++ = x; 1937 } 1938 if (_PyUnicode_Resize(&v, (int)(p - buf))) 1939 goto onError; 1940 return (PyObject *)v; 1941 1942 onError: 1943 Py_XDECREF(v); 1944 return NULL; 1945} 1946 1947PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1948 int size) 1949{ 1950 PyObject *repr; 1951 char *p; 1952 char *q; 1953 1954 static const char *hexdigit = "0123456789abcdef"; 1955 1956 repr = PyString_FromStringAndSize(NULL, 6 * size); 1957 if (repr == NULL) 1958 return NULL; 1959 if (size == 0) 1960 return repr; 1961 1962 p = q = PyString_AS_STRING(repr); 1963 while (size-- > 0) { 1964 Py_UNICODE ch = *s++; 1965 /* Map 16-bit characters to '\uxxxx' */ 1966 if (ch >= 256) { 1967 *p++ = '\\'; 1968 *p++ = 'u'; 1969 *p++ = hexdigit[(ch >> 12) & 0xf]; 1970 *p++ = hexdigit[(ch >> 8) & 0xf]; 1971 *p++ = hexdigit[(ch >> 4) & 0xf]; 1972 *p++ = hexdigit[ch & 15]; 1973 } 1974 /* Copy everything else as-is */ 1975 else 1976 *p++ = (char) ch; 1977 } 1978 *p = '\0'; 1979 _PyString_Resize(&repr, p - q); 1980 return repr; 1981} 1982 1983PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1984{ 1985 if (!PyUnicode_Check(unicode)) { 1986 PyErr_BadArgument(); 1987 return NULL; 1988 } 1989 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1990 PyUnicode_GET_SIZE(unicode)); 1991} 1992 1993/* --- Latin-1 Codec ------------------------------------------------------ */ 1994 1995PyObject *PyUnicode_DecodeLatin1(const char *s, 1996 int size, 1997 const char *errors) 1998{ 1999 PyUnicodeObject *v; 2000 Py_UNICODE *p; 2001 2002 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2003 if (size == 1 && *(unsigned char*)s < 256) { 2004 Py_UNICODE r = *(unsigned char*)s; 2005 return PyUnicode_FromUnicode(&r, 1); 2006 } 2007 2008 v = _PyUnicode_New(size); 2009 if (v == NULL) 2010 goto onError; 2011 if (size == 0) 2012 return (PyObject *)v; 2013 p = PyUnicode_AS_UNICODE(v); 2014 while (size-- > 0) 2015 *p++ = (unsigned char)*s++; 2016 return (PyObject *)v; 2017 2018 onError: 2019 Py_XDECREF(v); 2020 return NULL; 2021} 2022 2023static 2024int latin1_encoding_error(const Py_UNICODE **source, 2025 char **dest, 2026 const char *errors, 2027 const char *details) 2028{ 2029 if ((errors == NULL) || 2030 (strcmp(errors,"strict") == 0)) { 2031 PyErr_Format(PyExc_UnicodeError, 2032 "Latin-1 encoding error: %.400s", 2033 details); 2034 return -1; 2035 } 2036 else if (strcmp(errors,"ignore") == 0) { 2037 return 0; 2038 } 2039 else if (strcmp(errors,"replace") == 0) { 2040 **dest = '?'; 2041 (*dest)++; 2042 return 0; 2043 } 2044 else { 2045 PyErr_Format(PyExc_ValueError, 2046 "Latin-1 encoding error; " 2047 "unknown error handling code: %.400s", 2048 errors); 2049 return -1; 2050 } 2051} 2052 2053PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2054 int size, 2055 const char *errors) 2056{ 2057 PyObject *repr; 2058 char *s, *start; 2059 2060 repr = PyString_FromStringAndSize(NULL, size); 2061 if (repr == NULL) 2062 return NULL; 2063 if (size == 0) 2064 return repr; 2065 2066 s = PyString_AS_STRING(repr); 2067 start = s; 2068 while (size-- > 0) { 2069 Py_UNICODE ch = *p++; 2070 if (ch >= 256) { 2071 if (latin1_encoding_error(&p, &s, errors, 2072 "ordinal not in range(256)")) 2073 goto onError; 2074 } 2075 else 2076 *s++ = (char)ch; 2077 } 2078 /* Resize if error handling skipped some characters */ 2079 if (s - start < PyString_GET_SIZE(repr)) 2080 _PyString_Resize(&repr, s - start); 2081 return repr; 2082 2083 onError: 2084 Py_DECREF(repr); 2085 return NULL; 2086} 2087 2088PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2089{ 2090 if (!PyUnicode_Check(unicode)) { 2091 PyErr_BadArgument(); 2092 return NULL; 2093 } 2094 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2095 PyUnicode_GET_SIZE(unicode), 2096 NULL); 2097} 2098 2099/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2100 2101static 2102int ascii_decoding_error(const char **source, 2103 Py_UNICODE **dest, 2104 const char *errors, 2105 const char *details) 2106{ 2107 if ((errors == NULL) || 2108 (strcmp(errors,"strict") == 0)) { 2109 PyErr_Format(PyExc_UnicodeError, 2110 "ASCII decoding error: %.400s", 2111 details); 2112 return -1; 2113 } 2114 else if (strcmp(errors,"ignore") == 0) { 2115 return 0; 2116 } 2117 else if (strcmp(errors,"replace") == 0) { 2118 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2119 (*dest)++; 2120 return 0; 2121 } 2122 else { 2123 PyErr_Format(PyExc_ValueError, 2124 "ASCII decoding error; " 2125 "unknown error handling code: %.400s", 2126 errors); 2127 return -1; 2128 } 2129} 2130 2131PyObject *PyUnicode_DecodeASCII(const char *s, 2132 int size, 2133 const char *errors) 2134{ 2135 PyUnicodeObject *v; 2136 Py_UNICODE *p; 2137 2138 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2139 if (size == 1 && *(unsigned char*)s < 128) { 2140 Py_UNICODE r = *(unsigned char*)s; 2141 return PyUnicode_FromUnicode(&r, 1); 2142 } 2143 2144 v = _PyUnicode_New(size); 2145 if (v == NULL) 2146 goto onError; 2147 if (size == 0) 2148 return (PyObject *)v; 2149 p = PyUnicode_AS_UNICODE(v); 2150 while (size-- > 0) { 2151 register unsigned char c; 2152 2153 c = (unsigned char)*s++; 2154 if (c < 128) 2155 *p++ = c; 2156 else if (ascii_decoding_error(&s, &p, errors, 2157 "ordinal not in range(128)")) 2158 goto onError; 2159 } 2160 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2161 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2162 goto onError; 2163 return (PyObject *)v; 2164 2165 onError: 2166 Py_XDECREF(v); 2167 return NULL; 2168} 2169 2170static 2171int ascii_encoding_error(const Py_UNICODE **source, 2172 char **dest, 2173 const char *errors, 2174 const char *details) 2175{ 2176 if ((errors == NULL) || 2177 (strcmp(errors,"strict") == 0)) { 2178 PyErr_Format(PyExc_UnicodeError, 2179 "ASCII encoding error: %.400s", 2180 details); 2181 return -1; 2182 } 2183 else if (strcmp(errors,"ignore") == 0) { 2184 return 0; 2185 } 2186 else if (strcmp(errors,"replace") == 0) { 2187 **dest = '?'; 2188 (*dest)++; 2189 return 0; 2190 } 2191 else { 2192 PyErr_Format(PyExc_ValueError, 2193 "ASCII encoding error; " 2194 "unknown error handling code: %.400s", 2195 errors); 2196 return -1; 2197 } 2198} 2199 2200PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2201 int size, 2202 const char *errors) 2203{ 2204 PyObject *repr; 2205 char *s, *start; 2206 2207 repr = PyString_FromStringAndSize(NULL, size); 2208 if (repr == NULL) 2209 return NULL; 2210 if (size == 0) 2211 return repr; 2212 2213 s = PyString_AS_STRING(repr); 2214 start = s; 2215 while (size-- > 0) { 2216 Py_UNICODE ch = *p++; 2217 if (ch >= 128) { 2218 if (ascii_encoding_error(&p, &s, errors, 2219 "ordinal not in range(128)")) 2220 goto onError; 2221 } 2222 else 2223 *s++ = (char)ch; 2224 } 2225 /* Resize if error handling skipped some characters */ 2226 if (s - start < PyString_GET_SIZE(repr)) 2227 _PyString_Resize(&repr, s - start); 2228 return repr; 2229 2230 onError: 2231 Py_DECREF(repr); 2232 return NULL; 2233} 2234 2235PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2236{ 2237 if (!PyUnicode_Check(unicode)) { 2238 PyErr_BadArgument(); 2239 return NULL; 2240 } 2241 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2242 PyUnicode_GET_SIZE(unicode), 2243 NULL); 2244} 2245 2246#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T) 2247 2248/* --- MBCS codecs for Windows -------------------------------------------- */ 2249 2250PyObject *PyUnicode_DecodeMBCS(const char *s, 2251 int size, 2252 const char *errors) 2253{ 2254 PyUnicodeObject *v; 2255 Py_UNICODE *p; 2256 2257 /* First get the size of the result */ 2258 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2259 if (size > 0 && usize==0) 2260 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2261 2262 v = _PyUnicode_New(usize); 2263 if (v == NULL) 2264 return NULL; 2265 if (usize == 0) 2266 return (PyObject *)v; 2267 p = PyUnicode_AS_UNICODE(v); 2268 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2269 Py_DECREF(v); 2270 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2271 } 2272 2273 return (PyObject *)v; 2274} 2275 2276PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2277 int size, 2278 const char *errors) 2279{ 2280 PyObject *repr; 2281 char *s; 2282 DWORD mbcssize; 2283 2284 /* If there are no characters, bail now! */ 2285 if (size==0) 2286 return PyString_FromString(""); 2287 2288 /* First get the size of the result */ 2289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2290 if (mbcssize==0) 2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2292 2293 repr = PyString_FromStringAndSize(NULL, mbcssize); 2294 if (repr == NULL) 2295 return NULL; 2296 if (mbcssize == 0) 2297 return repr; 2298 2299 /* Do the conversion */ 2300 s = PyString_AS_STRING(repr); 2301 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2302 Py_DECREF(repr); 2303 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2304 } 2305 return repr; 2306} 2307 2308#endif /* MS_WIN32 */ 2309 2310/* --- Character Mapping Codec -------------------------------------------- */ 2311 2312static 2313int charmap_decoding_error(const char **source, 2314 Py_UNICODE **dest, 2315 const char *errors, 2316 const char *details) 2317{ 2318 if ((errors == NULL) || 2319 (strcmp(errors,"strict") == 0)) { 2320 PyErr_Format(PyExc_UnicodeError, 2321 "charmap decoding error: %.400s", 2322 details); 2323 return -1; 2324 } 2325 else if (strcmp(errors,"ignore") == 0) { 2326 return 0; 2327 } 2328 else if (strcmp(errors,"replace") == 0) { 2329 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 2330 (*dest)++; 2331 return 0; 2332 } 2333 else { 2334 PyErr_Format(PyExc_ValueError, 2335 "charmap decoding error; " 2336 "unknown error handling code: %.400s", 2337 errors); 2338 return -1; 2339 } 2340} 2341 2342PyObject *PyUnicode_DecodeCharmap(const char *s, 2343 int size, 2344 PyObject *mapping, 2345 const char *errors) 2346{ 2347 PyUnicodeObject *v; 2348 Py_UNICODE *p; 2349 int extrachars = 0; 2350 2351 /* Default to Latin-1 */ 2352 if (mapping == NULL) 2353 return PyUnicode_DecodeLatin1(s, size, errors); 2354 2355 v = _PyUnicode_New(size); 2356 if (v == NULL) 2357 goto onError; 2358 if (size == 0) 2359 return (PyObject *)v; 2360 p = PyUnicode_AS_UNICODE(v); 2361 while (size-- > 0) { 2362 unsigned char ch = *s++; 2363 PyObject *w, *x; 2364 2365 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2366 w = PyInt_FromLong((long)ch); 2367 if (w == NULL) 2368 goto onError; 2369 x = PyObject_GetItem(mapping, w); 2370 Py_DECREF(w); 2371 if (x == NULL) { 2372 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2373 /* No mapping found means: mapping is undefined. */ 2374 PyErr_Clear(); 2375 x = Py_None; 2376 Py_INCREF(x); 2377 } else 2378 goto onError; 2379 } 2380 2381 /* Apply mapping */ 2382 if (PyInt_Check(x)) { 2383 long value = PyInt_AS_LONG(x); 2384 if (value < 0 || value > 65535) { 2385 PyErr_SetString(PyExc_TypeError, 2386 "character mapping must be in range(65536)"); 2387 Py_DECREF(x); 2388 goto onError; 2389 } 2390 *p++ = (Py_UNICODE)value; 2391 } 2392 else if (x == Py_None) { 2393 /* undefined mapping */ 2394 if (charmap_decoding_error(&s, &p, errors, 2395 "character maps to <undefined>")) { 2396 Py_DECREF(x); 2397 goto onError; 2398 } 2399 } 2400 else if (PyUnicode_Check(x)) { 2401 int targetsize = PyUnicode_GET_SIZE(x); 2402 2403 if (targetsize == 1) 2404 /* 1-1 mapping */ 2405 *p++ = *PyUnicode_AS_UNICODE(x); 2406 2407 else if (targetsize > 1) { 2408 /* 1-n mapping */ 2409 if (targetsize > extrachars) { 2410 /* resize first */ 2411 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2412 int needed = (targetsize - extrachars) + \ 2413 (targetsize << 2); 2414 extrachars += needed; 2415 if (_PyUnicode_Resize(&v, 2416 PyUnicode_GET_SIZE(v) + needed)) { 2417 Py_DECREF(x); 2418 goto onError; 2419 } 2420 p = PyUnicode_AS_UNICODE(v) + oldpos; 2421 } 2422 Py_UNICODE_COPY(p, 2423 PyUnicode_AS_UNICODE(x), 2424 targetsize); 2425 p += targetsize; 2426 extrachars -= targetsize; 2427 } 2428 /* 1-0 mapping: skip the character */ 2429 } 2430 else { 2431 /* wrong return value */ 2432 PyErr_SetString(PyExc_TypeError, 2433 "character mapping must return integer, None or unicode"); 2434 Py_DECREF(x); 2435 goto onError; 2436 } 2437 Py_DECREF(x); 2438 } 2439 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2440 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2441 goto onError; 2442 return (PyObject *)v; 2443 2444 onError: 2445 Py_XDECREF(v); 2446 return NULL; 2447} 2448 2449static 2450int charmap_encoding_error(const Py_UNICODE **source, 2451 char **dest, 2452 const char *errors, 2453 const char *details) 2454{ 2455 if ((errors == NULL) || 2456 (strcmp(errors,"strict") == 0)) { 2457 PyErr_Format(PyExc_UnicodeError, 2458 "charmap encoding error: %.400s", 2459 details); 2460 return -1; 2461 } 2462 else if (strcmp(errors,"ignore") == 0) { 2463 return 0; 2464 } 2465 else if (strcmp(errors,"replace") == 0) { 2466 **dest = '?'; 2467 (*dest)++; 2468 return 0; 2469 } 2470 else { 2471 PyErr_Format(PyExc_ValueError, 2472 "charmap encoding error; " 2473 "unknown error handling code: %.400s", 2474 errors); 2475 return -1; 2476 } 2477} 2478 2479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2480 int size, 2481 PyObject *mapping, 2482 const char *errors) 2483{ 2484 PyObject *v; 2485 char *s; 2486 int extrachars = 0; 2487 2488 /* Default to Latin-1 */ 2489 if (mapping == NULL) 2490 return PyUnicode_EncodeLatin1(p, size, errors); 2491 2492 v = PyString_FromStringAndSize(NULL, size); 2493 if (v == NULL) 2494 return NULL; 2495 if (size == 0) 2496 return v; 2497 s = PyString_AS_STRING(v); 2498 while (size-- > 0) { 2499 Py_UNICODE ch = *p++; 2500 PyObject *w, *x; 2501 2502 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2503 w = PyInt_FromLong((long)ch); 2504 if (w == NULL) 2505 goto onError; 2506 x = PyObject_GetItem(mapping, w); 2507 Py_DECREF(w); 2508 if (x == NULL) { 2509 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2510 /* No mapping found means: mapping is undefined. */ 2511 PyErr_Clear(); 2512 x = Py_None; 2513 Py_INCREF(x); 2514 } else 2515 goto onError; 2516 } 2517 2518 /* Apply mapping */ 2519 if (PyInt_Check(x)) { 2520 long value = PyInt_AS_LONG(x); 2521 if (value < 0 || value > 255) { 2522 PyErr_SetString(PyExc_TypeError, 2523 "character mapping must be in range(256)"); 2524 Py_DECREF(x); 2525 goto onError; 2526 } 2527 *s++ = (char)value; 2528 } 2529 else if (x == Py_None) { 2530 /* undefined mapping */ 2531 if (charmap_encoding_error(&p, &s, errors, 2532 "character maps to <undefined>")) { 2533 Py_DECREF(x); 2534 goto onError; 2535 } 2536 } 2537 else if (PyString_Check(x)) { 2538 int targetsize = PyString_GET_SIZE(x); 2539 2540 if (targetsize == 1) 2541 /* 1-1 mapping */ 2542 *s++ = *PyString_AS_STRING(x); 2543 2544 else if (targetsize > 1) { 2545 /* 1-n mapping */ 2546 if (targetsize > extrachars) { 2547 /* resize first */ 2548 int oldpos = (int)(s - PyString_AS_STRING(v)); 2549 int needed = (targetsize - extrachars) + \ 2550 (targetsize << 2); 2551 extrachars += needed; 2552 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2553 Py_DECREF(x); 2554 goto onError; 2555 } 2556 s = PyString_AS_STRING(v) + oldpos; 2557 } 2558 memcpy(s, PyString_AS_STRING(x), targetsize); 2559 s += targetsize; 2560 extrachars -= targetsize; 2561 } 2562 /* 1-0 mapping: skip the character */ 2563 } 2564 else { 2565 /* wrong return value */ 2566 PyErr_SetString(PyExc_TypeError, 2567 "character mapping must return integer, None or unicode"); 2568 Py_DECREF(x); 2569 goto onError; 2570 } 2571 Py_DECREF(x); 2572 } 2573 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2574 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))); 2575 return v; 2576 2577 onError: 2578 Py_XDECREF(v); 2579 return NULL; 2580} 2581 2582PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2583 PyObject *mapping) 2584{ 2585 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2586 PyErr_BadArgument(); 2587 return NULL; 2588 } 2589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2590 PyUnicode_GET_SIZE(unicode), 2591 mapping, 2592 NULL); 2593} 2594 2595static 2596int translate_error(const Py_UNICODE **source, 2597 Py_UNICODE **dest, 2598 const char *errors, 2599 const char *details) 2600{ 2601 if ((errors == NULL) || 2602 (strcmp(errors,"strict") == 0)) { 2603 PyErr_Format(PyExc_UnicodeError, 2604 "translate error: %.400s", 2605 details); 2606 return -1; 2607 } 2608 else if (strcmp(errors,"ignore") == 0) { 2609 return 0; 2610 } 2611 else if (strcmp(errors,"replace") == 0) { 2612 **dest = '?'; 2613 (*dest)++; 2614 return 0; 2615 } 2616 else { 2617 PyErr_Format(PyExc_ValueError, 2618 "translate error; " 2619 "unknown error handling code: %.400s", 2620 errors); 2621 return -1; 2622 } 2623} 2624 2625PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2626 int size, 2627 PyObject *mapping, 2628 const char *errors) 2629{ 2630 PyUnicodeObject *v; 2631 Py_UNICODE *p; 2632 2633 if (mapping == NULL) { 2634 PyErr_BadArgument(); 2635 return NULL; 2636 } 2637 2638 /* Output will never be longer than input */ 2639 v = _PyUnicode_New(size); 2640 if (v == NULL) 2641 goto onError; 2642 if (size == 0) 2643 goto done; 2644 p = PyUnicode_AS_UNICODE(v); 2645 while (size-- > 0) { 2646 Py_UNICODE ch = *s++; 2647 PyObject *w, *x; 2648 2649 /* Get mapping */ 2650 w = PyInt_FromLong(ch); 2651 if (w == NULL) 2652 goto onError; 2653 x = PyObject_GetItem(mapping, w); 2654 Py_DECREF(w); 2655 if (x == NULL) { 2656 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2657 /* No mapping found: default to 1-1 mapping */ 2658 PyErr_Clear(); 2659 *p++ = ch; 2660 continue; 2661 } 2662 goto onError; 2663 } 2664 2665 /* Apply mapping */ 2666 if (PyInt_Check(x)) 2667 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2668 else if (x == Py_None) { 2669 /* undefined mapping */ 2670 if (translate_error(&s, &p, errors, 2671 "character maps to <undefined>")) { 2672 Py_DECREF(x); 2673 goto onError; 2674 } 2675 } 2676 else if (PyUnicode_Check(x)) { 2677 if (PyUnicode_GET_SIZE(x) != 1) { 2678 /* 1-n mapping */ 2679 PyErr_SetString(PyExc_NotImplementedError, 2680 "1-n mappings are currently not implemented"); 2681 Py_DECREF(x); 2682 goto onError; 2683 } 2684 *p++ = *PyUnicode_AS_UNICODE(x); 2685 } 2686 else { 2687 /* wrong return value */ 2688 PyErr_SetString(PyExc_TypeError, 2689 "translate mapping must return integer, None or unicode"); 2690 Py_DECREF(x); 2691 goto onError; 2692 } 2693 Py_DECREF(x); 2694 } 2695 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2696 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2697 goto onError; 2698 2699 done: 2700 return (PyObject *)v; 2701 2702 onError: 2703 Py_XDECREF(v); 2704 return NULL; 2705} 2706 2707PyObject *PyUnicode_Translate(PyObject *str, 2708 PyObject *mapping, 2709 const char *errors) 2710{ 2711 PyObject *result; 2712 2713 str = PyUnicode_FromObject(str); 2714 if (str == NULL) 2715 goto onError; 2716 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2717 PyUnicode_GET_SIZE(str), 2718 mapping, 2719 errors); 2720 Py_DECREF(str); 2721 return result; 2722 2723 onError: 2724 Py_XDECREF(str); 2725 return NULL; 2726} 2727 2728/* --- Decimal Encoder ---------------------------------------------------- */ 2729 2730int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2731 int length, 2732 char *output, 2733 const char *errors) 2734{ 2735 Py_UNICODE *p, *end; 2736 2737 if (output == NULL) { 2738 PyErr_BadArgument(); 2739 return -1; 2740 } 2741 2742 p = s; 2743 end = s + length; 2744 while (p < end) { 2745 register Py_UNICODE ch = *p++; 2746 int decimal; 2747 2748 if (Py_UNICODE_ISSPACE(ch)) { 2749 *output++ = ' '; 2750 continue; 2751 } 2752 decimal = Py_UNICODE_TODECIMAL(ch); 2753 if (decimal >= 0) { 2754 *output++ = '0' + decimal; 2755 continue; 2756 } 2757 if (0 < ch && ch < 256) { 2758 *output++ = (char)ch; 2759 continue; 2760 } 2761 /* All other characters are considered invalid */ 2762 if (errors == NULL || strcmp(errors, "strict") == 0) { 2763 PyErr_SetString(PyExc_ValueError, 2764 "invalid decimal Unicode string"); 2765 goto onError; 2766 } 2767 else if (strcmp(errors, "ignore") == 0) 2768 continue; 2769 else if (strcmp(errors, "replace") == 0) { 2770 *output++ = '?'; 2771 continue; 2772 } 2773 } 2774 /* 0-terminate the output string */ 2775 *output++ = '\0'; 2776 return 0; 2777 2778 onError: 2779 return -1; 2780} 2781 2782/* --- Helpers ------------------------------------------------------------ */ 2783 2784static 2785int count(PyUnicodeObject *self, 2786 int start, 2787 int end, 2788 PyUnicodeObject *substring) 2789{ 2790 int count = 0; 2791 2792 if (start < 0) 2793 start += self->length; 2794 if (start < 0) 2795 start = 0; 2796 if (end > self->length) 2797 end = self->length; 2798 if (end < 0) 2799 end += self->length; 2800 if (end < 0) 2801 end = 0; 2802 2803 if (substring->length == 0) 2804 return (end - start + 1); 2805 2806 end -= substring->length; 2807 2808 while (start <= end) 2809 if (Py_UNICODE_MATCH(self, start, substring)) { 2810 count++; 2811 start += substring->length; 2812 } else 2813 start++; 2814 2815 return count; 2816} 2817 2818int PyUnicode_Count(PyObject *str, 2819 PyObject *substr, 2820 int start, 2821 int end) 2822{ 2823 int result; 2824 2825 str = PyUnicode_FromObject(str); 2826 if (str == NULL) 2827 return -1; 2828 substr = PyUnicode_FromObject(substr); 2829 if (substr == NULL) { 2830 Py_DECREF(str); 2831 return -1; 2832 } 2833 2834 result = count((PyUnicodeObject *)str, 2835 start, end, 2836 (PyUnicodeObject *)substr); 2837 2838 Py_DECREF(str); 2839 Py_DECREF(substr); 2840 return result; 2841} 2842 2843static 2844int findstring(PyUnicodeObject *self, 2845 PyUnicodeObject *substring, 2846 int start, 2847 int end, 2848 int direction) 2849{ 2850 if (start < 0) 2851 start += self->length; 2852 if (start < 0) 2853 start = 0; 2854 2855 if (substring->length == 0) 2856 return start; 2857 2858 if (end > self->length) 2859 end = self->length; 2860 if (end < 0) 2861 end += self->length; 2862 if (end < 0) 2863 end = 0; 2864 2865 end -= substring->length; 2866 2867 if (direction < 0) { 2868 for (; end >= start; end--) 2869 if (Py_UNICODE_MATCH(self, end, substring)) 2870 return end; 2871 } else { 2872 for (; start <= end; start++) 2873 if (Py_UNICODE_MATCH(self, start, substring)) 2874 return start; 2875 } 2876 2877 return -1; 2878} 2879 2880int PyUnicode_Find(PyObject *str, 2881 PyObject *substr, 2882 int start, 2883 int end, 2884 int direction) 2885{ 2886 int result; 2887 2888 str = PyUnicode_FromObject(str); 2889 if (str == NULL) 2890 return -1; 2891 substr = PyUnicode_FromObject(substr); 2892 if (substr == NULL) { 2893 Py_DECREF(substr); 2894 return -1; 2895 } 2896 2897 result = findstring((PyUnicodeObject *)str, 2898 (PyUnicodeObject *)substr, 2899 start, end, direction); 2900 Py_DECREF(str); 2901 Py_DECREF(substr); 2902 return result; 2903} 2904 2905static 2906int tailmatch(PyUnicodeObject *self, 2907 PyUnicodeObject *substring, 2908 int start, 2909 int end, 2910 int direction) 2911{ 2912 if (start < 0) 2913 start += self->length; 2914 if (start < 0) 2915 start = 0; 2916 2917 if (substring->length == 0) 2918 return 1; 2919 2920 if (end > self->length) 2921 end = self->length; 2922 if (end < 0) 2923 end += self->length; 2924 if (end < 0) 2925 end = 0; 2926 2927 end -= substring->length; 2928 if (end < start) 2929 return 0; 2930 2931 if (direction > 0) { 2932 if (Py_UNICODE_MATCH(self, end, substring)) 2933 return 1; 2934 } else { 2935 if (Py_UNICODE_MATCH(self, start, substring)) 2936 return 1; 2937 } 2938 2939 return 0; 2940} 2941 2942int PyUnicode_Tailmatch(PyObject *str, 2943 PyObject *substr, 2944 int start, 2945 int end, 2946 int direction) 2947{ 2948 int result; 2949 2950 str = PyUnicode_FromObject(str); 2951 if (str == NULL) 2952 return -1; 2953 substr = PyUnicode_FromObject(substr); 2954 if (substr == NULL) { 2955 Py_DECREF(substr); 2956 return -1; 2957 } 2958 2959 result = tailmatch((PyUnicodeObject *)str, 2960 (PyUnicodeObject *)substr, 2961 start, end, direction); 2962 Py_DECREF(str); 2963 Py_DECREF(substr); 2964 return result; 2965} 2966 2967static 2968const Py_UNICODE *findchar(const Py_UNICODE *s, 2969 int size, 2970 Py_UNICODE ch) 2971{ 2972 /* like wcschr, but doesn't stop at NULL characters */ 2973 2974 while (size-- > 0) { 2975 if (*s == ch) 2976 return s; 2977 s++; 2978 } 2979 2980 return NULL; 2981} 2982 2983/* Apply fixfct filter to the Unicode object self and return a 2984 reference to the modified object */ 2985 2986static 2987PyObject *fixup(PyUnicodeObject *self, 2988 int (*fixfct)(PyUnicodeObject *s)) 2989{ 2990 2991 PyUnicodeObject *u; 2992 2993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 2994 if (u == NULL) 2995 return NULL; 2996 2997 Py_UNICODE_COPY(u->str, self->str, self->length); 2998 2999 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3000 /* fixfct should return TRUE if it modified the buffer. If 3001 FALSE, return a reference to the original buffer instead 3002 (to save space, not time) */ 3003 Py_INCREF(self); 3004 Py_DECREF(u); 3005 return (PyObject*) self; 3006 } 3007 return (PyObject*) u; 3008} 3009 3010static 3011int fixupper(PyUnicodeObject *self) 3012{ 3013 int len = self->length; 3014 Py_UNICODE *s = self->str; 3015 int status = 0; 3016 3017 while (len-- > 0) { 3018 register Py_UNICODE ch; 3019 3020 ch = Py_UNICODE_TOUPPER(*s); 3021 if (ch != *s) { 3022 status = 1; 3023 *s = ch; 3024 } 3025 s++; 3026 } 3027 3028 return status; 3029} 3030 3031static 3032int fixlower(PyUnicodeObject *self) 3033{ 3034 int len = self->length; 3035 Py_UNICODE *s = self->str; 3036 int status = 0; 3037 3038 while (len-- > 0) { 3039 register Py_UNICODE ch; 3040 3041 ch = Py_UNICODE_TOLOWER(*s); 3042 if (ch != *s) { 3043 status = 1; 3044 *s = ch; 3045 } 3046 s++; 3047 } 3048 3049 return status; 3050} 3051 3052static 3053int fixswapcase(PyUnicodeObject *self) 3054{ 3055 int len = self->length; 3056 Py_UNICODE *s = self->str; 3057 int status = 0; 3058 3059 while (len-- > 0) { 3060 if (Py_UNICODE_ISUPPER(*s)) { 3061 *s = Py_UNICODE_TOLOWER(*s); 3062 status = 1; 3063 } else if (Py_UNICODE_ISLOWER(*s)) { 3064 *s = Py_UNICODE_TOUPPER(*s); 3065 status = 1; 3066 } 3067 s++; 3068 } 3069 3070 return status; 3071} 3072 3073static 3074int fixcapitalize(PyUnicodeObject *self) 3075{ 3076 int len = self->length; 3077 Py_UNICODE *s = self->str; 3078 int status = 0; 3079 3080 if (len == 0) 3081 return 0; 3082 if (Py_UNICODE_ISLOWER(*s)) { 3083 *s = Py_UNICODE_TOUPPER(*s); 3084 status = 1; 3085 } 3086 s++; 3087 while (--len > 0) { 3088 if (Py_UNICODE_ISUPPER(*s)) { 3089 *s = Py_UNICODE_TOLOWER(*s); 3090 status = 1; 3091 } 3092 s++; 3093 } 3094 return status; 3095} 3096 3097static 3098int fixtitle(PyUnicodeObject *self) 3099{ 3100 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3101 register Py_UNICODE *e; 3102 int previous_is_cased; 3103 3104 /* Shortcut for single character strings */ 3105 if (PyUnicode_GET_SIZE(self) == 1) { 3106 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3107 if (*p != ch) { 3108 *p = ch; 3109 return 1; 3110 } 3111 else 3112 return 0; 3113 } 3114 3115 e = p + PyUnicode_GET_SIZE(self); 3116 previous_is_cased = 0; 3117 for (; p < e; p++) { 3118 register const Py_UNICODE ch = *p; 3119 3120 if (previous_is_cased) 3121 *p = Py_UNICODE_TOLOWER(ch); 3122 else 3123 *p = Py_UNICODE_TOTITLE(ch); 3124 3125 if (Py_UNICODE_ISLOWER(ch) || 3126 Py_UNICODE_ISUPPER(ch) || 3127 Py_UNICODE_ISTITLE(ch)) 3128 previous_is_cased = 1; 3129 else 3130 previous_is_cased = 0; 3131 } 3132 return 1; 3133} 3134 3135PyObject *PyUnicode_Join(PyObject *separator, 3136 PyObject *seq) 3137{ 3138 Py_UNICODE *sep; 3139 int seplen; 3140 PyUnicodeObject *res = NULL; 3141 int reslen = 0; 3142 Py_UNICODE *p; 3143 int sz = 100; 3144 int i; 3145 PyObject *it; 3146 3147 it = PyObject_GetIter(seq); 3148 if (it == NULL) 3149 return NULL; 3150 3151 if (separator == NULL) { 3152 Py_UNICODE blank = ' '; 3153 sep = ␣ 3154 seplen = 1; 3155 } 3156 else { 3157 separator = PyUnicode_FromObject(separator); 3158 if (separator == NULL) 3159 goto onError; 3160 sep = PyUnicode_AS_UNICODE(separator); 3161 seplen = PyUnicode_GET_SIZE(separator); 3162 } 3163 3164 res = _PyUnicode_New(sz); 3165 if (res == NULL) 3166 goto onError; 3167 p = PyUnicode_AS_UNICODE(res); 3168 reslen = 0; 3169 3170 for (i = 0; ; ++i) { 3171 int itemlen; 3172 PyObject *item = PyIter_Next(it); 3173 if (item == NULL) { 3174 if (PyErr_Occurred()) 3175 goto onError; 3176 break; 3177 } 3178 if (!PyUnicode_Check(item)) { 3179 PyObject *v; 3180 if (!PyString_Check(item)) { 3181 PyErr_Format(PyExc_TypeError, 3182 "sequence item %i: expected string or Unicode," 3183 " %.80s found", 3184 i, item->ob_type->tp_name); 3185 Py_DECREF(item); 3186 goto onError; 3187 } 3188 v = PyUnicode_FromObject(item); 3189 Py_DECREF(item); 3190 item = v; 3191 if (item == NULL) 3192 goto onError; 3193 } 3194 itemlen = PyUnicode_GET_SIZE(item); 3195 while (reslen + itemlen + seplen >= sz) { 3196 if (_PyUnicode_Resize(&res, sz*2)) { 3197 Py_DECREF(item); 3198 goto onError; 3199 } 3200 sz *= 2; 3201 p = PyUnicode_AS_UNICODE(res) + reslen; 3202 } 3203 if (i > 0) { 3204 Py_UNICODE_COPY(p, sep, seplen); 3205 p += seplen; 3206 reslen += seplen; 3207 } 3208 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3209 p += itemlen; 3210 reslen += itemlen; 3211 Py_DECREF(item); 3212 } 3213 if (_PyUnicode_Resize(&res, reslen)) 3214 goto onError; 3215 3216 Py_XDECREF(separator); 3217 Py_DECREF(it); 3218 return (PyObject *)res; 3219 3220 onError: 3221 Py_XDECREF(separator); 3222 Py_XDECREF(res); 3223 Py_DECREF(it); 3224 return NULL; 3225} 3226 3227static 3228PyUnicodeObject *pad(PyUnicodeObject *self, 3229 int left, 3230 int right, 3231 Py_UNICODE fill) 3232{ 3233 PyUnicodeObject *u; 3234 3235 if (left < 0) 3236 left = 0; 3237 if (right < 0) 3238 right = 0; 3239 3240 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3241 Py_INCREF(self); 3242 return self; 3243 } 3244 3245 u = _PyUnicode_New(left + self->length + right); 3246 if (u) { 3247 if (left) 3248 Py_UNICODE_FILL(u->str, fill, left); 3249 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3250 if (right) 3251 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3252 } 3253 3254 return u; 3255} 3256 3257#define SPLIT_APPEND(data, left, right) \ 3258 str = PyUnicode_FromUnicode(data + left, right - left); \ 3259 if (!str) \ 3260 goto onError; \ 3261 if (PyList_Append(list, str)) { \ 3262 Py_DECREF(str); \ 3263 goto onError; \ 3264 } \ 3265 else \ 3266 Py_DECREF(str); 3267 3268static 3269PyObject *split_whitespace(PyUnicodeObject *self, 3270 PyObject *list, 3271 int maxcount) 3272{ 3273 register int i; 3274 register int j; 3275 int len = self->length; 3276 PyObject *str; 3277 3278 for (i = j = 0; i < len; ) { 3279 /* find a token */ 3280 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3281 i++; 3282 j = i; 3283 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 3284 i++; 3285 if (j < i) { 3286 if (maxcount-- <= 0) 3287 break; 3288 SPLIT_APPEND(self->str, j, i); 3289 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 3290 i++; 3291 j = i; 3292 } 3293 } 3294 if (j < len) { 3295 SPLIT_APPEND(self->str, j, len); 3296 } 3297 return list; 3298 3299 onError: 3300 Py_DECREF(list); 3301 return NULL; 3302} 3303 3304PyObject *PyUnicode_Splitlines(PyObject *string, 3305 int keepends) 3306{ 3307 register int i; 3308 register int j; 3309 int len; 3310 PyObject *list; 3311 PyObject *str; 3312 Py_UNICODE *data; 3313 3314 string = PyUnicode_FromObject(string); 3315 if (string == NULL) 3316 return NULL; 3317 data = PyUnicode_AS_UNICODE(string); 3318 len = PyUnicode_GET_SIZE(string); 3319 3320 list = PyList_New(0); 3321 if (!list) 3322 goto onError; 3323 3324 for (i = j = 0; i < len; ) { 3325 int eol; 3326 3327 /* Find a line and append it */ 3328 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 3329 i++; 3330 3331 /* Skip the line break reading CRLF as one line break */ 3332 eol = i; 3333 if (i < len) { 3334 if (data[i] == '\r' && i + 1 < len && 3335 data[i+1] == '\n') 3336 i += 2; 3337 else 3338 i++; 3339 if (keepends) 3340 eol = i; 3341 } 3342 SPLIT_APPEND(data, j, eol); 3343 j = i; 3344 } 3345 if (j < len) { 3346 SPLIT_APPEND(data, j, len); 3347 } 3348 3349 Py_DECREF(string); 3350 return list; 3351 3352 onError: 3353 Py_DECREF(list); 3354 Py_DECREF(string); 3355 return NULL; 3356} 3357 3358static 3359PyObject *split_char(PyUnicodeObject *self, 3360 PyObject *list, 3361 Py_UNICODE ch, 3362 int maxcount) 3363{ 3364 register int i; 3365 register int j; 3366 int len = self->length; 3367 PyObject *str; 3368 3369 for (i = j = 0; i < len; ) { 3370 if (self->str[i] == ch) { 3371 if (maxcount-- <= 0) 3372 break; 3373 SPLIT_APPEND(self->str, j, i); 3374 i = j = i + 1; 3375 } else 3376 i++; 3377 } 3378 if (j <= len) { 3379 SPLIT_APPEND(self->str, j, len); 3380 } 3381 return list; 3382 3383 onError: 3384 Py_DECREF(list); 3385 return NULL; 3386} 3387 3388static 3389PyObject *split_substring(PyUnicodeObject *self, 3390 PyObject *list, 3391 PyUnicodeObject *substring, 3392 int maxcount) 3393{ 3394 register int i; 3395 register int j; 3396 int len = self->length; 3397 int sublen = substring->length; 3398 PyObject *str; 3399 3400 for (i = j = 0; i <= len - sublen; ) { 3401 if (Py_UNICODE_MATCH(self, i, substring)) { 3402 if (maxcount-- <= 0) 3403 break; 3404 SPLIT_APPEND(self->str, j, i); 3405 i = j = i + sublen; 3406 } else 3407 i++; 3408 } 3409 if (j <= len) { 3410 SPLIT_APPEND(self->str, j, len); 3411 } 3412 return list; 3413 3414 onError: 3415 Py_DECREF(list); 3416 return NULL; 3417} 3418 3419#undef SPLIT_APPEND 3420 3421static 3422PyObject *split(PyUnicodeObject *self, 3423 PyUnicodeObject *substring, 3424 int maxcount) 3425{ 3426 PyObject *list; 3427 3428 if (maxcount < 0) 3429 maxcount = INT_MAX; 3430 3431 list = PyList_New(0); 3432 if (!list) 3433 return NULL; 3434 3435 if (substring == NULL) 3436 return split_whitespace(self,list,maxcount); 3437 3438 else if (substring->length == 1) 3439 return split_char(self,list,substring->str[0],maxcount); 3440 3441 else if (substring->length == 0) { 3442 Py_DECREF(list); 3443 PyErr_SetString(PyExc_ValueError, "empty separator"); 3444 return NULL; 3445 } 3446 else 3447 return split_substring(self,list,substring,maxcount); 3448} 3449 3450static 3451PyObject *replace(PyUnicodeObject *self, 3452 PyUnicodeObject *str1, 3453 PyUnicodeObject *str2, 3454 int maxcount) 3455{ 3456 PyUnicodeObject *u; 3457 3458 if (maxcount < 0) 3459 maxcount = INT_MAX; 3460 3461 if (str1->length == 1 && str2->length == 1) { 3462 int i; 3463 3464 /* replace characters */ 3465 if (!findchar(self->str, self->length, str1->str[0]) && 3466 PyUnicode_CheckExact(self)) { 3467 /* nothing to replace, return original string */ 3468 Py_INCREF(self); 3469 u = self; 3470 } else { 3471 Py_UNICODE u1 = str1->str[0]; 3472 Py_UNICODE u2 = str2->str[0]; 3473 3474 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3475 NULL, 3476 self->length 3477 ); 3478 if (u != NULL) { 3479 Py_UNICODE_COPY(u->str, self->str, 3480 self->length); 3481 for (i = 0; i < u->length; i++) 3482 if (u->str[i] == u1) { 3483 if (--maxcount < 0) 3484 break; 3485 u->str[i] = u2; 3486 } 3487 } 3488 } 3489 3490 } else { 3491 int n, i; 3492 Py_UNICODE *p; 3493 3494 /* replace strings */ 3495 n = count(self, 0, self->length, str1); 3496 if (n > maxcount) 3497 n = maxcount; 3498 if (n == 0 && PyUnicode_CheckExact(self)) { 3499 /* nothing to replace, return original string */ 3500 Py_INCREF(self); 3501 u = self; 3502 } else { 3503 u = _PyUnicode_New( 3504 self->length + n * (str2->length - str1->length)); 3505 if (u) { 3506 i = 0; 3507 p = u->str; 3508 while (i <= self->length - str1->length) 3509 if (Py_UNICODE_MATCH(self, i, str1)) { 3510 /* replace string segment */ 3511 Py_UNICODE_COPY(p, str2->str, str2->length); 3512 p += str2->length; 3513 i += str1->length; 3514 if (--n <= 0) { 3515 /* copy remaining part */ 3516 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3517 break; 3518 } 3519 } else 3520 *p++ = self->str[i++]; 3521 } 3522 } 3523 } 3524 3525 return (PyObject *) u; 3526} 3527 3528/* --- Unicode Object Methods --------------------------------------------- */ 3529 3530static char title__doc__[] = 3531"S.title() -> unicode\n\ 3532\n\ 3533Return a titlecased version of S, i.e. words start with title case\n\ 3534characters, all remaining cased characters have lower case."; 3535 3536static PyObject* 3537unicode_title(PyUnicodeObject *self) 3538{ 3539 return fixup(self, fixtitle); 3540} 3541 3542static char capitalize__doc__[] = 3543"S.capitalize() -> unicode\n\ 3544\n\ 3545Return a capitalized version of S, i.e. make the first character\n\ 3546have upper case."; 3547 3548static PyObject* 3549unicode_capitalize(PyUnicodeObject *self) 3550{ 3551 return fixup(self, fixcapitalize); 3552} 3553 3554#if 0 3555static char capwords__doc__[] = 3556"S.capwords() -> unicode\n\ 3557\n\ 3558Apply .capitalize() to all words in S and return the result with\n\ 3559normalized whitespace (all whitespace strings are replaced by ' ')."; 3560 3561static PyObject* 3562unicode_capwords(PyUnicodeObject *self) 3563{ 3564 PyObject *list; 3565 PyObject *item; 3566 int i; 3567 3568 /* Split into words */ 3569 list = split(self, NULL, -1); 3570 if (!list) 3571 return NULL; 3572 3573 /* Capitalize each word */ 3574 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3575 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3576 fixcapitalize); 3577 if (item == NULL) 3578 goto onError; 3579 Py_DECREF(PyList_GET_ITEM(list, i)); 3580 PyList_SET_ITEM(list, i, item); 3581 } 3582 3583 /* Join the words to form a new string */ 3584 item = PyUnicode_Join(NULL, list); 3585 3586onError: 3587 Py_DECREF(list); 3588 return (PyObject *)item; 3589} 3590#endif 3591 3592static char center__doc__[] = 3593"S.center(width) -> unicode\n\ 3594\n\ 3595Return S centered in a Unicode string of length width. Padding is done\n\ 3596using spaces."; 3597 3598static PyObject * 3599unicode_center(PyUnicodeObject *self, PyObject *args) 3600{ 3601 int marg, left; 3602 int width; 3603 3604 if (!PyArg_ParseTuple(args, "i:center", &width)) 3605 return NULL; 3606 3607 if (self->length >= width && PyUnicode_CheckExact(self)) { 3608 Py_INCREF(self); 3609 return (PyObject*) self; 3610 } 3611 3612 marg = width - self->length; 3613 left = marg / 2 + (marg & width & 1); 3614 3615 return (PyObject*) pad(self, left, marg - left, ' '); 3616} 3617 3618#if 0 3619 3620/* This code should go into some future Unicode collation support 3621 module. The basic comparison should compare ordinals on a naive 3622 basis (this is what Java does and thus JPython too). */ 3623 3624/* speedy UTF-16 code point order comparison */ 3625/* gleaned from: */ 3626/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3627 3628static short utf16Fixup[32] = 3629{ 3630 0, 0, 0, 0, 0, 0, 0, 0, 3631 0, 0, 0, 0, 0, 0, 0, 0, 3632 0, 0, 0, 0, 0, 0, 0, 0, 3633 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3634}; 3635 3636static int 3637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3638{ 3639 int len1, len2; 3640 3641 Py_UNICODE *s1 = str1->str; 3642 Py_UNICODE *s2 = str2->str; 3643 3644 len1 = str1->length; 3645 len2 = str2->length; 3646 3647 while (len1 > 0 && len2 > 0) { 3648 Py_UNICODE c1, c2; 3649 3650 c1 = *s1++; 3651 c2 = *s2++; 3652 3653 if (c1 > (1<<11) * 26) 3654 c1 += utf16Fixup[c1>>11]; 3655 if (c2 > (1<<11) * 26) 3656 c2 += utf16Fixup[c2>>11]; 3657 /* now c1 and c2 are in UTF-32-compatible order */ 3658 3659 if (c1 != c2) 3660 return (c1 < c2) ? -1 : 1; 3661 3662 len1--; len2--; 3663 } 3664 3665 return (len1 < len2) ? -1 : (len1 != len2); 3666} 3667 3668#else 3669 3670static int 3671unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3672{ 3673 register int len1, len2; 3674 3675 Py_UNICODE *s1 = str1->str; 3676 Py_UNICODE *s2 = str2->str; 3677 3678 len1 = str1->length; 3679 len2 = str2->length; 3680 3681 while (len1 > 0 && len2 > 0) { 3682 Py_UNICODE c1, c2; 3683 3684 c1 = *s1++; 3685 c2 = *s2++; 3686 3687 if (c1 != c2) 3688 return (c1 < c2) ? -1 : 1; 3689 3690 len1--; len2--; 3691 } 3692 3693 return (len1 < len2) ? -1 : (len1 != len2); 3694} 3695 3696#endif 3697 3698int PyUnicode_Compare(PyObject *left, 3699 PyObject *right) 3700{ 3701 PyUnicodeObject *u = NULL, *v = NULL; 3702 int result; 3703 3704 /* Coerce the two arguments */ 3705 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3706 if (u == NULL) 3707 goto onError; 3708 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3709 if (v == NULL) 3710 goto onError; 3711 3712 /* Shortcut for empty or interned objects */ 3713 if (v == u) { 3714 Py_DECREF(u); 3715 Py_DECREF(v); 3716 return 0; 3717 } 3718 3719 result = unicode_compare(u, v); 3720 3721 Py_DECREF(u); 3722 Py_DECREF(v); 3723 return result; 3724 3725onError: 3726 Py_XDECREF(u); 3727 Py_XDECREF(v); 3728 return -1; 3729} 3730 3731int PyUnicode_Contains(PyObject *container, 3732 PyObject *element) 3733{ 3734 PyUnicodeObject *u = NULL, *v = NULL; 3735 int result; 3736 register const Py_UNICODE *p, *e; 3737 register Py_UNICODE ch; 3738 3739 /* Coerce the two arguments */ 3740 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3741 if (v == NULL) { 3742 PyErr_SetString(PyExc_TypeError, 3743 "'in <string>' requires character as left operand"); 3744 goto onError; 3745 } 3746 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3747 if (u == NULL) { 3748 Py_DECREF(v); 3749 goto onError; 3750 } 3751 3752 /* Check v in u */ 3753 if (PyUnicode_GET_SIZE(v) != 1) { 3754 PyErr_SetString(PyExc_TypeError, 3755 "'in <string>' requires character as left operand"); 3756 goto onError; 3757 } 3758 ch = *PyUnicode_AS_UNICODE(v); 3759 p = PyUnicode_AS_UNICODE(u); 3760 e = p + PyUnicode_GET_SIZE(u); 3761 result = 0; 3762 while (p < e) { 3763 if (*p++ == ch) { 3764 result = 1; 3765 break; 3766 } 3767 } 3768 3769 Py_DECREF(u); 3770 Py_DECREF(v); 3771 return result; 3772 3773onError: 3774 Py_XDECREF(u); 3775 Py_XDECREF(v); 3776 return -1; 3777} 3778 3779/* Concat to string or Unicode object giving a new Unicode object. */ 3780 3781PyObject *PyUnicode_Concat(PyObject *left, 3782 PyObject *right) 3783{ 3784 PyUnicodeObject *u = NULL, *v = NULL, *w; 3785 3786 /* Coerce the two arguments */ 3787 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3788 if (u == NULL) 3789 goto onError; 3790 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3791 if (v == NULL) 3792 goto onError; 3793 3794 /* Shortcuts */ 3795 if (v == unicode_empty) { 3796 Py_DECREF(v); 3797 return (PyObject *)u; 3798 } 3799 if (u == unicode_empty) { 3800 Py_DECREF(u); 3801 return (PyObject *)v; 3802 } 3803 3804 /* Concat the two Unicode strings */ 3805 w = _PyUnicode_New(u->length + v->length); 3806 if (w == NULL) 3807 goto onError; 3808 Py_UNICODE_COPY(w->str, u->str, u->length); 3809 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3810 3811 Py_DECREF(u); 3812 Py_DECREF(v); 3813 return (PyObject *)w; 3814 3815onError: 3816 Py_XDECREF(u); 3817 Py_XDECREF(v); 3818 return NULL; 3819} 3820 3821static char count__doc__[] = 3822"S.count(sub[, start[, end]]) -> int\n\ 3823\n\ 3824Return the number of occurrences of substring sub in Unicode string\n\ 3825S[start:end]. Optional arguments start and end are\n\ 3826interpreted as in slice notation."; 3827 3828static PyObject * 3829unicode_count(PyUnicodeObject *self, PyObject *args) 3830{ 3831 PyUnicodeObject *substring; 3832 int start = 0; 3833 int end = INT_MAX; 3834 PyObject *result; 3835 3836 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3837 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3838 return NULL; 3839 3840 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3841 (PyObject *)substring); 3842 if (substring == NULL) 3843 return NULL; 3844 3845 if (start < 0) 3846 start += self->length; 3847 if (start < 0) 3848 start = 0; 3849 if (end > self->length) 3850 end = self->length; 3851 if (end < 0) 3852 end += self->length; 3853 if (end < 0) 3854 end = 0; 3855 3856 result = PyInt_FromLong((long) count(self, start, end, substring)); 3857 3858 Py_DECREF(substring); 3859 return result; 3860} 3861 3862static char encode__doc__[] = 3863"S.encode([encoding[,errors]]) -> string\n\ 3864\n\ 3865Return an encoded string version of S. Default encoding is the current\n\ 3866default string encoding. errors may be given to set a different error\n\ 3867handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3868a ValueError. Other possible values are 'ignore' and 'replace'."; 3869 3870static PyObject * 3871unicode_encode(PyUnicodeObject *self, PyObject *args) 3872{ 3873 char *encoding = NULL; 3874 char *errors = NULL; 3875 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3876 return NULL; 3877 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3878} 3879 3880static char expandtabs__doc__[] = 3881"S.expandtabs([tabsize]) -> unicode\n\ 3882\n\ 3883Return a copy of S where all tab characters are expanded using spaces.\n\ 3884If tabsize is not given, a tab size of 8 characters is assumed."; 3885 3886static PyObject* 3887unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3888{ 3889 Py_UNICODE *e; 3890 Py_UNICODE *p; 3891 Py_UNICODE *q; 3892 int i, j; 3893 PyUnicodeObject *u; 3894 int tabsize = 8; 3895 3896 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3897 return NULL; 3898 3899 /* First pass: determine size of output string */ 3900 i = j = 0; 3901 e = self->str + self->length; 3902 for (p = self->str; p < e; p++) 3903 if (*p == '\t') { 3904 if (tabsize > 0) 3905 j += tabsize - (j % tabsize); 3906 } 3907 else { 3908 j++; 3909 if (*p == '\n' || *p == '\r') { 3910 i += j; 3911 j = 0; 3912 } 3913 } 3914 3915 /* Second pass: create output string and fill it */ 3916 u = _PyUnicode_New(i + j); 3917 if (!u) 3918 return NULL; 3919 3920 j = 0; 3921 q = u->str; 3922 3923 for (p = self->str; p < e; p++) 3924 if (*p == '\t') { 3925 if (tabsize > 0) { 3926 i = tabsize - (j % tabsize); 3927 j += i; 3928 while (i--) 3929 *q++ = ' '; 3930 } 3931 } 3932 else { 3933 j++; 3934 *q++ = *p; 3935 if (*p == '\n' || *p == '\r') 3936 j = 0; 3937 } 3938 3939 return (PyObject*) u; 3940} 3941 3942static char find__doc__[] = 3943"S.find(sub [,start [,end]]) -> int\n\ 3944\n\ 3945Return the lowest index in S where substring sub is found,\n\ 3946such that sub is contained within s[start,end]. Optional\n\ 3947arguments start and end are interpreted as in slice notation.\n\ 3948\n\ 3949Return -1 on failure."; 3950 3951static PyObject * 3952unicode_find(PyUnicodeObject *self, PyObject *args) 3953{ 3954 PyUnicodeObject *substring; 3955 int start = 0; 3956 int end = INT_MAX; 3957 PyObject *result; 3958 3959 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 3960 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3961 return NULL; 3962 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3963 (PyObject *)substring); 3964 if (substring == NULL) 3965 return NULL; 3966 3967 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 3968 3969 Py_DECREF(substring); 3970 return result; 3971} 3972 3973static PyObject * 3974unicode_getitem(PyUnicodeObject *self, int index) 3975{ 3976 if (index < 0 || index >= self->length) { 3977 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3978 return NULL; 3979 } 3980 3981 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 3982} 3983 3984static long 3985unicode_hash(PyUnicodeObject *self) 3986{ 3987 /* Since Unicode objects compare equal to their ASCII string 3988 counterparts, they should use the individual character values 3989 as basis for their hash value. This is needed to assure that 3990 strings and Unicode objects behave in the same way as 3991 dictionary keys. */ 3992 3993 register int len; 3994 register Py_UNICODE *p; 3995 register long x; 3996 3997 if (self->hash != -1) 3998 return self->hash; 3999 len = PyUnicode_GET_SIZE(self); 4000 p = PyUnicode_AS_UNICODE(self); 4001 x = *p << 7; 4002 while (--len >= 0) 4003 x = (1000003*x) ^ *p++; 4004 x ^= PyUnicode_GET_SIZE(self); 4005 if (x == -1) 4006 x = -2; 4007 self->hash = x; 4008 return x; 4009} 4010 4011static char index__doc__[] = 4012"S.index(sub [,start [,end]]) -> int\n\ 4013\n\ 4014Like S.find() but raise ValueError when the substring is not found."; 4015 4016static PyObject * 4017unicode_index(PyUnicodeObject *self, PyObject *args) 4018{ 4019 int result; 4020 PyUnicodeObject *substring; 4021 int start = 0; 4022 int end = INT_MAX; 4023 4024 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4025 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4026 return NULL; 4027 4028 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4029 (PyObject *)substring); 4030 if (substring == NULL) 4031 return NULL; 4032 4033 result = findstring(self, substring, start, end, 1); 4034 4035 Py_DECREF(substring); 4036 if (result < 0) { 4037 PyErr_SetString(PyExc_ValueError, "substring not found"); 4038 return NULL; 4039 } 4040 return PyInt_FromLong(result); 4041} 4042 4043static char islower__doc__[] = 4044"S.islower() -> bool\n\ 4045\n\ 4046Return True if all cased characters in S are lowercase and there is\n\ 4047at least one cased character in S, False otherwise."; 4048 4049static PyObject* 4050unicode_islower(PyUnicodeObject *self) 4051{ 4052 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4053 register const Py_UNICODE *e; 4054 int cased; 4055 4056 /* Shortcut for single character strings */ 4057 if (PyUnicode_GET_SIZE(self) == 1) 4058 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 4059 4060 /* Special case for empty strings */ 4061 if (PyString_GET_SIZE(self) == 0) 4062 return PyBool_FromLong(0); 4063 4064 e = p + PyUnicode_GET_SIZE(self); 4065 cased = 0; 4066 for (; p < e; p++) { 4067 register const Py_UNICODE ch = *p; 4068 4069 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4070 return PyBool_FromLong(0); 4071 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4072 cased = 1; 4073 } 4074 return PyBool_FromLong(cased); 4075} 4076 4077static char isupper__doc__[] = 4078"S.isupper() -> bool\n\ 4079\n\ 4080Return True if all cased characters in S are uppercase and there is\n\ 4081at least one cased character in S, False otherwise."; 4082 4083static PyObject* 4084unicode_isupper(PyUnicodeObject *self) 4085{ 4086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4087 register const Py_UNICODE *e; 4088 int cased; 4089 4090 /* Shortcut for single character strings */ 4091 if (PyUnicode_GET_SIZE(self) == 1) 4092 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4093 4094 /* Special case for empty strings */ 4095 if (PyString_GET_SIZE(self) == 0) 4096 return PyBool_FromLong(0); 4097 4098 e = p + PyUnicode_GET_SIZE(self); 4099 cased = 0; 4100 for (; p < e; p++) { 4101 register const Py_UNICODE ch = *p; 4102 4103 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4104 return PyBool_FromLong(0); 4105 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4106 cased = 1; 4107 } 4108 return PyBool_FromLong(cased); 4109} 4110 4111static char istitle__doc__[] = 4112"S.istitle() -> bool\n\ 4113\n\ 4114Return True if S is a titlecased string, i.e. upper- and titlecase\n\ 4115characters may only follow uncased characters and lowercase characters\n\ 4116only cased ones. Return False otherwise."; 4117 4118static PyObject* 4119unicode_istitle(PyUnicodeObject *self) 4120{ 4121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4122 register const Py_UNICODE *e; 4123 int cased, previous_is_cased; 4124 4125 /* Shortcut for single character strings */ 4126 if (PyUnicode_GET_SIZE(self) == 1) 4127 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4128 (Py_UNICODE_ISUPPER(*p) != 0)); 4129 4130 /* Special case for empty strings */ 4131 if (PyString_GET_SIZE(self) == 0) 4132 return PyBool_FromLong(0); 4133 4134 e = p + PyUnicode_GET_SIZE(self); 4135 cased = 0; 4136 previous_is_cased = 0; 4137 for (; p < e; p++) { 4138 register const Py_UNICODE ch = *p; 4139 4140 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4141 if (previous_is_cased) 4142 return PyBool_FromLong(0); 4143 previous_is_cased = 1; 4144 cased = 1; 4145 } 4146 else if (Py_UNICODE_ISLOWER(ch)) { 4147 if (!previous_is_cased) 4148 return PyBool_FromLong(0); 4149 previous_is_cased = 1; 4150 cased = 1; 4151 } 4152 else 4153 previous_is_cased = 0; 4154 } 4155 return PyBool_FromLong(cased); 4156} 4157 4158static char isspace__doc__[] = 4159"S.isspace() -> bool\n\ 4160\n\ 4161Return True if there are only whitespace characters in S,\n\ 4162False otherwise."; 4163 4164static PyObject* 4165unicode_isspace(PyUnicodeObject *self) 4166{ 4167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4168 register const Py_UNICODE *e; 4169 4170 /* Shortcut for single character strings */ 4171 if (PyUnicode_GET_SIZE(self) == 1 && 4172 Py_UNICODE_ISSPACE(*p)) 4173 return PyBool_FromLong(1); 4174 4175 /* Special case for empty strings */ 4176 if (PyString_GET_SIZE(self) == 0) 4177 return PyBool_FromLong(0); 4178 4179 e = p + PyUnicode_GET_SIZE(self); 4180 for (; p < e; p++) { 4181 if (!Py_UNICODE_ISSPACE(*p)) 4182 return PyBool_FromLong(0); 4183 } 4184 return PyBool_FromLong(1); 4185} 4186 4187static char isalpha__doc__[] = 4188"S.isalpha() -> bool\n\ 4189\n\ 4190Return True if all characters in S are alphabetic\n\ 4191and there is at least one character in S, False otherwise."; 4192 4193static PyObject* 4194unicode_isalpha(PyUnicodeObject *self) 4195{ 4196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4197 register const Py_UNICODE *e; 4198 4199 /* Shortcut for single character strings */ 4200 if (PyUnicode_GET_SIZE(self) == 1 && 4201 Py_UNICODE_ISALPHA(*p)) 4202 return PyBool_FromLong(1); 4203 4204 /* Special case for empty strings */ 4205 if (PyString_GET_SIZE(self) == 0) 4206 return PyBool_FromLong(0); 4207 4208 e = p + PyUnicode_GET_SIZE(self); 4209 for (; p < e; p++) { 4210 if (!Py_UNICODE_ISALPHA(*p)) 4211 return PyBool_FromLong(0); 4212 } 4213 return PyBool_FromLong(1); 4214} 4215 4216static char isalnum__doc__[] = 4217"S.isalnum() -> bool\n\ 4218\n\ 4219Return True if all characters in S are alphanumeric\n\ 4220and there is at least one character in S, False otherwise."; 4221 4222static PyObject* 4223unicode_isalnum(PyUnicodeObject *self) 4224{ 4225 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4226 register const Py_UNICODE *e; 4227 4228 /* Shortcut for single character strings */ 4229 if (PyUnicode_GET_SIZE(self) == 1 && 4230 Py_UNICODE_ISALNUM(*p)) 4231 return PyBool_FromLong(1); 4232 4233 /* Special case for empty strings */ 4234 if (PyString_GET_SIZE(self) == 0) 4235 return PyBool_FromLong(0); 4236 4237 e = p + PyUnicode_GET_SIZE(self); 4238 for (; p < e; p++) { 4239 if (!Py_UNICODE_ISALNUM(*p)) 4240 return PyBool_FromLong(0); 4241 } 4242 return PyBool_FromLong(1); 4243} 4244 4245static char isdecimal__doc__[] = 4246"S.isdecimal() -> bool\n\ 4247\n\ 4248Return True if there are only decimal characters in S,\n\ 4249False otherwise."; 4250 4251static PyObject* 4252unicode_isdecimal(PyUnicodeObject *self) 4253{ 4254 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4255 register const Py_UNICODE *e; 4256 4257 /* Shortcut for single character strings */ 4258 if (PyUnicode_GET_SIZE(self) == 1 && 4259 Py_UNICODE_ISDECIMAL(*p)) 4260 return PyBool_FromLong(1); 4261 4262 /* Special case for empty strings */ 4263 if (PyString_GET_SIZE(self) == 0) 4264 return PyBool_FromLong(0); 4265 4266 e = p + PyUnicode_GET_SIZE(self); 4267 for (; p < e; p++) { 4268 if (!Py_UNICODE_ISDECIMAL(*p)) 4269 return PyBool_FromLong(0); 4270 } 4271 return PyBool_FromLong(1); 4272} 4273 4274static char isdigit__doc__[] = 4275"S.isdigit() -> bool\n\ 4276\n\ 4277Return True if there are only digit characters in S,\n\ 4278False otherwise."; 4279 4280static PyObject* 4281unicode_isdigit(PyUnicodeObject *self) 4282{ 4283 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4284 register const Py_UNICODE *e; 4285 4286 /* Shortcut for single character strings */ 4287 if (PyUnicode_GET_SIZE(self) == 1 && 4288 Py_UNICODE_ISDIGIT(*p)) 4289 return PyBool_FromLong(1); 4290 4291 /* Special case for empty strings */ 4292 if (PyString_GET_SIZE(self) == 0) 4293 return PyBool_FromLong(0); 4294 4295 e = p + PyUnicode_GET_SIZE(self); 4296 for (; p < e; p++) { 4297 if (!Py_UNICODE_ISDIGIT(*p)) 4298 return PyBool_FromLong(0); 4299 } 4300 return PyBool_FromLong(1); 4301} 4302 4303static char isnumeric__doc__[] = 4304"S.isnumeric() -> bool\n\ 4305\n\ 4306Return True if there are only numeric characters in S,\n\ 4307False otherwise."; 4308 4309static PyObject* 4310unicode_isnumeric(PyUnicodeObject *self) 4311{ 4312 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4313 register const Py_UNICODE *e; 4314 4315 /* Shortcut for single character strings */ 4316 if (PyUnicode_GET_SIZE(self) == 1 && 4317 Py_UNICODE_ISNUMERIC(*p)) 4318 return PyBool_FromLong(1); 4319 4320 /* Special case for empty strings */ 4321 if (PyString_GET_SIZE(self) == 0) 4322 return PyBool_FromLong(0); 4323 4324 e = p + PyUnicode_GET_SIZE(self); 4325 for (; p < e; p++) { 4326 if (!Py_UNICODE_ISNUMERIC(*p)) 4327 return PyBool_FromLong(0); 4328 } 4329 return PyBool_FromLong(1); 4330} 4331 4332static char join__doc__[] = 4333"S.join(sequence) -> unicode\n\ 4334\n\ 4335Return a string which is the concatenation of the strings in the\n\ 4336sequence. The separator between elements is S."; 4337 4338static PyObject* 4339unicode_join(PyObject *self, PyObject *data) 4340{ 4341 return PyUnicode_Join(self, data); 4342} 4343 4344static int 4345unicode_length(PyUnicodeObject *self) 4346{ 4347 return self->length; 4348} 4349 4350static char ljust__doc__[] = 4351"S.ljust(width) -> unicode\n\ 4352\n\ 4353Return S left justified in a Unicode string of length width. Padding is\n\ 4354done using spaces."; 4355 4356static PyObject * 4357unicode_ljust(PyUnicodeObject *self, PyObject *args) 4358{ 4359 int width; 4360 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 4361 return NULL; 4362 4363 if (self->length >= width && PyUnicode_CheckExact(self)) { 4364 Py_INCREF(self); 4365 return (PyObject*) self; 4366 } 4367 4368 return (PyObject*) pad(self, 0, width - self->length, ' '); 4369} 4370 4371static char lower__doc__[] = 4372"S.lower() -> unicode\n\ 4373\n\ 4374Return a copy of the string S converted to lowercase."; 4375 4376static PyObject* 4377unicode_lower(PyUnicodeObject *self) 4378{ 4379 return fixup(self, fixlower); 4380} 4381 4382#define LEFTSTRIP 0 4383#define RIGHTSTRIP 1 4384#define BOTHSTRIP 2 4385 4386/* Arrays indexed by above */ 4387static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 4388 4389#define STRIPNAME(i) (stripformat[i]+3) 4390 4391static const Py_UNICODE * 4392unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) 4393{ 4394 size_t i; 4395 for (i = 0; i < n; ++i) 4396 if (s[i] == c) 4397 return s+i; 4398 return NULL; 4399} 4400 4401/* externally visible for str.strip(unicode) */ 4402PyObject * 4403_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 4404{ 4405 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 4406 int len = PyUnicode_GET_SIZE(self); 4407 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 4408 int seplen = PyUnicode_GET_SIZE(sepobj); 4409 int i, j; 4410 4411 i = 0; 4412 if (striptype != RIGHTSTRIP) { 4413 while (i < len && unicode_memchr(sep, s[i], seplen)) { 4414 i++; 4415 } 4416 } 4417 4418 j = len; 4419 if (striptype != LEFTSTRIP) { 4420 do { 4421 j--; 4422 } while (j >= i && unicode_memchr(sep, s[j], seplen)); 4423 j++; 4424 } 4425 4426 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 4427 Py_INCREF(self); 4428 return (PyObject*)self; 4429 } 4430 else 4431 return PyUnicode_FromUnicode(s+i, j-i); 4432} 4433 4434 4435static PyObject * 4436do_strip(PyUnicodeObject *self, int striptype) 4437{ 4438 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 4439 int len = PyUnicode_GET_SIZE(self), i, j; 4440 4441 i = 0; 4442 if (striptype != RIGHTSTRIP) { 4443 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 4444 i++; 4445 } 4446 } 4447 4448 j = len; 4449 if (striptype != LEFTSTRIP) { 4450 do { 4451 j--; 4452 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 4453 j++; 4454 } 4455 4456 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 4457 Py_INCREF(self); 4458 return (PyObject*)self; 4459 } 4460 else 4461 return PyUnicode_FromUnicode(s+i, j-i); 4462} 4463 4464 4465static PyObject * 4466do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 4467{ 4468 PyObject *sep = NULL; 4469 4470 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 4471 return NULL; 4472 4473 if (sep != NULL && sep != Py_None) { 4474 if (PyUnicode_Check(sep)) 4475 return _PyUnicode_XStrip(self, striptype, sep); 4476 else if (PyString_Check(sep)) { 4477 PyObject *res; 4478 sep = PyUnicode_FromObject(sep); 4479 if (sep==NULL) 4480 return NULL; 4481 res = _PyUnicode_XStrip(self, striptype, sep); 4482 Py_DECREF(sep); 4483 return res; 4484 } 4485 else { 4486 PyErr_Format(PyExc_TypeError, 4487 "%s arg must be None, unicode or str", 4488 STRIPNAME(striptype)); 4489 return NULL; 4490 } 4491 } 4492 4493 return do_strip(self, striptype); 4494} 4495 4496 4497static char strip__doc__[] = 4498"S.strip([sep]) -> unicode\n\ 4499\n\ 4500Return a copy of the string S with leading and trailing\n\ 4501whitespace removed.\n\ 4502If sep is given and not None, remove characters in sep instead.\n\ 4503If sep is a str, it will be converted to unicode before stripping"; 4504 4505static PyObject * 4506unicode_strip(PyUnicodeObject *self, PyObject *args) 4507{ 4508 if (PyTuple_GET_SIZE(args) == 0) 4509 return do_strip(self, BOTHSTRIP); /* Common case */ 4510 else 4511 return do_argstrip(self, BOTHSTRIP, args); 4512} 4513 4514 4515static char lstrip__doc__[] = 4516"S.lstrip([sep]) -> unicode\n\ 4517\n\ 4518Return a copy of the string S with leading whitespace removed.\n\ 4519If sep is given and not None, remove characters in sep instead.\n\ 4520If sep is a str, it will be converted to unicode before stripping"; 4521 4522static PyObject * 4523unicode_lstrip(PyUnicodeObject *self, PyObject *args) 4524{ 4525 if (PyTuple_GET_SIZE(args) == 0) 4526 return do_strip(self, LEFTSTRIP); /* Common case */ 4527 else 4528 return do_argstrip(self, LEFTSTRIP, args); 4529} 4530 4531 4532static char rstrip__doc__[] = 4533"S.rstrip([sep]) -> unicode\n\ 4534\n\ 4535Return a copy of the string S with trailing whitespace removed.\n\ 4536If sep is given and not None, remove characters in sep instead.\n\ 4537If sep is a str, it will be converted to unicode before stripping"; 4538 4539static PyObject * 4540unicode_rstrip(PyUnicodeObject *self, PyObject *args) 4541{ 4542 if (PyTuple_GET_SIZE(args) == 0) 4543 return do_strip(self, RIGHTSTRIP); /* Common case */ 4544 else 4545 return do_argstrip(self, RIGHTSTRIP, args); 4546} 4547 4548 4549static PyObject* 4550unicode_repeat(PyUnicodeObject *str, int len) 4551{ 4552 PyUnicodeObject *u; 4553 Py_UNICODE *p; 4554 int nchars; 4555 size_t nbytes; 4556 4557 if (len < 0) 4558 len = 0; 4559 4560 if (len == 1 && PyUnicode_CheckExact(str)) { 4561 /* no repeat, return original string */ 4562 Py_INCREF(str); 4563 return (PyObject*) str; 4564 } 4565 4566 /* ensure # of chars needed doesn't overflow int and # of bytes 4567 * needed doesn't overflow size_t 4568 */ 4569 nchars = len * str->length; 4570 if (len && nchars / len != str->length) { 4571 PyErr_SetString(PyExc_OverflowError, 4572 "repeated string is too long"); 4573 return NULL; 4574 } 4575 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4576 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4577 PyErr_SetString(PyExc_OverflowError, 4578 "repeated string is too long"); 4579 return NULL; 4580 } 4581 u = _PyUnicode_New(nchars); 4582 if (!u) 4583 return NULL; 4584 4585 p = u->str; 4586 4587 while (len-- > 0) { 4588 Py_UNICODE_COPY(p, str->str, str->length); 4589 p += str->length; 4590 } 4591 4592 return (PyObject*) u; 4593} 4594 4595PyObject *PyUnicode_Replace(PyObject *obj, 4596 PyObject *subobj, 4597 PyObject *replobj, 4598 int maxcount) 4599{ 4600 PyObject *self; 4601 PyObject *str1; 4602 PyObject *str2; 4603 PyObject *result; 4604 4605 self = PyUnicode_FromObject(obj); 4606 if (self == NULL) 4607 return NULL; 4608 str1 = PyUnicode_FromObject(subobj); 4609 if (str1 == NULL) { 4610 Py_DECREF(self); 4611 return NULL; 4612 } 4613 str2 = PyUnicode_FromObject(replobj); 4614 if (str2 == NULL) { 4615 Py_DECREF(self); 4616 Py_DECREF(str1); 4617 return NULL; 4618 } 4619 result = replace((PyUnicodeObject *)self, 4620 (PyUnicodeObject *)str1, 4621 (PyUnicodeObject *)str2, 4622 maxcount); 4623 Py_DECREF(self); 4624 Py_DECREF(str1); 4625 Py_DECREF(str2); 4626 return result; 4627} 4628 4629static char replace__doc__[] = 4630"S.replace (old, new[, maxsplit]) -> unicode\n\ 4631\n\ 4632Return a copy of S with all occurrences of substring\n\ 4633old replaced by new. If the optional argument maxsplit is\n\ 4634given, only the first maxsplit occurrences are replaced."; 4635 4636static PyObject* 4637unicode_replace(PyUnicodeObject *self, PyObject *args) 4638{ 4639 PyUnicodeObject *str1; 4640 PyUnicodeObject *str2; 4641 int maxcount = -1; 4642 PyObject *result; 4643 4644 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4645 return NULL; 4646 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4647 if (str1 == NULL) 4648 return NULL; 4649 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4650 if (str2 == NULL) 4651 return NULL; 4652 4653 result = replace(self, str1, str2, maxcount); 4654 4655 Py_DECREF(str1); 4656 Py_DECREF(str2); 4657 return result; 4658} 4659 4660static 4661PyObject *unicode_repr(PyObject *unicode) 4662{ 4663 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4664 PyUnicode_GET_SIZE(unicode), 4665 1); 4666} 4667 4668static char rfind__doc__[] = 4669"S.rfind(sub [,start [,end]]) -> int\n\ 4670\n\ 4671Return the highest index in S where substring sub is found,\n\ 4672such that sub is contained within s[start,end]. Optional\n\ 4673arguments start and end are interpreted as in slice notation.\n\ 4674\n\ 4675Return -1 on failure."; 4676 4677static PyObject * 4678unicode_rfind(PyUnicodeObject *self, PyObject *args) 4679{ 4680 PyUnicodeObject *substring; 4681 int start = 0; 4682 int end = INT_MAX; 4683 PyObject *result; 4684 4685 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4686 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4687 return NULL; 4688 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4689 (PyObject *)substring); 4690 if (substring == NULL) 4691 return NULL; 4692 4693 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4694 4695 Py_DECREF(substring); 4696 return result; 4697} 4698 4699static char rindex__doc__[] = 4700"S.rindex(sub [,start [,end]]) -> int\n\ 4701\n\ 4702Like S.rfind() but raise ValueError when the substring is not found."; 4703 4704static PyObject * 4705unicode_rindex(PyUnicodeObject *self, PyObject *args) 4706{ 4707 int result; 4708 PyUnicodeObject *substring; 4709 int start = 0; 4710 int end = INT_MAX; 4711 4712 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4713 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4714 return NULL; 4715 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4716 (PyObject *)substring); 4717 if (substring == NULL) 4718 return NULL; 4719 4720 result = findstring(self, substring, start, end, -1); 4721 4722 Py_DECREF(substring); 4723 if (result < 0) { 4724 PyErr_SetString(PyExc_ValueError, "substring not found"); 4725 return NULL; 4726 } 4727 return PyInt_FromLong(result); 4728} 4729 4730static char rjust__doc__[] = 4731"S.rjust(width) -> unicode\n\ 4732\n\ 4733Return S right justified in a Unicode string of length width. Padding is\n\ 4734done using spaces."; 4735 4736static PyObject * 4737unicode_rjust(PyUnicodeObject *self, PyObject *args) 4738{ 4739 int width; 4740 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4741 return NULL; 4742 4743 if (self->length >= width && PyUnicode_CheckExact(self)) { 4744 Py_INCREF(self); 4745 return (PyObject*) self; 4746 } 4747 4748 return (PyObject*) pad(self, width - self->length, 0, ' '); 4749} 4750 4751static PyObject* 4752unicode_slice(PyUnicodeObject *self, int start, int end) 4753{ 4754 /* standard clamping */ 4755 if (start < 0) 4756 start = 0; 4757 if (end < 0) 4758 end = 0; 4759 if (end > self->length) 4760 end = self->length; 4761 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 4762 /* full slice, return original string */ 4763 Py_INCREF(self); 4764 return (PyObject*) self; 4765 } 4766 if (start > end) 4767 start = end; 4768 /* copy slice */ 4769 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4770 end - start); 4771} 4772 4773PyObject *PyUnicode_Split(PyObject *s, 4774 PyObject *sep, 4775 int maxsplit) 4776{ 4777 PyObject *result; 4778 4779 s = PyUnicode_FromObject(s); 4780 if (s == NULL) 4781 return NULL; 4782 if (sep != NULL) { 4783 sep = PyUnicode_FromObject(sep); 4784 if (sep == NULL) { 4785 Py_DECREF(s); 4786 return NULL; 4787 } 4788 } 4789 4790 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4791 4792 Py_DECREF(s); 4793 Py_XDECREF(sep); 4794 return result; 4795} 4796 4797static char split__doc__[] = 4798"S.split([sep [,maxsplit]]) -> list of strings\n\ 4799\n\ 4800Return a list of the words in S, using sep as the\n\ 4801delimiter string. If maxsplit is given, at most maxsplit\n\ 4802splits are done. If sep is not specified, any whitespace string\n\ 4803is a separator."; 4804 4805static PyObject* 4806unicode_split(PyUnicodeObject *self, PyObject *args) 4807{ 4808 PyObject *substring = Py_None; 4809 int maxcount = -1; 4810 4811 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4812 return NULL; 4813 4814 if (substring == Py_None) 4815 return split(self, NULL, maxcount); 4816 else if (PyUnicode_Check(substring)) 4817 return split(self, (PyUnicodeObject *)substring, maxcount); 4818 else 4819 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4820} 4821 4822static char splitlines__doc__[] = 4823"S.splitlines([keepends]]) -> list of strings\n\ 4824\n\ 4825Return a list of the lines in S, breaking at line boundaries.\n\ 4826Line breaks are not included in the resulting list unless keepends\n\ 4827is given and true."; 4828 4829static PyObject* 4830unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4831{ 4832 int keepends = 0; 4833 4834 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4835 return NULL; 4836 4837 return PyUnicode_Splitlines((PyObject *)self, keepends); 4838} 4839 4840static 4841PyObject *unicode_str(PyUnicodeObject *self) 4842{ 4843 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4844} 4845 4846static char swapcase__doc__[] = 4847"S.swapcase() -> unicode\n\ 4848\n\ 4849Return a copy of S with uppercase characters converted to lowercase\n\ 4850and vice versa."; 4851 4852static PyObject* 4853unicode_swapcase(PyUnicodeObject *self) 4854{ 4855 return fixup(self, fixswapcase); 4856} 4857 4858static char translate__doc__[] = 4859"S.translate(table) -> unicode\n\ 4860\n\ 4861Return a copy of the string S, where all characters have been mapped\n\ 4862through the given translation table, which must be a mapping of\n\ 4863Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4864are left untouched. Characters mapped to None are deleted."; 4865 4866static PyObject* 4867unicode_translate(PyUnicodeObject *self, PyObject *table) 4868{ 4869 return PyUnicode_TranslateCharmap(self->str, 4870 self->length, 4871 table, 4872 "ignore"); 4873} 4874 4875static char upper__doc__[] = 4876"S.upper() -> unicode\n\ 4877\n\ 4878Return a copy of S converted to uppercase."; 4879 4880static PyObject* 4881unicode_upper(PyUnicodeObject *self) 4882{ 4883 return fixup(self, fixupper); 4884} 4885 4886static char zfill__doc__[] = 4887"S.zfill(width) -> unicode\n\ 4888\n\ 4889Pad a numeric string x with zeros on the left, to fill a field\n\ 4890of the specified width. The string x is never truncated."; 4891 4892static PyObject * 4893unicode_zfill(PyUnicodeObject *self, PyObject *args) 4894{ 4895 int fill; 4896 PyUnicodeObject *u; 4897 4898 int width; 4899 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4900 return NULL; 4901 4902 if (self->length >= width) { 4903 if (PyUnicode_CheckExact(self)) { 4904 Py_INCREF(self); 4905 return (PyObject*) self; 4906 } 4907 else 4908 return PyUnicode_FromUnicode( 4909 PyUnicode_AS_UNICODE(self), 4910 PyUnicode_GET_SIZE(self) 4911 ); 4912 } 4913 4914 fill = width - self->length; 4915 4916 u = pad(self, fill, 0, '0'); 4917 4918 if (u == NULL) 4919 return NULL; 4920 4921 if (u->str[fill] == '+' || u->str[fill] == '-') { 4922 /* move sign to beginning of string */ 4923 u->str[0] = u->str[fill]; 4924 u->str[fill] = '0'; 4925 } 4926 4927 return (PyObject*) u; 4928} 4929 4930#if 0 4931static PyObject* 4932unicode_freelistsize(PyUnicodeObject *self) 4933{ 4934 return PyInt_FromLong(unicode_freelist_size); 4935} 4936#endif 4937 4938static char startswith__doc__[] = 4939"S.startswith(prefix[, start[, end]]) -> bool\n\ 4940\n\ 4941Return True if S starts with the specified prefix, False otherwise. With\n\ 4942optional start, test S beginning at that position. With optional end, stop\n\ 4943comparing S at that position."; 4944 4945static PyObject * 4946unicode_startswith(PyUnicodeObject *self, 4947 PyObject *args) 4948{ 4949 PyUnicodeObject *substring; 4950 int start = 0; 4951 int end = INT_MAX; 4952 PyObject *result; 4953 4954 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4955 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4956 return NULL; 4957 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4958 (PyObject *)substring); 4959 if (substring == NULL) 4960 return NULL; 4961 4962 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); 4963 4964 Py_DECREF(substring); 4965 return result; 4966} 4967 4968 4969static char endswith__doc__[] = 4970"S.endswith(suffix[, start[, end]]) -> bool\n\ 4971\n\ 4972Return True if S ends with the specified suffix, False otherwise. With\n\ 4973optional start, test S beginning at that position. With optional end, stop\n\ 4974comparing S at that position."; 4975 4976static PyObject * 4977unicode_endswith(PyUnicodeObject *self, 4978 PyObject *args) 4979{ 4980 PyUnicodeObject *substring; 4981 int start = 0; 4982 int end = INT_MAX; 4983 PyObject *result; 4984 4985 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4986 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4987 return NULL; 4988 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4989 (PyObject *)substring); 4990 if (substring == NULL) 4991 return NULL; 4992 4993 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); 4994 4995 Py_DECREF(substring); 4996 return result; 4997} 4998 4999 5000static PyMethodDef unicode_methods[] = { 5001 5002 /* Order is according to common usage: often used methods should 5003 appear first, since lookup is done sequentially. */ 5004 5005 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 5006 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 5007 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 5008 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 5009 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 5010 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 5011 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 5012 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 5013 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 5014 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 5015 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 5016 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 5017 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 5018 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 5019/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 5020 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 5021 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 5022 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 5023 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 5024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 5025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 5026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 5027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 5028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 5029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 5030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 5031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 5032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 5033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 5034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 5035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 5036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 5037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 5038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 5039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 5040 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 5041#if 0 5042 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 5043#endif 5044 5045#if 0 5046 /* This one is just used for debugging the implementation. */ 5047 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 5048#endif 5049 5050 {NULL, NULL} 5051}; 5052 5053static PySequenceMethods unicode_as_sequence = { 5054 (inquiry) unicode_length, /* sq_length */ 5055 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 5056 (intargfunc) unicode_repeat, /* sq_repeat */ 5057 (intargfunc) unicode_getitem, /* sq_item */ 5058 (intintargfunc) unicode_slice, /* sq_slice */ 5059 0, /* sq_ass_item */ 5060 0, /* sq_ass_slice */ 5061 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 5062}; 5063 5064static int 5065unicode_buffer_getreadbuf(PyUnicodeObject *self, 5066 int index, 5067 const void **ptr) 5068{ 5069 if (index != 0) { 5070 PyErr_SetString(PyExc_SystemError, 5071 "accessing non-existent unicode segment"); 5072 return -1; 5073 } 5074 *ptr = (void *) self->str; 5075 return PyUnicode_GET_DATA_SIZE(self); 5076} 5077 5078static int 5079unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 5080 const void **ptr) 5081{ 5082 PyErr_SetString(PyExc_TypeError, 5083 "cannot use unicode as modifyable buffer"); 5084 return -1; 5085} 5086 5087static int 5088unicode_buffer_getsegcount(PyUnicodeObject *self, 5089 int *lenp) 5090{ 5091 if (lenp) 5092 *lenp = PyUnicode_GET_DATA_SIZE(self); 5093 return 1; 5094} 5095 5096static int 5097unicode_buffer_getcharbuf(PyUnicodeObject *self, 5098 int index, 5099 const void **ptr) 5100{ 5101 PyObject *str; 5102 5103 if (index != 0) { 5104 PyErr_SetString(PyExc_SystemError, 5105 "accessing non-existent unicode segment"); 5106 return -1; 5107 } 5108 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5109 if (str == NULL) 5110 return -1; 5111 *ptr = (void *) PyString_AS_STRING(str); 5112 return PyString_GET_SIZE(str); 5113} 5114 5115/* Helpers for PyUnicode_Format() */ 5116 5117static PyObject * 5118getnextarg(PyObject *args, int arglen, int *p_argidx) 5119{ 5120 int argidx = *p_argidx; 5121 if (argidx < arglen) { 5122 (*p_argidx)++; 5123 if (arglen < 0) 5124 return args; 5125 else 5126 return PyTuple_GetItem(args, argidx); 5127 } 5128 PyErr_SetString(PyExc_TypeError, 5129 "not enough arguments for format string"); 5130 return NULL; 5131} 5132 5133#define F_LJUST (1<<0) 5134#define F_SIGN (1<<1) 5135#define F_BLANK (1<<2) 5136#define F_ALT (1<<3) 5137#define F_ZERO (1<<4) 5138 5139static 5140int usprintf(register Py_UNICODE *buffer, char *format, ...) 5141{ 5142 register int i; 5143 int len; 5144 va_list va; 5145 char *charbuffer; 5146 va_start(va, format); 5147 5148 /* First, format the string as char array, then expand to Py_UNICODE 5149 array. */ 5150 charbuffer = (char *)buffer; 5151 len = vsprintf(charbuffer, format, va); 5152 for (i = len - 1; i >= 0; i--) 5153 buffer[i] = (Py_UNICODE) charbuffer[i]; 5154 5155 va_end(va); 5156 return len; 5157} 5158 5159static int 5160formatfloat(Py_UNICODE *buf, 5161 size_t buflen, 5162 int flags, 5163 int prec, 5164 int type, 5165 PyObject *v) 5166{ 5167 /* fmt = '%#.' + `prec` + `type` 5168 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 5169 char fmt[20]; 5170 double x; 5171 5172 x = PyFloat_AsDouble(v); 5173 if (x == -1.0 && PyErr_Occurred()) 5174 return -1; 5175 if (prec < 0) 5176 prec = 6; 5177 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 5178 type = 'g'; 5179 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 5180 (flags & F_ALT) ? "#" : "", prec, type); 5181 /* worst case length calc to ensure no buffer overrun: 5182 fmt = %#.<prec>g 5183 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 5184 for any double rep.) 5185 len = 1 + prec + 1 + 2 + 5 = 9 + prec 5186 If prec=0 the effective precision is 1 (the leading digit is 5187 always given), therefore increase by one to 10+prec. */ 5188 if (buflen <= (size_t)10 + (size_t)prec) { 5189 PyErr_SetString(PyExc_OverflowError, 5190 "formatted float is too long (precision too long?)"); 5191 return -1; 5192 } 5193 return usprintf(buf, fmt, x); 5194} 5195 5196static PyObject* 5197formatlong(PyObject *val, int flags, int prec, int type) 5198{ 5199 char *buf; 5200 int i, len; 5201 PyObject *str; /* temporary string object. */ 5202 PyUnicodeObject *result; 5203 5204 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 5205 if (!str) 5206 return NULL; 5207 result = _PyUnicode_New(len); 5208 for (i = 0; i < len; i++) 5209 result->str[i] = buf[i]; 5210 result->str[len] = 0; 5211 Py_DECREF(str); 5212 return (PyObject*)result; 5213} 5214 5215static int 5216formatint(Py_UNICODE *buf, 5217 size_t buflen, 5218 int flags, 5219 int prec, 5220 int type, 5221 PyObject *v) 5222{ 5223 /* fmt = '%#.' + `prec` + 'l' + `type` 5224 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 5225 * + 1 + 1 5226 * = 24 5227 */ 5228 char fmt[64]; /* plenty big enough! */ 5229 long x; 5230 5231 x = PyInt_AsLong(v); 5232 if (x == -1 && PyErr_Occurred()) 5233 return -1; 5234 if (prec < 0) 5235 prec = 1; 5236 5237 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 5238 * worst case buf = '0x' + [0-9]*prec, where prec >= 11 5239 */ 5240 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) { 5241 PyErr_SetString(PyExc_OverflowError, 5242 "formatted integer is too long (precision too large?)"); 5243 return -1; 5244 } 5245 5246 if ((flags & F_ALT) && 5247 (type == 'x' || type == 'X')) { 5248 /* When converting under %#x or %#X, there are a number 5249 * of issues that cause pain: 5250 * - when 0 is being converted, the C standard leaves off 5251 * the '0x' or '0X', which is inconsistent with other 5252 * %#x/%#X conversions and inconsistent with Python's 5253 * hex() function 5254 * - there are platforms that violate the standard and 5255 * convert 0 with the '0x' or '0X' 5256 * (Metrowerks, Compaq Tru64) 5257 * - there are platforms that give '0x' when converting 5258 * under %#X, but convert 0 in accordance with the 5259 * standard (OS/2 EMX) 5260 * 5261 * We can achieve the desired consistency by inserting our 5262 * own '0x' or '0X' prefix, and substituting %x/%X in place 5263 * of %#x/%#X. 5264 * 5265 * Note that this is the same approach as used in 5266 * formatint() in stringobject.c 5267 */ 5268 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c", 5269 type, prec, type); 5270 } 5271 else { 5272 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 5273 (flags&F_ALT) ? "#" : "", 5274 prec, type); 5275 } 5276 return usprintf(buf, fmt, x); 5277} 5278 5279static int 5280formatchar(Py_UNICODE *buf, 5281 size_t buflen, 5282 PyObject *v) 5283{ 5284 /* presume that the buffer is at least 2 characters long */ 5285 if (PyUnicode_Check(v)) { 5286 if (PyUnicode_GET_SIZE(v) != 1) 5287 goto onError; 5288 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 5289 } 5290 5291 else if (PyString_Check(v)) { 5292 if (PyString_GET_SIZE(v) != 1) 5293 goto onError; 5294 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 5295 } 5296 5297 else { 5298 /* Integer input truncated to a character */ 5299 long x; 5300 x = PyInt_AsLong(v); 5301 if (x == -1 && PyErr_Occurred()) 5302 goto onError; 5303 buf[0] = (char) x; 5304 } 5305 buf[1] = '\0'; 5306 return 1; 5307 5308 onError: 5309 PyErr_SetString(PyExc_TypeError, 5310 "%c requires int or char"); 5311 return -1; 5312} 5313 5314/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 5315 5316 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 5317 chars are formatted. XXX This is a magic number. Each formatting 5318 routine does bounds checking to ensure no overflow, but a better 5319 solution may be to malloc a buffer of appropriate size for each 5320 format. For now, the current solution is sufficient. 5321*/ 5322#define FORMATBUFLEN (size_t)120 5323 5324PyObject *PyUnicode_Format(PyObject *format, 5325 PyObject *args) 5326{ 5327 Py_UNICODE *fmt, *res; 5328 int fmtcnt, rescnt, reslen, arglen, argidx; 5329 int args_owned = 0; 5330 PyUnicodeObject *result = NULL; 5331 PyObject *dict = NULL; 5332 PyObject *uformat; 5333 5334 if (format == NULL || args == NULL) { 5335 PyErr_BadInternalCall(); 5336 return NULL; 5337 } 5338 uformat = PyUnicode_FromObject(format); 5339 if (uformat == NULL) 5340 return NULL; 5341 fmt = PyUnicode_AS_UNICODE(uformat); 5342 fmtcnt = PyUnicode_GET_SIZE(uformat); 5343 5344 reslen = rescnt = fmtcnt + 100; 5345 result = _PyUnicode_New(reslen); 5346 if (result == NULL) 5347 goto onError; 5348 res = PyUnicode_AS_UNICODE(result); 5349 5350 if (PyTuple_Check(args)) { 5351 arglen = PyTuple_Size(args); 5352 argidx = 0; 5353 } 5354 else { 5355 arglen = -1; 5356 argidx = -2; 5357 } 5358 if (args->ob_type->tp_as_mapping) 5359 dict = args; 5360 5361 while (--fmtcnt >= 0) { 5362 if (*fmt != '%') { 5363 if (--rescnt < 0) { 5364 rescnt = fmtcnt + 100; 5365 reslen += rescnt; 5366 if (_PyUnicode_Resize(&result, reslen) < 0) 5367 return NULL; 5368 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 5369 --rescnt; 5370 } 5371 *res++ = *fmt++; 5372 } 5373 else { 5374 /* Got a format specifier */ 5375 int flags = 0; 5376 int width = -1; 5377 int prec = -1; 5378 Py_UNICODE c = '\0'; 5379 Py_UNICODE fill; 5380 PyObject *v = NULL; 5381 PyObject *temp = NULL; 5382 Py_UNICODE *pbuf; 5383 Py_UNICODE sign; 5384 int len; 5385 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 5386 5387 fmt++; 5388 if (*fmt == '(') { 5389 Py_UNICODE *keystart; 5390 int keylen; 5391 PyObject *key; 5392 int pcount = 1; 5393 5394 if (dict == NULL) { 5395 PyErr_SetString(PyExc_TypeError, 5396 "format requires a mapping"); 5397 goto onError; 5398 } 5399 ++fmt; 5400 --fmtcnt; 5401 keystart = fmt; 5402 /* Skip over balanced parentheses */ 5403 while (pcount > 0 && --fmtcnt >= 0) { 5404 if (*fmt == ')') 5405 --pcount; 5406 else if (*fmt == '(') 5407 ++pcount; 5408 fmt++; 5409 } 5410 keylen = fmt - keystart - 1; 5411 if (fmtcnt < 0 || pcount > 0) { 5412 PyErr_SetString(PyExc_ValueError, 5413 "incomplete format key"); 5414 goto onError; 5415 } 5416#if 0 5417 /* keys are converted to strings using UTF-8 and 5418 then looked up since Python uses strings to hold 5419 variables names etc. in its namespaces and we 5420 wouldn't want to break common idioms. */ 5421 key = PyUnicode_EncodeUTF8(keystart, 5422 keylen, 5423 NULL); 5424#else 5425 key = PyUnicode_FromUnicode(keystart, keylen); 5426#endif 5427 if (key == NULL) 5428 goto onError; 5429 if (args_owned) { 5430 Py_DECREF(args); 5431 args_owned = 0; 5432 } 5433 args = PyObject_GetItem(dict, key); 5434 Py_DECREF(key); 5435 if (args == NULL) { 5436 goto onError; 5437 } 5438 args_owned = 1; 5439 arglen = -1; 5440 argidx = -2; 5441 } 5442 while (--fmtcnt >= 0) { 5443 switch (c = *fmt++) { 5444 case '-': flags |= F_LJUST; continue; 5445 case '+': flags |= F_SIGN; continue; 5446 case ' ': flags |= F_BLANK; continue; 5447 case '#': flags |= F_ALT; continue; 5448 case '0': flags |= F_ZERO; continue; 5449 } 5450 break; 5451 } 5452 if (c == '*') { 5453 v = getnextarg(args, arglen, &argidx); 5454 if (v == NULL) 5455 goto onError; 5456 if (!PyInt_Check(v)) { 5457 PyErr_SetString(PyExc_TypeError, 5458 "* wants int"); 5459 goto onError; 5460 } 5461 width = PyInt_AsLong(v); 5462 if (width < 0) { 5463 flags |= F_LJUST; 5464 width = -width; 5465 } 5466 if (--fmtcnt >= 0) 5467 c = *fmt++; 5468 } 5469 else if (c >= '0' && c <= '9') { 5470 width = c - '0'; 5471 while (--fmtcnt >= 0) { 5472 c = *fmt++; 5473 if (c < '0' || c > '9') 5474 break; 5475 if ((width*10) / 10 != width) { 5476 PyErr_SetString(PyExc_ValueError, 5477 "width too big"); 5478 goto onError; 5479 } 5480 width = width*10 + (c - '0'); 5481 } 5482 } 5483 if (c == '.') { 5484 prec = 0; 5485 if (--fmtcnt >= 0) 5486 c = *fmt++; 5487 if (c == '*') { 5488 v = getnextarg(args, arglen, &argidx); 5489 if (v == NULL) 5490 goto onError; 5491 if (!PyInt_Check(v)) { 5492 PyErr_SetString(PyExc_TypeError, 5493 "* wants int"); 5494 goto onError; 5495 } 5496 prec = PyInt_AsLong(v); 5497 if (prec < 0) 5498 prec = 0; 5499 if (--fmtcnt >= 0) 5500 c = *fmt++; 5501 } 5502 else if (c >= '0' && c <= '9') { 5503 prec = c - '0'; 5504 while (--fmtcnt >= 0) { 5505 c = Py_CHARMASK(*fmt++); 5506 if (c < '0' || c > '9') 5507 break; 5508 if ((prec*10) / 10 != prec) { 5509 PyErr_SetString(PyExc_ValueError, 5510 "prec too big"); 5511 goto onError; 5512 } 5513 prec = prec*10 + (c - '0'); 5514 } 5515 } 5516 } /* prec */ 5517 if (fmtcnt >= 0) { 5518 if (c == 'h' || c == 'l' || c == 'L') { 5519 if (--fmtcnt >= 0) 5520 c = *fmt++; 5521 } 5522 } 5523 if (fmtcnt < 0) { 5524 PyErr_SetString(PyExc_ValueError, 5525 "incomplete format"); 5526 goto onError; 5527 } 5528 if (c != '%') { 5529 v = getnextarg(args, arglen, &argidx); 5530 if (v == NULL) 5531 goto onError; 5532 } 5533 sign = 0; 5534 fill = ' '; 5535 switch (c) { 5536 5537 case '%': 5538 pbuf = formatbuf; 5539 /* presume that buffer length is at least 1 */ 5540 pbuf[0] = '%'; 5541 len = 1; 5542 break; 5543 5544 case 's': 5545 case 'r': 5546 if (PyUnicode_Check(v) && c == 's') { 5547 temp = v; 5548 Py_INCREF(temp); 5549 } 5550 else { 5551 PyObject *unicode; 5552 if (c == 's') 5553 temp = PyObject_Str(v); 5554 else 5555 temp = PyObject_Repr(v); 5556 if (temp == NULL) 5557 goto onError; 5558 if (!PyString_Check(temp)) { 5559 /* XXX Note: this should never happen, since 5560 PyObject_Repr() and PyObject_Str() assure 5561 this */ 5562 Py_DECREF(temp); 5563 PyErr_SetString(PyExc_TypeError, 5564 "%s argument has non-string str()"); 5565 goto onError; 5566 } 5567 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5568 PyString_GET_SIZE(temp), 5569 NULL, 5570 "strict"); 5571 Py_DECREF(temp); 5572 temp = unicode; 5573 if (temp == NULL) 5574 goto onError; 5575 } 5576 pbuf = PyUnicode_AS_UNICODE(temp); 5577 len = PyUnicode_GET_SIZE(temp); 5578 if (prec >= 0 && len > prec) 5579 len = prec; 5580 break; 5581 5582 case 'i': 5583 case 'd': 5584 case 'u': 5585 case 'o': 5586 case 'x': 5587 case 'X': 5588 if (c == 'i') 5589 c = 'd'; 5590 if (PyLong_Check(v)) { 5591 temp = formatlong(v, flags, prec, c); 5592 if (!temp) 5593 goto onError; 5594 pbuf = PyUnicode_AS_UNICODE(temp); 5595 len = PyUnicode_GET_SIZE(temp); 5596 /* unbounded ints can always produce 5597 a sign character! */ 5598 sign = 1; 5599 } 5600 else { 5601 pbuf = formatbuf; 5602 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5603 flags, prec, c, v); 5604 if (len < 0) 5605 goto onError; 5606 /* only d conversion is signed */ 5607 sign = c == 'd'; 5608 } 5609 if (flags & F_ZERO) 5610 fill = '0'; 5611 break; 5612 5613 case 'e': 5614 case 'E': 5615 case 'f': 5616 case 'g': 5617 case 'G': 5618 pbuf = formatbuf; 5619 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5620 flags, prec, c, v); 5621 if (len < 0) 5622 goto onError; 5623 sign = 1; 5624 if (flags & F_ZERO) 5625 fill = '0'; 5626 break; 5627 5628 case 'c': 5629 pbuf = formatbuf; 5630 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5631 if (len < 0) 5632 goto onError; 5633 break; 5634 5635 default: 5636 PyErr_Format(PyExc_ValueError, 5637 "unsupported format character '%c' (0x%x) " 5638 "at index %i", 5639 (31<=c && c<=126) ? c : '?', 5640 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5641 goto onError; 5642 } 5643 if (sign) { 5644 if (*pbuf == '-' || *pbuf == '+') { 5645 sign = *pbuf++; 5646 len--; 5647 } 5648 else if (flags & F_SIGN) 5649 sign = '+'; 5650 else if (flags & F_BLANK) 5651 sign = ' '; 5652 else 5653 sign = 0; 5654 } 5655 if (width < len) 5656 width = len; 5657 if (rescnt < width + (sign != 0)) { 5658 reslen -= rescnt; 5659 rescnt = width + fmtcnt + 100; 5660 reslen += rescnt; 5661 if (_PyUnicode_Resize(&result, reslen) < 0) 5662 return NULL; 5663 res = PyUnicode_AS_UNICODE(result) 5664 + reslen - rescnt; 5665 } 5666 if (sign) { 5667 if (fill != ' ') 5668 *res++ = sign; 5669 rescnt--; 5670 if (width > len) 5671 width--; 5672 } 5673 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5674 assert(pbuf[0] == '0'); 5675 assert(pbuf[1] == c); 5676 if (fill != ' ') { 5677 *res++ = *pbuf++; 5678 *res++ = *pbuf++; 5679 } 5680 rescnt -= 2; 5681 width -= 2; 5682 if (width < 0) 5683 width = 0; 5684 len -= 2; 5685 } 5686 if (width > len && !(flags & F_LJUST)) { 5687 do { 5688 --rescnt; 5689 *res++ = fill; 5690 } while (--width > len); 5691 } 5692 if (fill == ' ') { 5693 if (sign) 5694 *res++ = sign; 5695 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5696 assert(pbuf[0] == '0'); 5697 assert(pbuf[1] == c); 5698 *res++ = *pbuf++; 5699 *res++ = *pbuf++; 5700 } 5701 } 5702 Py_UNICODE_COPY(res, pbuf, len); 5703 res += len; 5704 rescnt -= len; 5705 while (--width >= len) { 5706 --rescnt; 5707 *res++ = ' '; 5708 } 5709 if (dict && (argidx < arglen) && c != '%') { 5710 PyErr_SetString(PyExc_TypeError, 5711 "not all arguments converted during string formatting"); 5712 goto onError; 5713 } 5714 Py_XDECREF(temp); 5715 } /* '%' */ 5716 } /* until end */ 5717 if (argidx < arglen && !dict) { 5718 PyErr_SetString(PyExc_TypeError, 5719 "not all arguments converted during string formatting"); 5720 goto onError; 5721 } 5722 5723 if (args_owned) { 5724 Py_DECREF(args); 5725 } 5726 Py_DECREF(uformat); 5727 if (_PyUnicode_Resize(&result, reslen - rescnt)) 5728 goto onError; 5729 return (PyObject *)result; 5730 5731 onError: 5732 Py_XDECREF(result); 5733 Py_DECREF(uformat); 5734 if (args_owned) { 5735 Py_DECREF(args); 5736 } 5737 return NULL; 5738} 5739 5740static PyBufferProcs unicode_as_buffer = { 5741 (getreadbufferproc) unicode_buffer_getreadbuf, 5742 (getwritebufferproc) unicode_buffer_getwritebuf, 5743 (getsegcountproc) unicode_buffer_getsegcount, 5744 (getcharbufferproc) unicode_buffer_getcharbuf, 5745}; 5746 5747staticforward PyObject * 5748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 5749 5750static PyObject * 5751unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5752{ 5753 PyObject *x = NULL; 5754 static char *kwlist[] = {"string", "encoding", "errors", 0}; 5755 char *encoding = NULL; 5756 char *errors = NULL; 5757 5758 if (type != &PyUnicode_Type) 5759 return unicode_subtype_new(type, args, kwds); 5760 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 5761 kwlist, &x, &encoding, &errors)) 5762 return NULL; 5763 if (x == NULL) 5764 return (PyObject *)_PyUnicode_New(0); 5765 if (encoding == NULL && errors == NULL) 5766 return PyObject_Unicode(x); 5767 else 5768 return PyUnicode_FromEncodedObject(x, encoding, errors); 5769} 5770 5771static PyObject * 5772unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 5773{ 5774 PyUnicodeObject *tmp, *pnew; 5775 int n; 5776 5777 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 5778 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 5779 if (tmp == NULL) 5780 return NULL; 5781 assert(PyUnicode_Check(tmp)); 5782 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 5783 if (pnew == NULL) 5784 return NULL; 5785 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 5786 if (pnew->str == NULL) { 5787 _Py_ForgetReference((PyObject *)pnew); 5788 PyObject_Del(pnew); 5789 return NULL; 5790 } 5791 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 5792 pnew->length = n; 5793 pnew->hash = tmp->hash; 5794 Py_DECREF(tmp); 5795 return (PyObject *)pnew; 5796} 5797 5798static char unicode_doc[] = 5799"unicode(string [, encoding[, errors]]) -> object\n\ 5800\n\ 5801Create a new Unicode object from the given encoded string.\n\ 5802encoding defaults to the current default string encoding and \n\ 5803errors, defining the error handling, to 'strict'."; 5804 5805PyTypeObject PyUnicode_Type = { 5806 PyObject_HEAD_INIT(&PyType_Type) 5807 0, /* ob_size */ 5808 "unicode", /* tp_name */ 5809 sizeof(PyUnicodeObject), /* tp_size */ 5810 0, /* tp_itemsize */ 5811 /* Slots */ 5812 (destructor)unicode_dealloc, /* tp_dealloc */ 5813 0, /* tp_print */ 5814 0, /* tp_getattr */ 5815 0, /* tp_setattr */ 5816 (cmpfunc) unicode_compare, /* tp_compare */ 5817 (reprfunc) unicode_repr, /* tp_repr */ 5818 0, /* tp_as_number */ 5819 &unicode_as_sequence, /* tp_as_sequence */ 5820 0, /* tp_as_mapping */ 5821 (hashfunc) unicode_hash, /* tp_hash*/ 5822 0, /* tp_call*/ 5823 (reprfunc) unicode_str, /* tp_str */ 5824 PyObject_GenericGetAttr, /* tp_getattro */ 5825 0, /* tp_setattro */ 5826 &unicode_as_buffer, /* tp_as_buffer */ 5827 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 5828 unicode_doc, /* tp_doc */ 5829 0, /* tp_traverse */ 5830 0, /* tp_clear */ 5831 0, /* tp_richcompare */ 5832 0, /* tp_weaklistoffset */ 5833 0, /* tp_iter */ 5834 0, /* tp_iternext */ 5835 unicode_methods, /* tp_methods */ 5836 0, /* tp_members */ 5837 0, /* tp_getset */ 5838 0, /* tp_base */ 5839 0, /* tp_dict */ 5840 0, /* tp_descr_get */ 5841 0, /* tp_descr_set */ 5842 0, /* tp_dictoffset */ 5843 0, /* tp_init */ 5844 0, /* tp_alloc */ 5845 unicode_new, /* tp_new */ 5846 PyObject_Del, /* tp_free */ 5847}; 5848 5849/* Initialize the Unicode implementation */ 5850 5851void _PyUnicode_Init(void) 5852{ 5853 int i; 5854 5855 /* Init the implementation */ 5856 unicode_freelist = NULL; 5857 unicode_freelist_size = 0; 5858 unicode_empty = _PyUnicode_New(0); 5859 strcpy(unicode_default_encoding, "ascii"); 5860 for (i = 0; i < 256; i++) 5861 unicode_latin1[i] = NULL; 5862} 5863 5864/* Finalize the Unicode implementation */ 5865 5866void 5867_PyUnicode_Fini(void) 5868{ 5869 PyUnicodeObject *u; 5870 int i; 5871 5872 Py_XDECREF(unicode_empty); 5873 unicode_empty = NULL; 5874 5875 for (i = 0; i < 256; i++) { 5876 if (unicode_latin1[i]) { 5877 Py_DECREF(unicode_latin1[i]); 5878 unicode_latin1[i] = NULL; 5879 } 5880 } 5881 5882 for (u = unicode_freelist; u != NULL;) { 5883 PyUnicodeObject *v = u; 5884 u = *(PyUnicodeObject **)u; 5885 if (v->str) 5886 PyMem_DEL(v->str); 5887 Py_XDECREF(v->defenc); 5888 PyObject_Del(v); 5889 } 5890 unicode_freelist = NULL; 5891 unicode_freelist_size = 0; 5892} 5893