unicodeobject.c revision ce30bc9f49dd77a9e6707eabaa1f3ceb8e6e458e
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WINDOWS 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_Del(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 *unicode = (PyObject *)w; 280 return 0; 281 } 282 283 /* Note that we don't have to modify *unicode for unshared Unicode 284 objects, since we can modify them in-place. */ 285 return unicode_resize(v, length); 286} 287 288/* Internal API for use in unicodeobject.c only ! */ 289#define _PyUnicode_Resize(unicodevar, length) \ 290 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 291 292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 293 int size) 294{ 295 PyUnicodeObject *unicode; 296 297 /* If the Unicode data is known at construction time, we can apply 298 some optimizations which share commonly used objects. */ 299 if (u != NULL) { 300 301 /* Optimization for empty strings */ 302 if (size == 0 && unicode_empty != NULL) { 303 Py_INCREF(unicode_empty); 304 return (PyObject *)unicode_empty; 305 } 306 307 /* Single character Unicode objects in the Latin-1 range are 308 shared when using this constructor */ 309 if (size == 1 && *u < 256) { 310 unicode = unicode_latin1[*u]; 311 if (!unicode) { 312 unicode = _PyUnicode_New(1); 313 if (!unicode) 314 return NULL; 315 unicode->str[0] = *u; 316 unicode_latin1[*u] = unicode; 317 } 318 Py_INCREF(unicode); 319 return (PyObject *)unicode; 320 } 321 } 322 323 unicode = _PyUnicode_New(size); 324 if (!unicode) 325 return NULL; 326 327 /* Copy the Unicode data into the new object */ 328 if (u != NULL) 329 Py_UNICODE_COPY(unicode->str, u, size); 330 331 return (PyObject *)unicode; 332} 333 334#ifdef HAVE_WCHAR_H 335 336PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 337 int size) 338{ 339 PyUnicodeObject *unicode; 340 341 if (w == NULL) { 342 PyErr_BadInternalCall(); 343 return NULL; 344 } 345 346 unicode = _PyUnicode_New(size); 347 if (!unicode) 348 return NULL; 349 350 /* Copy the wchar_t data into the new object */ 351#ifdef HAVE_USABLE_WCHAR_T 352 memcpy(unicode->str, w, size * sizeof(wchar_t)); 353#else 354 { 355 register Py_UNICODE *u; 356 register int i; 357 u = PyUnicode_AS_UNICODE(unicode); 358 for (i = size; i >= 0; i--) 359 *u++ = *w++; 360 } 361#endif 362 363 return (PyObject *)unicode; 364} 365 366int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 367 register wchar_t *w, 368 int size) 369{ 370 if (unicode == NULL) { 371 PyErr_BadInternalCall(); 372 return -1; 373 } 374 if (size > PyUnicode_GET_SIZE(unicode)) 375 size = PyUnicode_GET_SIZE(unicode); 376#ifdef HAVE_USABLE_WCHAR_T 377 memcpy(w, unicode->str, size * sizeof(wchar_t)); 378#else 379 { 380 register Py_UNICODE *u; 381 register int i; 382 u = PyUnicode_AS_UNICODE(unicode); 383 for (i = size; i >= 0; i--) 384 *w++ = *u++; 385 } 386#endif 387 388 return size; 389} 390 391#endif 392 393PyObject *PyUnicode_FromOrdinal(int ordinal) 394{ 395 Py_UNICODE s[2]; 396 397#ifdef Py_UNICODE_WIDE 398 if (ordinal < 0 || ordinal > 0x10ffff) { 399 PyErr_SetString(PyExc_ValueError, 400 "unichr() arg not in range(0x110000) " 401 "(wide Python build)"); 402 return NULL; 403 } 404#else 405 if (ordinal < 0 || ordinal > 0xffff) { 406 PyErr_SetString(PyExc_ValueError, 407 "unichr() arg not in range(0x10000) " 408 "(narrow Python build)"); 409 return NULL; 410 } 411#endif 412 413 if (ordinal <= 0xffff) { 414 /* UCS-2 character */ 415 s[0] = (Py_UNICODE) ordinal; 416 return PyUnicode_FromUnicode(s, 1); 417 } 418 else { 419#ifndef Py_UNICODE_WIDE 420 /* UCS-4 character. store as two surrogate characters */ 421 ordinal -= 0x10000L; 422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10); 423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF); 424 return PyUnicode_FromUnicode(s, 2); 425#else 426 s[0] = (Py_UNICODE)ordinal; 427 return PyUnicode_FromUnicode(s, 1); 428#endif 429 } 430} 431 432PyObject *PyUnicode_FromObject(register PyObject *obj) 433{ 434 /* XXX Perhaps we should make this API an alias of 435 PyObject_Unicode() instead ?! */ 436 if (PyUnicode_CheckExact(obj)) { 437 Py_INCREF(obj); 438 return obj; 439 } 440 if (PyUnicode_Check(obj)) { 441 /* For a Unicode subtype that's not a Unicode object, 442 return a true Unicode object with the same data. */ 443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 444 PyUnicode_GET_SIZE(obj)); 445 } 446 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 447} 448 449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 450 const char *encoding, 451 const char *errors) 452{ 453 const char *s = NULL; 454 int len; 455 int owned = 0; 456 PyObject *v; 457 458 if (obj == NULL) { 459 PyErr_BadInternalCall(); 460 return NULL; 461 } 462 463#if 0 464 /* For b/w compatibility we also accept Unicode objects provided 465 that no encodings is given and then redirect to 466 PyObject_Unicode() which then applies the additional logic for 467 Unicode subclasses. 468 469 NOTE: This API should really only be used for object which 470 represent *encoded* Unicode ! 471 472 */ 473 if (PyUnicode_Check(obj)) { 474 if (encoding) { 475 PyErr_SetString(PyExc_TypeError, 476 "decoding Unicode is not supported"); 477 return NULL; 478 } 479 return PyObject_Unicode(obj); 480 } 481#else 482 if (PyUnicode_Check(obj)) { 483 PyErr_SetString(PyExc_TypeError, 484 "decoding Unicode is not supported"); 485 return NULL; 486 } 487#endif 488 489 /* Coerce object */ 490 if (PyString_Check(obj)) { 491 s = PyString_AS_STRING(obj); 492 len = PyString_GET_SIZE(obj); 493 } 494 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 495 /* Overwrite the error message with something more useful in 496 case of a TypeError. */ 497 if (PyErr_ExceptionMatches(PyExc_TypeError)) 498 PyErr_Format(PyExc_TypeError, 499 "coercing to Unicode: need string or buffer, " 500 "%.80s found", 501 obj->ob_type->tp_name); 502 goto onError; 503 } 504 505 /* Convert to Unicode */ 506 if (len == 0) { 507 Py_INCREF(unicode_empty); 508 v = (PyObject *)unicode_empty; 509 } 510 else 511 v = PyUnicode_Decode(s, len, encoding, errors); 512 513 if (owned) { 514 Py_DECREF(obj); 515 } 516 return v; 517 518 onError: 519 if (owned) { 520 Py_DECREF(obj); 521 } 522 return NULL; 523} 524 525PyObject *PyUnicode_Decode(const char *s, 526 int size, 527 const char *encoding, 528 const char *errors) 529{ 530 PyObject *buffer = NULL, *unicode; 531 532 if (encoding == NULL) 533 encoding = PyUnicode_GetDefaultEncoding(); 534 535 /* Shortcuts for common default encodings */ 536 if (strcmp(encoding, "utf-8") == 0) 537 return PyUnicode_DecodeUTF8(s, size, errors); 538 else if (strcmp(encoding, "latin-1") == 0) 539 return PyUnicode_DecodeLatin1(s, size, errors); 540 else if (strcmp(encoding, "ascii") == 0) 541 return PyUnicode_DecodeASCII(s, size, errors); 542 543 /* Decode via the codec registry */ 544 buffer = PyBuffer_FromMemory((void *)s, size); 545 if (buffer == NULL) 546 goto onError; 547 unicode = PyCodec_Decode(buffer, encoding, errors); 548 if (unicode == NULL) 549 goto onError; 550 if (!PyUnicode_Check(unicode)) { 551 PyErr_Format(PyExc_TypeError, 552 "decoder did not return an unicode object (type=%.400s)", 553 unicode->ob_type->tp_name); 554 Py_DECREF(unicode); 555 goto onError; 556 } 557 Py_DECREF(buffer); 558 return unicode; 559 560 onError: 561 Py_XDECREF(buffer); 562 return NULL; 563} 564 565PyObject *PyUnicode_Encode(const Py_UNICODE *s, 566 int size, 567 const char *encoding, 568 const char *errors) 569{ 570 PyObject *v, *unicode; 571 572 unicode = PyUnicode_FromUnicode(s, size); 573 if (unicode == NULL) 574 return NULL; 575 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 576 Py_DECREF(unicode); 577 return v; 578} 579 580PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 581 const char *encoding, 582 const char *errors) 583{ 584 PyObject *v; 585 586 if (!PyUnicode_Check(unicode)) { 587 PyErr_BadArgument(); 588 goto onError; 589 } 590 591 if (encoding == NULL) 592 encoding = PyUnicode_GetDefaultEncoding(); 593 594 /* Shortcuts for common default encodings */ 595 if (errors == NULL) { 596 if (strcmp(encoding, "utf-8") == 0) 597 return PyUnicode_AsUTF8String(unicode); 598 else if (strcmp(encoding, "latin-1") == 0) 599 return PyUnicode_AsLatin1String(unicode); 600 else if (strcmp(encoding, "ascii") == 0) 601 return PyUnicode_AsASCIIString(unicode); 602 } 603 604 /* Encode via the codec registry */ 605 v = PyCodec_Encode(unicode, encoding, errors); 606 if (v == NULL) 607 goto onError; 608 /* XXX Should we really enforce this ? */ 609 if (!PyString_Check(v)) { 610 PyErr_Format(PyExc_TypeError, 611 "encoder did not return a string object (type=%.400s)", 612 v->ob_type->tp_name); 613 Py_DECREF(v); 614 goto onError; 615 } 616 return v; 617 618 onError: 619 return NULL; 620} 621 622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 623 const char *errors) 624{ 625 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 626 627 if (v) 628 return v; 629 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 630 if (v && errors == NULL) 631 ((PyUnicodeObject *)unicode)->defenc = v; 632 return v; 633} 634 635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 636{ 637 if (!PyUnicode_Check(unicode)) { 638 PyErr_BadArgument(); 639 goto onError; 640 } 641 return PyUnicode_AS_UNICODE(unicode); 642 643 onError: 644 return NULL; 645} 646 647int PyUnicode_GetSize(PyObject *unicode) 648{ 649 if (!PyUnicode_Check(unicode)) { 650 PyErr_BadArgument(); 651 goto onError; 652 } 653 return PyUnicode_GET_SIZE(unicode); 654 655 onError: 656 return -1; 657} 658 659const char *PyUnicode_GetDefaultEncoding(void) 660{ 661 return unicode_default_encoding; 662} 663 664int PyUnicode_SetDefaultEncoding(const char *encoding) 665{ 666 PyObject *v; 667 668 /* Make sure the encoding is valid. As side effect, this also 669 loads the encoding into the codec registry cache. */ 670 v = _PyCodec_Lookup(encoding); 671 if (v == NULL) 672 goto onError; 673 Py_DECREF(v); 674 strncpy(unicode_default_encoding, 675 encoding, 676 sizeof(unicode_default_encoding)); 677 return 0; 678 679 onError: 680 return -1; 681} 682 683/* error handling callback helper: 684 build arguments, call the callback and check the arguments, 685 if no exception occured, copy the replacement to the output 686 and adjust various state variables. 687 return 0 on success, -1 on error 688*/ 689 690static 691int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 692 const char *encoding, const char *reason, 693 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, 694 PyObject **output, int *outpos, Py_UNICODE **outptr) 695{ 696 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; 697 698 PyObject *restuple = NULL; 699 PyObject *repunicode = NULL; 700 int outsize = PyUnicode_GET_SIZE(*output); 701 int requiredsize; 702 int newpos; 703 Py_UNICODE *repptr; 704 int repsize; 705 int res = -1; 706 707 if (*errorHandler == NULL) { 708 *errorHandler = PyCodec_LookupError(errors); 709 if (*errorHandler == NULL) 710 goto onError; 711 } 712 713 if (*exceptionObject == NULL) { 714 *exceptionObject = PyUnicodeDecodeError_Create( 715 encoding, input, insize, *startinpos, *endinpos, reason); 716 if (*exceptionObject == NULL) 717 goto onError; 718 } 719 else { 720 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 721 goto onError; 722 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 723 goto onError; 724 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 725 goto onError; 726 } 727 728 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 729 if (restuple == NULL) 730 goto onError; 731 if (!PyTuple_Check(restuple)) { 732 PyErr_Format(PyExc_TypeError, &argparse[4]); 733 goto onError; 734 } 735 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 736 goto onError; 737 if (newpos<0) 738 newpos = 0; 739 else if (newpos>insize) 740 newpos = insize; 741 742 /* need more space? (at least enough for what we 743 have+the replacement+the rest of the string (starting 744 at the new input position), so we won't have to check space 745 when there are no errors in the rest of the string) */ 746 repptr = PyUnicode_AS_UNICODE(repunicode); 747 repsize = PyUnicode_GET_SIZE(repunicode); 748 requiredsize = *outpos + repsize + insize-newpos; 749 if (requiredsize > outsize) { 750 if (requiredsize<2*outsize) 751 requiredsize = 2*outsize; 752 if (PyUnicode_Resize(output, requiredsize)) 753 goto onError; 754 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 755 } 756 *endinpos = newpos; 757 *inptr = input + newpos; 758 Py_UNICODE_COPY(*outptr, repptr, repsize); 759 *outptr += repsize; 760 *outpos += repsize; 761 /* we made it! */ 762 res = 0; 763 764 onError: 765 Py_XDECREF(restuple); 766 return res; 767} 768 769/* --- UTF-7 Codec -------------------------------------------------------- */ 770 771/* see RFC2152 for details */ 772 773static 774char utf7_special[128] = { 775 /* indicate whether a UTF-7 character is special i.e. cannot be directly 776 encoded: 777 0 - not special 778 1 - special 779 2 - whitespace (optional) 780 3 - RFC2152 Set O (optional) */ 781 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 783 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 785 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 787 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 789 790}; 791 792#define SPECIAL(c, encodeO, encodeWS) \ 793 (((c)>127 || utf7_special[(c)] == 1) || \ 794 (encodeWS && (utf7_special[(c)] == 2)) || \ 795 (encodeO && (utf7_special[(c)] == 3))) 796 797#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 798#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 799#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 800 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 801 802#define ENCODE(out, ch, bits) \ 803 while (bits >= 6) { \ 804 *out++ = B64(ch >> (bits-6)); \ 805 bits -= 6; \ 806 } 807 808#define DECODE(out, ch, bits, surrogate) \ 809 while (bits >= 16) { \ 810 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 811 bits -= 16; \ 812 if (surrogate) { \ 813 /* We have already generated an error for the high surrogate 814 so let's not bother seeing if the low surrogate is correct or not */\ 815 surrogate = 0; \ 816 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 817 /* This is a surrogate pair. Unfortunately we can't represent \ 818 it in a 16-bit character */ \ 819 surrogate = 1; \ 820 errmsg = "code pairs are not supported"; \ 821 goto utf7Error; \ 822 } else { \ 823 *out++ = outCh; \ 824 } \ 825 } \ 826 827PyObject *PyUnicode_DecodeUTF7(const char *s, 828 int size, 829 const char *errors) 830{ 831 const char *starts = s; 832 int startinpos; 833 int endinpos; 834 int outpos; 835 const char *e; 836 PyUnicodeObject *unicode; 837 Py_UNICODE *p; 838 const char *errmsg = ""; 839 int inShift = 0; 840 unsigned int bitsleft = 0; 841 unsigned long charsleft = 0; 842 int surrogate = 0; 843 PyObject *errorHandler = NULL; 844 PyObject *exc = NULL; 845 846 unicode = _PyUnicode_New(size); 847 if (!unicode) 848 return NULL; 849 if (size == 0) 850 return (PyObject *)unicode; 851 852 p = unicode->str; 853 e = s + size; 854 855 while (s < e) { 856 Py_UNICODE ch; 857 restart: 858 ch = *s; 859 860 if (inShift) { 861 if ((ch == '-') || !B64CHAR(ch)) { 862 inShift = 0; 863 s++; 864 865 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 866 if (bitsleft >= 6) { 867 /* The shift sequence has a partial character in it. If 868 bitsleft < 6 then we could just classify it as padding 869 but that is not the case here */ 870 871 errmsg = "partial character in shift sequence"; 872 goto utf7Error; 873 } 874 /* According to RFC2152 the remaining bits should be zero. We 875 choose to signal an error/insert a replacement character 876 here so indicate the potential of a misencoded character. */ 877 878 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 879 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 880 errmsg = "non-zero padding bits in shift sequence"; 881 goto utf7Error; 882 } 883 884 if (ch == '-') { 885 if ((s < e) && (*(s) == '-')) { 886 *p++ = '-'; 887 inShift = 1; 888 } 889 } else if (SPECIAL(ch,0,0)) { 890 errmsg = "unexpected special character"; 891 goto utf7Error; 892 } else { 893 *p++ = ch; 894 } 895 } else { 896 charsleft = (charsleft << 6) | UB64(ch); 897 bitsleft += 6; 898 s++; 899 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 900 } 901 } 902 else if ( ch == '+' ) { 903 startinpos = s-starts; 904 s++; 905 if (s < e && *s == '-') { 906 s++; 907 *p++ = '+'; 908 } else 909 { 910 inShift = 1; 911 bitsleft = 0; 912 } 913 } 914 else if (SPECIAL(ch,0,0)) { 915 errmsg = "unexpected special character"; 916 s++; 917 goto utf7Error; 918 } 919 else { 920 *p++ = ch; 921 s++; 922 } 923 continue; 924 utf7Error: 925 outpos = p-PyUnicode_AS_UNICODE(unicode); 926 endinpos = s-starts; 927 if (unicode_decode_call_errorhandler( 928 errors, &errorHandler, 929 "utf7", errmsg, 930 starts, size, &startinpos, &endinpos, &exc, &s, 931 (PyObject **)&unicode, &outpos, &p)) 932 goto onError; 933 } 934 935 if (inShift) { 936 outpos = p-PyUnicode_AS_UNICODE(unicode); 937 endinpos = size; 938 if (unicode_decode_call_errorhandler( 939 errors, &errorHandler, 940 "utf7", "unterminated shift sequence", 941 starts, size, &startinpos, &endinpos, &exc, &s, 942 (PyObject **)&unicode, &outpos, &p)) 943 goto onError; 944 if (s < e) 945 goto restart; 946 } 947 948 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) 949 goto onError; 950 951 Py_XDECREF(errorHandler); 952 Py_XDECREF(exc); 953 return (PyObject *)unicode; 954 955onError: 956 Py_XDECREF(errorHandler); 957 Py_XDECREF(exc); 958 Py_DECREF(unicode); 959 return NULL; 960} 961 962 963PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 964 int size, 965 int encodeSetO, 966 int encodeWhiteSpace, 967 const char *errors) 968{ 969 PyObject *v; 970 /* It might be possible to tighten this worst case */ 971 unsigned int cbAllocated = 5 * size; 972 int inShift = 0; 973 int i = 0; 974 unsigned int bitsleft = 0; 975 unsigned long charsleft = 0; 976 char * out; 977 char * start; 978 979 if (size == 0) 980 return PyString_FromStringAndSize(NULL, 0); 981 982 v = PyString_FromStringAndSize(NULL, cbAllocated); 983 if (v == NULL) 984 return NULL; 985 986 start = out = PyString_AS_STRING(v); 987 for (;i < size; ++i) { 988 Py_UNICODE ch = s[i]; 989 990 if (!inShift) { 991 if (ch == '+') { 992 *out++ = '+'; 993 *out++ = '-'; 994 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 995 charsleft = ch; 996 bitsleft = 16; 997 *out++ = '+'; 998 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 999 inShift = bitsleft > 0; 1000 } else { 1001 *out++ = (char) ch; 1002 } 1003 } else { 1004 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1005 *out++ = B64(charsleft << (6-bitsleft)); 1006 charsleft = 0; 1007 bitsleft = 0; 1008 /* Characters not in the BASE64 set implicitly unshift the sequence 1009 so no '-' is required, except if the character is itself a '-' */ 1010 if (B64CHAR(ch) || ch == '-') { 1011 *out++ = '-'; 1012 } 1013 inShift = 0; 1014 *out++ = (char) ch; 1015 } else { 1016 bitsleft += 16; 1017 charsleft = (charsleft << 16) | ch; 1018 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1019 1020 /* If the next character is special then we dont' need to terminate 1021 the shift sequence. If the next character is not a BASE64 character 1022 or '-' then the shift sequence will be terminated implicitly and we 1023 don't have to insert a '-'. */ 1024 1025 if (bitsleft == 0) { 1026 if (i + 1 < size) { 1027 Py_UNICODE ch2 = s[i+1]; 1028 1029 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1030 1031 } else if (B64CHAR(ch2) || ch2 == '-') { 1032 *out++ = '-'; 1033 inShift = 0; 1034 } else { 1035 inShift = 0; 1036 } 1037 1038 } 1039 else { 1040 *out++ = '-'; 1041 inShift = 0; 1042 } 1043 } 1044 } 1045 } 1046 } 1047 if (bitsleft) { 1048 *out++= B64(charsleft << (6-bitsleft) ); 1049 *out++ = '-'; 1050 } 1051 1052 _PyString_Resize(&v, out - start); 1053 return v; 1054} 1055 1056#undef SPECIAL 1057#undef B64 1058#undef B64CHAR 1059#undef UB64 1060#undef ENCODE 1061#undef DECODE 1062 1063/* --- UTF-8 Codec -------------------------------------------------------- */ 1064 1065static 1066char utf8_code_length[256] = { 1067 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1068 illegal prefix. see RFC 2279 for details */ 1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1081 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1082 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1083 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1084 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1085}; 1086 1087PyObject *PyUnicode_DecodeUTF8(const char *s, 1088 int size, 1089 const char *errors) 1090{ 1091 const char *starts = s; 1092 int n; 1093 int startinpos; 1094 int endinpos; 1095 int outpos; 1096 const char *e; 1097 PyUnicodeObject *unicode; 1098 Py_UNICODE *p; 1099 const char *errmsg = ""; 1100 PyObject *errorHandler = NULL; 1101 PyObject *exc = NULL; 1102 1103 /* Note: size will always be longer than the resulting Unicode 1104 character count */ 1105 unicode = _PyUnicode_New(size); 1106 if (!unicode) 1107 return NULL; 1108 if (size == 0) 1109 return (PyObject *)unicode; 1110 1111 /* Unpack UTF-8 encoded data */ 1112 p = unicode->str; 1113 e = s + size; 1114 1115 while (s < e) { 1116 Py_UCS4 ch = (unsigned char)*s; 1117 1118 if (ch < 0x80) { 1119 *p++ = (Py_UNICODE)ch; 1120 s++; 1121 continue; 1122 } 1123 1124 n = utf8_code_length[ch]; 1125 1126 if (s + n > e) { 1127 errmsg = "unexpected end of data"; 1128 startinpos = s-starts; 1129 endinpos = size; 1130 goto utf8Error; 1131 } 1132 1133 switch (n) { 1134 1135 case 0: 1136 errmsg = "unexpected code byte"; 1137 startinpos = s-starts; 1138 endinpos = startinpos+1; 1139 goto utf8Error; 1140 1141 case 1: 1142 errmsg = "internal error"; 1143 startinpos = s-starts; 1144 endinpos = startinpos+1; 1145 goto utf8Error; 1146 1147 case 2: 1148 if ((s[1] & 0xc0) != 0x80) { 1149 errmsg = "invalid data"; 1150 startinpos = s-starts; 1151 endinpos = startinpos+2; 1152 goto utf8Error; 1153 } 1154 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1155 if (ch < 0x80) { 1156 startinpos = s-starts; 1157 endinpos = startinpos+2; 1158 errmsg = "illegal encoding"; 1159 goto utf8Error; 1160 } 1161 else 1162 *p++ = (Py_UNICODE)ch; 1163 break; 1164 1165 case 3: 1166 if ((s[1] & 0xc0) != 0x80 || 1167 (s[2] & 0xc0) != 0x80) { 1168 errmsg = "invalid data"; 1169 startinpos = s-starts; 1170 endinpos = startinpos+3; 1171 goto utf8Error; 1172 } 1173 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1174 if (ch < 0x0800) { 1175 /* Note: UTF-8 encodings of surrogates are considered 1176 legal UTF-8 sequences; 1177 1178 XXX For wide builds (UCS-4) we should probably try 1179 to recombine the surrogates into a single code 1180 unit. 1181 */ 1182 errmsg = "illegal encoding"; 1183 startinpos = s-starts; 1184 endinpos = startinpos+3; 1185 goto utf8Error; 1186 } 1187 else 1188 *p++ = (Py_UNICODE)ch; 1189 break; 1190 1191 case 4: 1192 if ((s[1] & 0xc0) != 0x80 || 1193 (s[2] & 0xc0) != 0x80 || 1194 (s[3] & 0xc0) != 0x80) { 1195 errmsg = "invalid data"; 1196 startinpos = s-starts; 1197 endinpos = startinpos+4; 1198 goto utf8Error; 1199 } 1200 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1201 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1202 /* validate and convert to UTF-16 */ 1203 if ((ch < 0x10000) /* minimum value allowed for 4 1204 byte encoding */ 1205 || (ch > 0x10ffff)) /* maximum value allowed for 1206 UTF-16 */ 1207 { 1208 errmsg = "illegal encoding"; 1209 startinpos = s-starts; 1210 endinpos = startinpos+4; 1211 goto utf8Error; 1212 } 1213#ifdef Py_UNICODE_WIDE 1214 *p++ = (Py_UNICODE)ch; 1215#else 1216 /* compute and append the two surrogates: */ 1217 1218 /* translate from 10000..10FFFF to 0..FFFF */ 1219 ch -= 0x10000; 1220 1221 /* high surrogate = top 10 bits added to D800 */ 1222 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1223 1224 /* low surrogate = bottom 10 bits added to DC00 */ 1225 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1226#endif 1227 break; 1228 1229 default: 1230 /* Other sizes are only needed for UCS-4 */ 1231 errmsg = "unsupported Unicode code range"; 1232 startinpos = s-starts; 1233 endinpos = startinpos+n; 1234 goto utf8Error; 1235 } 1236 s += n; 1237 continue; 1238 1239 utf8Error: 1240 outpos = p-PyUnicode_AS_UNICODE(unicode); 1241 if (unicode_decode_call_errorhandler( 1242 errors, &errorHandler, 1243 "utf8", errmsg, 1244 starts, size, &startinpos, &endinpos, &exc, &s, 1245 (PyObject **)&unicode, &outpos, &p)) 1246 goto onError; 1247 } 1248 1249 /* Adjust length */ 1250 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1251 goto onError; 1252 1253 Py_XDECREF(errorHandler); 1254 Py_XDECREF(exc); 1255 return (PyObject *)unicode; 1256 1257onError: 1258 Py_XDECREF(errorHandler); 1259 Py_XDECREF(exc); 1260 Py_DECREF(unicode); 1261 return NULL; 1262} 1263 1264/* Allocation strategy: if the string is short, convert into a stack buffer 1265 and allocate exactly as much space needed at the end. Else allocate the 1266 maximum possible needed (4 result bytes per Unicode character), and return 1267 the excess memory at the end. 1268*/ 1269PyObject * 1270PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1271 int size, 1272 const char *errors) 1273{ 1274#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1275 1276 int i; /* index into s of next input byte */ 1277 PyObject *v; /* result string object */ 1278 char *p; /* next free byte in output buffer */ 1279 int nallocated; /* number of result bytes allocated */ 1280 int nneeded; /* number of result bytes needed */ 1281 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1282 1283 assert(s != NULL); 1284 assert(size >= 0); 1285 1286 if (size <= MAX_SHORT_UNICHARS) { 1287 /* Write into the stack buffer; nallocated can't overflow. 1288 * At the end, we'll allocate exactly as much heap space as it 1289 * turns out we need. 1290 */ 1291 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1292 v = NULL; /* will allocate after we're done */ 1293 p = stackbuf; 1294 } 1295 else { 1296 /* Overallocate on the heap, and give the excess back at the end. */ 1297 nallocated = size * 4; 1298 if (nallocated / 4 != size) /* overflow! */ 1299 return PyErr_NoMemory(); 1300 v = PyString_FromStringAndSize(NULL, nallocated); 1301 if (v == NULL) 1302 return NULL; 1303 p = PyString_AS_STRING(v); 1304 } 1305 1306 for (i = 0; i < size;) { 1307 Py_UCS4 ch = s[i++]; 1308 1309 if (ch < 0x80) 1310 /* Encode ASCII */ 1311 *p++ = (char) ch; 1312 1313 else if (ch < 0x0800) { 1314 /* Encode Latin-1 */ 1315 *p++ = (char)(0xc0 | (ch >> 6)); 1316 *p++ = (char)(0x80 | (ch & 0x3f)); 1317 } 1318 else { 1319 /* Encode UCS2 Unicode ordinals */ 1320 if (ch < 0x10000) { 1321 /* Special case: check for high surrogate */ 1322 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1323 Py_UCS4 ch2 = s[i]; 1324 /* Check for low surrogate and combine the two to 1325 form a UCS4 value */ 1326 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1327 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1328 i++; 1329 goto encodeUCS4; 1330 } 1331 /* Fall through: handles isolated high surrogates */ 1332 } 1333 *p++ = (char)(0xe0 | (ch >> 12)); 1334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1335 *p++ = (char)(0x80 | (ch & 0x3f)); 1336 continue; 1337 } 1338encodeUCS4: 1339 /* Encode UCS4 Unicode ordinals */ 1340 *p++ = (char)(0xf0 | (ch >> 18)); 1341 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1343 *p++ = (char)(0x80 | (ch & 0x3f)); 1344 } 1345 } 1346 1347 if (v == NULL) { 1348 /* This was stack allocated. */ 1349 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); 1350 assert(nneeded <= nallocated); 1351 v = PyString_FromStringAndSize(stackbuf, nneeded); 1352 } 1353 else { 1354 /* Cut back to size actually needed. */ 1355 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); 1356 assert(nneeded <= nallocated); 1357 _PyString_Resize(&v, nneeded); 1358 } 1359 return v; 1360 1361#undef MAX_SHORT_UNICHARS 1362} 1363 1364PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1365{ 1366 if (!PyUnicode_Check(unicode)) { 1367 PyErr_BadArgument(); 1368 return NULL; 1369 } 1370 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1371 PyUnicode_GET_SIZE(unicode), 1372 NULL); 1373} 1374 1375/* --- UTF-16 Codec ------------------------------------------------------- */ 1376 1377PyObject * 1378PyUnicode_DecodeUTF16(const char *s, 1379 int size, 1380 const char *errors, 1381 int *byteorder) 1382{ 1383 const char *starts = s; 1384 int startinpos; 1385 int endinpos; 1386 int outpos; 1387 PyUnicodeObject *unicode; 1388 Py_UNICODE *p; 1389 const unsigned char *q, *e; 1390 int bo = 0; /* assume native ordering by default */ 1391 const char *errmsg = ""; 1392 /* Offsets from q for retrieving byte pairs in the right order. */ 1393#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1394 int ihi = 1, ilo = 0; 1395#else 1396 int ihi = 0, ilo = 1; 1397#endif 1398 PyObject *errorHandler = NULL; 1399 PyObject *exc = NULL; 1400 1401 /* Note: size will always be longer than the resulting Unicode 1402 character count */ 1403 unicode = _PyUnicode_New(size); 1404 if (!unicode) 1405 return NULL; 1406 if (size == 0) 1407 return (PyObject *)unicode; 1408 1409 /* Unpack UTF-16 encoded data */ 1410 p = unicode->str; 1411 q = (unsigned char *)s; 1412 e = q + size; 1413 1414 if (byteorder) 1415 bo = *byteorder; 1416 1417 /* Check for BOM marks (U+FEFF) in the input and adjust current 1418 byte order setting accordingly. In native mode, the leading BOM 1419 mark is skipped, in all other modes, it is copied to the output 1420 stream as-is (giving a ZWNBSP character). */ 1421 if (bo == 0) { 1422 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1423#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1424 if (bom == 0xFEFF) { 1425 q += 2; 1426 bo = -1; 1427 } 1428 else if (bom == 0xFFFE) { 1429 q += 2; 1430 bo = 1; 1431 } 1432#else 1433 if (bom == 0xFEFF) { 1434 q += 2; 1435 bo = 1; 1436 } 1437 else if (bom == 0xFFFE) { 1438 q += 2; 1439 bo = -1; 1440 } 1441#endif 1442 } 1443 1444 if (bo == -1) { 1445 /* force LE */ 1446 ihi = 1; 1447 ilo = 0; 1448 } 1449 else if (bo == 1) { 1450 /* force BE */ 1451 ihi = 0; 1452 ilo = 1; 1453 } 1454 1455 while (q < e) { 1456 Py_UNICODE ch; 1457 /* remaing bytes at the end? (size should be even) */ 1458 if (e-q<2) { 1459 errmsg = "truncated data"; 1460 startinpos = ((const char *)q)-starts; 1461 endinpos = ((const char *)e)-starts; 1462 goto utf16Error; 1463 /* The remaining input chars are ignored if the callback 1464 chooses to skip the input */ 1465 } 1466 ch = (q[ihi] << 8) | q[ilo]; 1467 1468 q += 2; 1469 1470 if (ch < 0xD800 || ch > 0xDFFF) { 1471 *p++ = ch; 1472 continue; 1473 } 1474 1475 /* UTF-16 code pair: */ 1476 if (q >= e) { 1477 errmsg = "unexpected end of data"; 1478 startinpos = (((const char *)q)-2)-starts; 1479 endinpos = ((const char *)e)-starts; 1480 goto utf16Error; 1481 } 1482 if (0xD800 <= ch && ch <= 0xDBFF) { 1483 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1484 q += 2; 1485 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1486#ifndef Py_UNICODE_WIDE 1487 *p++ = ch; 1488 *p++ = ch2; 1489#else 1490 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1491#endif 1492 continue; 1493 } 1494 else { 1495 errmsg = "illegal UTF-16 surrogate"; 1496 startinpos = (((const char *)q)-4)-starts; 1497 endinpos = startinpos+2; 1498 goto utf16Error; 1499 } 1500 1501 } 1502 errmsg = "illegal encoding"; 1503 startinpos = (((const char *)q)-2)-starts; 1504 endinpos = startinpos+2; 1505 /* Fall through to report the error */ 1506 1507 utf16Error: 1508 outpos = p-PyUnicode_AS_UNICODE(unicode); 1509 if (unicode_decode_call_errorhandler( 1510 errors, &errorHandler, 1511 "utf16", errmsg, 1512 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1513 (PyObject **)&unicode, &outpos, &p)) 1514 goto onError; 1515 } 1516 1517 if (byteorder) 1518 *byteorder = bo; 1519 1520 /* Adjust length */ 1521 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1522 goto onError; 1523 1524 Py_XDECREF(errorHandler); 1525 Py_XDECREF(exc); 1526 return (PyObject *)unicode; 1527 1528onError: 1529 Py_DECREF(unicode); 1530 Py_XDECREF(errorHandler); 1531 Py_XDECREF(exc); 1532 return NULL; 1533} 1534 1535PyObject * 1536PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1537 int size, 1538 const char *errors, 1539 int byteorder) 1540{ 1541 PyObject *v; 1542 unsigned char *p; 1543 int i, pairs; 1544 /* Offsets from p for storing byte pairs in the right order. */ 1545#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1546 int ihi = 1, ilo = 0; 1547#else 1548 int ihi = 0, ilo = 1; 1549#endif 1550 1551#define STORECHAR(CH) \ 1552 do { \ 1553 p[ihi] = ((CH) >> 8) & 0xff; \ 1554 p[ilo] = (CH) & 0xff; \ 1555 p += 2; \ 1556 } while(0) 1557 1558 for (i = pairs = 0; i < size; i++) 1559 if (s[i] >= 0x10000) 1560 pairs++; 1561 v = PyString_FromStringAndSize(NULL, 1562 2 * (size + pairs + (byteorder == 0))); 1563 if (v == NULL) 1564 return NULL; 1565 1566 p = (unsigned char *)PyString_AS_STRING(v); 1567 if (byteorder == 0) 1568 STORECHAR(0xFEFF); 1569 if (size == 0) 1570 return v; 1571 1572 if (byteorder == -1) { 1573 /* force LE */ 1574 ihi = 1; 1575 ilo = 0; 1576 } 1577 else if (byteorder == 1) { 1578 /* force BE */ 1579 ihi = 0; 1580 ilo = 1; 1581 } 1582 1583 while (size-- > 0) { 1584 Py_UNICODE ch = *s++; 1585 Py_UNICODE ch2 = 0; 1586 if (ch >= 0x10000) { 1587 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1588 ch = 0xD800 | ((ch-0x10000) >> 10); 1589 } 1590 STORECHAR(ch); 1591 if (ch2) 1592 STORECHAR(ch2); 1593 } 1594 return v; 1595#undef STORECHAR 1596} 1597 1598PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1599{ 1600 if (!PyUnicode_Check(unicode)) { 1601 PyErr_BadArgument(); 1602 return NULL; 1603 } 1604 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1605 PyUnicode_GET_SIZE(unicode), 1606 NULL, 1607 0); 1608} 1609 1610/* --- Unicode Escape Codec ----------------------------------------------- */ 1611 1612static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1613 1614PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1615 int size, 1616 const char *errors) 1617{ 1618 const char *starts = s; 1619 int startinpos; 1620 int endinpos; 1621 int outpos; 1622 int i; 1623 PyUnicodeObject *v; 1624 Py_UNICODE *p; 1625 const char *end; 1626 char* message; 1627 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1628 PyObject *errorHandler = NULL; 1629 PyObject *exc = NULL; 1630 1631 /* Escaped strings will always be longer than the resulting 1632 Unicode string, so we start with size here and then reduce the 1633 length after conversion to the true value. 1634 (but if the error callback returns a long replacement string 1635 we'll have to allocate more space) */ 1636 v = _PyUnicode_New(size); 1637 if (v == NULL) 1638 goto onError; 1639 if (size == 0) 1640 return (PyObject *)v; 1641 1642 p = PyUnicode_AS_UNICODE(v); 1643 end = s + size; 1644 1645 while (s < end) { 1646 unsigned char c; 1647 Py_UNICODE x; 1648 int digits; 1649 1650 /* Non-escape characters are interpreted as Unicode ordinals */ 1651 if (*s != '\\') { 1652 *p++ = (unsigned char) *s++; 1653 continue; 1654 } 1655 1656 startinpos = s-starts; 1657 /* \ - Escapes */ 1658 s++; 1659 switch (*s++) { 1660 1661 /* \x escapes */ 1662 case '\n': break; 1663 case '\\': *p++ = '\\'; break; 1664 case '\'': *p++ = '\''; break; 1665 case '\"': *p++ = '\"'; break; 1666 case 'b': *p++ = '\b'; break; 1667 case 'f': *p++ = '\014'; break; /* FF */ 1668 case 't': *p++ = '\t'; break; 1669 case 'n': *p++ = '\n'; break; 1670 case 'r': *p++ = '\r'; break; 1671 case 'v': *p++ = '\013'; break; /* VT */ 1672 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1673 1674 /* \OOO (octal) escapes */ 1675 case '0': case '1': case '2': case '3': 1676 case '4': case '5': case '6': case '7': 1677 x = s[-1] - '0'; 1678 if ('0' <= *s && *s <= '7') { 1679 x = (x<<3) + *s++ - '0'; 1680 if ('0' <= *s && *s <= '7') 1681 x = (x<<3) + *s++ - '0'; 1682 } 1683 *p++ = x; 1684 break; 1685 1686 /* hex escapes */ 1687 /* \xXX */ 1688 case 'x': 1689 digits = 2; 1690 message = "truncated \\xXX escape"; 1691 goto hexescape; 1692 1693 /* \uXXXX */ 1694 case 'u': 1695 digits = 4; 1696 message = "truncated \\uXXXX escape"; 1697 goto hexescape; 1698 1699 /* \UXXXXXXXX */ 1700 case 'U': 1701 digits = 8; 1702 message = "truncated \\UXXXXXXXX escape"; 1703 hexescape: 1704 chr = 0; 1705 outpos = p-PyUnicode_AS_UNICODE(v); 1706 if (s+digits>end) { 1707 endinpos = size; 1708 if (unicode_decode_call_errorhandler( 1709 errors, &errorHandler, 1710 "unicodeescape", "end of string in escape sequence", 1711 starts, size, &startinpos, &endinpos, &exc, &s, 1712 (PyObject **)&v, &outpos, &p)) 1713 goto onError; 1714 goto nextByte; 1715 } 1716 for (i = 0; i < digits; ++i) { 1717 c = (unsigned char) s[i]; 1718 if (!isxdigit(c)) { 1719 endinpos = (s+i+1)-starts; 1720 if (unicode_decode_call_errorhandler( 1721 errors, &errorHandler, 1722 "unicodeescape", message, 1723 starts, size, &startinpos, &endinpos, &exc, &s, 1724 (PyObject **)&v, &outpos, &p)) 1725 goto onError; 1726 goto nextByte; 1727 } 1728 chr = (chr<<4) & ~0xF; 1729 if (c >= '0' && c <= '9') 1730 chr += c - '0'; 1731 else if (c >= 'a' && c <= 'f') 1732 chr += 10 + c - 'a'; 1733 else 1734 chr += 10 + c - 'A'; 1735 } 1736 s += i; 1737 if (chr == 0xffffffff) 1738 /* _decoding_error will have already written into the 1739 target buffer. */ 1740 break; 1741 store: 1742 /* when we get here, chr is a 32-bit unicode character */ 1743 if (chr <= 0xffff) 1744 /* UCS-2 character */ 1745 *p++ = (Py_UNICODE) chr; 1746 else if (chr <= 0x10ffff) { 1747 /* UCS-4 character. Either store directly, or as 1748 surrogate pair. */ 1749#ifdef Py_UNICODE_WIDE 1750 *p++ = chr; 1751#else 1752 chr -= 0x10000L; 1753 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1754 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1755#endif 1756 } else { 1757 endinpos = s-starts; 1758 outpos = p-PyUnicode_AS_UNICODE(v); 1759 if (unicode_decode_call_errorhandler( 1760 errors, &errorHandler, 1761 "unicodeescape", "illegal Unicode character", 1762 starts, size, &startinpos, &endinpos, &exc, &s, 1763 (PyObject **)&v, &outpos, &p)) 1764 goto onError; 1765 } 1766 break; 1767 1768 /* \N{name} */ 1769 case 'N': 1770 message = "malformed \\N character escape"; 1771 if (ucnhash_CAPI == NULL) { 1772 /* load the unicode data module */ 1773 PyObject *m, *v; 1774 m = PyImport_ImportModule("unicodedata"); 1775 if (m == NULL) 1776 goto ucnhashError; 1777 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1778 Py_DECREF(m); 1779 if (v == NULL) 1780 goto ucnhashError; 1781 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1782 Py_DECREF(v); 1783 if (ucnhash_CAPI == NULL) 1784 goto ucnhashError; 1785 } 1786 if (*s == '{') { 1787 const char *start = s+1; 1788 /* look for the closing brace */ 1789 while (*s != '}' && s < end) 1790 s++; 1791 if (s > start && s < end && *s == '}') { 1792 /* found a name. look it up in the unicode database */ 1793 message = "unknown Unicode character name"; 1794 s++; 1795 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1796 goto store; 1797 } 1798 } 1799 endinpos = s-starts; 1800 outpos = p-PyUnicode_AS_UNICODE(v); 1801 if (unicode_decode_call_errorhandler( 1802 errors, &errorHandler, 1803 "unicodeescape", message, 1804 starts, size, &startinpos, &endinpos, &exc, &s, 1805 (PyObject **)&v, &outpos, &p)) 1806 goto onError; 1807 break; 1808 1809 default: 1810 if (s > end) { 1811 message = "\\ at end of string"; 1812 s--; 1813 endinpos = s-starts; 1814 outpos = p-PyUnicode_AS_UNICODE(v); 1815 if (unicode_decode_call_errorhandler( 1816 errors, &errorHandler, 1817 "unicodeescape", message, 1818 starts, size, &startinpos, &endinpos, &exc, &s, 1819 (PyObject **)&v, &outpos, &p)) 1820 goto onError; 1821 } 1822 else { 1823 *p++ = '\\'; 1824 *p++ = (unsigned char)s[-1]; 1825 } 1826 break; 1827 } 1828 nextByte: 1829 ; 1830 } 1831 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1832 goto onError; 1833 return (PyObject *)v; 1834 1835ucnhashError: 1836 PyErr_SetString( 1837 PyExc_UnicodeError, 1838 "\\N escapes not supported (can't load unicodedata module)" 1839 ); 1840 Py_XDECREF(errorHandler); 1841 Py_XDECREF(exc); 1842 return NULL; 1843 1844onError: 1845 Py_XDECREF(v); 1846 Py_XDECREF(errorHandler); 1847 Py_XDECREF(exc); 1848 return NULL; 1849} 1850 1851/* Return a Unicode-Escape string version of the Unicode object. 1852 1853 If quotes is true, the string is enclosed in u"" or u'' quotes as 1854 appropriate. 1855 1856*/ 1857 1858static const Py_UNICODE *findchar(const Py_UNICODE *s, 1859 int size, 1860 Py_UNICODE ch); 1861 1862static 1863PyObject *unicodeescape_string(const Py_UNICODE *s, 1864 int size, 1865 int quotes) 1866{ 1867 PyObject *repr; 1868 char *p; 1869 1870 static const char *hexdigit = "0123456789abcdef"; 1871 1872 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1873 if (repr == NULL) 1874 return NULL; 1875 1876 p = PyString_AS_STRING(repr); 1877 1878 if (quotes) { 1879 *p++ = 'u'; 1880 *p++ = (findchar(s, size, '\'') && 1881 !findchar(s, size, '"')) ? '"' : '\''; 1882 } 1883 while (size-- > 0) { 1884 Py_UNICODE ch = *s++; 1885 1886 /* Escape quotes */ 1887 if (quotes && 1888 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1889 *p++ = '\\'; 1890 *p++ = (char) ch; 1891 continue; 1892 } 1893 1894#ifdef Py_UNICODE_WIDE 1895 /* Map 21-bit characters to '\U00xxxxxx' */ 1896 else if (ch >= 0x10000) { 1897 int offset = p - PyString_AS_STRING(repr); 1898 1899 /* Resize the string if necessary */ 1900 if (offset + 12 > PyString_GET_SIZE(repr)) { 1901 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1902 return NULL; 1903 p = PyString_AS_STRING(repr) + offset; 1904 } 1905 1906 *p++ = '\\'; 1907 *p++ = 'U'; 1908 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1909 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1910 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1911 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1912 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1913 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1914 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1915 *p++ = hexdigit[ch & 0x0000000F]; 1916 continue; 1917 } 1918#endif 1919 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1920 else if (ch >= 0xD800 && ch < 0xDC00) { 1921 Py_UNICODE ch2; 1922 Py_UCS4 ucs; 1923 1924 ch2 = *s++; 1925 size--; 1926 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1927 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1928 *p++ = '\\'; 1929 *p++ = 'U'; 1930 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1931 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1932 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1933 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1934 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1935 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1936 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1937 *p++ = hexdigit[ucs & 0x0000000F]; 1938 continue; 1939 } 1940 /* Fall through: isolated surrogates are copied as-is */ 1941 s--; 1942 size++; 1943 } 1944 1945 /* Map 16-bit characters to '\uxxxx' */ 1946 if (ch >= 256) { 1947 *p++ = '\\'; 1948 *p++ = 'u'; 1949 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1950 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1951 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1952 *p++ = hexdigit[ch & 0x000F]; 1953 } 1954 1955 /* Map special whitespace to '\t', \n', '\r' */ 1956 else if (ch == '\t') { 1957 *p++ = '\\'; 1958 *p++ = 't'; 1959 } 1960 else if (ch == '\n') { 1961 *p++ = '\\'; 1962 *p++ = 'n'; 1963 } 1964 else if (ch == '\r') { 1965 *p++ = '\\'; 1966 *p++ = 'r'; 1967 } 1968 1969 /* Map non-printable US ASCII to '\xhh' */ 1970 else if (ch < ' ' || ch >= 0x7F) { 1971 *p++ = '\\'; 1972 *p++ = 'x'; 1973 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1974 *p++ = hexdigit[ch & 0x000F]; 1975 } 1976 1977 /* Copy everything else as-is */ 1978 else 1979 *p++ = (char) ch; 1980 } 1981 if (quotes) 1982 *p++ = PyString_AS_STRING(repr)[1]; 1983 1984 *p = '\0'; 1985 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 1986 return repr; 1987} 1988 1989PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1990 int size) 1991{ 1992 return unicodeescape_string(s, size, 0); 1993} 1994 1995PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1996{ 1997 if (!PyUnicode_Check(unicode)) { 1998 PyErr_BadArgument(); 1999 return NULL; 2000 } 2001 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2002 PyUnicode_GET_SIZE(unicode)); 2003} 2004 2005/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2006 2007PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2008 int size, 2009 const char *errors) 2010{ 2011 const char *starts = s; 2012 int startinpos; 2013 int endinpos; 2014 int outpos; 2015 PyUnicodeObject *v; 2016 Py_UNICODE *p; 2017 const char *end; 2018 const char *bs; 2019 PyObject *errorHandler = NULL; 2020 PyObject *exc = NULL; 2021 2022 /* Escaped strings will always be longer than the resulting 2023 Unicode string, so we start with size here and then reduce the 2024 length after conversion to the true value. (But decoding error 2025 handler might have to resize the string) */ 2026 v = _PyUnicode_New(size); 2027 if (v == NULL) 2028 goto onError; 2029 if (size == 0) 2030 return (PyObject *)v; 2031 p = PyUnicode_AS_UNICODE(v); 2032 end = s + size; 2033 while (s < end) { 2034 unsigned char c; 2035 Py_UCS4 x; 2036 int i; 2037 2038 /* Non-escape characters are interpreted as Unicode ordinals */ 2039 if (*s != '\\') { 2040 *p++ = (unsigned char)*s++; 2041 continue; 2042 } 2043 startinpos = s-starts; 2044 2045 /* \u-escapes are only interpreted iff the number of leading 2046 backslashes if odd */ 2047 bs = s; 2048 for (;s < end;) { 2049 if (*s != '\\') 2050 break; 2051 *p++ = (unsigned char)*s++; 2052 } 2053 if (((s - bs) & 1) == 0 || 2054 s >= end || 2055 *s != 'u') { 2056 continue; 2057 } 2058 p--; 2059 s++; 2060 2061 /* \uXXXX with 4 hex digits */ 2062 outpos = p-PyUnicode_AS_UNICODE(v); 2063 for (x = 0, i = 0; i < 4; ++i, ++s) { 2064 c = (unsigned char)*s; 2065 if (!isxdigit(c)) { 2066 endinpos = s-starts; 2067 if (unicode_decode_call_errorhandler( 2068 errors, &errorHandler, 2069 "rawunicodeescape", "truncated \\uXXXX", 2070 starts, size, &startinpos, &endinpos, &exc, &s, 2071 (PyObject **)&v, &outpos, &p)) 2072 goto onError; 2073 goto nextByte; 2074 } 2075 x = (x<<4) & ~0xF; 2076 if (c >= '0' && c <= '9') 2077 x += c - '0'; 2078 else if (c >= 'a' && c <= 'f') 2079 x += 10 + c - 'a'; 2080 else 2081 x += 10 + c - 'A'; 2082 } 2083 *p++ = x; 2084 nextByte: 2085 ; 2086 } 2087 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2088 goto onError; 2089 Py_XDECREF(errorHandler); 2090 Py_XDECREF(exc); 2091 return (PyObject *)v; 2092 2093 onError: 2094 Py_XDECREF(v); 2095 Py_XDECREF(errorHandler); 2096 Py_XDECREF(exc); 2097 return NULL; 2098} 2099 2100PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2101 int size) 2102{ 2103 PyObject *repr; 2104 char *p; 2105 char *q; 2106 2107 static const char *hexdigit = "0123456789abcdef"; 2108 2109 repr = PyString_FromStringAndSize(NULL, 6 * size); 2110 if (repr == NULL) 2111 return NULL; 2112 if (size == 0) 2113 return repr; 2114 2115 p = q = PyString_AS_STRING(repr); 2116 while (size-- > 0) { 2117 Py_UNICODE ch = *s++; 2118 /* Map 16-bit characters to '\uxxxx' */ 2119 if (ch >= 256) { 2120 *p++ = '\\'; 2121 *p++ = 'u'; 2122 *p++ = hexdigit[(ch >> 12) & 0xf]; 2123 *p++ = hexdigit[(ch >> 8) & 0xf]; 2124 *p++ = hexdigit[(ch >> 4) & 0xf]; 2125 *p++ = hexdigit[ch & 15]; 2126 } 2127 /* Copy everything else as-is */ 2128 else 2129 *p++ = (char) ch; 2130 } 2131 *p = '\0'; 2132 _PyString_Resize(&repr, p - q); 2133 return repr; 2134} 2135 2136PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2137{ 2138 if (!PyUnicode_Check(unicode)) { 2139 PyErr_BadArgument(); 2140 return NULL; 2141 } 2142 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2143 PyUnicode_GET_SIZE(unicode)); 2144} 2145 2146/* --- Latin-1 Codec ------------------------------------------------------ */ 2147 2148PyObject *PyUnicode_DecodeLatin1(const char *s, 2149 int size, 2150 const char *errors) 2151{ 2152 PyUnicodeObject *v; 2153 Py_UNICODE *p; 2154 2155 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2156 if (size == 1 && *(unsigned char*)s < 256) { 2157 Py_UNICODE r = *(unsigned char*)s; 2158 return PyUnicode_FromUnicode(&r, 1); 2159 } 2160 2161 v = _PyUnicode_New(size); 2162 if (v == NULL) 2163 goto onError; 2164 if (size == 0) 2165 return (PyObject *)v; 2166 p = PyUnicode_AS_UNICODE(v); 2167 while (size-- > 0) 2168 *p++ = (unsigned char)*s++; 2169 return (PyObject *)v; 2170 2171 onError: 2172 Py_XDECREF(v); 2173 return NULL; 2174} 2175 2176/* create or adjust a UnicodeEncodeError */ 2177static void make_encode_exception(PyObject **exceptionObject, 2178 const char *encoding, 2179 const Py_UNICODE *unicode, int size, 2180 int startpos, int endpos, 2181 const char *reason) 2182{ 2183 if (*exceptionObject == NULL) { 2184 *exceptionObject = PyUnicodeEncodeError_Create( 2185 encoding, unicode, size, startpos, endpos, reason); 2186 } 2187 else { 2188 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2189 goto onError; 2190 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2191 goto onError; 2192 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2193 goto onError; 2194 return; 2195 onError: 2196 Py_DECREF(*exceptionObject); 2197 *exceptionObject = NULL; 2198 } 2199} 2200 2201/* raises a UnicodeEncodeError */ 2202static void raise_encode_exception(PyObject **exceptionObject, 2203 const char *encoding, 2204 const Py_UNICODE *unicode, int size, 2205 int startpos, int endpos, 2206 const char *reason) 2207{ 2208 make_encode_exception(exceptionObject, 2209 encoding, unicode, size, startpos, endpos, reason); 2210 if (*exceptionObject != NULL) 2211 PyCodec_StrictErrors(*exceptionObject); 2212} 2213 2214/* error handling callback helper: 2215 build arguments, call the callback and check the arguments, 2216 put the result into newpos and return the replacement string, which 2217 has to be freed by the caller */ 2218static PyObject *unicode_encode_call_errorhandler(const char *errors, 2219 PyObject **errorHandler, 2220 const char *encoding, const char *reason, 2221 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 2222 int startpos, int endpos, 2223 int *newpos) 2224{ 2225 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; 2226 2227 PyObject *restuple; 2228 PyObject *resunicode; 2229 2230 if (*errorHandler == NULL) { 2231 *errorHandler = PyCodec_LookupError(errors); 2232 if (*errorHandler == NULL) 2233 return NULL; 2234 } 2235 2236 make_encode_exception(exceptionObject, 2237 encoding, unicode, size, startpos, endpos, reason); 2238 if (*exceptionObject == NULL) 2239 return NULL; 2240 2241 restuple = PyObject_CallFunctionObjArgs( 2242 *errorHandler, *exceptionObject, NULL); 2243 if (restuple == NULL) 2244 return NULL; 2245 if (!PyTuple_Check(restuple)) { 2246 PyErr_Format(PyExc_TypeError, &argparse[4]); 2247 Py_DECREF(restuple); 2248 return NULL; 2249 } 2250 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2251 &resunicode, newpos)) { 2252 Py_DECREF(restuple); 2253 return NULL; 2254 } 2255 if (*newpos<0) 2256 *newpos = 0; 2257 else if (*newpos>size) 2258 *newpos = size; 2259 Py_INCREF(resunicode); 2260 Py_DECREF(restuple); 2261 return resunicode; 2262} 2263 2264static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2265 int size, 2266 const char *errors, 2267 int limit) 2268{ 2269 /* output object */ 2270 PyObject *res; 2271 /* pointers to the beginning and end+1 of input */ 2272 const Py_UNICODE *startp = p; 2273 const Py_UNICODE *endp = p + size; 2274 /* pointer to the beginning of the unencodable characters */ 2275 /* const Py_UNICODE *badp = NULL; */ 2276 /* pointer into the output */ 2277 char *str; 2278 /* current output position */ 2279 int respos = 0; 2280 int ressize; 2281 char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2282 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2283 PyObject *errorHandler = NULL; 2284 PyObject *exc = NULL; 2285 /* the following variable is used for caching string comparisons 2286 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2287 int known_errorHandler = -1; 2288 2289 /* allocate enough for a simple encoding without 2290 replacements, if we need more, we'll resize */ 2291 res = PyString_FromStringAndSize(NULL, size); 2292 if (res == NULL) 2293 goto onError; 2294 if (size == 0) 2295 return res; 2296 str = PyString_AS_STRING(res); 2297 ressize = size; 2298 2299 while (p<endp) { 2300 Py_UNICODE c = *p; 2301 2302 /* can we encode this? */ 2303 if (c<limit) { 2304 /* no overflow check, because we know that the space is enough */ 2305 *str++ = (char)c; 2306 ++p; 2307 } 2308 else { 2309 int unicodepos = p-startp; 2310 int requiredsize; 2311 PyObject *repunicode; 2312 int repsize; 2313 int newpos; 2314 int respos; 2315 Py_UNICODE *uni2; 2316 /* startpos for collecting unencodable chars */ 2317 const Py_UNICODE *collstart = p; 2318 const Py_UNICODE *collend = p; 2319 /* find all unecodable characters */ 2320 while ((collend < endp) && ((*collend)>=limit)) 2321 ++collend; 2322 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2323 if (known_errorHandler==-1) { 2324 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2325 known_errorHandler = 1; 2326 else if (!strcmp(errors, "replace")) 2327 known_errorHandler = 2; 2328 else if (!strcmp(errors, "ignore")) 2329 known_errorHandler = 3; 2330 else if (!strcmp(errors, "xmlcharrefreplace")) 2331 known_errorHandler = 4; 2332 else 2333 known_errorHandler = 0; 2334 } 2335 switch (known_errorHandler) { 2336 case 1: /* strict */ 2337 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2338 goto onError; 2339 case 2: /* replace */ 2340 while (collstart++<collend) 2341 *str++ = '?'; /* fall through */ 2342 case 3: /* ignore */ 2343 p = collend; 2344 break; 2345 case 4: /* xmlcharrefreplace */ 2346 respos = str-PyString_AS_STRING(res); 2347 /* determine replacement size (temporarily (mis)uses p) */ 2348 for (p = collstart, repsize = 0; p < collend; ++p) { 2349 if (*p<10) 2350 repsize += 2+1+1; 2351 else if (*p<100) 2352 repsize += 2+2+1; 2353 else if (*p<1000) 2354 repsize += 2+3+1; 2355 else if (*p<10000) 2356 repsize += 2+4+1; 2357 else if (*p<100000) 2358 repsize += 2+5+1; 2359 else if (*p<1000000) 2360 repsize += 2+6+1; 2361 else 2362 repsize += 2+7+1; 2363 } 2364 requiredsize = respos+repsize+(endp-collend); 2365 if (requiredsize > ressize) { 2366 if (requiredsize<2*ressize) 2367 requiredsize = 2*ressize; 2368 if (_PyString_Resize(&res, requiredsize)) 2369 goto onError; 2370 str = PyString_AS_STRING(res) + respos; 2371 ressize = requiredsize; 2372 } 2373 /* generate replacement (temporarily (mis)uses p) */ 2374 for (p = collstart; p < collend; ++p) { 2375 str += sprintf(str, "&#%d;", (int)*p); 2376 } 2377 p = collend; 2378 break; 2379 default: 2380 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2381 encoding, reason, startp, size, &exc, 2382 collstart-startp, collend-startp, &newpos); 2383 if (repunicode == NULL) 2384 goto onError; 2385 /* need more space? (at least enough for what we 2386 have+the replacement+the rest of the string, so 2387 we won't have to check space for encodable characters) */ 2388 respos = str-PyString_AS_STRING(res); 2389 repsize = PyUnicode_GET_SIZE(repunicode); 2390 requiredsize = respos+repsize+(endp-collend); 2391 if (requiredsize > ressize) { 2392 if (requiredsize<2*ressize) 2393 requiredsize = 2*ressize; 2394 if (_PyString_Resize(&res, requiredsize)) { 2395 Py_DECREF(repunicode); 2396 goto onError; 2397 } 2398 str = PyString_AS_STRING(res) + respos; 2399 ressize = requiredsize; 2400 } 2401 /* check if there is anything unencodable in the replacement 2402 and copy it to the output */ 2403 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2404 c = *uni2; 2405 if (c >= limit) { 2406 raise_encode_exception(&exc, encoding, startp, size, 2407 unicodepos, unicodepos+1, reason); 2408 Py_DECREF(repunicode); 2409 goto onError; 2410 } 2411 *str = (char)c; 2412 } 2413 p = startp + newpos; 2414 Py_DECREF(repunicode); 2415 } 2416 } 2417 } 2418 /* Resize if we allocated to much */ 2419 respos = str-PyString_AS_STRING(res); 2420 if (respos<ressize) 2421 /* If this falls res will be NULL */ 2422 _PyString_Resize(&res, respos); 2423 Py_XDECREF(errorHandler); 2424 Py_XDECREF(exc); 2425 return res; 2426 2427 onError: 2428 Py_XDECREF(res); 2429 Py_XDECREF(errorHandler); 2430 Py_XDECREF(exc); 2431 return NULL; 2432} 2433 2434PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2435 int size, 2436 const char *errors) 2437{ 2438 return unicode_encode_ucs1(p, size, errors, 256); 2439} 2440 2441PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2442{ 2443 if (!PyUnicode_Check(unicode)) { 2444 PyErr_BadArgument(); 2445 return NULL; 2446 } 2447 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2448 PyUnicode_GET_SIZE(unicode), 2449 NULL); 2450} 2451 2452/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2453 2454PyObject *PyUnicode_DecodeASCII(const char *s, 2455 int size, 2456 const char *errors) 2457{ 2458 const char *starts = s; 2459 PyUnicodeObject *v; 2460 Py_UNICODE *p; 2461 int startinpos; 2462 int endinpos; 2463 int outpos; 2464 const char *e; 2465 PyObject *errorHandler = NULL; 2466 PyObject *exc = NULL; 2467 2468 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2469 if (size == 1 && *(unsigned char*)s < 128) { 2470 Py_UNICODE r = *(unsigned char*)s; 2471 return PyUnicode_FromUnicode(&r, 1); 2472 } 2473 2474 v = _PyUnicode_New(size); 2475 if (v == NULL) 2476 goto onError; 2477 if (size == 0) 2478 return (PyObject *)v; 2479 p = PyUnicode_AS_UNICODE(v); 2480 e = s + size; 2481 while (s < e) { 2482 register unsigned char c = (unsigned char)*s; 2483 if (c < 128) { 2484 *p++ = c; 2485 ++s; 2486 } 2487 else { 2488 startinpos = s-starts; 2489 endinpos = startinpos + 1; 2490 outpos = p-PyUnicode_AS_UNICODE(v); 2491 if (unicode_decode_call_errorhandler( 2492 errors, &errorHandler, 2493 "ascii", "ordinal not in range(128)", 2494 starts, size, &startinpos, &endinpos, &exc, &s, 2495 (PyObject **)&v, &outpos, &p)) 2496 goto onError; 2497 } 2498 } 2499 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2500 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2501 goto onError; 2502 Py_XDECREF(errorHandler); 2503 Py_XDECREF(exc); 2504 return (PyObject *)v; 2505 2506 onError: 2507 Py_XDECREF(v); 2508 Py_XDECREF(errorHandler); 2509 Py_XDECREF(exc); 2510 return NULL; 2511} 2512 2513PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2514 int size, 2515 const char *errors) 2516{ 2517 return unicode_encode_ucs1(p, size, errors, 128); 2518} 2519 2520PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2521{ 2522 if (!PyUnicode_Check(unicode)) { 2523 PyErr_BadArgument(); 2524 return NULL; 2525 } 2526 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2527 PyUnicode_GET_SIZE(unicode), 2528 NULL); 2529} 2530 2531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2532 2533/* --- MBCS codecs for Windows -------------------------------------------- */ 2534 2535PyObject *PyUnicode_DecodeMBCS(const char *s, 2536 int size, 2537 const char *errors) 2538{ 2539 PyUnicodeObject *v; 2540 Py_UNICODE *p; 2541 2542 /* First get the size of the result */ 2543 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2544 if (size > 0 && usize==0) 2545 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2546 2547 v = _PyUnicode_New(usize); 2548 if (v == NULL) 2549 return NULL; 2550 if (usize == 0) 2551 return (PyObject *)v; 2552 p = PyUnicode_AS_UNICODE(v); 2553 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2554 Py_DECREF(v); 2555 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2556 } 2557 2558 return (PyObject *)v; 2559} 2560 2561PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2562 int size, 2563 const char *errors) 2564{ 2565 PyObject *repr; 2566 char *s; 2567 DWORD mbcssize; 2568 2569 /* If there are no characters, bail now! */ 2570 if (size==0) 2571 return PyString_FromString(""); 2572 2573 /* First get the size of the result */ 2574 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2575 if (mbcssize==0) 2576 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2577 2578 repr = PyString_FromStringAndSize(NULL, mbcssize); 2579 if (repr == NULL) 2580 return NULL; 2581 if (mbcssize == 0) 2582 return repr; 2583 2584 /* Do the conversion */ 2585 s = PyString_AS_STRING(repr); 2586 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2587 Py_DECREF(repr); 2588 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2589 } 2590 return repr; 2591} 2592 2593#endif /* MS_WINDOWS */ 2594 2595/* --- Character Mapping Codec -------------------------------------------- */ 2596 2597PyObject *PyUnicode_DecodeCharmap(const char *s, 2598 int size, 2599 PyObject *mapping, 2600 const char *errors) 2601{ 2602 const char *starts = s; 2603 int startinpos; 2604 int endinpos; 2605 int outpos; 2606 const char *e; 2607 PyUnicodeObject *v; 2608 Py_UNICODE *p; 2609 int extrachars = 0; 2610 PyObject *errorHandler = NULL; 2611 PyObject *exc = NULL; 2612 2613 /* Default to Latin-1 */ 2614 if (mapping == NULL) 2615 return PyUnicode_DecodeLatin1(s, size, errors); 2616 2617 v = _PyUnicode_New(size); 2618 if (v == NULL) 2619 goto onError; 2620 if (size == 0) 2621 return (PyObject *)v; 2622 p = PyUnicode_AS_UNICODE(v); 2623 e = s + size; 2624 while (s < e) { 2625 unsigned char ch = *s; 2626 PyObject *w, *x; 2627 2628 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2629 w = PyInt_FromLong((long)ch); 2630 if (w == NULL) 2631 goto onError; 2632 x = PyObject_GetItem(mapping, w); 2633 Py_DECREF(w); 2634 if (x == NULL) { 2635 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2636 /* No mapping found means: mapping is undefined. */ 2637 PyErr_Clear(); 2638 x = Py_None; 2639 Py_INCREF(x); 2640 } else 2641 goto onError; 2642 } 2643 2644 /* Apply mapping */ 2645 if (PyInt_Check(x)) { 2646 long value = PyInt_AS_LONG(x); 2647 if (value < 0 || value > 65535) { 2648 PyErr_SetString(PyExc_TypeError, 2649 "character mapping must be in range(65536)"); 2650 Py_DECREF(x); 2651 goto onError; 2652 } 2653 *p++ = (Py_UNICODE)value; 2654 } 2655 else if (x == Py_None) { 2656 /* undefined mapping */ 2657 outpos = p-PyUnicode_AS_UNICODE(v); 2658 startinpos = s-starts; 2659 endinpos = startinpos+1; 2660 if (unicode_decode_call_errorhandler( 2661 errors, &errorHandler, 2662 "charmap", "character maps to <undefined>", 2663 starts, size, &startinpos, &endinpos, &exc, &s, 2664 (PyObject **)&v, &outpos, &p)) { 2665 Py_DECREF(x); 2666 goto onError; 2667 } 2668 continue; 2669 } 2670 else if (PyUnicode_Check(x)) { 2671 int targetsize = PyUnicode_GET_SIZE(x); 2672 2673 if (targetsize == 1) 2674 /* 1-1 mapping */ 2675 *p++ = *PyUnicode_AS_UNICODE(x); 2676 2677 else if (targetsize > 1) { 2678 /* 1-n mapping */ 2679 if (targetsize > extrachars) { 2680 /* resize first */ 2681 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2682 int needed = (targetsize - extrachars) + \ 2683 (targetsize << 2); 2684 extrachars += needed; 2685 if (_PyUnicode_Resize(&v, 2686 PyUnicode_GET_SIZE(v) + needed)) { 2687 Py_DECREF(x); 2688 goto onError; 2689 } 2690 p = PyUnicode_AS_UNICODE(v) + oldpos; 2691 } 2692 Py_UNICODE_COPY(p, 2693 PyUnicode_AS_UNICODE(x), 2694 targetsize); 2695 p += targetsize; 2696 extrachars -= targetsize; 2697 } 2698 /* 1-0 mapping: skip the character */ 2699 } 2700 else { 2701 /* wrong return value */ 2702 PyErr_SetString(PyExc_TypeError, 2703 "character mapping must return integer, None or unicode"); 2704 Py_DECREF(x); 2705 goto onError; 2706 } 2707 Py_DECREF(x); 2708 ++s; 2709 } 2710 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2711 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2712 goto onError; 2713 Py_XDECREF(errorHandler); 2714 Py_XDECREF(exc); 2715 return (PyObject *)v; 2716 2717 onError: 2718 Py_XDECREF(errorHandler); 2719 Py_XDECREF(exc); 2720 Py_XDECREF(v); 2721 return NULL; 2722} 2723 2724/* Lookup the character ch in the mapping. If the character 2725 can't be found, Py_None is returned (or NULL, if another 2726 error occured). */ 2727static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 2728{ 2729 PyObject *w = PyInt_FromLong((long)c); 2730 PyObject *x; 2731 2732 if (w == NULL) 2733 return NULL; 2734 x = PyObject_GetItem(mapping, w); 2735 Py_DECREF(w); 2736 if (x == NULL) { 2737 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2738 /* No mapping found means: mapping is undefined. */ 2739 PyErr_Clear(); 2740 x = Py_None; 2741 Py_INCREF(x); 2742 return x; 2743 } else 2744 return NULL; 2745 } 2746 else if (PyInt_Check(x)) { 2747 long value = PyInt_AS_LONG(x); 2748 if (value < 0 || value > 255) { 2749 PyErr_SetString(PyExc_TypeError, 2750 "character mapping must be in range(256)"); 2751 Py_DECREF(x); 2752 return NULL; 2753 } 2754 return x; 2755 } 2756 else if (PyString_Check(x)) 2757 return x; 2758 else { 2759 /* wrong return value */ 2760 PyErr_SetString(PyExc_TypeError, 2761 "character mapping must return integer, None or str"); 2762 Py_DECREF(x); 2763 return NULL; 2764 } 2765} 2766 2767/* lookup the character, put the result in the output string and adjust 2768 various state variables. Reallocate the output string if not enough 2769 space is available. Return a new reference to the object that 2770 was put in the output buffer, or Py_None, if the mapping was undefined 2771 (in which case no character was written) or NULL, if a 2772 reallocation error ocurred. The called must decref the result */ 2773static 2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, 2775 PyObject **outobj, int *outpos) 2776{ 2777 PyObject *rep = charmapencode_lookup(c, mapping); 2778 2779 if (rep==NULL) 2780 return NULL; 2781 else if (rep==Py_None) 2782 return rep; 2783 else { 2784 char *outstart = PyString_AS_STRING(*outobj); 2785 int outsize = PyString_GET_SIZE(*outobj); 2786 if (PyInt_Check(rep)) { 2787 int requiredsize = *outpos+1; 2788 if (outsize<requiredsize) { 2789 /* exponentially overallocate to minimize reallocations */ 2790 if (requiredsize < 2*outsize) 2791 requiredsize = 2*outsize; 2792 if (_PyString_Resize(outobj, requiredsize)) { 2793 Py_DECREF(rep); 2794 return NULL; 2795 } 2796 outstart = PyString_AS_STRING(*outobj); 2797 } 2798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 2799 } 2800 else { 2801 const char *repchars = PyString_AS_STRING(rep); 2802 int repsize = PyString_GET_SIZE(rep); 2803 int requiredsize = *outpos+repsize; 2804 if (outsize<requiredsize) { 2805 /* exponentially overallocate to minimize reallocations */ 2806 if (requiredsize < 2*outsize) 2807 requiredsize = 2*outsize; 2808 if (_PyString_Resize(outobj, requiredsize)) { 2809 Py_DECREF(rep); 2810 return NULL; 2811 } 2812 outstart = PyString_AS_STRING(*outobj); 2813 } 2814 memcpy(outstart + *outpos, repchars, repsize); 2815 *outpos += repsize; 2816 } 2817 } 2818 return rep; 2819} 2820 2821/* handle an error in PyUnicode_EncodeCharmap 2822 Return 0 on success, -1 on error */ 2823static 2824int charmap_encoding_error( 2825 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping, 2826 PyObject **exceptionObject, 2827 int *known_errorHandler, PyObject *errorHandler, const char *errors, 2828 PyObject **res, int *respos) 2829{ 2830 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 2831 int repsize; 2832 int newpos; 2833 Py_UNICODE *uni2; 2834 /* startpos for collecting unencodable chars */ 2835 int collstartpos = *inpos; 2836 int collendpos = *inpos+1; 2837 int collpos; 2838 char *encoding = "charmap"; 2839 char *reason = "character maps to <undefined>"; 2840 2841 PyObject *x; 2842 /* find all unencodable characters */ 2843 while (collendpos < size) { 2844 x = charmapencode_lookup(p[collendpos], mapping); 2845 if (x==NULL) 2846 return -1; 2847 else if (x!=Py_None) { 2848 Py_DECREF(x); 2849 break; 2850 } 2851 Py_DECREF(x); 2852 ++collendpos; 2853 } 2854 /* cache callback name lookup 2855 * (if not done yet, i.e. it's the first error) */ 2856 if (*known_errorHandler==-1) { 2857 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2858 *known_errorHandler = 1; 2859 else if (!strcmp(errors, "replace")) 2860 *known_errorHandler = 2; 2861 else if (!strcmp(errors, "ignore")) 2862 *known_errorHandler = 3; 2863 else if (!strcmp(errors, "xmlcharrefreplace")) 2864 *known_errorHandler = 4; 2865 else 2866 *known_errorHandler = 0; 2867 } 2868 switch (*known_errorHandler) { 2869 case 1: /* strict */ 2870 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2871 return -1; 2872 case 2: /* replace */ 2873 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 2874 x = charmapencode_output('?', mapping, res, respos); 2875 if (x==NULL) { 2876 return -1; 2877 } 2878 else if (x==Py_None) { 2879 Py_DECREF(x); 2880 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2881 return -1; 2882 } 2883 Py_DECREF(x); 2884 } 2885 /* fall through */ 2886 case 3: /* ignore */ 2887 *inpos = collendpos; 2888 break; 2889 case 4: /* xmlcharrefreplace */ 2890 /* generate replacement (temporarily (mis)uses p) */ 2891 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 2892 char buffer[2+29+1+1]; 2893 char *cp; 2894 sprintf(buffer, "&#%d;", (int)p[collpos]); 2895 for (cp = buffer; *cp; ++cp) { 2896 x = charmapencode_output(*cp, mapping, res, respos); 2897 if (x==NULL) 2898 return -1; 2899 else if (x==Py_None) { 2900 Py_DECREF(x); 2901 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2902 return -1; 2903 } 2904 Py_DECREF(x); 2905 } 2906 } 2907 *inpos = collendpos; 2908 break; 2909 default: 2910 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2911 encoding, reason, p, size, exceptionObject, 2912 collstartpos, collendpos, &newpos); 2913 if (repunicode == NULL) 2914 return -1; 2915 /* generate replacement */ 2916 repsize = PyUnicode_GET_SIZE(repunicode); 2917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 2918 x = charmapencode_output(*uni2, mapping, res, respos); 2919 if (x==NULL) { 2920 Py_DECREF(repunicode); 2921 return -1; 2922 } 2923 else if (x==Py_None) { 2924 Py_DECREF(repunicode); 2925 Py_DECREF(x); 2926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2927 return -1; 2928 } 2929 Py_DECREF(x); 2930 } 2931 *inpos = newpos; 2932 Py_DECREF(repunicode); 2933 } 2934 return 0; 2935} 2936 2937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2938 int size, 2939 PyObject *mapping, 2940 const char *errors) 2941{ 2942 /* output object */ 2943 PyObject *res = NULL; 2944 /* current input position */ 2945 int inpos = 0; 2946 /* current output position */ 2947 int respos = 0; 2948 PyObject *errorHandler = NULL; 2949 PyObject *exc = NULL; 2950 /* the following variable is used for caching string comparisons 2951 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 2952 * 3=ignore, 4=xmlcharrefreplace */ 2953 int known_errorHandler = -1; 2954 2955 /* Default to Latin-1 */ 2956 if (mapping == NULL) 2957 return PyUnicode_EncodeLatin1(p, size, errors); 2958 2959 /* allocate enough for a simple encoding without 2960 replacements, if we need more, we'll resize */ 2961 res = PyString_FromStringAndSize(NULL, size); 2962 if (res == NULL) 2963 goto onError; 2964 if (size == 0) 2965 return res; 2966 2967 while (inpos<size) { 2968 /* try to encode it */ 2969 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); 2970 if (x==NULL) /* error */ 2971 goto onError; 2972 if (x==Py_None) { /* unencodable character */ 2973 if (charmap_encoding_error(p, size, &inpos, mapping, 2974 &exc, 2975 &known_errorHandler, errorHandler, errors, 2976 &res, &respos)) 2977 goto onError; 2978 } 2979 else 2980 /* done with this character => adjust input position */ 2981 ++inpos; 2982 Py_DECREF(x); 2983 } 2984 2985 /* Resize if we allocated to much */ 2986 if (respos<PyString_GET_SIZE(res)) { 2987 if (_PyString_Resize(&res, respos)) 2988 goto onError; 2989 } 2990 Py_XDECREF(exc); 2991 Py_XDECREF(errorHandler); 2992 return res; 2993 2994 onError: 2995 Py_XDECREF(res); 2996 Py_XDECREF(exc); 2997 Py_XDECREF(errorHandler); 2998 return NULL; 2999} 3000 3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3002 PyObject *mapping) 3003{ 3004 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3005 PyErr_BadArgument(); 3006 return NULL; 3007 } 3008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3009 PyUnicode_GET_SIZE(unicode), 3010 mapping, 3011 NULL); 3012} 3013 3014/* create or adjust a UnicodeTranslateError */ 3015static void make_translate_exception(PyObject **exceptionObject, 3016 const Py_UNICODE *unicode, int size, 3017 int startpos, int endpos, 3018 const char *reason) 3019{ 3020 if (*exceptionObject == NULL) { 3021 *exceptionObject = PyUnicodeTranslateError_Create( 3022 unicode, size, startpos, endpos, reason); 3023 } 3024 else { 3025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3026 goto onError; 3027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3028 goto onError; 3029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3030 goto onError; 3031 return; 3032 onError: 3033 Py_DECREF(*exceptionObject); 3034 *exceptionObject = NULL; 3035 } 3036} 3037 3038/* raises a UnicodeTranslateError */ 3039static void raise_translate_exception(PyObject **exceptionObject, 3040 const Py_UNICODE *unicode, int size, 3041 int startpos, int endpos, 3042 const char *reason) 3043{ 3044 make_translate_exception(exceptionObject, 3045 unicode, size, startpos, endpos, reason); 3046 if (*exceptionObject != NULL) 3047 PyCodec_StrictErrors(*exceptionObject); 3048} 3049 3050/* error handling callback helper: 3051 build arguments, call the callback and check the arguments, 3052 put the result into newpos and return the replacement string, which 3053 has to be freed by the caller */ 3054static PyObject *unicode_translate_call_errorhandler(const char *errors, 3055 PyObject **errorHandler, 3056 const char *reason, 3057 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 3058 int startpos, int endpos, 3059 int *newpos) 3060{ 3061 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; 3062 3063 PyObject *restuple; 3064 PyObject *resunicode; 3065 3066 if (*errorHandler == NULL) { 3067 *errorHandler = PyCodec_LookupError(errors); 3068 if (*errorHandler == NULL) 3069 return NULL; 3070 } 3071 3072 make_translate_exception(exceptionObject, 3073 unicode, size, startpos, endpos, reason); 3074 if (*exceptionObject == NULL) 3075 return NULL; 3076 3077 restuple = PyObject_CallFunctionObjArgs( 3078 *errorHandler, *exceptionObject, NULL); 3079 if (restuple == NULL) 3080 return NULL; 3081 if (!PyTuple_Check(restuple)) { 3082 PyErr_Format(PyExc_TypeError, &argparse[4]); 3083 Py_DECREF(restuple); 3084 return NULL; 3085 } 3086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3087 &resunicode, newpos)) { 3088 Py_DECREF(restuple); 3089 return NULL; 3090 } 3091 if (*newpos<0) 3092 *newpos = 0; 3093 else if (*newpos>size) 3094 *newpos = size; 3095 Py_INCREF(resunicode); 3096 Py_DECREF(restuple); 3097 return resunicode; 3098} 3099 3100/* Lookup the character ch in the mapping and put the result in result, 3101 which must be decrefed by the caller. 3102 Return 0 on success, -1 on error */ 3103static 3104int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3105{ 3106 PyObject *w = PyInt_FromLong((long)c); 3107 PyObject *x; 3108 3109 if (w == NULL) 3110 return -1; 3111 x = PyObject_GetItem(mapping, w); 3112 Py_DECREF(w); 3113 if (x == NULL) { 3114 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3115 /* No mapping found means: use 1:1 mapping. */ 3116 PyErr_Clear(); 3117 *result = NULL; 3118 return 0; 3119 } else 3120 return -1; 3121 } 3122 else if (x == Py_None) { 3123 *result = x; 3124 return 0; 3125 } 3126 else if (PyInt_Check(x)) { 3127 long value = PyInt_AS_LONG(x); 3128 long max = PyUnicode_GetMax(); 3129 if (value < 0 || value > max) { 3130 PyErr_Format(PyExc_TypeError, 3131 "character mapping must be in range(0x%lx)", max+1); 3132 Py_DECREF(x); 3133 return -1; 3134 } 3135 *result = x; 3136 return 0; 3137 } 3138 else if (PyUnicode_Check(x)) { 3139 *result = x; 3140 return 0; 3141 } 3142 else { 3143 /* wrong return value */ 3144 PyErr_SetString(PyExc_TypeError, 3145 "character mapping must return integer, None or unicode"); 3146 return -1; 3147 } 3148} 3149/* ensure that *outobj is at least requiredsize characters long, 3150if not reallocate and adjust various state variables. 3151Return 0 on success, -1 on error */ 3152static 3153int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, 3154 int requiredsize) 3155{ 3156 if (requiredsize > *outsize) { 3157 /* remember old output position */ 3158 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3159 /* exponentially overallocate to minimize reallocations */ 3160 if (requiredsize < 2 * *outsize) 3161 requiredsize = 2 * *outsize; 3162 if (_PyUnicode_Resize(outobj, requiredsize)) 3163 return -1; 3164 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3165 *outsize = requiredsize; 3166 } 3167 return 0; 3168} 3169/* lookup the character, put the result in the output string and adjust 3170 various state variables. Return a new reference to the object that 3171 was put in the output buffer in *result, or Py_None, if the mapping was 3172 undefined (in which case no character was written). 3173 The called must decref result. 3174 Return 0 on success, -1 on error. */ 3175static 3176int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, 3177 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) 3178{ 3179 if (charmaptranslate_lookup(c, mapping, res)) 3180 return -1; 3181 if (*res==NULL) { 3182 /* not found => default to 1:1 mapping */ 3183 *(*outp)++ = (Py_UNICODE)c; 3184 } 3185 else if (*res==Py_None) 3186 ; 3187 else if (PyInt_Check(*res)) { 3188 /* no overflow check, because we know that the space is enough */ 3189 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3190 } 3191 else if (PyUnicode_Check(*res)) { 3192 int repsize = PyUnicode_GET_SIZE(*res); 3193 if (repsize==1) { 3194 /* no overflow check, because we know that the space is enough */ 3195 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3196 } 3197 else if (repsize!=0) { 3198 /* more than one character */ 3199 int requiredsize = *outsize + repsize - 1; 3200 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) 3201 return -1; 3202 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3203 *outp += repsize; 3204 } 3205 } 3206 else 3207 return -1; 3208 return 0; 3209} 3210 3211PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3212 int size, 3213 PyObject *mapping, 3214 const char *errors) 3215{ 3216 /* output object */ 3217 PyObject *res = NULL; 3218 /* pointers to the beginning and end+1 of input */ 3219 const Py_UNICODE *startp = p; 3220 const Py_UNICODE *endp = p + size; 3221 /* pointer into the output */ 3222 Py_UNICODE *str; 3223 /* current output position */ 3224 int respos = 0; 3225 int ressize; 3226 char *reason = "character maps to <undefined>"; 3227 PyObject *errorHandler = NULL; 3228 PyObject *exc = NULL; 3229 /* the following variable is used for caching string comparisons 3230 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3231 * 3=ignore, 4=xmlcharrefreplace */ 3232 int known_errorHandler = -1; 3233 3234 if (mapping == NULL) { 3235 PyErr_BadArgument(); 3236 return NULL; 3237 } 3238 3239 /* allocate enough for a simple 1:1 translation without 3240 replacements, if we need more, we'll resize */ 3241 res = PyUnicode_FromUnicode(NULL, size); 3242 if (res == NULL) 3243 goto onError; 3244 if (size == 0) 3245 return res; 3246 str = PyUnicode_AS_UNICODE(res); 3247 ressize = size; 3248 3249 while (p<endp) { 3250 /* try to encode it */ 3251 PyObject *x = NULL; 3252 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) { 3253 Py_XDECREF(x); 3254 goto onError; 3255 } 3256 if (x!=Py_None) /* it worked => adjust input pointer */ 3257 ++p; 3258 else { /* untranslatable character */ 3259 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3260 int repsize; 3261 int newpos; 3262 Py_UNICODE *uni2; 3263 /* startpos for collecting untranslatable chars */ 3264 const Py_UNICODE *collstart = p; 3265 const Py_UNICODE *collend = p+1; 3266 const Py_UNICODE *coll; 3267 3268 Py_XDECREF(x); 3269 /* find all untranslatable characters */ 3270 while (collend < endp) { 3271 if (charmaptranslate_lookup(*collend, mapping, &x)) 3272 goto onError; 3273 Py_XDECREF(x); 3274 if (x!=Py_None) 3275 break; 3276 ++collend; 3277 } 3278 /* cache callback name lookup 3279 * (if not done yet, i.e. it's the first error) */ 3280 if (known_errorHandler==-1) { 3281 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3282 known_errorHandler = 1; 3283 else if (!strcmp(errors, "replace")) 3284 known_errorHandler = 2; 3285 else if (!strcmp(errors, "ignore")) 3286 known_errorHandler = 3; 3287 else if (!strcmp(errors, "xmlcharrefreplace")) 3288 known_errorHandler = 4; 3289 else 3290 known_errorHandler = 0; 3291 } 3292 switch (known_errorHandler) { 3293 case 1: /* strict */ 3294 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 3295 goto onError; 3296 case 2: /* replace */ 3297 /* No need to check for space, this is a 1:1 replacement */ 3298 for (coll = collstart; coll<collend; ++coll) 3299 *str++ = '?'; 3300 /* fall through */ 3301 case 3: /* ignore */ 3302 p = collend; 3303 break; 3304 case 4: /* xmlcharrefreplace */ 3305 /* generate replacement (temporarily (mis)uses p) */ 3306 for (p = collstart; p < collend; ++p) { 3307 char buffer[2+29+1+1]; 3308 char *cp; 3309 sprintf(buffer, "&#%d;", (int)*p); 3310 if (charmaptranslate_makespace(&res, &str, &ressize, 3311 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 3312 goto onError; 3313 for (cp = buffer; *cp; ++cp) 3314 *str++ = *cp; 3315 } 3316 p = collend; 3317 break; 3318 default: 3319 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 3320 reason, startp, size, &exc, 3321 collstart-startp, collend-startp, &newpos); 3322 if (repunicode == NULL) 3323 goto onError; 3324 /* generate replacement */ 3325 repsize = PyUnicode_GET_SIZE(repunicode); 3326 if (charmaptranslate_makespace(&res, &str, &ressize, 3327 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 3328 Py_DECREF(repunicode); 3329 goto onError; 3330 } 3331 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 3332 *str++ = *uni2; 3333 p = startp + newpos; 3334 Py_DECREF(repunicode); 3335 } 3336 } 3337 } 3338 /* Resize if we allocated to much */ 3339 respos = str-PyUnicode_AS_UNICODE(res); 3340 if (respos<ressize) { 3341 if (_PyUnicode_Resize(&res, respos)) 3342 goto onError; 3343 } 3344 Py_XDECREF(exc); 3345 Py_XDECREF(errorHandler); 3346 return res; 3347 3348 onError: 3349 Py_XDECREF(res); 3350 Py_XDECREF(exc); 3351 Py_XDECREF(errorHandler); 3352 return NULL; 3353} 3354 3355PyObject *PyUnicode_Translate(PyObject *str, 3356 PyObject *mapping, 3357 const char *errors) 3358{ 3359 PyObject *result; 3360 3361 str = PyUnicode_FromObject(str); 3362 if (str == NULL) 3363 goto onError; 3364 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 3365 PyUnicode_GET_SIZE(str), 3366 mapping, 3367 errors); 3368 Py_DECREF(str); 3369 return result; 3370 3371 onError: 3372 Py_XDECREF(str); 3373 return NULL; 3374} 3375 3376/* --- Decimal Encoder ---------------------------------------------------- */ 3377 3378int PyUnicode_EncodeDecimal(Py_UNICODE *s, 3379 int length, 3380 char *output, 3381 const char *errors) 3382{ 3383 Py_UNICODE *p, *end; 3384 PyObject *errorHandler = NULL; 3385 PyObject *exc = NULL; 3386 const char *encoding = "decimal"; 3387 const char *reason = "invalid decimal Unicode string"; 3388 /* the following variable is used for caching string comparisons 3389 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3390 int known_errorHandler = -1; 3391 3392 if (output == NULL) { 3393 PyErr_BadArgument(); 3394 return -1; 3395 } 3396 3397 p = s; 3398 end = s + length; 3399 while (p < end) { 3400 register Py_UNICODE ch = *p; 3401 int decimal; 3402 PyObject *repunicode; 3403 int repsize; 3404 int newpos; 3405 Py_UNICODE *uni2; 3406 Py_UNICODE *collstart; 3407 Py_UNICODE *collend; 3408 3409 if (Py_UNICODE_ISSPACE(ch)) { 3410 *output++ = ' '; 3411 ++p; 3412 continue; 3413 } 3414 decimal = Py_UNICODE_TODECIMAL(ch); 3415 if (decimal >= 0) { 3416 *output++ = '0' + decimal; 3417 ++p; 3418 continue; 3419 } 3420 if (0 < ch && ch < 256) { 3421 *output++ = (char)ch; 3422 ++p; 3423 continue; 3424 } 3425 /* All other characters are considered unencodable */ 3426 collstart = p; 3427 collend = p+1; 3428 while (collend < end) { 3429 if ((0 < *collend && *collend < 256) || 3430 !Py_UNICODE_ISSPACE(*collend) || 3431 Py_UNICODE_TODECIMAL(*collend)) 3432 break; 3433 } 3434 /* cache callback name lookup 3435 * (if not done yet, i.e. it's the first error) */ 3436 if (known_errorHandler==-1) { 3437 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3438 known_errorHandler = 1; 3439 else if (!strcmp(errors, "replace")) 3440 known_errorHandler = 2; 3441 else if (!strcmp(errors, "ignore")) 3442 known_errorHandler = 3; 3443 else if (!strcmp(errors, "xmlcharrefreplace")) 3444 known_errorHandler = 4; 3445 else 3446 known_errorHandler = 0; 3447 } 3448 switch (known_errorHandler) { 3449 case 1: /* strict */ 3450 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 3451 goto onError; 3452 case 2: /* replace */ 3453 for (p = collstart; p < collend; ++p) 3454 *output++ = '?'; 3455 /* fall through */ 3456 case 3: /* ignore */ 3457 p = collend; 3458 break; 3459 case 4: /* xmlcharrefreplace */ 3460 /* generate replacement (temporarily (mis)uses p) */ 3461 for (p = collstart; p < collend; ++p) 3462 output += sprintf(output, "&#%d;", (int)*p); 3463 p = collend; 3464 break; 3465 default: 3466 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3467 encoding, reason, s, length, &exc, 3468 collstart-s, collend-s, &newpos); 3469 if (repunicode == NULL) 3470 goto onError; 3471 /* generate replacement */ 3472 repsize = PyUnicode_GET_SIZE(repunicode); 3473 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3474 Py_UNICODE ch = *uni2; 3475 if (Py_UNICODE_ISSPACE(ch)) 3476 *output++ = ' '; 3477 else { 3478 decimal = Py_UNICODE_TODECIMAL(ch); 3479 if (decimal >= 0) 3480 *output++ = '0' + decimal; 3481 else if (0 < ch && ch < 256) 3482 *output++ = (char)ch; 3483 else { 3484 Py_DECREF(repunicode); 3485 raise_encode_exception(&exc, encoding, 3486 s, length, collstart-s, collend-s, reason); 3487 goto onError; 3488 } 3489 } 3490 } 3491 p = s + newpos; 3492 Py_DECREF(repunicode); 3493 } 3494 } 3495 /* 0-terminate the output string */ 3496 *output++ = '\0'; 3497 Py_XDECREF(exc); 3498 Py_XDECREF(errorHandler); 3499 return 0; 3500 3501 onError: 3502 Py_XDECREF(exc); 3503 Py_XDECREF(errorHandler); 3504 return -1; 3505} 3506 3507/* --- Helpers ------------------------------------------------------------ */ 3508 3509static 3510int count(PyUnicodeObject *self, 3511 int start, 3512 int end, 3513 PyUnicodeObject *substring) 3514{ 3515 int count = 0; 3516 3517 if (start < 0) 3518 start += self->length; 3519 if (start < 0) 3520 start = 0; 3521 if (end > self->length) 3522 end = self->length; 3523 if (end < 0) 3524 end += self->length; 3525 if (end < 0) 3526 end = 0; 3527 3528 if (substring->length == 0) 3529 return (end - start + 1); 3530 3531 end -= substring->length; 3532 3533 while (start <= end) 3534 if (Py_UNICODE_MATCH(self, start, substring)) { 3535 count++; 3536 start += substring->length; 3537 } else 3538 start++; 3539 3540 return count; 3541} 3542 3543int PyUnicode_Count(PyObject *str, 3544 PyObject *substr, 3545 int start, 3546 int end) 3547{ 3548 int result; 3549 3550 str = PyUnicode_FromObject(str); 3551 if (str == NULL) 3552 return -1; 3553 substr = PyUnicode_FromObject(substr); 3554 if (substr == NULL) { 3555 Py_DECREF(str); 3556 return -1; 3557 } 3558 3559 result = count((PyUnicodeObject *)str, 3560 start, end, 3561 (PyUnicodeObject *)substr); 3562 3563 Py_DECREF(str); 3564 Py_DECREF(substr); 3565 return result; 3566} 3567 3568static 3569int findstring(PyUnicodeObject *self, 3570 PyUnicodeObject *substring, 3571 int start, 3572 int end, 3573 int direction) 3574{ 3575 if (start < 0) 3576 start += self->length; 3577 if (start < 0) 3578 start = 0; 3579 3580 if (end > self->length) 3581 end = self->length; 3582 if (end < 0) 3583 end += self->length; 3584 if (end < 0) 3585 end = 0; 3586 3587 if (substring->length == 0) 3588 return (direction > 0) ? start : end; 3589 3590 end -= substring->length; 3591 3592 if (direction < 0) { 3593 for (; end >= start; end--) 3594 if (Py_UNICODE_MATCH(self, end, substring)) 3595 return end; 3596 } else { 3597 for (; start <= end; start++) 3598 if (Py_UNICODE_MATCH(self, start, substring)) 3599 return start; 3600 } 3601 3602 return -1; 3603} 3604 3605int PyUnicode_Find(PyObject *str, 3606 PyObject *substr, 3607 int start, 3608 int end, 3609 int direction) 3610{ 3611 int result; 3612 3613 str = PyUnicode_FromObject(str); 3614 if (str == NULL) 3615 return -2; 3616 substr = PyUnicode_FromObject(substr); 3617 if (substr == NULL) { 3618 Py_DECREF(str); 3619 return -2; 3620 } 3621 3622 result = findstring((PyUnicodeObject *)str, 3623 (PyUnicodeObject *)substr, 3624 start, end, direction); 3625 Py_DECREF(str); 3626 Py_DECREF(substr); 3627 return result; 3628} 3629 3630static 3631int tailmatch(PyUnicodeObject *self, 3632 PyUnicodeObject *substring, 3633 int start, 3634 int end, 3635 int direction) 3636{ 3637 if (start < 0) 3638 start += self->length; 3639 if (start < 0) 3640 start = 0; 3641 3642 if (substring->length == 0) 3643 return 1; 3644 3645 if (end > self->length) 3646 end = self->length; 3647 if (end < 0) 3648 end += self->length; 3649 if (end < 0) 3650 end = 0; 3651 3652 end -= substring->length; 3653 if (end < start) 3654 return 0; 3655 3656 if (direction > 0) { 3657 if (Py_UNICODE_MATCH(self, end, substring)) 3658 return 1; 3659 } else { 3660 if (Py_UNICODE_MATCH(self, start, substring)) 3661 return 1; 3662 } 3663 3664 return 0; 3665} 3666 3667int PyUnicode_Tailmatch(PyObject *str, 3668 PyObject *substr, 3669 int start, 3670 int end, 3671 int direction) 3672{ 3673 int result; 3674 3675 str = PyUnicode_FromObject(str); 3676 if (str == NULL) 3677 return -1; 3678 substr = PyUnicode_FromObject(substr); 3679 if (substr == NULL) { 3680 Py_DECREF(substr); 3681 return -1; 3682 } 3683 3684 result = tailmatch((PyUnicodeObject *)str, 3685 (PyUnicodeObject *)substr, 3686 start, end, direction); 3687 Py_DECREF(str); 3688 Py_DECREF(substr); 3689 return result; 3690} 3691 3692static 3693const Py_UNICODE *findchar(const Py_UNICODE *s, 3694 int size, 3695 Py_UNICODE ch) 3696{ 3697 /* like wcschr, but doesn't stop at NULL characters */ 3698 3699 while (size-- > 0) { 3700 if (*s == ch) 3701 return s; 3702 s++; 3703 } 3704 3705 return NULL; 3706} 3707 3708/* Apply fixfct filter to the Unicode object self and return a 3709 reference to the modified object */ 3710 3711static 3712PyObject *fixup(PyUnicodeObject *self, 3713 int (*fixfct)(PyUnicodeObject *s)) 3714{ 3715 3716 PyUnicodeObject *u; 3717 3718 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3719 if (u == NULL) 3720 return NULL; 3721 3722 Py_UNICODE_COPY(u->str, self->str, self->length); 3723 3724 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3725 /* fixfct should return TRUE if it modified the buffer. If 3726 FALSE, return a reference to the original buffer instead 3727 (to save space, not time) */ 3728 Py_INCREF(self); 3729 Py_DECREF(u); 3730 return (PyObject*) self; 3731 } 3732 return (PyObject*) u; 3733} 3734 3735static 3736int fixupper(PyUnicodeObject *self) 3737{ 3738 int len = self->length; 3739 Py_UNICODE *s = self->str; 3740 int status = 0; 3741 3742 while (len-- > 0) { 3743 register Py_UNICODE ch; 3744 3745 ch = Py_UNICODE_TOUPPER(*s); 3746 if (ch != *s) { 3747 status = 1; 3748 *s = ch; 3749 } 3750 s++; 3751 } 3752 3753 return status; 3754} 3755 3756static 3757int fixlower(PyUnicodeObject *self) 3758{ 3759 int len = self->length; 3760 Py_UNICODE *s = self->str; 3761 int status = 0; 3762 3763 while (len-- > 0) { 3764 register Py_UNICODE ch; 3765 3766 ch = Py_UNICODE_TOLOWER(*s); 3767 if (ch != *s) { 3768 status = 1; 3769 *s = ch; 3770 } 3771 s++; 3772 } 3773 3774 return status; 3775} 3776 3777static 3778int fixswapcase(PyUnicodeObject *self) 3779{ 3780 int len = self->length; 3781 Py_UNICODE *s = self->str; 3782 int status = 0; 3783 3784 while (len-- > 0) { 3785 if (Py_UNICODE_ISUPPER(*s)) { 3786 *s = Py_UNICODE_TOLOWER(*s); 3787 status = 1; 3788 } else if (Py_UNICODE_ISLOWER(*s)) { 3789 *s = Py_UNICODE_TOUPPER(*s); 3790 status = 1; 3791 } 3792 s++; 3793 } 3794 3795 return status; 3796} 3797 3798static 3799int fixcapitalize(PyUnicodeObject *self) 3800{ 3801 int len = self->length; 3802 Py_UNICODE *s = self->str; 3803 int status = 0; 3804 3805 if (len == 0) 3806 return 0; 3807 if (Py_UNICODE_ISLOWER(*s)) { 3808 *s = Py_UNICODE_TOUPPER(*s); 3809 status = 1; 3810 } 3811 s++; 3812 while (--len > 0) { 3813 if (Py_UNICODE_ISUPPER(*s)) { 3814 *s = Py_UNICODE_TOLOWER(*s); 3815 status = 1; 3816 } 3817 s++; 3818 } 3819 return status; 3820} 3821 3822static 3823int fixtitle(PyUnicodeObject *self) 3824{ 3825 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3826 register Py_UNICODE *e; 3827 int previous_is_cased; 3828 3829 /* Shortcut for single character strings */ 3830 if (PyUnicode_GET_SIZE(self) == 1) { 3831 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3832 if (*p != ch) { 3833 *p = ch; 3834 return 1; 3835 } 3836 else 3837 return 0; 3838 } 3839 3840 e = p + PyUnicode_GET_SIZE(self); 3841 previous_is_cased = 0; 3842 for (; p < e; p++) { 3843 register const Py_UNICODE ch = *p; 3844 3845 if (previous_is_cased) 3846 *p = Py_UNICODE_TOLOWER(ch); 3847 else 3848 *p = Py_UNICODE_TOTITLE(ch); 3849 3850 if (Py_UNICODE_ISLOWER(ch) || 3851 Py_UNICODE_ISUPPER(ch) || 3852 Py_UNICODE_ISTITLE(ch)) 3853 previous_is_cased = 1; 3854 else 3855 previous_is_cased = 0; 3856 } 3857 return 1; 3858} 3859 3860PyObject *PyUnicode_Join(PyObject *separator, 3861 PyObject *seq) 3862{ 3863 Py_UNICODE *sep; 3864 int seplen; 3865 PyUnicodeObject *res = NULL; 3866 int reslen = 0; 3867 Py_UNICODE *p; 3868 int sz = 100; 3869 int i; 3870 PyObject *it; 3871 3872 it = PyObject_GetIter(seq); 3873 if (it == NULL) 3874 return NULL; 3875 3876 if (separator == NULL) { 3877 Py_UNICODE blank = ' '; 3878 sep = ␣ 3879 seplen = 1; 3880 } 3881 else { 3882 separator = PyUnicode_FromObject(separator); 3883 if (separator == NULL) 3884 goto onError; 3885 sep = PyUnicode_AS_UNICODE(separator); 3886 seplen = PyUnicode_GET_SIZE(separator); 3887 } 3888 3889 res = _PyUnicode_New(sz); 3890 if (res == NULL) 3891 goto onError; 3892 p = PyUnicode_AS_UNICODE(res); 3893 reslen = 0; 3894 3895 for (i = 0; ; ++i) { 3896 int itemlen; 3897 PyObject *item = PyIter_Next(it); 3898 if (item == NULL) { 3899 if (PyErr_Occurred()) 3900 goto onError; 3901 break; 3902 } 3903 if (!PyUnicode_Check(item)) { 3904 PyObject *v; 3905 if (!PyString_Check(item)) { 3906 PyErr_Format(PyExc_TypeError, 3907 "sequence item %i: expected string or Unicode," 3908 " %.80s found", 3909 i, item->ob_type->tp_name); 3910 Py_DECREF(item); 3911 goto onError; 3912 } 3913 v = PyUnicode_FromObject(item); 3914 Py_DECREF(item); 3915 item = v; 3916 if (item == NULL) 3917 goto onError; 3918 } 3919 itemlen = PyUnicode_GET_SIZE(item); 3920 while (reslen + itemlen + seplen >= sz) { 3921 if (_PyUnicode_Resize(&res, sz*2)) { 3922 Py_DECREF(item); 3923 goto onError; 3924 } 3925 sz *= 2; 3926 p = PyUnicode_AS_UNICODE(res) + reslen; 3927 } 3928 if (i > 0) { 3929 Py_UNICODE_COPY(p, sep, seplen); 3930 p += seplen; 3931 reslen += seplen; 3932 } 3933 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3934 p += itemlen; 3935 reslen += itemlen; 3936 Py_DECREF(item); 3937 } 3938 if (_PyUnicode_Resize(&res, reslen)) 3939 goto onError; 3940 3941 Py_XDECREF(separator); 3942 Py_DECREF(it); 3943 return (PyObject *)res; 3944 3945 onError: 3946 Py_XDECREF(separator); 3947 Py_XDECREF(res); 3948 Py_DECREF(it); 3949 return NULL; 3950} 3951 3952static 3953PyUnicodeObject *pad(PyUnicodeObject *self, 3954 int left, 3955 int right, 3956 Py_UNICODE fill) 3957{ 3958 PyUnicodeObject *u; 3959 3960 if (left < 0) 3961 left = 0; 3962 if (right < 0) 3963 right = 0; 3964 3965 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3966 Py_INCREF(self); 3967 return self; 3968 } 3969 3970 u = _PyUnicode_New(left + self->length + right); 3971 if (u) { 3972 if (left) 3973 Py_UNICODE_FILL(u->str, fill, left); 3974 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3975 if (right) 3976 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3977 } 3978 3979 return u; 3980} 3981 3982#define SPLIT_APPEND(data, left, right) \ 3983 str = PyUnicode_FromUnicode(data + left, right - left); \ 3984 if (!str) \ 3985 goto onError; \ 3986 if (PyList_Append(list, str)) { \ 3987 Py_DECREF(str); \ 3988 goto onError; \ 3989 } \ 3990 else \ 3991 Py_DECREF(str); 3992 3993static 3994PyObject *split_whitespace(PyUnicodeObject *self, 3995 PyObject *list, 3996 int maxcount) 3997{ 3998 register int i; 3999 register int j; 4000 int len = self->length; 4001 PyObject *str; 4002 4003 for (i = j = 0; i < len; ) { 4004 /* find a token */ 4005 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4006 i++; 4007 j = i; 4008 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4009 i++; 4010 if (j < i) { 4011 if (maxcount-- <= 0) 4012 break; 4013 SPLIT_APPEND(self->str, j, i); 4014 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4015 i++; 4016 j = i; 4017 } 4018 } 4019 if (j < len) { 4020 SPLIT_APPEND(self->str, j, len); 4021 } 4022 return list; 4023 4024 onError: 4025 Py_DECREF(list); 4026 return NULL; 4027} 4028 4029PyObject *PyUnicode_Splitlines(PyObject *string, 4030 int keepends) 4031{ 4032 register int i; 4033 register int j; 4034 int len; 4035 PyObject *list; 4036 PyObject *str; 4037 Py_UNICODE *data; 4038 4039 string = PyUnicode_FromObject(string); 4040 if (string == NULL) 4041 return NULL; 4042 data = PyUnicode_AS_UNICODE(string); 4043 len = PyUnicode_GET_SIZE(string); 4044 4045 list = PyList_New(0); 4046 if (!list) 4047 goto onError; 4048 4049 for (i = j = 0; i < len; ) { 4050 int eol; 4051 4052 /* Find a line and append it */ 4053 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 4054 i++; 4055 4056 /* Skip the line break reading CRLF as one line break */ 4057 eol = i; 4058 if (i < len) { 4059 if (data[i] == '\r' && i + 1 < len && 4060 data[i+1] == '\n') 4061 i += 2; 4062 else 4063 i++; 4064 if (keepends) 4065 eol = i; 4066 } 4067 SPLIT_APPEND(data, j, eol); 4068 j = i; 4069 } 4070 if (j < len) { 4071 SPLIT_APPEND(data, j, len); 4072 } 4073 4074 Py_DECREF(string); 4075 return list; 4076 4077 onError: 4078 Py_DECREF(list); 4079 Py_DECREF(string); 4080 return NULL; 4081} 4082 4083static 4084PyObject *split_char(PyUnicodeObject *self, 4085 PyObject *list, 4086 Py_UNICODE ch, 4087 int maxcount) 4088{ 4089 register int i; 4090 register int j; 4091 int len = self->length; 4092 PyObject *str; 4093 4094 for (i = j = 0; i < len; ) { 4095 if (self->str[i] == ch) { 4096 if (maxcount-- <= 0) 4097 break; 4098 SPLIT_APPEND(self->str, j, i); 4099 i = j = i + 1; 4100 } else 4101 i++; 4102 } 4103 if (j <= len) { 4104 SPLIT_APPEND(self->str, j, len); 4105 } 4106 return list; 4107 4108 onError: 4109 Py_DECREF(list); 4110 return NULL; 4111} 4112 4113static 4114PyObject *split_substring(PyUnicodeObject *self, 4115 PyObject *list, 4116 PyUnicodeObject *substring, 4117 int maxcount) 4118{ 4119 register int i; 4120 register int j; 4121 int len = self->length; 4122 int sublen = substring->length; 4123 PyObject *str; 4124 4125 for (i = j = 0; i <= len - sublen; ) { 4126 if (Py_UNICODE_MATCH(self, i, substring)) { 4127 if (maxcount-- <= 0) 4128 break; 4129 SPLIT_APPEND(self->str, j, i); 4130 i = j = i + sublen; 4131 } else 4132 i++; 4133 } 4134 if (j <= len) { 4135 SPLIT_APPEND(self->str, j, len); 4136 } 4137 return list; 4138 4139 onError: 4140 Py_DECREF(list); 4141 return NULL; 4142} 4143 4144#undef SPLIT_APPEND 4145 4146static 4147PyObject *split(PyUnicodeObject *self, 4148 PyUnicodeObject *substring, 4149 int maxcount) 4150{ 4151 PyObject *list; 4152 4153 if (maxcount < 0) 4154 maxcount = INT_MAX; 4155 4156 list = PyList_New(0); 4157 if (!list) 4158 return NULL; 4159 4160 if (substring == NULL) 4161 return split_whitespace(self,list,maxcount); 4162 4163 else if (substring->length == 1) 4164 return split_char(self,list,substring->str[0],maxcount); 4165 4166 else if (substring->length == 0) { 4167 Py_DECREF(list); 4168 PyErr_SetString(PyExc_ValueError, "empty separator"); 4169 return NULL; 4170 } 4171 else 4172 return split_substring(self,list,substring,maxcount); 4173} 4174 4175static 4176PyObject *replace(PyUnicodeObject *self, 4177 PyUnicodeObject *str1, 4178 PyUnicodeObject *str2, 4179 int maxcount) 4180{ 4181 PyUnicodeObject *u; 4182 4183 if (maxcount < 0) 4184 maxcount = INT_MAX; 4185 4186 if (str1->length == 1 && str2->length == 1) { 4187 int i; 4188 4189 /* replace characters */ 4190 if (!findchar(self->str, self->length, str1->str[0]) && 4191 PyUnicode_CheckExact(self)) { 4192 /* nothing to replace, return original string */ 4193 Py_INCREF(self); 4194 u = self; 4195 } else { 4196 Py_UNICODE u1 = str1->str[0]; 4197 Py_UNICODE u2 = str2->str[0]; 4198 4199 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 4200 NULL, 4201 self->length 4202 ); 4203 if (u != NULL) { 4204 Py_UNICODE_COPY(u->str, self->str, 4205 self->length); 4206 for (i = 0; i < u->length; i++) 4207 if (u->str[i] == u1) { 4208 if (--maxcount < 0) 4209 break; 4210 u->str[i] = u2; 4211 } 4212 } 4213 } 4214 4215 } else { 4216 int n, i; 4217 Py_UNICODE *p; 4218 4219 /* replace strings */ 4220 n = count(self, 0, self->length, str1); 4221 if (n > maxcount) 4222 n = maxcount; 4223 if (n == 0) { 4224 /* nothing to replace, return original string */ 4225 if (PyUnicode_CheckExact(self)) { 4226 Py_INCREF(self); 4227 u = self; 4228 } 4229 else { 4230 u = (PyUnicodeObject *) 4231 PyUnicode_FromUnicode(self->str, self->length); 4232 } 4233 } else { 4234 u = _PyUnicode_New( 4235 self->length + n * (str2->length - str1->length)); 4236 if (u) { 4237 i = 0; 4238 p = u->str; 4239 if (str1->length > 0) { 4240 while (i <= self->length - str1->length) 4241 if (Py_UNICODE_MATCH(self, i, str1)) { 4242 /* replace string segment */ 4243 Py_UNICODE_COPY(p, str2->str, str2->length); 4244 p += str2->length; 4245 i += str1->length; 4246 if (--n <= 0) { 4247 /* copy remaining part */ 4248 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4249 break; 4250 } 4251 } else 4252 *p++ = self->str[i++]; 4253 } else { 4254 while (n > 0) { 4255 Py_UNICODE_COPY(p, str2->str, str2->length); 4256 p += str2->length; 4257 if (--n <= 0) 4258 break; 4259 *p++ = self->str[i++]; 4260 } 4261 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4262 } 4263 } 4264 } 4265 } 4266 4267 return (PyObject *) u; 4268} 4269 4270/* --- Unicode Object Methods --------------------------------------------- */ 4271 4272PyDoc_STRVAR(title__doc__, 4273"S.title() -> unicode\n\ 4274\n\ 4275Return a titlecased version of S, i.e. words start with title case\n\ 4276characters, all remaining cased characters have lower case."); 4277 4278static PyObject* 4279unicode_title(PyUnicodeObject *self) 4280{ 4281 return fixup(self, fixtitle); 4282} 4283 4284PyDoc_STRVAR(capitalize__doc__, 4285"S.capitalize() -> unicode\n\ 4286\n\ 4287Return a capitalized version of S, i.e. make the first character\n\ 4288have upper case."); 4289 4290static PyObject* 4291unicode_capitalize(PyUnicodeObject *self) 4292{ 4293 return fixup(self, fixcapitalize); 4294} 4295 4296#if 0 4297PyDoc_STRVAR(capwords__doc__, 4298"S.capwords() -> unicode\n\ 4299\n\ 4300Apply .capitalize() to all words in S and return the result with\n\ 4301normalized whitespace (all whitespace strings are replaced by ' ')."); 4302 4303static PyObject* 4304unicode_capwords(PyUnicodeObject *self) 4305{ 4306 PyObject *list; 4307 PyObject *item; 4308 int i; 4309 4310 /* Split into words */ 4311 list = split(self, NULL, -1); 4312 if (!list) 4313 return NULL; 4314 4315 /* Capitalize each word */ 4316 for (i = 0; i < PyList_GET_SIZE(list); i++) { 4317 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 4318 fixcapitalize); 4319 if (item == NULL) 4320 goto onError; 4321 Py_DECREF(PyList_GET_ITEM(list, i)); 4322 PyList_SET_ITEM(list, i, item); 4323 } 4324 4325 /* Join the words to form a new string */ 4326 item = PyUnicode_Join(NULL, list); 4327 4328onError: 4329 Py_DECREF(list); 4330 return (PyObject *)item; 4331} 4332#endif 4333 4334PyDoc_STRVAR(center__doc__, 4335"S.center(width) -> unicode\n\ 4336\n\ 4337Return S centered in a Unicode string of length width. Padding is done\n\ 4338using spaces."); 4339 4340static PyObject * 4341unicode_center(PyUnicodeObject *self, PyObject *args) 4342{ 4343 int marg, left; 4344 int width; 4345 4346 if (!PyArg_ParseTuple(args, "i:center", &width)) 4347 return NULL; 4348 4349 if (self->length >= width && PyUnicode_CheckExact(self)) { 4350 Py_INCREF(self); 4351 return (PyObject*) self; 4352 } 4353 4354 marg = width - self->length; 4355 left = marg / 2 + (marg & width & 1); 4356 4357 return (PyObject*) pad(self, left, marg - left, ' '); 4358} 4359 4360#if 0 4361 4362/* This code should go into some future Unicode collation support 4363 module. The basic comparison should compare ordinals on a naive 4364 basis (this is what Java does and thus JPython too). */ 4365 4366/* speedy UTF-16 code point order comparison */ 4367/* gleaned from: */ 4368/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 4369 4370static short utf16Fixup[32] = 4371{ 4372 0, 0, 0, 0, 0, 0, 0, 0, 4373 0, 0, 0, 0, 0, 0, 0, 0, 4374 0, 0, 0, 0, 0, 0, 0, 0, 4375 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 4376}; 4377 4378static int 4379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4380{ 4381 int len1, len2; 4382 4383 Py_UNICODE *s1 = str1->str; 4384 Py_UNICODE *s2 = str2->str; 4385 4386 len1 = str1->length; 4387 len2 = str2->length; 4388 4389 while (len1 > 0 && len2 > 0) { 4390 Py_UNICODE c1, c2; 4391 4392 c1 = *s1++; 4393 c2 = *s2++; 4394 4395 if (c1 > (1<<11) * 26) 4396 c1 += utf16Fixup[c1>>11]; 4397 if (c2 > (1<<11) * 26) 4398 c2 += utf16Fixup[c2>>11]; 4399 /* now c1 and c2 are in UTF-32-compatible order */ 4400 4401 if (c1 != c2) 4402 return (c1 < c2) ? -1 : 1; 4403 4404 len1--; len2--; 4405 } 4406 4407 return (len1 < len2) ? -1 : (len1 != len2); 4408} 4409 4410#else 4411 4412static int 4413unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4414{ 4415 register int len1, len2; 4416 4417 Py_UNICODE *s1 = str1->str; 4418 Py_UNICODE *s2 = str2->str; 4419 4420 len1 = str1->length; 4421 len2 = str2->length; 4422 4423 while (len1 > 0 && len2 > 0) { 4424 Py_UNICODE c1, c2; 4425 4426 c1 = *s1++; 4427 c2 = *s2++; 4428 4429 if (c1 != c2) 4430 return (c1 < c2) ? -1 : 1; 4431 4432 len1--; len2--; 4433 } 4434 4435 return (len1 < len2) ? -1 : (len1 != len2); 4436} 4437 4438#endif 4439 4440int PyUnicode_Compare(PyObject *left, 4441 PyObject *right) 4442{ 4443 PyUnicodeObject *u = NULL, *v = NULL; 4444 int result; 4445 4446 /* Coerce the two arguments */ 4447 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4448 if (u == NULL) 4449 goto onError; 4450 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4451 if (v == NULL) 4452 goto onError; 4453 4454 /* Shortcut for empty or interned objects */ 4455 if (v == u) { 4456 Py_DECREF(u); 4457 Py_DECREF(v); 4458 return 0; 4459 } 4460 4461 result = unicode_compare(u, v); 4462 4463 Py_DECREF(u); 4464 Py_DECREF(v); 4465 return result; 4466 4467onError: 4468 Py_XDECREF(u); 4469 Py_XDECREF(v); 4470 return -1; 4471} 4472 4473int PyUnicode_Contains(PyObject *container, 4474 PyObject *element) 4475{ 4476 PyUnicodeObject *u = NULL, *v = NULL; 4477 int result, size; 4478 register const Py_UNICODE *lhs, *end, *rhs; 4479 4480 /* Coerce the two arguments */ 4481 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 4482 if (v == NULL) { 4483 PyErr_SetString(PyExc_TypeError, 4484 "'in <string>' requires string as left operand"); 4485 goto onError; 4486 } 4487 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 4488 if (u == NULL) 4489 goto onError; 4490 4491 size = PyUnicode_GET_SIZE(v); 4492 rhs = PyUnicode_AS_UNICODE(v); 4493 lhs = PyUnicode_AS_UNICODE(u); 4494 4495 result = 0; 4496 if (size == 1) { 4497 end = lhs + PyUnicode_GET_SIZE(u); 4498 while (lhs < end) { 4499 if (*lhs++ == *rhs) { 4500 result = 1; 4501 break; 4502 } 4503 } 4504 } 4505 else { 4506 end = lhs + (PyUnicode_GET_SIZE(u) - size); 4507 while (lhs <= end) { 4508 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) { 4509 result = 1; 4510 break; 4511 } 4512 } 4513 } 4514 4515 Py_DECREF(u); 4516 Py_DECREF(v); 4517 return result; 4518 4519onError: 4520 Py_XDECREF(u); 4521 Py_XDECREF(v); 4522 return -1; 4523} 4524 4525/* Concat to string or Unicode object giving a new Unicode object. */ 4526 4527PyObject *PyUnicode_Concat(PyObject *left, 4528 PyObject *right) 4529{ 4530 PyUnicodeObject *u = NULL, *v = NULL, *w; 4531 4532 /* Coerce the two arguments */ 4533 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4534 if (u == NULL) 4535 goto onError; 4536 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4537 if (v == NULL) 4538 goto onError; 4539 4540 /* Shortcuts */ 4541 if (v == unicode_empty) { 4542 Py_DECREF(v); 4543 return (PyObject *)u; 4544 } 4545 if (u == unicode_empty) { 4546 Py_DECREF(u); 4547 return (PyObject *)v; 4548 } 4549 4550 /* Concat the two Unicode strings */ 4551 w = _PyUnicode_New(u->length + v->length); 4552 if (w == NULL) 4553 goto onError; 4554 Py_UNICODE_COPY(w->str, u->str, u->length); 4555 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 4556 4557 Py_DECREF(u); 4558 Py_DECREF(v); 4559 return (PyObject *)w; 4560 4561onError: 4562 Py_XDECREF(u); 4563 Py_XDECREF(v); 4564 return NULL; 4565} 4566 4567PyDoc_STRVAR(count__doc__, 4568"S.count(sub[, start[, end]]) -> int\n\ 4569\n\ 4570Return the number of occurrences of substring sub in Unicode string\n\ 4571S[start:end]. Optional arguments start and end are\n\ 4572interpreted as in slice notation."); 4573 4574static PyObject * 4575unicode_count(PyUnicodeObject *self, PyObject *args) 4576{ 4577 PyUnicodeObject *substring; 4578 int start = 0; 4579 int end = INT_MAX; 4580 PyObject *result; 4581 4582 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 4583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4584 return NULL; 4585 4586 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4587 (PyObject *)substring); 4588 if (substring == NULL) 4589 return NULL; 4590 4591 if (start < 0) 4592 start += self->length; 4593 if (start < 0) 4594 start = 0; 4595 if (end > self->length) 4596 end = self->length; 4597 if (end < 0) 4598 end += self->length; 4599 if (end < 0) 4600 end = 0; 4601 4602 result = PyInt_FromLong((long) count(self, start, end, substring)); 4603 4604 Py_DECREF(substring); 4605 return result; 4606} 4607 4608PyDoc_STRVAR(encode__doc__, 4609"S.encode([encoding[,errors]]) -> string\n\ 4610\n\ 4611Return an encoded string version of S. Default encoding is the current\n\ 4612default string encoding. errors may be given to set a different error\n\ 4613handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 4614a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 4615'xmlcharrefreplace' as well as any other name registered with\n\ 4616codecs.register_error that can handle UnicodeEncodeErrors."); 4617 4618static PyObject * 4619unicode_encode(PyUnicodeObject *self, PyObject *args) 4620{ 4621 char *encoding = NULL; 4622 char *errors = NULL; 4623 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 4624 return NULL; 4625 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 4626} 4627 4628PyDoc_STRVAR(expandtabs__doc__, 4629"S.expandtabs([tabsize]) -> unicode\n\ 4630\n\ 4631Return a copy of S where all tab characters are expanded using spaces.\n\ 4632If tabsize is not given, a tab size of 8 characters is assumed."); 4633 4634static PyObject* 4635unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 4636{ 4637 Py_UNICODE *e; 4638 Py_UNICODE *p; 4639 Py_UNICODE *q; 4640 int i, j; 4641 PyUnicodeObject *u; 4642 int tabsize = 8; 4643 4644 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 4645 return NULL; 4646 4647 /* First pass: determine size of output string */ 4648 i = j = 0; 4649 e = self->str + self->length; 4650 for (p = self->str; p < e; p++) 4651 if (*p == '\t') { 4652 if (tabsize > 0) 4653 j += tabsize - (j % tabsize); 4654 } 4655 else { 4656 j++; 4657 if (*p == '\n' || *p == '\r') { 4658 i += j; 4659 j = 0; 4660 } 4661 } 4662 4663 /* Second pass: create output string and fill it */ 4664 u = _PyUnicode_New(i + j); 4665 if (!u) 4666 return NULL; 4667 4668 j = 0; 4669 q = u->str; 4670 4671 for (p = self->str; p < e; p++) 4672 if (*p == '\t') { 4673 if (tabsize > 0) { 4674 i = tabsize - (j % tabsize); 4675 j += i; 4676 while (i--) 4677 *q++ = ' '; 4678 } 4679 } 4680 else { 4681 j++; 4682 *q++ = *p; 4683 if (*p == '\n' || *p == '\r') 4684 j = 0; 4685 } 4686 4687 return (PyObject*) u; 4688} 4689 4690PyDoc_STRVAR(find__doc__, 4691"S.find(sub [,start [,end]]) -> int\n\ 4692\n\ 4693Return the lowest index in S where substring sub is found,\n\ 4694such that sub is contained within s[start,end]. Optional\n\ 4695arguments start and end are interpreted as in slice notation.\n\ 4696\n\ 4697Return -1 on failure."); 4698 4699static PyObject * 4700unicode_find(PyUnicodeObject *self, PyObject *args) 4701{ 4702 PyUnicodeObject *substring; 4703 int start = 0; 4704 int end = INT_MAX; 4705 PyObject *result; 4706 4707 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4708 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4709 return NULL; 4710 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4711 (PyObject *)substring); 4712 if (substring == NULL) 4713 return NULL; 4714 4715 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4716 4717 Py_DECREF(substring); 4718 return result; 4719} 4720 4721static PyObject * 4722unicode_getitem(PyUnicodeObject *self, int index) 4723{ 4724 if (index < 0 || index >= self->length) { 4725 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4726 return NULL; 4727 } 4728 4729 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4730} 4731 4732static long 4733unicode_hash(PyUnicodeObject *self) 4734{ 4735 /* Since Unicode objects compare equal to their ASCII string 4736 counterparts, they should use the individual character values 4737 as basis for their hash value. This is needed to assure that 4738 strings and Unicode objects behave in the same way as 4739 dictionary keys. */ 4740 4741 register int len; 4742 register Py_UNICODE *p; 4743 register long x; 4744 4745 if (self->hash != -1) 4746 return self->hash; 4747 len = PyUnicode_GET_SIZE(self); 4748 p = PyUnicode_AS_UNICODE(self); 4749 x = *p << 7; 4750 while (--len >= 0) 4751 x = (1000003*x) ^ *p++; 4752 x ^= PyUnicode_GET_SIZE(self); 4753 if (x == -1) 4754 x = -2; 4755 self->hash = x; 4756 return x; 4757} 4758 4759PyDoc_STRVAR(index__doc__, 4760"S.index(sub [,start [,end]]) -> int\n\ 4761\n\ 4762Like S.find() but raise ValueError when the substring is not found."); 4763 4764static PyObject * 4765unicode_index(PyUnicodeObject *self, PyObject *args) 4766{ 4767 int result; 4768 PyUnicodeObject *substring; 4769 int start = 0; 4770 int end = INT_MAX; 4771 4772 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4773 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4774 return NULL; 4775 4776 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4777 (PyObject *)substring); 4778 if (substring == NULL) 4779 return NULL; 4780 4781 result = findstring(self, substring, start, end, 1); 4782 4783 Py_DECREF(substring); 4784 if (result < 0) { 4785 PyErr_SetString(PyExc_ValueError, "substring not found"); 4786 return NULL; 4787 } 4788 return PyInt_FromLong(result); 4789} 4790 4791PyDoc_STRVAR(islower__doc__, 4792"S.islower() -> bool\n\ 4793\n\ 4794Return True if all cased characters in S are lowercase and there is\n\ 4795at least one cased character in S, False otherwise."); 4796 4797static PyObject* 4798unicode_islower(PyUnicodeObject *self) 4799{ 4800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4801 register const Py_UNICODE *e; 4802 int cased; 4803 4804 /* Shortcut for single character strings */ 4805 if (PyUnicode_GET_SIZE(self) == 1) 4806 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 4807 4808 /* Special case for empty strings */ 4809 if (PyString_GET_SIZE(self) == 0) 4810 return PyBool_FromLong(0); 4811 4812 e = p + PyUnicode_GET_SIZE(self); 4813 cased = 0; 4814 for (; p < e; p++) { 4815 register const Py_UNICODE ch = *p; 4816 4817 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4818 return PyBool_FromLong(0); 4819 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4820 cased = 1; 4821 } 4822 return PyBool_FromLong(cased); 4823} 4824 4825PyDoc_STRVAR(isupper__doc__, 4826"S.isupper() -> bool\n\ 4827\n\ 4828Return True if all cased characters in S are uppercase and there is\n\ 4829at least one cased character in S, False otherwise."); 4830 4831static PyObject* 4832unicode_isupper(PyUnicodeObject *self) 4833{ 4834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4835 register const Py_UNICODE *e; 4836 int cased; 4837 4838 /* Shortcut for single character strings */ 4839 if (PyUnicode_GET_SIZE(self) == 1) 4840 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4841 4842 /* Special case for empty strings */ 4843 if (PyString_GET_SIZE(self) == 0) 4844 return PyBool_FromLong(0); 4845 4846 e = p + PyUnicode_GET_SIZE(self); 4847 cased = 0; 4848 for (; p < e; p++) { 4849 register const Py_UNICODE ch = *p; 4850 4851 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4852 return PyBool_FromLong(0); 4853 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4854 cased = 1; 4855 } 4856 return PyBool_FromLong(cased); 4857} 4858 4859PyDoc_STRVAR(istitle__doc__, 4860"S.istitle() -> bool\n\ 4861\n\ 4862Return True if S is a titlecased string, i.e. upper- and titlecase\n\ 4863characters may only follow uncased characters and lowercase characters\n\ 4864only cased ones. Return False otherwise."); 4865 4866static PyObject* 4867unicode_istitle(PyUnicodeObject *self) 4868{ 4869 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4870 register const Py_UNICODE *e; 4871 int cased, previous_is_cased; 4872 4873 /* Shortcut for single character strings */ 4874 if (PyUnicode_GET_SIZE(self) == 1) 4875 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4876 (Py_UNICODE_ISUPPER(*p) != 0)); 4877 4878 /* Special case for empty strings */ 4879 if (PyString_GET_SIZE(self) == 0) 4880 return PyBool_FromLong(0); 4881 4882 e = p + PyUnicode_GET_SIZE(self); 4883 cased = 0; 4884 previous_is_cased = 0; 4885 for (; p < e; p++) { 4886 register const Py_UNICODE ch = *p; 4887 4888 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4889 if (previous_is_cased) 4890 return PyBool_FromLong(0); 4891 previous_is_cased = 1; 4892 cased = 1; 4893 } 4894 else if (Py_UNICODE_ISLOWER(ch)) { 4895 if (!previous_is_cased) 4896 return PyBool_FromLong(0); 4897 previous_is_cased = 1; 4898 cased = 1; 4899 } 4900 else 4901 previous_is_cased = 0; 4902 } 4903 return PyBool_FromLong(cased); 4904} 4905 4906PyDoc_STRVAR(isspace__doc__, 4907"S.isspace() -> bool\n\ 4908\n\ 4909Return True if there are only whitespace characters in S,\n\ 4910False otherwise."); 4911 4912static PyObject* 4913unicode_isspace(PyUnicodeObject *self) 4914{ 4915 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4916 register const Py_UNICODE *e; 4917 4918 /* Shortcut for single character strings */ 4919 if (PyUnicode_GET_SIZE(self) == 1 && 4920 Py_UNICODE_ISSPACE(*p)) 4921 return PyBool_FromLong(1); 4922 4923 /* Special case for empty strings */ 4924 if (PyString_GET_SIZE(self) == 0) 4925 return PyBool_FromLong(0); 4926 4927 e = p + PyUnicode_GET_SIZE(self); 4928 for (; p < e; p++) { 4929 if (!Py_UNICODE_ISSPACE(*p)) 4930 return PyBool_FromLong(0); 4931 } 4932 return PyBool_FromLong(1); 4933} 4934 4935PyDoc_STRVAR(isalpha__doc__, 4936"S.isalpha() -> bool\n\ 4937\n\ 4938Return True if all characters in S are alphabetic\n\ 4939and there is at least one character in S, False otherwise."); 4940 4941static PyObject* 4942unicode_isalpha(PyUnicodeObject *self) 4943{ 4944 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4945 register const Py_UNICODE *e; 4946 4947 /* Shortcut for single character strings */ 4948 if (PyUnicode_GET_SIZE(self) == 1 && 4949 Py_UNICODE_ISALPHA(*p)) 4950 return PyBool_FromLong(1); 4951 4952 /* Special case for empty strings */ 4953 if (PyString_GET_SIZE(self) == 0) 4954 return PyBool_FromLong(0); 4955 4956 e = p + PyUnicode_GET_SIZE(self); 4957 for (; p < e; p++) { 4958 if (!Py_UNICODE_ISALPHA(*p)) 4959 return PyBool_FromLong(0); 4960 } 4961 return PyBool_FromLong(1); 4962} 4963 4964PyDoc_STRVAR(isalnum__doc__, 4965"S.isalnum() -> bool\n\ 4966\n\ 4967Return True if all characters in S are alphanumeric\n\ 4968and there is at least one character in S, False otherwise."); 4969 4970static PyObject* 4971unicode_isalnum(PyUnicodeObject *self) 4972{ 4973 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4974 register const Py_UNICODE *e; 4975 4976 /* Shortcut for single character strings */ 4977 if (PyUnicode_GET_SIZE(self) == 1 && 4978 Py_UNICODE_ISALNUM(*p)) 4979 return PyBool_FromLong(1); 4980 4981 /* Special case for empty strings */ 4982 if (PyString_GET_SIZE(self) == 0) 4983 return PyBool_FromLong(0); 4984 4985 e = p + PyUnicode_GET_SIZE(self); 4986 for (; p < e; p++) { 4987 if (!Py_UNICODE_ISALNUM(*p)) 4988 return PyBool_FromLong(0); 4989 } 4990 return PyBool_FromLong(1); 4991} 4992 4993PyDoc_STRVAR(isdecimal__doc__, 4994"S.isdecimal() -> bool\n\ 4995\n\ 4996Return True if there are only decimal characters in S,\n\ 4997False otherwise."); 4998 4999static PyObject* 5000unicode_isdecimal(PyUnicodeObject *self) 5001{ 5002 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5003 register const Py_UNICODE *e; 5004 5005 /* Shortcut for single character strings */ 5006 if (PyUnicode_GET_SIZE(self) == 1 && 5007 Py_UNICODE_ISDECIMAL(*p)) 5008 return PyBool_FromLong(1); 5009 5010 /* Special case for empty strings */ 5011 if (PyString_GET_SIZE(self) == 0) 5012 return PyBool_FromLong(0); 5013 5014 e = p + PyUnicode_GET_SIZE(self); 5015 for (; p < e; p++) { 5016 if (!Py_UNICODE_ISDECIMAL(*p)) 5017 return PyBool_FromLong(0); 5018 } 5019 return PyBool_FromLong(1); 5020} 5021 5022PyDoc_STRVAR(isdigit__doc__, 5023"S.isdigit() -> bool\n\ 5024\n\ 5025Return True if there are only digit characters in S,\n\ 5026False otherwise."); 5027 5028static PyObject* 5029unicode_isdigit(PyUnicodeObject *self) 5030{ 5031 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5032 register const Py_UNICODE *e; 5033 5034 /* Shortcut for single character strings */ 5035 if (PyUnicode_GET_SIZE(self) == 1 && 5036 Py_UNICODE_ISDIGIT(*p)) 5037 return PyBool_FromLong(1); 5038 5039 /* Special case for empty strings */ 5040 if (PyString_GET_SIZE(self) == 0) 5041 return PyBool_FromLong(0); 5042 5043 e = p + PyUnicode_GET_SIZE(self); 5044 for (; p < e; p++) { 5045 if (!Py_UNICODE_ISDIGIT(*p)) 5046 return PyBool_FromLong(0); 5047 } 5048 return PyBool_FromLong(1); 5049} 5050 5051PyDoc_STRVAR(isnumeric__doc__, 5052"S.isnumeric() -> bool\n\ 5053\n\ 5054Return True if there are only numeric characters in S,\n\ 5055False otherwise."); 5056 5057static PyObject* 5058unicode_isnumeric(PyUnicodeObject *self) 5059{ 5060 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5061 register const Py_UNICODE *e; 5062 5063 /* Shortcut for single character strings */ 5064 if (PyUnicode_GET_SIZE(self) == 1 && 5065 Py_UNICODE_ISNUMERIC(*p)) 5066 return PyBool_FromLong(1); 5067 5068 /* Special case for empty strings */ 5069 if (PyString_GET_SIZE(self) == 0) 5070 return PyBool_FromLong(0); 5071 5072 e = p + PyUnicode_GET_SIZE(self); 5073 for (; p < e; p++) { 5074 if (!Py_UNICODE_ISNUMERIC(*p)) 5075 return PyBool_FromLong(0); 5076 } 5077 return PyBool_FromLong(1); 5078} 5079 5080PyDoc_STRVAR(join__doc__, 5081"S.join(sequence) -> unicode\n\ 5082\n\ 5083Return a string which is the concatenation of the strings in the\n\ 5084sequence. The separator between elements is S."); 5085 5086static PyObject* 5087unicode_join(PyObject *self, PyObject *data) 5088{ 5089 return PyUnicode_Join(self, data); 5090} 5091 5092static int 5093unicode_length(PyUnicodeObject *self) 5094{ 5095 return self->length; 5096} 5097 5098PyDoc_STRVAR(ljust__doc__, 5099"S.ljust(width) -> unicode\n\ 5100\n\ 5101Return S left justified in a Unicode string of length width. Padding is\n\ 5102done using spaces."); 5103 5104static PyObject * 5105unicode_ljust(PyUnicodeObject *self, PyObject *args) 5106{ 5107 int width; 5108 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 5109 return NULL; 5110 5111 if (self->length >= width && PyUnicode_CheckExact(self)) { 5112 Py_INCREF(self); 5113 return (PyObject*) self; 5114 } 5115 5116 return (PyObject*) pad(self, 0, width - self->length, ' '); 5117} 5118 5119PyDoc_STRVAR(lower__doc__, 5120"S.lower() -> unicode\n\ 5121\n\ 5122Return a copy of the string S converted to lowercase."); 5123 5124static PyObject* 5125unicode_lower(PyUnicodeObject *self) 5126{ 5127 return fixup(self, fixlower); 5128} 5129 5130#define LEFTSTRIP 0 5131#define RIGHTSTRIP 1 5132#define BOTHSTRIP 2 5133 5134/* Arrays indexed by above */ 5135static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 5136 5137#define STRIPNAME(i) (stripformat[i]+3) 5138 5139static const Py_UNICODE * 5140unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) 5141{ 5142 size_t i; 5143 for (i = 0; i < n; ++i) 5144 if (s[i] == c) 5145 return s+i; 5146 return NULL; 5147} 5148 5149/* externally visible for str.strip(unicode) */ 5150PyObject * 5151_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 5152{ 5153 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5154 int len = PyUnicode_GET_SIZE(self); 5155 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 5156 int seplen = PyUnicode_GET_SIZE(sepobj); 5157 int i, j; 5158 5159 i = 0; 5160 if (striptype != RIGHTSTRIP) { 5161 while (i < len && unicode_memchr(sep, s[i], seplen)) { 5162 i++; 5163 } 5164 } 5165 5166 j = len; 5167 if (striptype != LEFTSTRIP) { 5168 do { 5169 j--; 5170 } while (j >= i && unicode_memchr(sep, s[j], seplen)); 5171 j++; 5172 } 5173 5174 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5175 Py_INCREF(self); 5176 return (PyObject*)self; 5177 } 5178 else 5179 return PyUnicode_FromUnicode(s+i, j-i); 5180} 5181 5182 5183static PyObject * 5184do_strip(PyUnicodeObject *self, int striptype) 5185{ 5186 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5187 int len = PyUnicode_GET_SIZE(self), i, j; 5188 5189 i = 0; 5190 if (striptype != RIGHTSTRIP) { 5191 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 5192 i++; 5193 } 5194 } 5195 5196 j = len; 5197 if (striptype != LEFTSTRIP) { 5198 do { 5199 j--; 5200 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 5201 j++; 5202 } 5203 5204 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5205 Py_INCREF(self); 5206 return (PyObject*)self; 5207 } 5208 else 5209 return PyUnicode_FromUnicode(s+i, j-i); 5210} 5211 5212 5213static PyObject * 5214do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 5215{ 5216 PyObject *sep = NULL; 5217 5218 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 5219 return NULL; 5220 5221 if (sep != NULL && sep != Py_None) { 5222 if (PyUnicode_Check(sep)) 5223 return _PyUnicode_XStrip(self, striptype, sep); 5224 else if (PyString_Check(sep)) { 5225 PyObject *res; 5226 sep = PyUnicode_FromObject(sep); 5227 if (sep==NULL) 5228 return NULL; 5229 res = _PyUnicode_XStrip(self, striptype, sep); 5230 Py_DECREF(sep); 5231 return res; 5232 } 5233 else { 5234 PyErr_Format(PyExc_TypeError, 5235 "%s arg must be None, unicode or str", 5236 STRIPNAME(striptype)); 5237 return NULL; 5238 } 5239 } 5240 5241 return do_strip(self, striptype); 5242} 5243 5244 5245PyDoc_STRVAR(strip__doc__, 5246"S.strip([sep]) -> unicode\n\ 5247\n\ 5248Return a copy of the string S with leading and trailing\n\ 5249whitespace removed.\n\ 5250If sep is given and not None, remove characters in sep instead.\n\ 5251If sep is a str, it will be converted to unicode before stripping"); 5252 5253static PyObject * 5254unicode_strip(PyUnicodeObject *self, PyObject *args) 5255{ 5256 if (PyTuple_GET_SIZE(args) == 0) 5257 return do_strip(self, BOTHSTRIP); /* Common case */ 5258 else 5259 return do_argstrip(self, BOTHSTRIP, args); 5260} 5261 5262 5263PyDoc_STRVAR(lstrip__doc__, 5264"S.lstrip([sep]) -> unicode\n\ 5265\n\ 5266Return a copy of the string S with leading whitespace removed.\n\ 5267If sep is given and not None, remove characters in sep instead.\n\ 5268If sep is a str, it will be converted to unicode before stripping"); 5269 5270static PyObject * 5271unicode_lstrip(PyUnicodeObject *self, PyObject *args) 5272{ 5273 if (PyTuple_GET_SIZE(args) == 0) 5274 return do_strip(self, LEFTSTRIP); /* Common case */ 5275 else 5276 return do_argstrip(self, LEFTSTRIP, args); 5277} 5278 5279 5280PyDoc_STRVAR(rstrip__doc__, 5281"S.rstrip([sep]) -> unicode\n\ 5282\n\ 5283Return a copy of the string S with trailing whitespace removed.\n\ 5284If sep is given and not None, remove characters in sep instead.\n\ 5285If sep is a str, it will be converted to unicode before stripping"); 5286 5287static PyObject * 5288unicode_rstrip(PyUnicodeObject *self, PyObject *args) 5289{ 5290 if (PyTuple_GET_SIZE(args) == 0) 5291 return do_strip(self, RIGHTSTRIP); /* Common case */ 5292 else 5293 return do_argstrip(self, RIGHTSTRIP, args); 5294} 5295 5296 5297static PyObject* 5298unicode_repeat(PyUnicodeObject *str, int len) 5299{ 5300 PyUnicodeObject *u; 5301 Py_UNICODE *p; 5302 int nchars; 5303 size_t nbytes; 5304 5305 if (len < 0) 5306 len = 0; 5307 5308 if (len == 1 && PyUnicode_CheckExact(str)) { 5309 /* no repeat, return original string */ 5310 Py_INCREF(str); 5311 return (PyObject*) str; 5312 } 5313 5314 /* ensure # of chars needed doesn't overflow int and # of bytes 5315 * needed doesn't overflow size_t 5316 */ 5317 nchars = len * str->length; 5318 if (len && nchars / len != str->length) { 5319 PyErr_SetString(PyExc_OverflowError, 5320 "repeated string is too long"); 5321 return NULL; 5322 } 5323 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 5324 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 5325 PyErr_SetString(PyExc_OverflowError, 5326 "repeated string is too long"); 5327 return NULL; 5328 } 5329 u = _PyUnicode_New(nchars); 5330 if (!u) 5331 return NULL; 5332 5333 p = u->str; 5334 5335 while (len-- > 0) { 5336 Py_UNICODE_COPY(p, str->str, str->length); 5337 p += str->length; 5338 } 5339 5340 return (PyObject*) u; 5341} 5342 5343PyObject *PyUnicode_Replace(PyObject *obj, 5344 PyObject *subobj, 5345 PyObject *replobj, 5346 int maxcount) 5347{ 5348 PyObject *self; 5349 PyObject *str1; 5350 PyObject *str2; 5351 PyObject *result; 5352 5353 self = PyUnicode_FromObject(obj); 5354 if (self == NULL) 5355 return NULL; 5356 str1 = PyUnicode_FromObject(subobj); 5357 if (str1 == NULL) { 5358 Py_DECREF(self); 5359 return NULL; 5360 } 5361 str2 = PyUnicode_FromObject(replobj); 5362 if (str2 == NULL) { 5363 Py_DECREF(self); 5364 Py_DECREF(str1); 5365 return NULL; 5366 } 5367 result = replace((PyUnicodeObject *)self, 5368 (PyUnicodeObject *)str1, 5369 (PyUnicodeObject *)str2, 5370 maxcount); 5371 Py_DECREF(self); 5372 Py_DECREF(str1); 5373 Py_DECREF(str2); 5374 return result; 5375} 5376 5377PyDoc_STRVAR(replace__doc__, 5378"S.replace (old, new[, maxsplit]) -> unicode\n\ 5379\n\ 5380Return a copy of S with all occurrences of substring\n\ 5381old replaced by new. If the optional argument maxsplit is\n\ 5382given, only the first maxsplit occurrences are replaced."); 5383 5384static PyObject* 5385unicode_replace(PyUnicodeObject *self, PyObject *args) 5386{ 5387 PyUnicodeObject *str1; 5388 PyUnicodeObject *str2; 5389 int maxcount = -1; 5390 PyObject *result; 5391 5392 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 5393 return NULL; 5394 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 5395 if (str1 == NULL) 5396 return NULL; 5397 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 5398 if (str2 == NULL) 5399 return NULL; 5400 5401 result = replace(self, str1, str2, maxcount); 5402 5403 Py_DECREF(str1); 5404 Py_DECREF(str2); 5405 return result; 5406} 5407 5408static 5409PyObject *unicode_repr(PyObject *unicode) 5410{ 5411 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 5412 PyUnicode_GET_SIZE(unicode), 5413 1); 5414} 5415 5416PyDoc_STRVAR(rfind__doc__, 5417"S.rfind(sub [,start [,end]]) -> int\n\ 5418\n\ 5419Return the highest index in S where substring sub is found,\n\ 5420such that sub is contained within s[start,end]. Optional\n\ 5421arguments start and end are interpreted as in slice notation.\n\ 5422\n\ 5423Return -1 on failure."); 5424 5425static PyObject * 5426unicode_rfind(PyUnicodeObject *self, PyObject *args) 5427{ 5428 PyUnicodeObject *substring; 5429 int start = 0; 5430 int end = INT_MAX; 5431 PyObject *result; 5432 5433 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 5434 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5435 return NULL; 5436 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5437 (PyObject *)substring); 5438 if (substring == NULL) 5439 return NULL; 5440 5441 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 5442 5443 Py_DECREF(substring); 5444 return result; 5445} 5446 5447PyDoc_STRVAR(rindex__doc__, 5448"S.rindex(sub [,start [,end]]) -> int\n\ 5449\n\ 5450Like S.rfind() but raise ValueError when the substring is not found."); 5451 5452static PyObject * 5453unicode_rindex(PyUnicodeObject *self, PyObject *args) 5454{ 5455 int result; 5456 PyUnicodeObject *substring; 5457 int start = 0; 5458 int end = INT_MAX; 5459 5460 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 5461 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5462 return NULL; 5463 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5464 (PyObject *)substring); 5465 if (substring == NULL) 5466 return NULL; 5467 5468 result = findstring(self, substring, start, end, -1); 5469 5470 Py_DECREF(substring); 5471 if (result < 0) { 5472 PyErr_SetString(PyExc_ValueError, "substring not found"); 5473 return NULL; 5474 } 5475 return PyInt_FromLong(result); 5476} 5477 5478PyDoc_STRVAR(rjust__doc__, 5479"S.rjust(width) -> unicode\n\ 5480\n\ 5481Return S right justified in a Unicode string of length width. Padding is\n\ 5482done using spaces."); 5483 5484static PyObject * 5485unicode_rjust(PyUnicodeObject *self, PyObject *args) 5486{ 5487 int width; 5488 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 5489 return NULL; 5490 5491 if (self->length >= width && PyUnicode_CheckExact(self)) { 5492 Py_INCREF(self); 5493 return (PyObject*) self; 5494 } 5495 5496 return (PyObject*) pad(self, width - self->length, 0, ' '); 5497} 5498 5499static PyObject* 5500unicode_slice(PyUnicodeObject *self, int start, int end) 5501{ 5502 /* standard clamping */ 5503 if (start < 0) 5504 start = 0; 5505 if (end < 0) 5506 end = 0; 5507 if (end > self->length) 5508 end = self->length; 5509 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 5510 /* full slice, return original string */ 5511 Py_INCREF(self); 5512 return (PyObject*) self; 5513 } 5514 if (start > end) 5515 start = end; 5516 /* copy slice */ 5517 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 5518 end - start); 5519} 5520 5521PyObject *PyUnicode_Split(PyObject *s, 5522 PyObject *sep, 5523 int maxsplit) 5524{ 5525 PyObject *result; 5526 5527 s = PyUnicode_FromObject(s); 5528 if (s == NULL) 5529 return NULL; 5530 if (sep != NULL) { 5531 sep = PyUnicode_FromObject(sep); 5532 if (sep == NULL) { 5533 Py_DECREF(s); 5534 return NULL; 5535 } 5536 } 5537 5538 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 5539 5540 Py_DECREF(s); 5541 Py_XDECREF(sep); 5542 return result; 5543} 5544 5545PyDoc_STRVAR(split__doc__, 5546"S.split([sep [,maxsplit]]) -> list of strings\n\ 5547\n\ 5548Return a list of the words in S, using sep as the\n\ 5549delimiter string. If maxsplit is given, at most maxsplit\n\ 5550splits are done. If sep is not specified, any whitespace string\n\ 5551is a separator."); 5552 5553static PyObject* 5554unicode_split(PyUnicodeObject *self, PyObject *args) 5555{ 5556 PyObject *substring = Py_None; 5557 int maxcount = -1; 5558 5559 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 5560 return NULL; 5561 5562 if (substring == Py_None) 5563 return split(self, NULL, maxcount); 5564 else if (PyUnicode_Check(substring)) 5565 return split(self, (PyUnicodeObject *)substring, maxcount); 5566 else 5567 return PyUnicode_Split((PyObject *)self, substring, maxcount); 5568} 5569 5570PyDoc_STRVAR(splitlines__doc__, 5571"S.splitlines([keepends]]) -> list of strings\n\ 5572\n\ 5573Return a list of the lines in S, breaking at line boundaries.\n\ 5574Line breaks are not included in the resulting list unless keepends\n\ 5575is given and true."); 5576 5577static PyObject* 5578unicode_splitlines(PyUnicodeObject *self, PyObject *args) 5579{ 5580 int keepends = 0; 5581 5582 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 5583 return NULL; 5584 5585 return PyUnicode_Splitlines((PyObject *)self, keepends); 5586} 5587 5588static 5589PyObject *unicode_str(PyUnicodeObject *self) 5590{ 5591 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 5592} 5593 5594PyDoc_STRVAR(swapcase__doc__, 5595"S.swapcase() -> unicode\n\ 5596\n\ 5597Return a copy of S with uppercase characters converted to lowercase\n\ 5598and vice versa."); 5599 5600static PyObject* 5601unicode_swapcase(PyUnicodeObject *self) 5602{ 5603 return fixup(self, fixswapcase); 5604} 5605 5606PyDoc_STRVAR(translate__doc__, 5607"S.translate(table) -> unicode\n\ 5608\n\ 5609Return a copy of the string S, where all characters have been mapped\n\ 5610through the given translation table, which must be a mapping of\n\ 5611Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 5612Unmapped characters are left untouched. Characters mapped to None\n\ 5613are deleted."); 5614 5615static PyObject* 5616unicode_translate(PyUnicodeObject *self, PyObject *table) 5617{ 5618 return PyUnicode_TranslateCharmap(self->str, 5619 self->length, 5620 table, 5621 "ignore"); 5622} 5623 5624PyDoc_STRVAR(upper__doc__, 5625"S.upper() -> unicode\n\ 5626\n\ 5627Return a copy of S converted to uppercase."); 5628 5629static PyObject* 5630unicode_upper(PyUnicodeObject *self) 5631{ 5632 return fixup(self, fixupper); 5633} 5634 5635PyDoc_STRVAR(zfill__doc__, 5636"S.zfill(width) -> unicode\n\ 5637\n\ 5638Pad a numeric string x with zeros on the left, to fill a field\n\ 5639of the specified width. The string x is never truncated."); 5640 5641static PyObject * 5642unicode_zfill(PyUnicodeObject *self, PyObject *args) 5643{ 5644 int fill; 5645 PyUnicodeObject *u; 5646 5647 int width; 5648 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 5649 return NULL; 5650 5651 if (self->length >= width) { 5652 if (PyUnicode_CheckExact(self)) { 5653 Py_INCREF(self); 5654 return (PyObject*) self; 5655 } 5656 else 5657 return PyUnicode_FromUnicode( 5658 PyUnicode_AS_UNICODE(self), 5659 PyUnicode_GET_SIZE(self) 5660 ); 5661 } 5662 5663 fill = width - self->length; 5664 5665 u = pad(self, fill, 0, '0'); 5666 5667 if (u == NULL) 5668 return NULL; 5669 5670 if (u->str[fill] == '+' || u->str[fill] == '-') { 5671 /* move sign to beginning of string */ 5672 u->str[0] = u->str[fill]; 5673 u->str[fill] = '0'; 5674 } 5675 5676 return (PyObject*) u; 5677} 5678 5679#if 0 5680static PyObject* 5681unicode_freelistsize(PyUnicodeObject *self) 5682{ 5683 return PyInt_FromLong(unicode_freelist_size); 5684} 5685#endif 5686 5687PyDoc_STRVAR(startswith__doc__, 5688"S.startswith(prefix[, start[, end]]) -> bool\n\ 5689\n\ 5690Return True if S starts with the specified prefix, False otherwise. With\n\ 5691optional start, test S beginning at that position. With optional end, stop\n\ 5692comparing S at that position."); 5693 5694static PyObject * 5695unicode_startswith(PyUnicodeObject *self, 5696 PyObject *args) 5697{ 5698 PyUnicodeObject *substring; 5699 int start = 0; 5700 int end = INT_MAX; 5701 PyObject *result; 5702 5703 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 5704 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5705 return NULL; 5706 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5707 (PyObject *)substring); 5708 if (substring == NULL) 5709 return NULL; 5710 5711 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); 5712 5713 Py_DECREF(substring); 5714 return result; 5715} 5716 5717 5718PyDoc_STRVAR(endswith__doc__, 5719"S.endswith(suffix[, start[, end]]) -> bool\n\ 5720\n\ 5721Return True if S ends with the specified suffix, False otherwise. With\n\ 5722optional start, test S beginning at that position. With optional end, stop\n\ 5723comparing S at that position."); 5724 5725static PyObject * 5726unicode_endswith(PyUnicodeObject *self, 5727 PyObject *args) 5728{ 5729 PyUnicodeObject *substring; 5730 int start = 0; 5731 int end = INT_MAX; 5732 PyObject *result; 5733 5734 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 5735 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5736 return NULL; 5737 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5738 (PyObject *)substring); 5739 if (substring == NULL) 5740 return NULL; 5741 5742 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); 5743 5744 Py_DECREF(substring); 5745 return result; 5746} 5747 5748 5749static PyMethodDef unicode_methods[] = { 5750 5751 /* Order is according to common usage: often used methods should 5752 appear first, since lookup is done sequentially. */ 5753 5754 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 5755 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 5756 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 5757 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 5758 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 5759 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 5760 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 5761 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 5762 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 5763 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 5764 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 5765 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 5766 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 5767 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 5768/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 5769 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 5770 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 5771 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 5772 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 5773 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 5774 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 5775 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 5776 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 5777 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 5778 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 5779 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 5780 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 5781 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 5782 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 5783 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 5784 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 5785 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 5786 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 5787 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 5788 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 5789 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 5790#if 0 5791 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 5792#endif 5793 5794#if 0 5795 /* This one is just used for debugging the implementation. */ 5796 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 5797#endif 5798 5799 {NULL, NULL} 5800}; 5801 5802static PyObject * 5803unicode_mod(PyObject *v, PyObject *w) 5804{ 5805 if (!PyUnicode_Check(v)) { 5806 Py_INCREF(Py_NotImplemented); 5807 return Py_NotImplemented; 5808 } 5809 return PyUnicode_Format(v, w); 5810} 5811 5812static PyNumberMethods unicode_as_number = { 5813 0, /*nb_add*/ 5814 0, /*nb_subtract*/ 5815 0, /*nb_multiply*/ 5816 0, /*nb_divide*/ 5817 unicode_mod, /*nb_remainder*/ 5818}; 5819 5820static PySequenceMethods unicode_as_sequence = { 5821 (inquiry) unicode_length, /* sq_length */ 5822 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 5823 (intargfunc) unicode_repeat, /* sq_repeat */ 5824 (intargfunc) unicode_getitem, /* sq_item */ 5825 (intintargfunc) unicode_slice, /* sq_slice */ 5826 0, /* sq_ass_item */ 5827 0, /* sq_ass_slice */ 5828 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 5829}; 5830 5831static PyObject* 5832unicode_subscript(PyUnicodeObject* self, PyObject* item) 5833{ 5834 if (PyInt_Check(item)) { 5835 long i = PyInt_AS_LONG(item); 5836 if (i < 0) 5837 i += PyString_GET_SIZE(self); 5838 return unicode_getitem(self, i); 5839 } else if (PyLong_Check(item)) { 5840 long i = PyLong_AsLong(item); 5841 if (i == -1 && PyErr_Occurred()) 5842 return NULL; 5843 if (i < 0) 5844 i += PyString_GET_SIZE(self); 5845 return unicode_getitem(self, i); 5846 } else if (PySlice_Check(item)) { 5847 int start, stop, step, slicelength, cur, i; 5848 Py_UNICODE* source_buf; 5849 Py_UNICODE* result_buf; 5850 PyObject* result; 5851 5852 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self), 5853 &start, &stop, &step, &slicelength) < 0) { 5854 return NULL; 5855 } 5856 5857 if (slicelength <= 0) { 5858 return PyUnicode_FromUnicode(NULL, 0); 5859 } else { 5860 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 5861 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE)); 5862 5863 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 5864 result_buf[i] = source_buf[cur]; 5865 } 5866 5867 result = PyUnicode_FromUnicode(result_buf, slicelength); 5868 PyMem_FREE(result_buf); 5869 return result; 5870 } 5871 } else { 5872 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 5873 return NULL; 5874 } 5875} 5876 5877static PyMappingMethods unicode_as_mapping = { 5878 (inquiry)unicode_length, /* mp_length */ 5879 (binaryfunc)unicode_subscript, /* mp_subscript */ 5880 (objobjargproc)0, /* mp_ass_subscript */ 5881}; 5882 5883static int 5884unicode_buffer_getreadbuf(PyUnicodeObject *self, 5885 int index, 5886 const void **ptr) 5887{ 5888 if (index != 0) { 5889 PyErr_SetString(PyExc_SystemError, 5890 "accessing non-existent unicode segment"); 5891 return -1; 5892 } 5893 *ptr = (void *) self->str; 5894 return PyUnicode_GET_DATA_SIZE(self); 5895} 5896 5897static int 5898unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 5899 const void **ptr) 5900{ 5901 PyErr_SetString(PyExc_TypeError, 5902 "cannot use unicode as modifiable buffer"); 5903 return -1; 5904} 5905 5906static int 5907unicode_buffer_getsegcount(PyUnicodeObject *self, 5908 int *lenp) 5909{ 5910 if (lenp) 5911 *lenp = PyUnicode_GET_DATA_SIZE(self); 5912 return 1; 5913} 5914 5915static int 5916unicode_buffer_getcharbuf(PyUnicodeObject *self, 5917 int index, 5918 const void **ptr) 5919{ 5920 PyObject *str; 5921 5922 if (index != 0) { 5923 PyErr_SetString(PyExc_SystemError, 5924 "accessing non-existent unicode segment"); 5925 return -1; 5926 } 5927 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5928 if (str == NULL) 5929 return -1; 5930 *ptr = (void *) PyString_AS_STRING(str); 5931 return PyString_GET_SIZE(str); 5932} 5933 5934/* Helpers for PyUnicode_Format() */ 5935 5936static PyObject * 5937getnextarg(PyObject *args, int arglen, int *p_argidx) 5938{ 5939 int argidx = *p_argidx; 5940 if (argidx < arglen) { 5941 (*p_argidx)++; 5942 if (arglen < 0) 5943 return args; 5944 else 5945 return PyTuple_GetItem(args, argidx); 5946 } 5947 PyErr_SetString(PyExc_TypeError, 5948 "not enough arguments for format string"); 5949 return NULL; 5950} 5951 5952#define F_LJUST (1<<0) 5953#define F_SIGN (1<<1) 5954#define F_BLANK (1<<2) 5955#define F_ALT (1<<3) 5956#define F_ZERO (1<<4) 5957 5958static 5959int usprintf(register Py_UNICODE *buffer, char *format, ...) 5960{ 5961 register int i; 5962 int len; 5963 va_list va; 5964 char *charbuffer; 5965 va_start(va, format); 5966 5967 /* First, format the string as char array, then expand to Py_UNICODE 5968 array. */ 5969 charbuffer = (char *)buffer; 5970 len = vsprintf(charbuffer, format, va); 5971 for (i = len - 1; i >= 0; i--) 5972 buffer[i] = (Py_UNICODE) charbuffer[i]; 5973 5974 va_end(va); 5975 return len; 5976} 5977 5978/* XXX To save some code duplication, formatfloat/long/int could have been 5979 shared with stringobject.c, converting from 8-bit to Unicode after the 5980 formatting is done. */ 5981 5982static int 5983formatfloat(Py_UNICODE *buf, 5984 size_t buflen, 5985 int flags, 5986 int prec, 5987 int type, 5988 PyObject *v) 5989{ 5990 /* fmt = '%#.' + `prec` + `type` 5991 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 5992 char fmt[20]; 5993 double x; 5994 5995 x = PyFloat_AsDouble(v); 5996 if (x == -1.0 && PyErr_Occurred()) 5997 return -1; 5998 if (prec < 0) 5999 prec = 6; 6000 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 6001 type = 'g'; 6002 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 6003 (flags & F_ALT) ? "#" : "", prec, type); 6004 /* worst case length calc to ensure no buffer overrun: 6005 fmt = %#.<prec>g 6006 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 6007 for any double rep.) 6008 len = 1 + prec + 1 + 2 + 5 = 9 + prec 6009 If prec=0 the effective precision is 1 (the leading digit is 6010 always given), therefore increase by one to 10+prec. */ 6011 if (buflen <= (size_t)10 + (size_t)prec) { 6012 PyErr_SetString(PyExc_OverflowError, 6013 "formatted float is too long (precision too long?)"); 6014 return -1; 6015 } 6016 return usprintf(buf, fmt, x); 6017} 6018 6019static PyObject* 6020formatlong(PyObject *val, int flags, int prec, int type) 6021{ 6022 char *buf; 6023 int i, len; 6024 PyObject *str; /* temporary string object. */ 6025 PyUnicodeObject *result; 6026 6027 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 6028 if (!str) 6029 return NULL; 6030 result = _PyUnicode_New(len); 6031 for (i = 0; i < len; i++) 6032 result->str[i] = buf[i]; 6033 result->str[len] = 0; 6034 Py_DECREF(str); 6035 return (PyObject*)result; 6036} 6037 6038static int 6039formatint(Py_UNICODE *buf, 6040 size_t buflen, 6041 int flags, 6042 int prec, 6043 int type, 6044 PyObject *v) 6045{ 6046 /* fmt = '%#.' + `prec` + 'l' + `type` 6047 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 6048 * + 1 + 1 6049 * = 24 6050 */ 6051 char fmt[64]; /* plenty big enough! */ 6052 long x; 6053 6054 x = PyInt_AsLong(v); 6055 if (x == -1 && PyErr_Occurred()) 6056 return -1; 6057 if (x < 0 && type != 'd' && type != 'i') { 6058 if (PyErr_Warn(PyExc_FutureWarning, 6059 "%u/%o/%x/%X of negative int will return " 6060 "a signed string in Python 2.4 and up") < 0) 6061 return -1; 6062 } 6063 if (prec < 0) 6064 prec = 1; 6065 6066 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 6067 * worst case buf = '0x' + [0-9]*prec, where prec >= 11 6068 */ 6069 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) { 6070 PyErr_SetString(PyExc_OverflowError, 6071 "formatted integer is too long (precision too large?)"); 6072 return -1; 6073 } 6074 6075 if ((flags & F_ALT) && 6076 (type == 'x' || type == 'X')) { 6077 /* When converting under %#x or %#X, there are a number 6078 * of issues that cause pain: 6079 * - when 0 is being converted, the C standard leaves off 6080 * the '0x' or '0X', which is inconsistent with other 6081 * %#x/%#X conversions and inconsistent with Python's 6082 * hex() function 6083 * - there are platforms that violate the standard and 6084 * convert 0 with the '0x' or '0X' 6085 * (Metrowerks, Compaq Tru64) 6086 * - there are platforms that give '0x' when converting 6087 * under %#X, but convert 0 in accordance with the 6088 * standard (OS/2 EMX) 6089 * 6090 * We can achieve the desired consistency by inserting our 6091 * own '0x' or '0X' prefix, and substituting %x/%X in place 6092 * of %#x/%#X. 6093 * 6094 * Note that this is the same approach as used in 6095 * formatint() in stringobject.c 6096 */ 6097 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c", 6098 type, prec, type); 6099 } 6100 else { 6101 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 6102 (flags&F_ALT) ? "#" : "", 6103 prec, type); 6104 } 6105 return usprintf(buf, fmt, x); 6106} 6107 6108static int 6109formatchar(Py_UNICODE *buf, 6110 size_t buflen, 6111 PyObject *v) 6112{ 6113 /* presume that the buffer is at least 2 characters long */ 6114 if (PyUnicode_Check(v)) { 6115 if (PyUnicode_GET_SIZE(v) != 1) 6116 goto onError; 6117 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 6118 } 6119 6120 else if (PyString_Check(v)) { 6121 if (PyString_GET_SIZE(v) != 1) 6122 goto onError; 6123 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 6124 } 6125 6126 else { 6127 /* Integer input truncated to a character */ 6128 long x; 6129 x = PyInt_AsLong(v); 6130 if (x == -1 && PyErr_Occurred()) 6131 goto onError; 6132#ifdef Py_UNICODE_WIDE 6133 if (x < 0 || x > 0x10ffff) { 6134 PyErr_SetString(PyExc_ValueError, 6135 "%c arg not in range(0x110000) " 6136 "(wide Python build)"); 6137 return -1; 6138 } 6139#else 6140 if (x < 0 || x > 0xffff) { 6141 PyErr_SetString(PyExc_ValueError, 6142 "%c arg not in range(0x10000) " 6143 "(narrow Python build)"); 6144 return -1; 6145 } 6146#endif 6147 buf[0] = (Py_UNICODE) x; 6148 } 6149 buf[1] = '\0'; 6150 return 1; 6151 6152 onError: 6153 PyErr_SetString(PyExc_TypeError, 6154 "%c requires int or char"); 6155 return -1; 6156} 6157 6158/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 6159 6160 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 6161 chars are formatted. XXX This is a magic number. Each formatting 6162 routine does bounds checking to ensure no overflow, but a better 6163 solution may be to malloc a buffer of appropriate size for each 6164 format. For now, the current solution is sufficient. 6165*/ 6166#define FORMATBUFLEN (size_t)120 6167 6168PyObject *PyUnicode_Format(PyObject *format, 6169 PyObject *args) 6170{ 6171 Py_UNICODE *fmt, *res; 6172 int fmtcnt, rescnt, reslen, arglen, argidx; 6173 int args_owned = 0; 6174 PyUnicodeObject *result = NULL; 6175 PyObject *dict = NULL; 6176 PyObject *uformat; 6177 6178 if (format == NULL || args == NULL) { 6179 PyErr_BadInternalCall(); 6180 return NULL; 6181 } 6182 uformat = PyUnicode_FromObject(format); 6183 if (uformat == NULL) 6184 return NULL; 6185 fmt = PyUnicode_AS_UNICODE(uformat); 6186 fmtcnt = PyUnicode_GET_SIZE(uformat); 6187 6188 reslen = rescnt = fmtcnt + 100; 6189 result = _PyUnicode_New(reslen); 6190 if (result == NULL) 6191 goto onError; 6192 res = PyUnicode_AS_UNICODE(result); 6193 6194 if (PyTuple_Check(args)) { 6195 arglen = PyTuple_Size(args); 6196 argidx = 0; 6197 } 6198 else { 6199 arglen = -1; 6200 argidx = -2; 6201 } 6202 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 6203 !PyObject_TypeCheck(args, &PyBaseString_Type)) 6204 dict = args; 6205 6206 while (--fmtcnt >= 0) { 6207 if (*fmt != '%') { 6208 if (--rescnt < 0) { 6209 rescnt = fmtcnt + 100; 6210 reslen += rescnt; 6211 if (_PyUnicode_Resize(&result, reslen) < 0) 6212 return NULL; 6213 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 6214 --rescnt; 6215 } 6216 *res++ = *fmt++; 6217 } 6218 else { 6219 /* Got a format specifier */ 6220 int flags = 0; 6221 int width = -1; 6222 int prec = -1; 6223 Py_UNICODE c = '\0'; 6224 Py_UNICODE fill; 6225 PyObject *v = NULL; 6226 PyObject *temp = NULL; 6227 Py_UNICODE *pbuf; 6228 Py_UNICODE sign; 6229 int len; 6230 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 6231 6232 fmt++; 6233 if (*fmt == '(') { 6234 Py_UNICODE *keystart; 6235 int keylen; 6236 PyObject *key; 6237 int pcount = 1; 6238 6239 if (dict == NULL) { 6240 PyErr_SetString(PyExc_TypeError, 6241 "format requires a mapping"); 6242 goto onError; 6243 } 6244 ++fmt; 6245 --fmtcnt; 6246 keystart = fmt; 6247 /* Skip over balanced parentheses */ 6248 while (pcount > 0 && --fmtcnt >= 0) { 6249 if (*fmt == ')') 6250 --pcount; 6251 else if (*fmt == '(') 6252 ++pcount; 6253 fmt++; 6254 } 6255 keylen = fmt - keystart - 1; 6256 if (fmtcnt < 0 || pcount > 0) { 6257 PyErr_SetString(PyExc_ValueError, 6258 "incomplete format key"); 6259 goto onError; 6260 } 6261#if 0 6262 /* keys are converted to strings using UTF-8 and 6263 then looked up since Python uses strings to hold 6264 variables names etc. in its namespaces and we 6265 wouldn't want to break common idioms. */ 6266 key = PyUnicode_EncodeUTF8(keystart, 6267 keylen, 6268 NULL); 6269#else 6270 key = PyUnicode_FromUnicode(keystart, keylen); 6271#endif 6272 if (key == NULL) 6273 goto onError; 6274 if (args_owned) { 6275 Py_DECREF(args); 6276 args_owned = 0; 6277 } 6278 args = PyObject_GetItem(dict, key); 6279 Py_DECREF(key); 6280 if (args == NULL) { 6281 goto onError; 6282 } 6283 args_owned = 1; 6284 arglen = -1; 6285 argidx = -2; 6286 } 6287 while (--fmtcnt >= 0) { 6288 switch (c = *fmt++) { 6289 case '-': flags |= F_LJUST; continue; 6290 case '+': flags |= F_SIGN; continue; 6291 case ' ': flags |= F_BLANK; continue; 6292 case '#': flags |= F_ALT; continue; 6293 case '0': flags |= F_ZERO; continue; 6294 } 6295 break; 6296 } 6297 if (c == '*') { 6298 v = getnextarg(args, arglen, &argidx); 6299 if (v == NULL) 6300 goto onError; 6301 if (!PyInt_Check(v)) { 6302 PyErr_SetString(PyExc_TypeError, 6303 "* wants int"); 6304 goto onError; 6305 } 6306 width = PyInt_AsLong(v); 6307 if (width < 0) { 6308 flags |= F_LJUST; 6309 width = -width; 6310 } 6311 if (--fmtcnt >= 0) 6312 c = *fmt++; 6313 } 6314 else if (c >= '0' && c <= '9') { 6315 width = c - '0'; 6316 while (--fmtcnt >= 0) { 6317 c = *fmt++; 6318 if (c < '0' || c > '9') 6319 break; 6320 if ((width*10) / 10 != width) { 6321 PyErr_SetString(PyExc_ValueError, 6322 "width too big"); 6323 goto onError; 6324 } 6325 width = width*10 + (c - '0'); 6326 } 6327 } 6328 if (c == '.') { 6329 prec = 0; 6330 if (--fmtcnt >= 0) 6331 c = *fmt++; 6332 if (c == '*') { 6333 v = getnextarg(args, arglen, &argidx); 6334 if (v == NULL) 6335 goto onError; 6336 if (!PyInt_Check(v)) { 6337 PyErr_SetString(PyExc_TypeError, 6338 "* wants int"); 6339 goto onError; 6340 } 6341 prec = PyInt_AsLong(v); 6342 if (prec < 0) 6343 prec = 0; 6344 if (--fmtcnt >= 0) 6345 c = *fmt++; 6346 } 6347 else if (c >= '0' && c <= '9') { 6348 prec = c - '0'; 6349 while (--fmtcnt >= 0) { 6350 c = Py_CHARMASK(*fmt++); 6351 if (c < '0' || c > '9') 6352 break; 6353 if ((prec*10) / 10 != prec) { 6354 PyErr_SetString(PyExc_ValueError, 6355 "prec too big"); 6356 goto onError; 6357 } 6358 prec = prec*10 + (c - '0'); 6359 } 6360 } 6361 } /* prec */ 6362 if (fmtcnt >= 0) { 6363 if (c == 'h' || c == 'l' || c == 'L') { 6364 if (--fmtcnt >= 0) 6365 c = *fmt++; 6366 } 6367 } 6368 if (fmtcnt < 0) { 6369 PyErr_SetString(PyExc_ValueError, 6370 "incomplete format"); 6371 goto onError; 6372 } 6373 if (c != '%') { 6374 v = getnextarg(args, arglen, &argidx); 6375 if (v == NULL) 6376 goto onError; 6377 } 6378 sign = 0; 6379 fill = ' '; 6380 switch (c) { 6381 6382 case '%': 6383 pbuf = formatbuf; 6384 /* presume that buffer length is at least 1 */ 6385 pbuf[0] = '%'; 6386 len = 1; 6387 break; 6388 6389 case 's': 6390 case 'r': 6391 if (PyUnicode_Check(v) && c == 's') { 6392 temp = v; 6393 Py_INCREF(temp); 6394 } 6395 else { 6396 PyObject *unicode; 6397 if (c == 's') 6398 temp = PyObject_Str(v); 6399 else 6400 temp = PyObject_Repr(v); 6401 if (temp == NULL) 6402 goto onError; 6403 if (!PyString_Check(temp)) { 6404 /* XXX Note: this should never happen, since 6405 PyObject_Repr() and PyObject_Str() assure 6406 this */ 6407 Py_DECREF(temp); 6408 PyErr_SetString(PyExc_TypeError, 6409 "%s argument has non-string str()"); 6410 goto onError; 6411 } 6412 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 6413 PyString_GET_SIZE(temp), 6414 NULL, 6415 "strict"); 6416 Py_DECREF(temp); 6417 temp = unicode; 6418 if (temp == NULL) 6419 goto onError; 6420 } 6421 pbuf = PyUnicode_AS_UNICODE(temp); 6422 len = PyUnicode_GET_SIZE(temp); 6423 if (prec >= 0 && len > prec) 6424 len = prec; 6425 break; 6426 6427 case 'i': 6428 case 'd': 6429 case 'u': 6430 case 'o': 6431 case 'x': 6432 case 'X': 6433 if (c == 'i') 6434 c = 'd'; 6435 if (PyLong_Check(v)) { 6436 temp = formatlong(v, flags, prec, c); 6437 if (!temp) 6438 goto onError; 6439 pbuf = PyUnicode_AS_UNICODE(temp); 6440 len = PyUnicode_GET_SIZE(temp); 6441 /* unbounded ints can always produce 6442 a sign character! */ 6443 sign = 1; 6444 } 6445 else { 6446 pbuf = formatbuf; 6447 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6448 flags, prec, c, v); 6449 if (len < 0) 6450 goto onError; 6451 /* only d conversion is signed */ 6452 sign = c == 'd'; 6453 } 6454 if (flags & F_ZERO) 6455 fill = '0'; 6456 break; 6457 6458 case 'e': 6459 case 'E': 6460 case 'f': 6461 case 'g': 6462 case 'G': 6463 pbuf = formatbuf; 6464 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6465 flags, prec, c, v); 6466 if (len < 0) 6467 goto onError; 6468 sign = 1; 6469 if (flags & F_ZERO) 6470 fill = '0'; 6471 break; 6472 6473 case 'c': 6474 pbuf = formatbuf; 6475 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 6476 if (len < 0) 6477 goto onError; 6478 break; 6479 6480 default: 6481 PyErr_Format(PyExc_ValueError, 6482 "unsupported format character '%c' (0x%x) " 6483 "at index %i", 6484 (31<=c && c<=126) ? (char)c : '?', 6485 (int)c, 6486 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat))); 6487 goto onError; 6488 } 6489 if (sign) { 6490 if (*pbuf == '-' || *pbuf == '+') { 6491 sign = *pbuf++; 6492 len--; 6493 } 6494 else if (flags & F_SIGN) 6495 sign = '+'; 6496 else if (flags & F_BLANK) 6497 sign = ' '; 6498 else 6499 sign = 0; 6500 } 6501 if (width < len) 6502 width = len; 6503 if (rescnt - (sign != 0) < width) { 6504 reslen -= rescnt; 6505 rescnt = width + fmtcnt + 100; 6506 reslen += rescnt; 6507 if (reslen < 0) { 6508 Py_DECREF(result); 6509 return PyErr_NoMemory(); 6510 } 6511 if (_PyUnicode_Resize(&result, reslen) < 0) 6512 return NULL; 6513 res = PyUnicode_AS_UNICODE(result) 6514 + reslen - rescnt; 6515 } 6516 if (sign) { 6517 if (fill != ' ') 6518 *res++ = sign; 6519 rescnt--; 6520 if (width > len) 6521 width--; 6522 } 6523 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6524 assert(pbuf[0] == '0'); 6525 assert(pbuf[1] == c); 6526 if (fill != ' ') { 6527 *res++ = *pbuf++; 6528 *res++ = *pbuf++; 6529 } 6530 rescnt -= 2; 6531 width -= 2; 6532 if (width < 0) 6533 width = 0; 6534 len -= 2; 6535 } 6536 if (width > len && !(flags & F_LJUST)) { 6537 do { 6538 --rescnt; 6539 *res++ = fill; 6540 } while (--width > len); 6541 } 6542 if (fill == ' ') { 6543 if (sign) 6544 *res++ = sign; 6545 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6546 assert(pbuf[0] == '0'); 6547 assert(pbuf[1] == c); 6548 *res++ = *pbuf++; 6549 *res++ = *pbuf++; 6550 } 6551 } 6552 Py_UNICODE_COPY(res, pbuf, len); 6553 res += len; 6554 rescnt -= len; 6555 while (--width >= len) { 6556 --rescnt; 6557 *res++ = ' '; 6558 } 6559 if (dict && (argidx < arglen) && c != '%') { 6560 PyErr_SetString(PyExc_TypeError, 6561 "not all arguments converted during string formatting"); 6562 goto onError; 6563 } 6564 Py_XDECREF(temp); 6565 } /* '%' */ 6566 } /* until end */ 6567 if (argidx < arglen && !dict) { 6568 PyErr_SetString(PyExc_TypeError, 6569 "not all arguments converted during string formatting"); 6570 goto onError; 6571 } 6572 6573 if (args_owned) { 6574 Py_DECREF(args); 6575 } 6576 Py_DECREF(uformat); 6577 if (_PyUnicode_Resize(&result, reslen - rescnt)) 6578 goto onError; 6579 return (PyObject *)result; 6580 6581 onError: 6582 Py_XDECREF(result); 6583 Py_DECREF(uformat); 6584 if (args_owned) { 6585 Py_DECREF(args); 6586 } 6587 return NULL; 6588} 6589 6590static PyBufferProcs unicode_as_buffer = { 6591 (getreadbufferproc) unicode_buffer_getreadbuf, 6592 (getwritebufferproc) unicode_buffer_getwritebuf, 6593 (getsegcountproc) unicode_buffer_getsegcount, 6594 (getcharbufferproc) unicode_buffer_getcharbuf, 6595}; 6596 6597static PyObject * 6598unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 6599 6600static PyObject * 6601unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6602{ 6603 PyObject *x = NULL; 6604 static char *kwlist[] = {"string", "encoding", "errors", 0}; 6605 char *encoding = NULL; 6606 char *errors = NULL; 6607 6608 if (type != &PyUnicode_Type) 6609 return unicode_subtype_new(type, args, kwds); 6610 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 6611 kwlist, &x, &encoding, &errors)) 6612 return NULL; 6613 if (x == NULL) 6614 return (PyObject *)_PyUnicode_New(0); 6615 if (encoding == NULL && errors == NULL) 6616 return PyObject_Unicode(x); 6617 else 6618 return PyUnicode_FromEncodedObject(x, encoding, errors); 6619} 6620 6621static PyObject * 6622unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6623{ 6624 PyUnicodeObject *tmp, *pnew; 6625 int n; 6626 6627 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 6628 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 6629 if (tmp == NULL) 6630 return NULL; 6631 assert(PyUnicode_Check(tmp)); 6632 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 6633 if (pnew == NULL) 6634 return NULL; 6635 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 6636 if (pnew->str == NULL) { 6637 _Py_ForgetReference((PyObject *)pnew); 6638 PyObject_Del(pnew); 6639 return NULL; 6640 } 6641 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 6642 pnew->length = n; 6643 pnew->hash = tmp->hash; 6644 Py_DECREF(tmp); 6645 return (PyObject *)pnew; 6646} 6647 6648PyDoc_STRVAR(unicode_doc, 6649"unicode(string [, encoding[, errors]]) -> object\n\ 6650\n\ 6651Create a new Unicode object from the given encoded string.\n\ 6652encoding defaults to the current default string encoding.\n\ 6653errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 6654 6655PyTypeObject PyUnicode_Type = { 6656 PyObject_HEAD_INIT(&PyType_Type) 6657 0, /* ob_size */ 6658 "unicode", /* tp_name */ 6659 sizeof(PyUnicodeObject), /* tp_size */ 6660 0, /* tp_itemsize */ 6661 /* Slots */ 6662 (destructor)unicode_dealloc, /* tp_dealloc */ 6663 0, /* tp_print */ 6664 0, /* tp_getattr */ 6665 0, /* tp_setattr */ 6666 (cmpfunc) unicode_compare, /* tp_compare */ 6667 (reprfunc) unicode_repr, /* tp_repr */ 6668 &unicode_as_number, /* tp_as_number */ 6669 &unicode_as_sequence, /* tp_as_sequence */ 6670 &unicode_as_mapping, /* tp_as_mapping */ 6671 (hashfunc) unicode_hash, /* tp_hash*/ 6672 0, /* tp_call*/ 6673 (reprfunc) unicode_str, /* tp_str */ 6674 PyObject_GenericGetAttr, /* tp_getattro */ 6675 0, /* tp_setattro */ 6676 &unicode_as_buffer, /* tp_as_buffer */ 6677 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 6678 Py_TPFLAGS_BASETYPE, /* tp_flags */ 6679 unicode_doc, /* tp_doc */ 6680 0, /* tp_traverse */ 6681 0, /* tp_clear */ 6682 0, /* tp_richcompare */ 6683 0, /* tp_weaklistoffset */ 6684 0, /* tp_iter */ 6685 0, /* tp_iternext */ 6686 unicode_methods, /* tp_methods */ 6687 0, /* tp_members */ 6688 0, /* tp_getset */ 6689 &PyBaseString_Type, /* tp_base */ 6690 0, /* tp_dict */ 6691 0, /* tp_descr_get */ 6692 0, /* tp_descr_set */ 6693 0, /* tp_dictoffset */ 6694 0, /* tp_init */ 6695 0, /* tp_alloc */ 6696 unicode_new, /* tp_new */ 6697 PyObject_Del, /* tp_free */ 6698}; 6699 6700/* Initialize the Unicode implementation */ 6701 6702void _PyUnicode_Init(void) 6703{ 6704 int i; 6705 6706 /* Init the implementation */ 6707 unicode_freelist = NULL; 6708 unicode_freelist_size = 0; 6709 unicode_empty = _PyUnicode_New(0); 6710 strcpy(unicode_default_encoding, "ascii"); 6711 for (i = 0; i < 256; i++) 6712 unicode_latin1[i] = NULL; 6713 if (PyType_Ready(&PyUnicode_Type) < 0) 6714 Py_FatalError("Can't initialize 'unicode'"); 6715} 6716 6717/* Finalize the Unicode implementation */ 6718 6719void 6720_PyUnicode_Fini(void) 6721{ 6722 PyUnicodeObject *u; 6723 int i; 6724 6725 Py_XDECREF(unicode_empty); 6726 unicode_empty = NULL; 6727 6728 for (i = 0; i < 256; i++) { 6729 if (unicode_latin1[i]) { 6730 Py_DECREF(unicode_latin1[i]); 6731 unicode_latin1[i] = NULL; 6732 } 6733 } 6734 6735 for (u = unicode_freelist; u != NULL;) { 6736 PyUnicodeObject *v = u; 6737 u = *(PyUnicodeObject **)u; 6738 if (v->str) 6739 PyMem_DEL(v->str); 6740 Py_XDECREF(v->defenc); 6741 PyObject_Del(v); 6742 } 6743 unicode_freelist = NULL; 6744 unicode_freelist_size = 0; 6745} 6746