unicodeobject.c revision 9a3a9f779142d92655f86eaf9584ce946c61dfea
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WINDOWS 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_Del(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 Py_DECREF(*unicode); 280 *unicode = (PyObject *)w; 281 return 0; 282 } 283 284 /* Note that we don't have to modify *unicode for unshared Unicode 285 objects, since we can modify them in-place. */ 286 return unicode_resize(v, length); 287} 288 289/* Internal API for use in unicodeobject.c only ! */ 290#define _PyUnicode_Resize(unicodevar, length) \ 291 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 292 293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 294 int size) 295{ 296 PyUnicodeObject *unicode; 297 298 /* If the Unicode data is known at construction time, we can apply 299 some optimizations which share commonly used objects. */ 300 if (u != NULL) { 301 302 /* Optimization for empty strings */ 303 if (size == 0 && unicode_empty != NULL) { 304 Py_INCREF(unicode_empty); 305 return (PyObject *)unicode_empty; 306 } 307 308 /* Single character Unicode objects in the Latin-1 range are 309 shared when using this constructor */ 310 if (size == 1 && *u < 256) { 311 unicode = unicode_latin1[*u]; 312 if (!unicode) { 313 unicode = _PyUnicode_New(1); 314 if (!unicode) 315 return NULL; 316 unicode->str[0] = *u; 317 unicode_latin1[*u] = unicode; 318 } 319 Py_INCREF(unicode); 320 return (PyObject *)unicode; 321 } 322 } 323 324 unicode = _PyUnicode_New(size); 325 if (!unicode) 326 return NULL; 327 328 /* Copy the Unicode data into the new object */ 329 if (u != NULL) 330 Py_UNICODE_COPY(unicode->str, u, size); 331 332 return (PyObject *)unicode; 333} 334 335#ifdef HAVE_WCHAR_H 336 337PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 338 int size) 339{ 340 PyUnicodeObject *unicode; 341 342 if (w == NULL) { 343 PyErr_BadInternalCall(); 344 return NULL; 345 } 346 347 unicode = _PyUnicode_New(size); 348 if (!unicode) 349 return NULL; 350 351 /* Copy the wchar_t data into the new object */ 352#ifdef HAVE_USABLE_WCHAR_T 353 memcpy(unicode->str, w, size * sizeof(wchar_t)); 354#else 355 { 356 register Py_UNICODE *u; 357 register int i; 358 u = PyUnicode_AS_UNICODE(unicode); 359 for (i = size; i >= 0; i--) 360 *u++ = *w++; 361 } 362#endif 363 364 return (PyObject *)unicode; 365} 366 367int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 368 register wchar_t *w, 369 int size) 370{ 371 if (unicode == NULL) { 372 PyErr_BadInternalCall(); 373 return -1; 374 } 375 if (size > PyUnicode_GET_SIZE(unicode)) 376 size = PyUnicode_GET_SIZE(unicode); 377#ifdef HAVE_USABLE_WCHAR_T 378 memcpy(w, unicode->str, size * sizeof(wchar_t)); 379#else 380 { 381 register Py_UNICODE *u; 382 register int i; 383 u = PyUnicode_AS_UNICODE(unicode); 384 for (i = size; i >= 0; i--) 385 *w++ = *u++; 386 } 387#endif 388 389 return size; 390} 391 392#endif 393 394PyObject *PyUnicode_FromOrdinal(int ordinal) 395{ 396 Py_UNICODE s[2]; 397 398#ifdef Py_UNICODE_WIDE 399 if (ordinal < 0 || ordinal > 0x10ffff) { 400 PyErr_SetString(PyExc_ValueError, 401 "unichr() arg not in range(0x110000) " 402 "(wide Python build)"); 403 return NULL; 404 } 405#else 406 if (ordinal < 0 || ordinal > 0xffff) { 407 PyErr_SetString(PyExc_ValueError, 408 "unichr() arg not in range(0x10000) " 409 "(narrow Python build)"); 410 return NULL; 411 } 412#endif 413 414 if (ordinal <= 0xffff) { 415 /* UCS-2 character */ 416 s[0] = (Py_UNICODE) ordinal; 417 return PyUnicode_FromUnicode(s, 1); 418 } 419 else { 420#ifndef Py_UNICODE_WIDE 421 /* UCS-4 character. store as two surrogate characters */ 422 ordinal -= 0x10000L; 423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10); 424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF); 425 return PyUnicode_FromUnicode(s, 2); 426#else 427 s[0] = (Py_UNICODE)ordinal; 428 return PyUnicode_FromUnicode(s, 1); 429#endif 430 } 431} 432 433PyObject *PyUnicode_FromObject(register PyObject *obj) 434{ 435 /* XXX Perhaps we should make this API an alias of 436 PyObject_Unicode() instead ?! */ 437 if (PyUnicode_CheckExact(obj)) { 438 Py_INCREF(obj); 439 return obj; 440 } 441 if (PyUnicode_Check(obj)) { 442 /* For a Unicode subtype that's not a Unicode object, 443 return a true Unicode object with the same data. */ 444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 445 PyUnicode_GET_SIZE(obj)); 446 } 447 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 448} 449 450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 451 const char *encoding, 452 const char *errors) 453{ 454 const char *s = NULL; 455 int len; 456 PyObject *v; 457 458 if (obj == NULL) { 459 PyErr_BadInternalCall(); 460 return NULL; 461 } 462 463#if 0 464 /* For b/w compatibility we also accept Unicode objects provided 465 that no encodings is given and then redirect to 466 PyObject_Unicode() which then applies the additional logic for 467 Unicode subclasses. 468 469 NOTE: This API should really only be used for object which 470 represent *encoded* Unicode ! 471 472 */ 473 if (PyUnicode_Check(obj)) { 474 if (encoding) { 475 PyErr_SetString(PyExc_TypeError, 476 "decoding Unicode is not supported"); 477 return NULL; 478 } 479 return PyObject_Unicode(obj); 480 } 481#else 482 if (PyUnicode_Check(obj)) { 483 PyErr_SetString(PyExc_TypeError, 484 "decoding Unicode is not supported"); 485 return NULL; 486 } 487#endif 488 489 /* Coerce object */ 490 if (PyString_Check(obj)) { 491 s = PyString_AS_STRING(obj); 492 len = PyString_GET_SIZE(obj); 493 } 494 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 495 /* Overwrite the error message with something more useful in 496 case of a TypeError. */ 497 if (PyErr_ExceptionMatches(PyExc_TypeError)) 498 PyErr_Format(PyExc_TypeError, 499 "coercing to Unicode: need string or buffer, " 500 "%.80s found", 501 obj->ob_type->tp_name); 502 goto onError; 503 } 504 505 /* Convert to Unicode */ 506 if (len == 0) { 507 Py_INCREF(unicode_empty); 508 v = (PyObject *)unicode_empty; 509 } 510 else 511 v = PyUnicode_Decode(s, len, encoding, errors); 512 513 return v; 514 515 onError: 516 return NULL; 517} 518 519PyObject *PyUnicode_Decode(const char *s, 520 int size, 521 const char *encoding, 522 const char *errors) 523{ 524 PyObject *buffer = NULL, *unicode; 525 526 if (encoding == NULL) 527 encoding = PyUnicode_GetDefaultEncoding(); 528 529 /* Shortcuts for common default encodings */ 530 if (strcmp(encoding, "utf-8") == 0) 531 return PyUnicode_DecodeUTF8(s, size, errors); 532 else if (strcmp(encoding, "latin-1") == 0) 533 return PyUnicode_DecodeLatin1(s, size, errors); 534 else if (strcmp(encoding, "ascii") == 0) 535 return PyUnicode_DecodeASCII(s, size, errors); 536 537 /* Decode via the codec registry */ 538 buffer = PyBuffer_FromMemory((void *)s, size); 539 if (buffer == NULL) 540 goto onError; 541 unicode = PyCodec_Decode(buffer, encoding, errors); 542 if (unicode == NULL) 543 goto onError; 544 if (!PyUnicode_Check(unicode)) { 545 PyErr_Format(PyExc_TypeError, 546 "decoder did not return an unicode object (type=%.400s)", 547 unicode->ob_type->tp_name); 548 Py_DECREF(unicode); 549 goto onError; 550 } 551 Py_DECREF(buffer); 552 return unicode; 553 554 onError: 555 Py_XDECREF(buffer); 556 return NULL; 557} 558 559PyObject *PyUnicode_Encode(const Py_UNICODE *s, 560 int size, 561 const char *encoding, 562 const char *errors) 563{ 564 PyObject *v, *unicode; 565 566 unicode = PyUnicode_FromUnicode(s, size); 567 if (unicode == NULL) 568 return NULL; 569 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 570 Py_DECREF(unicode); 571 return v; 572} 573 574PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 575 const char *encoding, 576 const char *errors) 577{ 578 PyObject *v; 579 580 if (!PyUnicode_Check(unicode)) { 581 PyErr_BadArgument(); 582 goto onError; 583 } 584 585 if (encoding == NULL) 586 encoding = PyUnicode_GetDefaultEncoding(); 587 588 /* Shortcuts for common default encodings */ 589 if (errors == NULL) { 590 if (strcmp(encoding, "utf-8") == 0) 591 return PyUnicode_AsUTF8String(unicode); 592 else if (strcmp(encoding, "latin-1") == 0) 593 return PyUnicode_AsLatin1String(unicode); 594 else if (strcmp(encoding, "ascii") == 0) 595 return PyUnicode_AsASCIIString(unicode); 596 } 597 598 /* Encode via the codec registry */ 599 v = PyCodec_Encode(unicode, encoding, errors); 600 if (v == NULL) 601 goto onError; 602 /* XXX Should we really enforce this ? */ 603 if (!PyString_Check(v)) { 604 PyErr_Format(PyExc_TypeError, 605 "encoder did not return a string object (type=%.400s)", 606 v->ob_type->tp_name); 607 Py_DECREF(v); 608 goto onError; 609 } 610 return v; 611 612 onError: 613 return NULL; 614} 615 616PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 617 const char *errors) 618{ 619 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 620 621 if (v) 622 return v; 623 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 624 if (v && errors == NULL) 625 ((PyUnicodeObject *)unicode)->defenc = v; 626 return v; 627} 628 629Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 630{ 631 if (!PyUnicode_Check(unicode)) { 632 PyErr_BadArgument(); 633 goto onError; 634 } 635 return PyUnicode_AS_UNICODE(unicode); 636 637 onError: 638 return NULL; 639} 640 641int PyUnicode_GetSize(PyObject *unicode) 642{ 643 if (!PyUnicode_Check(unicode)) { 644 PyErr_BadArgument(); 645 goto onError; 646 } 647 return PyUnicode_GET_SIZE(unicode); 648 649 onError: 650 return -1; 651} 652 653const char *PyUnicode_GetDefaultEncoding(void) 654{ 655 return unicode_default_encoding; 656} 657 658int PyUnicode_SetDefaultEncoding(const char *encoding) 659{ 660 PyObject *v; 661 662 /* Make sure the encoding is valid. As side effect, this also 663 loads the encoding into the codec registry cache. */ 664 v = _PyCodec_Lookup(encoding); 665 if (v == NULL) 666 goto onError; 667 Py_DECREF(v); 668 strncpy(unicode_default_encoding, 669 encoding, 670 sizeof(unicode_default_encoding)); 671 return 0; 672 673 onError: 674 return -1; 675} 676 677/* error handling callback helper: 678 build arguments, call the callback and check the arguments, 679 if no exception occured, copy the replacement to the output 680 and adjust various state variables. 681 return 0 on success, -1 on error 682*/ 683 684static 685int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 686 const char *encoding, const char *reason, 687 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, 688 PyObject **output, int *outpos, Py_UNICODE **outptr) 689{ 690 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; 691 692 PyObject *restuple = NULL; 693 PyObject *repunicode = NULL; 694 int outsize = PyUnicode_GET_SIZE(*output); 695 int requiredsize; 696 int newpos; 697 Py_UNICODE *repptr; 698 int repsize; 699 int res = -1; 700 701 if (*errorHandler == NULL) { 702 *errorHandler = PyCodec_LookupError(errors); 703 if (*errorHandler == NULL) 704 goto onError; 705 } 706 707 if (*exceptionObject == NULL) { 708 *exceptionObject = PyUnicodeDecodeError_Create( 709 encoding, input, insize, *startinpos, *endinpos, reason); 710 if (*exceptionObject == NULL) 711 goto onError; 712 } 713 else { 714 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 715 goto onError; 716 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 717 goto onError; 718 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 719 goto onError; 720 } 721 722 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 723 if (restuple == NULL) 724 goto onError; 725 if (!PyTuple_Check(restuple)) { 726 PyErr_Format(PyExc_TypeError, &argparse[4]); 727 goto onError; 728 } 729 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 730 goto onError; 731 if (newpos<0) 732 newpos = insize+newpos; 733 if (newpos<0 || newpos>insize) { 734 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos); 735 goto onError; 736 } 737 738 /* need more space? (at least enough for what we 739 have+the replacement+the rest of the string (starting 740 at the new input position), so we won't have to check space 741 when there are no errors in the rest of the string) */ 742 repptr = PyUnicode_AS_UNICODE(repunicode); 743 repsize = PyUnicode_GET_SIZE(repunicode); 744 requiredsize = *outpos + repsize + insize-newpos; 745 if (requiredsize > outsize) { 746 if (requiredsize<2*outsize) 747 requiredsize = 2*outsize; 748 if (PyUnicode_Resize(output, requiredsize)) 749 goto onError; 750 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 751 } 752 *endinpos = newpos; 753 *inptr = input + newpos; 754 Py_UNICODE_COPY(*outptr, repptr, repsize); 755 *outptr += repsize; 756 *outpos += repsize; 757 /* we made it! */ 758 res = 0; 759 760 onError: 761 Py_XDECREF(restuple); 762 return res; 763} 764 765/* --- UTF-7 Codec -------------------------------------------------------- */ 766 767/* see RFC2152 for details */ 768 769static 770char utf7_special[128] = { 771 /* indicate whether a UTF-7 character is special i.e. cannot be directly 772 encoded: 773 0 - not special 774 1 - special 775 2 - whitespace (optional) 776 3 - RFC2152 Set O (optional) */ 777 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 779 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 781 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 783 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 785 786}; 787 788#define SPECIAL(c, encodeO, encodeWS) \ 789 (((c)>127 || utf7_special[(c)] == 1) || \ 790 (encodeWS && (utf7_special[(c)] == 2)) || \ 791 (encodeO && (utf7_special[(c)] == 3))) 792 793#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 794#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 795#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 796 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 797 798#define ENCODE(out, ch, bits) \ 799 while (bits >= 6) { \ 800 *out++ = B64(ch >> (bits-6)); \ 801 bits -= 6; \ 802 } 803 804#define DECODE(out, ch, bits, surrogate) \ 805 while (bits >= 16) { \ 806 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 807 bits -= 16; \ 808 if (surrogate) { \ 809 /* We have already generated an error for the high surrogate 810 so let's not bother seeing if the low surrogate is correct or not */\ 811 surrogate = 0; \ 812 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 813 /* This is a surrogate pair. Unfortunately we can't represent \ 814 it in a 16-bit character */ \ 815 surrogate = 1; \ 816 errmsg = "code pairs are not supported"; \ 817 goto utf7Error; \ 818 } else { \ 819 *out++ = outCh; \ 820 } \ 821 } \ 822 823PyObject *PyUnicode_DecodeUTF7(const char *s, 824 int size, 825 const char *errors) 826{ 827 const char *starts = s; 828 int startinpos; 829 int endinpos; 830 int outpos; 831 const char *e; 832 PyUnicodeObject *unicode; 833 Py_UNICODE *p; 834 const char *errmsg = ""; 835 int inShift = 0; 836 unsigned int bitsleft = 0; 837 unsigned long charsleft = 0; 838 int surrogate = 0; 839 PyObject *errorHandler = NULL; 840 PyObject *exc = NULL; 841 842 unicode = _PyUnicode_New(size); 843 if (!unicode) 844 return NULL; 845 if (size == 0) 846 return (PyObject *)unicode; 847 848 p = unicode->str; 849 e = s + size; 850 851 while (s < e) { 852 Py_UNICODE ch; 853 restart: 854 ch = *s; 855 856 if (inShift) { 857 if ((ch == '-') || !B64CHAR(ch)) { 858 inShift = 0; 859 s++; 860 861 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 862 if (bitsleft >= 6) { 863 /* The shift sequence has a partial character in it. If 864 bitsleft < 6 then we could just classify it as padding 865 but that is not the case here */ 866 867 errmsg = "partial character in shift sequence"; 868 goto utf7Error; 869 } 870 /* According to RFC2152 the remaining bits should be zero. We 871 choose to signal an error/insert a replacement character 872 here so indicate the potential of a misencoded character. */ 873 874 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 875 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 876 errmsg = "non-zero padding bits in shift sequence"; 877 goto utf7Error; 878 } 879 880 if (ch == '-') { 881 if ((s < e) && (*(s) == '-')) { 882 *p++ = '-'; 883 inShift = 1; 884 } 885 } else if (SPECIAL(ch,0,0)) { 886 errmsg = "unexpected special character"; 887 goto utf7Error; 888 } else { 889 *p++ = ch; 890 } 891 } else { 892 charsleft = (charsleft << 6) | UB64(ch); 893 bitsleft += 6; 894 s++; 895 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 896 } 897 } 898 else if ( ch == '+' ) { 899 startinpos = s-starts; 900 s++; 901 if (s < e && *s == '-') { 902 s++; 903 *p++ = '+'; 904 } else 905 { 906 inShift = 1; 907 bitsleft = 0; 908 } 909 } 910 else if (SPECIAL(ch,0,0)) { 911 errmsg = "unexpected special character"; 912 s++; 913 goto utf7Error; 914 } 915 else { 916 *p++ = ch; 917 s++; 918 } 919 continue; 920 utf7Error: 921 outpos = p-PyUnicode_AS_UNICODE(unicode); 922 endinpos = s-starts; 923 if (unicode_decode_call_errorhandler( 924 errors, &errorHandler, 925 "utf7", errmsg, 926 starts, size, &startinpos, &endinpos, &exc, &s, 927 (PyObject **)&unicode, &outpos, &p)) 928 goto onError; 929 } 930 931 if (inShift) { 932 outpos = p-PyUnicode_AS_UNICODE(unicode); 933 endinpos = size; 934 if (unicode_decode_call_errorhandler( 935 errors, &errorHandler, 936 "utf7", "unterminated shift sequence", 937 starts, size, &startinpos, &endinpos, &exc, &s, 938 (PyObject **)&unicode, &outpos, &p)) 939 goto onError; 940 if (s < e) 941 goto restart; 942 } 943 944 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) 945 goto onError; 946 947 Py_XDECREF(errorHandler); 948 Py_XDECREF(exc); 949 return (PyObject *)unicode; 950 951onError: 952 Py_XDECREF(errorHandler); 953 Py_XDECREF(exc); 954 Py_DECREF(unicode); 955 return NULL; 956} 957 958 959PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 960 int size, 961 int encodeSetO, 962 int encodeWhiteSpace, 963 const char *errors) 964{ 965 PyObject *v; 966 /* It might be possible to tighten this worst case */ 967 unsigned int cbAllocated = 5 * size; 968 int inShift = 0; 969 int i = 0; 970 unsigned int bitsleft = 0; 971 unsigned long charsleft = 0; 972 char * out; 973 char * start; 974 975 if (size == 0) 976 return PyString_FromStringAndSize(NULL, 0); 977 978 v = PyString_FromStringAndSize(NULL, cbAllocated); 979 if (v == NULL) 980 return NULL; 981 982 start = out = PyString_AS_STRING(v); 983 for (;i < size; ++i) { 984 Py_UNICODE ch = s[i]; 985 986 if (!inShift) { 987 if (ch == '+') { 988 *out++ = '+'; 989 *out++ = '-'; 990 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 991 charsleft = ch; 992 bitsleft = 16; 993 *out++ = '+'; 994 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 995 inShift = bitsleft > 0; 996 } else { 997 *out++ = (char) ch; 998 } 999 } else { 1000 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1001 *out++ = B64(charsleft << (6-bitsleft)); 1002 charsleft = 0; 1003 bitsleft = 0; 1004 /* Characters not in the BASE64 set implicitly unshift the sequence 1005 so no '-' is required, except if the character is itself a '-' */ 1006 if (B64CHAR(ch) || ch == '-') { 1007 *out++ = '-'; 1008 } 1009 inShift = 0; 1010 *out++ = (char) ch; 1011 } else { 1012 bitsleft += 16; 1013 charsleft = (charsleft << 16) | ch; 1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1015 1016 /* If the next character is special then we dont' need to terminate 1017 the shift sequence. If the next character is not a BASE64 character 1018 or '-' then the shift sequence will be terminated implicitly and we 1019 don't have to insert a '-'. */ 1020 1021 if (bitsleft == 0) { 1022 if (i + 1 < size) { 1023 Py_UNICODE ch2 = s[i+1]; 1024 1025 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1026 1027 } else if (B64CHAR(ch2) || ch2 == '-') { 1028 *out++ = '-'; 1029 inShift = 0; 1030 } else { 1031 inShift = 0; 1032 } 1033 1034 } 1035 else { 1036 *out++ = '-'; 1037 inShift = 0; 1038 } 1039 } 1040 } 1041 } 1042 } 1043 if (bitsleft) { 1044 *out++= B64(charsleft << (6-bitsleft) ); 1045 *out++ = '-'; 1046 } 1047 1048 _PyString_Resize(&v, out - start); 1049 return v; 1050} 1051 1052#undef SPECIAL 1053#undef B64 1054#undef B64CHAR 1055#undef UB64 1056#undef ENCODE 1057#undef DECODE 1058 1059/* --- UTF-8 Codec -------------------------------------------------------- */ 1060 1061static 1062char utf8_code_length[256] = { 1063 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1064 illegal prefix. see RFC 2279 for details */ 1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1078 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1079 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1080 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1081}; 1082 1083PyObject *PyUnicode_DecodeUTF8(const char *s, 1084 int size, 1085 const char *errors) 1086{ 1087 const char *starts = s; 1088 int n; 1089 int startinpos; 1090 int endinpos; 1091 int outpos; 1092 const char *e; 1093 PyUnicodeObject *unicode; 1094 Py_UNICODE *p; 1095 const char *errmsg = ""; 1096 PyObject *errorHandler = NULL; 1097 PyObject *exc = NULL; 1098 1099 /* Note: size will always be longer than the resulting Unicode 1100 character count */ 1101 unicode = _PyUnicode_New(size); 1102 if (!unicode) 1103 return NULL; 1104 if (size == 0) 1105 return (PyObject *)unicode; 1106 1107 /* Unpack UTF-8 encoded data */ 1108 p = unicode->str; 1109 e = s + size; 1110 1111 while (s < e) { 1112 Py_UCS4 ch = (unsigned char)*s; 1113 1114 if (ch < 0x80) { 1115 *p++ = (Py_UNICODE)ch; 1116 s++; 1117 continue; 1118 } 1119 1120 n = utf8_code_length[ch]; 1121 1122 if (s + n > e) { 1123 errmsg = "unexpected end of data"; 1124 startinpos = s-starts; 1125 endinpos = size; 1126 goto utf8Error; 1127 } 1128 1129 switch (n) { 1130 1131 case 0: 1132 errmsg = "unexpected code byte"; 1133 startinpos = s-starts; 1134 endinpos = startinpos+1; 1135 goto utf8Error; 1136 1137 case 1: 1138 errmsg = "internal error"; 1139 startinpos = s-starts; 1140 endinpos = startinpos+1; 1141 goto utf8Error; 1142 1143 case 2: 1144 if ((s[1] & 0xc0) != 0x80) { 1145 errmsg = "invalid data"; 1146 startinpos = s-starts; 1147 endinpos = startinpos+2; 1148 goto utf8Error; 1149 } 1150 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1151 if (ch < 0x80) { 1152 startinpos = s-starts; 1153 endinpos = startinpos+2; 1154 errmsg = "illegal encoding"; 1155 goto utf8Error; 1156 } 1157 else 1158 *p++ = (Py_UNICODE)ch; 1159 break; 1160 1161 case 3: 1162 if ((s[1] & 0xc0) != 0x80 || 1163 (s[2] & 0xc0) != 0x80) { 1164 errmsg = "invalid data"; 1165 startinpos = s-starts; 1166 endinpos = startinpos+3; 1167 goto utf8Error; 1168 } 1169 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1170 if (ch < 0x0800) { 1171 /* Note: UTF-8 encodings of surrogates are considered 1172 legal UTF-8 sequences; 1173 1174 XXX For wide builds (UCS-4) we should probably try 1175 to recombine the surrogates into a single code 1176 unit. 1177 */ 1178 errmsg = "illegal encoding"; 1179 startinpos = s-starts; 1180 endinpos = startinpos+3; 1181 goto utf8Error; 1182 } 1183 else 1184 *p++ = (Py_UNICODE)ch; 1185 break; 1186 1187 case 4: 1188 if ((s[1] & 0xc0) != 0x80 || 1189 (s[2] & 0xc0) != 0x80 || 1190 (s[3] & 0xc0) != 0x80) { 1191 errmsg = "invalid data"; 1192 startinpos = s-starts; 1193 endinpos = startinpos+4; 1194 goto utf8Error; 1195 } 1196 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1197 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1198 /* validate and convert to UTF-16 */ 1199 if ((ch < 0x10000) /* minimum value allowed for 4 1200 byte encoding */ 1201 || (ch > 0x10ffff)) /* maximum value allowed for 1202 UTF-16 */ 1203 { 1204 errmsg = "illegal encoding"; 1205 startinpos = s-starts; 1206 endinpos = startinpos+4; 1207 goto utf8Error; 1208 } 1209#ifdef Py_UNICODE_WIDE 1210 *p++ = (Py_UNICODE)ch; 1211#else 1212 /* compute and append the two surrogates: */ 1213 1214 /* translate from 10000..10FFFF to 0..FFFF */ 1215 ch -= 0x10000; 1216 1217 /* high surrogate = top 10 bits added to D800 */ 1218 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1219 1220 /* low surrogate = bottom 10 bits added to DC00 */ 1221 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1222#endif 1223 break; 1224 1225 default: 1226 /* Other sizes are only needed for UCS-4 */ 1227 errmsg = "unsupported Unicode code range"; 1228 startinpos = s-starts; 1229 endinpos = startinpos+n; 1230 goto utf8Error; 1231 } 1232 s += n; 1233 continue; 1234 1235 utf8Error: 1236 outpos = p-PyUnicode_AS_UNICODE(unicode); 1237 if (unicode_decode_call_errorhandler( 1238 errors, &errorHandler, 1239 "utf8", errmsg, 1240 starts, size, &startinpos, &endinpos, &exc, &s, 1241 (PyObject **)&unicode, &outpos, &p)) 1242 goto onError; 1243 } 1244 1245 /* Adjust length */ 1246 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1247 goto onError; 1248 1249 Py_XDECREF(errorHandler); 1250 Py_XDECREF(exc); 1251 return (PyObject *)unicode; 1252 1253onError: 1254 Py_XDECREF(errorHandler); 1255 Py_XDECREF(exc); 1256 Py_DECREF(unicode); 1257 return NULL; 1258} 1259 1260/* Allocation strategy: if the string is short, convert into a stack buffer 1261 and allocate exactly as much space needed at the end. Else allocate the 1262 maximum possible needed (4 result bytes per Unicode character), and return 1263 the excess memory at the end. 1264*/ 1265PyObject * 1266PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1267 int size, 1268 const char *errors) 1269{ 1270#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1271 1272 int i; /* index into s of next input byte */ 1273 PyObject *v; /* result string object */ 1274 char *p; /* next free byte in output buffer */ 1275 int nallocated; /* number of result bytes allocated */ 1276 int nneeded; /* number of result bytes needed */ 1277 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1278 1279 assert(s != NULL); 1280 assert(size >= 0); 1281 1282 if (size <= MAX_SHORT_UNICHARS) { 1283 /* Write into the stack buffer; nallocated can't overflow. 1284 * At the end, we'll allocate exactly as much heap space as it 1285 * turns out we need. 1286 */ 1287 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1288 v = NULL; /* will allocate after we're done */ 1289 p = stackbuf; 1290 } 1291 else { 1292 /* Overallocate on the heap, and give the excess back at the end. */ 1293 nallocated = size * 4; 1294 if (nallocated / 4 != size) /* overflow! */ 1295 return PyErr_NoMemory(); 1296 v = PyString_FromStringAndSize(NULL, nallocated); 1297 if (v == NULL) 1298 return NULL; 1299 p = PyString_AS_STRING(v); 1300 } 1301 1302 for (i = 0; i < size;) { 1303 Py_UCS4 ch = s[i++]; 1304 1305 if (ch < 0x80) 1306 /* Encode ASCII */ 1307 *p++ = (char) ch; 1308 1309 else if (ch < 0x0800) { 1310 /* Encode Latin-1 */ 1311 *p++ = (char)(0xc0 | (ch >> 6)); 1312 *p++ = (char)(0x80 | (ch & 0x3f)); 1313 } 1314 else { 1315 /* Encode UCS2 Unicode ordinals */ 1316 if (ch < 0x10000) { 1317 /* Special case: check for high surrogate */ 1318 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1319 Py_UCS4 ch2 = s[i]; 1320 /* Check for low surrogate and combine the two to 1321 form a UCS4 value */ 1322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1323 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1324 i++; 1325 goto encodeUCS4; 1326 } 1327 /* Fall through: handles isolated high surrogates */ 1328 } 1329 *p++ = (char)(0xe0 | (ch >> 12)); 1330 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1331 *p++ = (char)(0x80 | (ch & 0x3f)); 1332 continue; 1333 } 1334encodeUCS4: 1335 /* Encode UCS4 Unicode ordinals */ 1336 *p++ = (char)(0xf0 | (ch >> 18)); 1337 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1339 *p++ = (char)(0x80 | (ch & 0x3f)); 1340 } 1341 } 1342 1343 if (v == NULL) { 1344 /* This was stack allocated. */ 1345 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); 1346 assert(nneeded <= nallocated); 1347 v = PyString_FromStringAndSize(stackbuf, nneeded); 1348 } 1349 else { 1350 /* Cut back to size actually needed. */ 1351 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); 1352 assert(nneeded <= nallocated); 1353 _PyString_Resize(&v, nneeded); 1354 } 1355 return v; 1356 1357#undef MAX_SHORT_UNICHARS 1358} 1359 1360PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1361{ 1362 if (!PyUnicode_Check(unicode)) { 1363 PyErr_BadArgument(); 1364 return NULL; 1365 } 1366 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1367 PyUnicode_GET_SIZE(unicode), 1368 NULL); 1369} 1370 1371/* --- UTF-16 Codec ------------------------------------------------------- */ 1372 1373PyObject * 1374PyUnicode_DecodeUTF16(const char *s, 1375 int size, 1376 const char *errors, 1377 int *byteorder) 1378{ 1379 const char *starts = s; 1380 int startinpos; 1381 int endinpos; 1382 int outpos; 1383 PyUnicodeObject *unicode; 1384 Py_UNICODE *p; 1385 const unsigned char *q, *e; 1386 int bo = 0; /* assume native ordering by default */ 1387 const char *errmsg = ""; 1388 /* Offsets from q for retrieving byte pairs in the right order. */ 1389#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1390 int ihi = 1, ilo = 0; 1391#else 1392 int ihi = 0, ilo = 1; 1393#endif 1394 PyObject *errorHandler = NULL; 1395 PyObject *exc = NULL; 1396 1397 /* Note: size will always be longer than the resulting Unicode 1398 character count */ 1399 unicode = _PyUnicode_New(size); 1400 if (!unicode) 1401 return NULL; 1402 if (size == 0) 1403 return (PyObject *)unicode; 1404 1405 /* Unpack UTF-16 encoded data */ 1406 p = unicode->str; 1407 q = (unsigned char *)s; 1408 e = q + size; 1409 1410 if (byteorder) 1411 bo = *byteorder; 1412 1413 /* Check for BOM marks (U+FEFF) in the input and adjust current 1414 byte order setting accordingly. In native mode, the leading BOM 1415 mark is skipped, in all other modes, it is copied to the output 1416 stream as-is (giving a ZWNBSP character). */ 1417 if (bo == 0) { 1418 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1419#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1420 if (bom == 0xFEFF) { 1421 q += 2; 1422 bo = -1; 1423 } 1424 else if (bom == 0xFFFE) { 1425 q += 2; 1426 bo = 1; 1427 } 1428#else 1429 if (bom == 0xFEFF) { 1430 q += 2; 1431 bo = 1; 1432 } 1433 else if (bom == 0xFFFE) { 1434 q += 2; 1435 bo = -1; 1436 } 1437#endif 1438 } 1439 1440 if (bo == -1) { 1441 /* force LE */ 1442 ihi = 1; 1443 ilo = 0; 1444 } 1445 else if (bo == 1) { 1446 /* force BE */ 1447 ihi = 0; 1448 ilo = 1; 1449 } 1450 1451 while (q < e) { 1452 Py_UNICODE ch; 1453 /* remaing bytes at the end? (size should be even) */ 1454 if (e-q<2) { 1455 errmsg = "truncated data"; 1456 startinpos = ((const char *)q)-starts; 1457 endinpos = ((const char *)e)-starts; 1458 goto utf16Error; 1459 /* The remaining input chars are ignored if the callback 1460 chooses to skip the input */ 1461 } 1462 ch = (q[ihi] << 8) | q[ilo]; 1463 1464 q += 2; 1465 1466 if (ch < 0xD800 || ch > 0xDFFF) { 1467 *p++ = ch; 1468 continue; 1469 } 1470 1471 /* UTF-16 code pair: */ 1472 if (q >= e) { 1473 errmsg = "unexpected end of data"; 1474 startinpos = (((const char *)q)-2)-starts; 1475 endinpos = ((const char *)e)-starts; 1476 goto utf16Error; 1477 } 1478 if (0xD800 <= ch && ch <= 0xDBFF) { 1479 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1480 q += 2; 1481 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1482#ifndef Py_UNICODE_WIDE 1483 *p++ = ch; 1484 *p++ = ch2; 1485#else 1486 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1487#endif 1488 continue; 1489 } 1490 else { 1491 errmsg = "illegal UTF-16 surrogate"; 1492 startinpos = (((const char *)q)-4)-starts; 1493 endinpos = startinpos+2; 1494 goto utf16Error; 1495 } 1496 1497 } 1498 errmsg = "illegal encoding"; 1499 startinpos = (((const char *)q)-2)-starts; 1500 endinpos = startinpos+2; 1501 /* Fall through to report the error */ 1502 1503 utf16Error: 1504 outpos = p-PyUnicode_AS_UNICODE(unicode); 1505 if (unicode_decode_call_errorhandler( 1506 errors, &errorHandler, 1507 "utf16", errmsg, 1508 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1509 (PyObject **)&unicode, &outpos, &p)) 1510 goto onError; 1511 } 1512 1513 if (byteorder) 1514 *byteorder = bo; 1515 1516 /* Adjust length */ 1517 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1518 goto onError; 1519 1520 Py_XDECREF(errorHandler); 1521 Py_XDECREF(exc); 1522 return (PyObject *)unicode; 1523 1524onError: 1525 Py_DECREF(unicode); 1526 Py_XDECREF(errorHandler); 1527 Py_XDECREF(exc); 1528 return NULL; 1529} 1530 1531PyObject * 1532PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1533 int size, 1534 const char *errors, 1535 int byteorder) 1536{ 1537 PyObject *v; 1538 unsigned char *p; 1539 int i, pairs; 1540 /* Offsets from p for storing byte pairs in the right order. */ 1541#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1542 int ihi = 1, ilo = 0; 1543#else 1544 int ihi = 0, ilo = 1; 1545#endif 1546 1547#define STORECHAR(CH) \ 1548 do { \ 1549 p[ihi] = ((CH) >> 8) & 0xff; \ 1550 p[ilo] = (CH) & 0xff; \ 1551 p += 2; \ 1552 } while(0) 1553 1554 for (i = pairs = 0; i < size; i++) 1555 if (s[i] >= 0x10000) 1556 pairs++; 1557 v = PyString_FromStringAndSize(NULL, 1558 2 * (size + pairs + (byteorder == 0))); 1559 if (v == NULL) 1560 return NULL; 1561 1562 p = (unsigned char *)PyString_AS_STRING(v); 1563 if (byteorder == 0) 1564 STORECHAR(0xFEFF); 1565 if (size == 0) 1566 return v; 1567 1568 if (byteorder == -1) { 1569 /* force LE */ 1570 ihi = 1; 1571 ilo = 0; 1572 } 1573 else if (byteorder == 1) { 1574 /* force BE */ 1575 ihi = 0; 1576 ilo = 1; 1577 } 1578 1579 while (size-- > 0) { 1580 Py_UNICODE ch = *s++; 1581 Py_UNICODE ch2 = 0; 1582 if (ch >= 0x10000) { 1583 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1584 ch = 0xD800 | ((ch-0x10000) >> 10); 1585 } 1586 STORECHAR(ch); 1587 if (ch2) 1588 STORECHAR(ch2); 1589 } 1590 return v; 1591#undef STORECHAR 1592} 1593 1594PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1595{ 1596 if (!PyUnicode_Check(unicode)) { 1597 PyErr_BadArgument(); 1598 return NULL; 1599 } 1600 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1601 PyUnicode_GET_SIZE(unicode), 1602 NULL, 1603 0); 1604} 1605 1606/* --- Unicode Escape Codec ----------------------------------------------- */ 1607 1608static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1609 1610PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1611 int size, 1612 const char *errors) 1613{ 1614 const char *starts = s; 1615 int startinpos; 1616 int endinpos; 1617 int outpos; 1618 int i; 1619 PyUnicodeObject *v; 1620 Py_UNICODE *p; 1621 const char *end; 1622 char* message; 1623 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1624 PyObject *errorHandler = NULL; 1625 PyObject *exc = NULL; 1626 1627 /* Escaped strings will always be longer than the resulting 1628 Unicode string, so we start with size here and then reduce the 1629 length after conversion to the true value. 1630 (but if the error callback returns a long replacement string 1631 we'll have to allocate more space) */ 1632 v = _PyUnicode_New(size); 1633 if (v == NULL) 1634 goto onError; 1635 if (size == 0) 1636 return (PyObject *)v; 1637 1638 p = PyUnicode_AS_UNICODE(v); 1639 end = s + size; 1640 1641 while (s < end) { 1642 unsigned char c; 1643 Py_UNICODE x; 1644 int digits; 1645 1646 /* Non-escape characters are interpreted as Unicode ordinals */ 1647 if (*s != '\\') { 1648 *p++ = (unsigned char) *s++; 1649 continue; 1650 } 1651 1652 startinpos = s-starts; 1653 /* \ - Escapes */ 1654 s++; 1655 switch (*s++) { 1656 1657 /* \x escapes */ 1658 case '\n': break; 1659 case '\\': *p++ = '\\'; break; 1660 case '\'': *p++ = '\''; break; 1661 case '\"': *p++ = '\"'; break; 1662 case 'b': *p++ = '\b'; break; 1663 case 'f': *p++ = '\014'; break; /* FF */ 1664 case 't': *p++ = '\t'; break; 1665 case 'n': *p++ = '\n'; break; 1666 case 'r': *p++ = '\r'; break; 1667 case 'v': *p++ = '\013'; break; /* VT */ 1668 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1669 1670 /* \OOO (octal) escapes */ 1671 case '0': case '1': case '2': case '3': 1672 case '4': case '5': case '6': case '7': 1673 x = s[-1] - '0'; 1674 if ('0' <= *s && *s <= '7') { 1675 x = (x<<3) + *s++ - '0'; 1676 if ('0' <= *s && *s <= '7') 1677 x = (x<<3) + *s++ - '0'; 1678 } 1679 *p++ = x; 1680 break; 1681 1682 /* hex escapes */ 1683 /* \xXX */ 1684 case 'x': 1685 digits = 2; 1686 message = "truncated \\xXX escape"; 1687 goto hexescape; 1688 1689 /* \uXXXX */ 1690 case 'u': 1691 digits = 4; 1692 message = "truncated \\uXXXX escape"; 1693 goto hexescape; 1694 1695 /* \UXXXXXXXX */ 1696 case 'U': 1697 digits = 8; 1698 message = "truncated \\UXXXXXXXX escape"; 1699 hexescape: 1700 chr = 0; 1701 outpos = p-PyUnicode_AS_UNICODE(v); 1702 if (s+digits>end) { 1703 endinpos = size; 1704 if (unicode_decode_call_errorhandler( 1705 errors, &errorHandler, 1706 "unicodeescape", "end of string in escape sequence", 1707 starts, size, &startinpos, &endinpos, &exc, &s, 1708 (PyObject **)&v, &outpos, &p)) 1709 goto onError; 1710 goto nextByte; 1711 } 1712 for (i = 0; i < digits; ++i) { 1713 c = (unsigned char) s[i]; 1714 if (!isxdigit(c)) { 1715 endinpos = (s+i+1)-starts; 1716 if (unicode_decode_call_errorhandler( 1717 errors, &errorHandler, 1718 "unicodeescape", message, 1719 starts, size, &startinpos, &endinpos, &exc, &s, 1720 (PyObject **)&v, &outpos, &p)) 1721 goto onError; 1722 goto nextByte; 1723 } 1724 chr = (chr<<4) & ~0xF; 1725 if (c >= '0' && c <= '9') 1726 chr += c - '0'; 1727 else if (c >= 'a' && c <= 'f') 1728 chr += 10 + c - 'a'; 1729 else 1730 chr += 10 + c - 'A'; 1731 } 1732 s += i; 1733 if (chr == 0xffffffff) 1734 /* _decoding_error will have already written into the 1735 target buffer. */ 1736 break; 1737 store: 1738 /* when we get here, chr is a 32-bit unicode character */ 1739 if (chr <= 0xffff) 1740 /* UCS-2 character */ 1741 *p++ = (Py_UNICODE) chr; 1742 else if (chr <= 0x10ffff) { 1743 /* UCS-4 character. Either store directly, or as 1744 surrogate pair. */ 1745#ifdef Py_UNICODE_WIDE 1746 *p++ = chr; 1747#else 1748 chr -= 0x10000L; 1749 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1750 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1751#endif 1752 } else { 1753 endinpos = s-starts; 1754 outpos = p-PyUnicode_AS_UNICODE(v); 1755 if (unicode_decode_call_errorhandler( 1756 errors, &errorHandler, 1757 "unicodeescape", "illegal Unicode character", 1758 starts, size, &startinpos, &endinpos, &exc, &s, 1759 (PyObject **)&v, &outpos, &p)) 1760 goto onError; 1761 } 1762 break; 1763 1764 /* \N{name} */ 1765 case 'N': 1766 message = "malformed \\N character escape"; 1767 if (ucnhash_CAPI == NULL) { 1768 /* load the unicode data module */ 1769 PyObject *m, *v; 1770 m = PyImport_ImportModule("unicodedata"); 1771 if (m == NULL) 1772 goto ucnhashError; 1773 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1774 Py_DECREF(m); 1775 if (v == NULL) 1776 goto ucnhashError; 1777 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1778 Py_DECREF(v); 1779 if (ucnhash_CAPI == NULL) 1780 goto ucnhashError; 1781 } 1782 if (*s == '{') { 1783 const char *start = s+1; 1784 /* look for the closing brace */ 1785 while (*s != '}' && s < end) 1786 s++; 1787 if (s > start && s < end && *s == '}') { 1788 /* found a name. look it up in the unicode database */ 1789 message = "unknown Unicode character name"; 1790 s++; 1791 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1792 goto store; 1793 } 1794 } 1795 endinpos = s-starts; 1796 outpos = p-PyUnicode_AS_UNICODE(v); 1797 if (unicode_decode_call_errorhandler( 1798 errors, &errorHandler, 1799 "unicodeescape", message, 1800 starts, size, &startinpos, &endinpos, &exc, &s, 1801 (PyObject **)&v, &outpos, &p)) 1802 goto onError; 1803 break; 1804 1805 default: 1806 if (s > end) { 1807 message = "\\ at end of string"; 1808 s--; 1809 endinpos = s-starts; 1810 outpos = p-PyUnicode_AS_UNICODE(v); 1811 if (unicode_decode_call_errorhandler( 1812 errors, &errorHandler, 1813 "unicodeescape", message, 1814 starts, size, &startinpos, &endinpos, &exc, &s, 1815 (PyObject **)&v, &outpos, &p)) 1816 goto onError; 1817 } 1818 else { 1819 *p++ = '\\'; 1820 *p++ = (unsigned char)s[-1]; 1821 } 1822 break; 1823 } 1824 nextByte: 1825 ; 1826 } 1827 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1828 goto onError; 1829 return (PyObject *)v; 1830 1831ucnhashError: 1832 PyErr_SetString( 1833 PyExc_UnicodeError, 1834 "\\N escapes not supported (can't load unicodedata module)" 1835 ); 1836 Py_XDECREF(errorHandler); 1837 Py_XDECREF(exc); 1838 return NULL; 1839 1840onError: 1841 Py_XDECREF(v); 1842 Py_XDECREF(errorHandler); 1843 Py_XDECREF(exc); 1844 return NULL; 1845} 1846 1847/* Return a Unicode-Escape string version of the Unicode object. 1848 1849 If quotes is true, the string is enclosed in u"" or u'' quotes as 1850 appropriate. 1851 1852*/ 1853 1854static const Py_UNICODE *findchar(const Py_UNICODE *s, 1855 int size, 1856 Py_UNICODE ch); 1857 1858static 1859PyObject *unicodeescape_string(const Py_UNICODE *s, 1860 int size, 1861 int quotes) 1862{ 1863 PyObject *repr; 1864 char *p; 1865 1866 static const char *hexdigit = "0123456789abcdef"; 1867 1868 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1869 if (repr == NULL) 1870 return NULL; 1871 1872 p = PyString_AS_STRING(repr); 1873 1874 if (quotes) { 1875 *p++ = 'u'; 1876 *p++ = (findchar(s, size, '\'') && 1877 !findchar(s, size, '"')) ? '"' : '\''; 1878 } 1879 while (size-- > 0) { 1880 Py_UNICODE ch = *s++; 1881 1882 /* Escape quotes */ 1883 if (quotes && 1884 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1885 *p++ = '\\'; 1886 *p++ = (char) ch; 1887 continue; 1888 } 1889 1890#ifdef Py_UNICODE_WIDE 1891 /* Map 21-bit characters to '\U00xxxxxx' */ 1892 else if (ch >= 0x10000) { 1893 int offset = p - PyString_AS_STRING(repr); 1894 1895 /* Resize the string if necessary */ 1896 if (offset + 12 > PyString_GET_SIZE(repr)) { 1897 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1898 return NULL; 1899 p = PyString_AS_STRING(repr) + offset; 1900 } 1901 1902 *p++ = '\\'; 1903 *p++ = 'U'; 1904 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1905 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1906 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1907 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1908 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1909 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1910 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1911 *p++ = hexdigit[ch & 0x0000000F]; 1912 continue; 1913 } 1914#endif 1915 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1916 else if (ch >= 0xD800 && ch < 0xDC00) { 1917 Py_UNICODE ch2; 1918 Py_UCS4 ucs; 1919 1920 ch2 = *s++; 1921 size--; 1922 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1923 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1924 *p++ = '\\'; 1925 *p++ = 'U'; 1926 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1927 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1928 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1929 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1930 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1931 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1932 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1933 *p++ = hexdigit[ucs & 0x0000000F]; 1934 continue; 1935 } 1936 /* Fall through: isolated surrogates are copied as-is */ 1937 s--; 1938 size++; 1939 } 1940 1941 /* Map 16-bit characters to '\uxxxx' */ 1942 if (ch >= 256) { 1943 *p++ = '\\'; 1944 *p++ = 'u'; 1945 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1946 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1947 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1948 *p++ = hexdigit[ch & 0x000F]; 1949 } 1950 1951 /* Map special whitespace to '\t', \n', '\r' */ 1952 else if (ch == '\t') { 1953 *p++ = '\\'; 1954 *p++ = 't'; 1955 } 1956 else if (ch == '\n') { 1957 *p++ = '\\'; 1958 *p++ = 'n'; 1959 } 1960 else if (ch == '\r') { 1961 *p++ = '\\'; 1962 *p++ = 'r'; 1963 } 1964 1965 /* Map non-printable US ASCII to '\xhh' */ 1966 else if (ch < ' ' || ch >= 0x7F) { 1967 *p++ = '\\'; 1968 *p++ = 'x'; 1969 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1970 *p++ = hexdigit[ch & 0x000F]; 1971 } 1972 1973 /* Copy everything else as-is */ 1974 else 1975 *p++ = (char) ch; 1976 } 1977 if (quotes) 1978 *p++ = PyString_AS_STRING(repr)[1]; 1979 1980 *p = '\0'; 1981 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 1982 return repr; 1983} 1984 1985PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1986 int size) 1987{ 1988 return unicodeescape_string(s, size, 0); 1989} 1990 1991PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1992{ 1993 if (!PyUnicode_Check(unicode)) { 1994 PyErr_BadArgument(); 1995 return NULL; 1996 } 1997 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1998 PyUnicode_GET_SIZE(unicode)); 1999} 2000 2001/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2002 2003PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2004 int size, 2005 const char *errors) 2006{ 2007 const char *starts = s; 2008 int startinpos; 2009 int endinpos; 2010 int outpos; 2011 PyUnicodeObject *v; 2012 Py_UNICODE *p; 2013 const char *end; 2014 const char *bs; 2015 PyObject *errorHandler = NULL; 2016 PyObject *exc = NULL; 2017 2018 /* Escaped strings will always be longer than the resulting 2019 Unicode string, so we start with size here and then reduce the 2020 length after conversion to the true value. (But decoding error 2021 handler might have to resize the string) */ 2022 v = _PyUnicode_New(size); 2023 if (v == NULL) 2024 goto onError; 2025 if (size == 0) 2026 return (PyObject *)v; 2027 p = PyUnicode_AS_UNICODE(v); 2028 end = s + size; 2029 while (s < end) { 2030 unsigned char c; 2031 Py_UCS4 x; 2032 int i; 2033 int count; 2034 2035 /* Non-escape characters are interpreted as Unicode ordinals */ 2036 if (*s != '\\') { 2037 *p++ = (unsigned char)*s++; 2038 continue; 2039 } 2040 startinpos = s-starts; 2041 2042 /* \u-escapes are only interpreted iff the number of leading 2043 backslashes if odd */ 2044 bs = s; 2045 for (;s < end;) { 2046 if (*s != '\\') 2047 break; 2048 *p++ = (unsigned char)*s++; 2049 } 2050 if (((s - bs) & 1) == 0 || 2051 s >= end || 2052 (*s != 'u' && *s != 'U')) { 2053 continue; 2054 } 2055 p--; 2056 count = *s=='u' ? 4 : 8; 2057 s++; 2058 2059 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 2060 outpos = p-PyUnicode_AS_UNICODE(v); 2061 for (x = 0, i = 0; i < count; ++i, ++s) { 2062 c = (unsigned char)*s; 2063 if (!isxdigit(c)) { 2064 endinpos = s-starts; 2065 if (unicode_decode_call_errorhandler( 2066 errors, &errorHandler, 2067 "rawunicodeescape", "truncated \\uXXXX", 2068 starts, size, &startinpos, &endinpos, &exc, &s, 2069 (PyObject **)&v, &outpos, &p)) 2070 goto onError; 2071 goto nextByte; 2072 } 2073 x = (x<<4) & ~0xF; 2074 if (c >= '0' && c <= '9') 2075 x += c - '0'; 2076 else if (c >= 'a' && c <= 'f') 2077 x += 10 + c - 'a'; 2078 else 2079 x += 10 + c - 'A'; 2080 } 2081#ifndef Py_UNICODE_WIDE 2082 if (x > 0x10000) { 2083 if (unicode_decode_call_errorhandler( 2084 errors, &errorHandler, 2085 "rawunicodeescape", "\\Uxxxxxxxx out of range", 2086 starts, size, &startinpos, &endinpos, &exc, &s, 2087 (PyObject **)&v, &outpos, &p)) 2088 goto onError; 2089 } 2090#endif 2091 *p++ = x; 2092 nextByte: 2093 ; 2094 } 2095 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2096 goto onError; 2097 Py_XDECREF(errorHandler); 2098 Py_XDECREF(exc); 2099 return (PyObject *)v; 2100 2101 onError: 2102 Py_XDECREF(v); 2103 Py_XDECREF(errorHandler); 2104 Py_XDECREF(exc); 2105 return NULL; 2106} 2107 2108PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2109 int size) 2110{ 2111 PyObject *repr; 2112 char *p; 2113 char *q; 2114 2115 static const char *hexdigit = "0123456789abcdef"; 2116 2117#ifdef Py_UNICODE_WIDE 2118 repr = PyString_FromStringAndSize(NULL, 10 * size); 2119#else 2120 repr = PyString_FromStringAndSize(NULL, 6 * size); 2121#endif 2122 if (repr == NULL) 2123 return NULL; 2124 if (size == 0) 2125 return repr; 2126 2127 p = q = PyString_AS_STRING(repr); 2128 while (size-- > 0) { 2129 Py_UNICODE ch = *s++; 2130#ifdef Py_UNICODE_WIDE 2131 /* Map 32-bit characters to '\Uxxxxxxxx' */ 2132 if (ch >= 0x10000) { 2133 *p++ = '\\'; 2134 *p++ = 'U'; 2135 *p++ = hexdigit[(ch >> 28) & 0xf]; 2136 *p++ = hexdigit[(ch >> 24) & 0xf]; 2137 *p++ = hexdigit[(ch >> 20) & 0xf]; 2138 *p++ = hexdigit[(ch >> 16) & 0xf]; 2139 *p++ = hexdigit[(ch >> 12) & 0xf]; 2140 *p++ = hexdigit[(ch >> 8) & 0xf]; 2141 *p++ = hexdigit[(ch >> 4) & 0xf]; 2142 *p++ = hexdigit[ch & 15]; 2143 } 2144 else 2145#endif 2146 /* Map 16-bit characters to '\uxxxx' */ 2147 if (ch >= 256) { 2148 *p++ = '\\'; 2149 *p++ = 'u'; 2150 *p++ = hexdigit[(ch >> 12) & 0xf]; 2151 *p++ = hexdigit[(ch >> 8) & 0xf]; 2152 *p++ = hexdigit[(ch >> 4) & 0xf]; 2153 *p++ = hexdigit[ch & 15]; 2154 } 2155 /* Copy everything else as-is */ 2156 else 2157 *p++ = (char) ch; 2158 } 2159 *p = '\0'; 2160 _PyString_Resize(&repr, p - q); 2161 return repr; 2162} 2163 2164PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2165{ 2166 if (!PyUnicode_Check(unicode)) { 2167 PyErr_BadArgument(); 2168 return NULL; 2169 } 2170 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2171 PyUnicode_GET_SIZE(unicode)); 2172} 2173 2174/* --- Latin-1 Codec ------------------------------------------------------ */ 2175 2176PyObject *PyUnicode_DecodeLatin1(const char *s, 2177 int size, 2178 const char *errors) 2179{ 2180 PyUnicodeObject *v; 2181 Py_UNICODE *p; 2182 2183 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2184 if (size == 1 && *(unsigned char*)s < 256) { 2185 Py_UNICODE r = *(unsigned char*)s; 2186 return PyUnicode_FromUnicode(&r, 1); 2187 } 2188 2189 v = _PyUnicode_New(size); 2190 if (v == NULL) 2191 goto onError; 2192 if (size == 0) 2193 return (PyObject *)v; 2194 p = PyUnicode_AS_UNICODE(v); 2195 while (size-- > 0) 2196 *p++ = (unsigned char)*s++; 2197 return (PyObject *)v; 2198 2199 onError: 2200 Py_XDECREF(v); 2201 return NULL; 2202} 2203 2204/* create or adjust a UnicodeEncodeError */ 2205static void make_encode_exception(PyObject **exceptionObject, 2206 const char *encoding, 2207 const Py_UNICODE *unicode, int size, 2208 int startpos, int endpos, 2209 const char *reason) 2210{ 2211 if (*exceptionObject == NULL) { 2212 *exceptionObject = PyUnicodeEncodeError_Create( 2213 encoding, unicode, size, startpos, endpos, reason); 2214 } 2215 else { 2216 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2217 goto onError; 2218 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2219 goto onError; 2220 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2221 goto onError; 2222 return; 2223 onError: 2224 Py_DECREF(*exceptionObject); 2225 *exceptionObject = NULL; 2226 } 2227} 2228 2229/* raises a UnicodeEncodeError */ 2230static void raise_encode_exception(PyObject **exceptionObject, 2231 const char *encoding, 2232 const Py_UNICODE *unicode, int size, 2233 int startpos, int endpos, 2234 const char *reason) 2235{ 2236 make_encode_exception(exceptionObject, 2237 encoding, unicode, size, startpos, endpos, reason); 2238 if (*exceptionObject != NULL) 2239 PyCodec_StrictErrors(*exceptionObject); 2240} 2241 2242/* error handling callback helper: 2243 build arguments, call the callback and check the arguments, 2244 put the result into newpos and return the replacement string, which 2245 has to be freed by the caller */ 2246static PyObject *unicode_encode_call_errorhandler(const char *errors, 2247 PyObject **errorHandler, 2248 const char *encoding, const char *reason, 2249 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 2250 int startpos, int endpos, 2251 int *newpos) 2252{ 2253 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; 2254 2255 PyObject *restuple; 2256 PyObject *resunicode; 2257 2258 if (*errorHandler == NULL) { 2259 *errorHandler = PyCodec_LookupError(errors); 2260 if (*errorHandler == NULL) 2261 return NULL; 2262 } 2263 2264 make_encode_exception(exceptionObject, 2265 encoding, unicode, size, startpos, endpos, reason); 2266 if (*exceptionObject == NULL) 2267 return NULL; 2268 2269 restuple = PyObject_CallFunctionObjArgs( 2270 *errorHandler, *exceptionObject, NULL); 2271 if (restuple == NULL) 2272 return NULL; 2273 if (!PyTuple_Check(restuple)) { 2274 PyErr_Format(PyExc_TypeError, &argparse[4]); 2275 Py_DECREF(restuple); 2276 return NULL; 2277 } 2278 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2279 &resunicode, newpos)) { 2280 Py_DECREF(restuple); 2281 return NULL; 2282 } 2283 if (*newpos<0) 2284 *newpos = size+*newpos; 2285 if (*newpos<0 || *newpos>size) { 2286 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 2287 Py_DECREF(restuple); 2288 return NULL; 2289 } 2290 Py_INCREF(resunicode); 2291 Py_DECREF(restuple); 2292 return resunicode; 2293} 2294 2295static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2296 int size, 2297 const char *errors, 2298 int limit) 2299{ 2300 /* output object */ 2301 PyObject *res; 2302 /* pointers to the beginning and end+1 of input */ 2303 const Py_UNICODE *startp = p; 2304 const Py_UNICODE *endp = p + size; 2305 /* pointer to the beginning of the unencodable characters */ 2306 /* const Py_UNICODE *badp = NULL; */ 2307 /* pointer into the output */ 2308 char *str; 2309 /* current output position */ 2310 int respos = 0; 2311 int ressize; 2312 char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2313 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2314 PyObject *errorHandler = NULL; 2315 PyObject *exc = NULL; 2316 /* the following variable is used for caching string comparisons 2317 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2318 int known_errorHandler = -1; 2319 2320 /* allocate enough for a simple encoding without 2321 replacements, if we need more, we'll resize */ 2322 res = PyString_FromStringAndSize(NULL, size); 2323 if (res == NULL) 2324 goto onError; 2325 if (size == 0) 2326 return res; 2327 str = PyString_AS_STRING(res); 2328 ressize = size; 2329 2330 while (p<endp) { 2331 Py_UNICODE c = *p; 2332 2333 /* can we encode this? */ 2334 if (c<limit) { 2335 /* no overflow check, because we know that the space is enough */ 2336 *str++ = (char)c; 2337 ++p; 2338 } 2339 else { 2340 int unicodepos = p-startp; 2341 int requiredsize; 2342 PyObject *repunicode; 2343 int repsize; 2344 int newpos; 2345 int respos; 2346 Py_UNICODE *uni2; 2347 /* startpos for collecting unencodable chars */ 2348 const Py_UNICODE *collstart = p; 2349 const Py_UNICODE *collend = p; 2350 /* find all unecodable characters */ 2351 while ((collend < endp) && ((*collend)>=limit)) 2352 ++collend; 2353 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2354 if (known_errorHandler==-1) { 2355 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2356 known_errorHandler = 1; 2357 else if (!strcmp(errors, "replace")) 2358 known_errorHandler = 2; 2359 else if (!strcmp(errors, "ignore")) 2360 known_errorHandler = 3; 2361 else if (!strcmp(errors, "xmlcharrefreplace")) 2362 known_errorHandler = 4; 2363 else 2364 known_errorHandler = 0; 2365 } 2366 switch (known_errorHandler) { 2367 case 1: /* strict */ 2368 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2369 goto onError; 2370 case 2: /* replace */ 2371 while (collstart++<collend) 2372 *str++ = '?'; /* fall through */ 2373 case 3: /* ignore */ 2374 p = collend; 2375 break; 2376 case 4: /* xmlcharrefreplace */ 2377 respos = str-PyString_AS_STRING(res); 2378 /* determine replacement size (temporarily (mis)uses p) */ 2379 for (p = collstart, repsize = 0; p < collend; ++p) { 2380 if (*p<10) 2381 repsize += 2+1+1; 2382 else if (*p<100) 2383 repsize += 2+2+1; 2384 else if (*p<1000) 2385 repsize += 2+3+1; 2386 else if (*p<10000) 2387 repsize += 2+4+1; 2388 else if (*p<100000) 2389 repsize += 2+5+1; 2390 else if (*p<1000000) 2391 repsize += 2+6+1; 2392 else 2393 repsize += 2+7+1; 2394 } 2395 requiredsize = respos+repsize+(endp-collend); 2396 if (requiredsize > ressize) { 2397 if (requiredsize<2*ressize) 2398 requiredsize = 2*ressize; 2399 if (_PyString_Resize(&res, requiredsize)) 2400 goto onError; 2401 str = PyString_AS_STRING(res) + respos; 2402 ressize = requiredsize; 2403 } 2404 /* generate replacement (temporarily (mis)uses p) */ 2405 for (p = collstart; p < collend; ++p) { 2406 str += sprintf(str, "&#%d;", (int)*p); 2407 } 2408 p = collend; 2409 break; 2410 default: 2411 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2412 encoding, reason, startp, size, &exc, 2413 collstart-startp, collend-startp, &newpos); 2414 if (repunicode == NULL) 2415 goto onError; 2416 /* need more space? (at least enough for what we 2417 have+the replacement+the rest of the string, so 2418 we won't have to check space for encodable characters) */ 2419 respos = str-PyString_AS_STRING(res); 2420 repsize = PyUnicode_GET_SIZE(repunicode); 2421 requiredsize = respos+repsize+(endp-collend); 2422 if (requiredsize > ressize) { 2423 if (requiredsize<2*ressize) 2424 requiredsize = 2*ressize; 2425 if (_PyString_Resize(&res, requiredsize)) { 2426 Py_DECREF(repunicode); 2427 goto onError; 2428 } 2429 str = PyString_AS_STRING(res) + respos; 2430 ressize = requiredsize; 2431 } 2432 /* check if there is anything unencodable in the replacement 2433 and copy it to the output */ 2434 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2435 c = *uni2; 2436 if (c >= limit) { 2437 raise_encode_exception(&exc, encoding, startp, size, 2438 unicodepos, unicodepos+1, reason); 2439 Py_DECREF(repunicode); 2440 goto onError; 2441 } 2442 *str = (char)c; 2443 } 2444 p = startp + newpos; 2445 Py_DECREF(repunicode); 2446 } 2447 } 2448 } 2449 /* Resize if we allocated to much */ 2450 respos = str-PyString_AS_STRING(res); 2451 if (respos<ressize) 2452 /* If this falls res will be NULL */ 2453 _PyString_Resize(&res, respos); 2454 Py_XDECREF(errorHandler); 2455 Py_XDECREF(exc); 2456 return res; 2457 2458 onError: 2459 Py_XDECREF(res); 2460 Py_XDECREF(errorHandler); 2461 Py_XDECREF(exc); 2462 return NULL; 2463} 2464 2465PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2466 int size, 2467 const char *errors) 2468{ 2469 return unicode_encode_ucs1(p, size, errors, 256); 2470} 2471 2472PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2473{ 2474 if (!PyUnicode_Check(unicode)) { 2475 PyErr_BadArgument(); 2476 return NULL; 2477 } 2478 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2479 PyUnicode_GET_SIZE(unicode), 2480 NULL); 2481} 2482 2483/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2484 2485PyObject *PyUnicode_DecodeASCII(const char *s, 2486 int size, 2487 const char *errors) 2488{ 2489 const char *starts = s; 2490 PyUnicodeObject *v; 2491 Py_UNICODE *p; 2492 int startinpos; 2493 int endinpos; 2494 int outpos; 2495 const char *e; 2496 PyObject *errorHandler = NULL; 2497 PyObject *exc = NULL; 2498 2499 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2500 if (size == 1 && *(unsigned char*)s < 128) { 2501 Py_UNICODE r = *(unsigned char*)s; 2502 return PyUnicode_FromUnicode(&r, 1); 2503 } 2504 2505 v = _PyUnicode_New(size); 2506 if (v == NULL) 2507 goto onError; 2508 if (size == 0) 2509 return (PyObject *)v; 2510 p = PyUnicode_AS_UNICODE(v); 2511 e = s + size; 2512 while (s < e) { 2513 register unsigned char c = (unsigned char)*s; 2514 if (c < 128) { 2515 *p++ = c; 2516 ++s; 2517 } 2518 else { 2519 startinpos = s-starts; 2520 endinpos = startinpos + 1; 2521 outpos = p-PyUnicode_AS_UNICODE(v); 2522 if (unicode_decode_call_errorhandler( 2523 errors, &errorHandler, 2524 "ascii", "ordinal not in range(128)", 2525 starts, size, &startinpos, &endinpos, &exc, &s, 2526 (PyObject **)&v, &outpos, &p)) 2527 goto onError; 2528 } 2529 } 2530 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2531 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2532 goto onError; 2533 Py_XDECREF(errorHandler); 2534 Py_XDECREF(exc); 2535 return (PyObject *)v; 2536 2537 onError: 2538 Py_XDECREF(v); 2539 Py_XDECREF(errorHandler); 2540 Py_XDECREF(exc); 2541 return NULL; 2542} 2543 2544PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2545 int size, 2546 const char *errors) 2547{ 2548 return unicode_encode_ucs1(p, size, errors, 128); 2549} 2550 2551PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2552{ 2553 if (!PyUnicode_Check(unicode)) { 2554 PyErr_BadArgument(); 2555 return NULL; 2556 } 2557 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2558 PyUnicode_GET_SIZE(unicode), 2559 NULL); 2560} 2561 2562#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2563 2564/* --- MBCS codecs for Windows -------------------------------------------- */ 2565 2566PyObject *PyUnicode_DecodeMBCS(const char *s, 2567 int size, 2568 const char *errors) 2569{ 2570 PyUnicodeObject *v; 2571 Py_UNICODE *p; 2572 2573 /* First get the size of the result */ 2574 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2575 if (size > 0 && usize==0) 2576 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2577 2578 v = _PyUnicode_New(usize); 2579 if (v == NULL) 2580 return NULL; 2581 if (usize == 0) 2582 return (PyObject *)v; 2583 p = PyUnicode_AS_UNICODE(v); 2584 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2585 Py_DECREF(v); 2586 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2587 } 2588 2589 return (PyObject *)v; 2590} 2591 2592PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2593 int size, 2594 const char *errors) 2595{ 2596 PyObject *repr; 2597 char *s; 2598 DWORD mbcssize; 2599 2600 /* If there are no characters, bail now! */ 2601 if (size==0) 2602 return PyString_FromString(""); 2603 2604 /* First get the size of the result */ 2605 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2606 if (mbcssize==0) 2607 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2608 2609 repr = PyString_FromStringAndSize(NULL, mbcssize); 2610 if (repr == NULL) 2611 return NULL; 2612 if (mbcssize == 0) 2613 return repr; 2614 2615 /* Do the conversion */ 2616 s = PyString_AS_STRING(repr); 2617 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2618 Py_DECREF(repr); 2619 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2620 } 2621 return repr; 2622} 2623 2624#endif /* MS_WINDOWS */ 2625 2626/* --- Character Mapping Codec -------------------------------------------- */ 2627 2628PyObject *PyUnicode_DecodeCharmap(const char *s, 2629 int size, 2630 PyObject *mapping, 2631 const char *errors) 2632{ 2633 const char *starts = s; 2634 int startinpos; 2635 int endinpos; 2636 int outpos; 2637 const char *e; 2638 PyUnicodeObject *v; 2639 Py_UNICODE *p; 2640 int extrachars = 0; 2641 PyObject *errorHandler = NULL; 2642 PyObject *exc = NULL; 2643 2644 /* Default to Latin-1 */ 2645 if (mapping == NULL) 2646 return PyUnicode_DecodeLatin1(s, size, errors); 2647 2648 v = _PyUnicode_New(size); 2649 if (v == NULL) 2650 goto onError; 2651 if (size == 0) 2652 return (PyObject *)v; 2653 p = PyUnicode_AS_UNICODE(v); 2654 e = s + size; 2655 while (s < e) { 2656 unsigned char ch = *s; 2657 PyObject *w, *x; 2658 2659 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2660 w = PyInt_FromLong((long)ch); 2661 if (w == NULL) 2662 goto onError; 2663 x = PyObject_GetItem(mapping, w); 2664 Py_DECREF(w); 2665 if (x == NULL) { 2666 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2667 /* No mapping found means: mapping is undefined. */ 2668 PyErr_Clear(); 2669 x = Py_None; 2670 Py_INCREF(x); 2671 } else 2672 goto onError; 2673 } 2674 2675 /* Apply mapping */ 2676 if (PyInt_Check(x)) { 2677 long value = PyInt_AS_LONG(x); 2678 if (value < 0 || value > 65535) { 2679 PyErr_SetString(PyExc_TypeError, 2680 "character mapping must be in range(65536)"); 2681 Py_DECREF(x); 2682 goto onError; 2683 } 2684 *p++ = (Py_UNICODE)value; 2685 } 2686 else if (x == Py_None) { 2687 /* undefined mapping */ 2688 outpos = p-PyUnicode_AS_UNICODE(v); 2689 startinpos = s-starts; 2690 endinpos = startinpos+1; 2691 if (unicode_decode_call_errorhandler( 2692 errors, &errorHandler, 2693 "charmap", "character maps to <undefined>", 2694 starts, size, &startinpos, &endinpos, &exc, &s, 2695 (PyObject **)&v, &outpos, &p)) { 2696 Py_DECREF(x); 2697 goto onError; 2698 } 2699 continue; 2700 } 2701 else if (PyUnicode_Check(x)) { 2702 int targetsize = PyUnicode_GET_SIZE(x); 2703 2704 if (targetsize == 1) 2705 /* 1-1 mapping */ 2706 *p++ = *PyUnicode_AS_UNICODE(x); 2707 2708 else if (targetsize > 1) { 2709 /* 1-n mapping */ 2710 if (targetsize > extrachars) { 2711 /* resize first */ 2712 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2713 int needed = (targetsize - extrachars) + \ 2714 (targetsize << 2); 2715 extrachars += needed; 2716 if (_PyUnicode_Resize(&v, 2717 PyUnicode_GET_SIZE(v) + needed)) { 2718 Py_DECREF(x); 2719 goto onError; 2720 } 2721 p = PyUnicode_AS_UNICODE(v) + oldpos; 2722 } 2723 Py_UNICODE_COPY(p, 2724 PyUnicode_AS_UNICODE(x), 2725 targetsize); 2726 p += targetsize; 2727 extrachars -= targetsize; 2728 } 2729 /* 1-0 mapping: skip the character */ 2730 } 2731 else { 2732 /* wrong return value */ 2733 PyErr_SetString(PyExc_TypeError, 2734 "character mapping must return integer, None or unicode"); 2735 Py_DECREF(x); 2736 goto onError; 2737 } 2738 Py_DECREF(x); 2739 ++s; 2740 } 2741 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2742 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2743 goto onError; 2744 Py_XDECREF(errorHandler); 2745 Py_XDECREF(exc); 2746 return (PyObject *)v; 2747 2748 onError: 2749 Py_XDECREF(errorHandler); 2750 Py_XDECREF(exc); 2751 Py_XDECREF(v); 2752 return NULL; 2753} 2754 2755/* Lookup the character ch in the mapping. If the character 2756 can't be found, Py_None is returned (or NULL, if another 2757 error occured). */ 2758static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 2759{ 2760 PyObject *w = PyInt_FromLong((long)c); 2761 PyObject *x; 2762 2763 if (w == NULL) 2764 return NULL; 2765 x = PyObject_GetItem(mapping, w); 2766 Py_DECREF(w); 2767 if (x == NULL) { 2768 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2769 /* No mapping found means: mapping is undefined. */ 2770 PyErr_Clear(); 2771 x = Py_None; 2772 Py_INCREF(x); 2773 return x; 2774 } else 2775 return NULL; 2776 } 2777 else if (x == Py_None) 2778 return x; 2779 else if (PyInt_Check(x)) { 2780 long value = PyInt_AS_LONG(x); 2781 if (value < 0 || value > 255) { 2782 PyErr_SetString(PyExc_TypeError, 2783 "character mapping must be in range(256)"); 2784 Py_DECREF(x); 2785 return NULL; 2786 } 2787 return x; 2788 } 2789 else if (PyString_Check(x)) 2790 return x; 2791 else { 2792 /* wrong return value */ 2793 PyErr_SetString(PyExc_TypeError, 2794 "character mapping must return integer, None or str"); 2795 Py_DECREF(x); 2796 return NULL; 2797 } 2798} 2799 2800/* lookup the character, put the result in the output string and adjust 2801 various state variables. Reallocate the output string if not enough 2802 space is available. Return a new reference to the object that 2803 was put in the output buffer, or Py_None, if the mapping was undefined 2804 (in which case no character was written) or NULL, if a 2805 reallocation error ocurred. The called must decref the result */ 2806static 2807PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, 2808 PyObject **outobj, int *outpos) 2809{ 2810 PyObject *rep = charmapencode_lookup(c, mapping); 2811 2812 if (rep==NULL) 2813 return NULL; 2814 else if (rep==Py_None) 2815 return rep; 2816 else { 2817 char *outstart = PyString_AS_STRING(*outobj); 2818 int outsize = PyString_GET_SIZE(*outobj); 2819 if (PyInt_Check(rep)) { 2820 int requiredsize = *outpos+1; 2821 if (outsize<requiredsize) { 2822 /* exponentially overallocate to minimize reallocations */ 2823 if (requiredsize < 2*outsize) 2824 requiredsize = 2*outsize; 2825 if (_PyString_Resize(outobj, requiredsize)) { 2826 Py_DECREF(rep); 2827 return NULL; 2828 } 2829 outstart = PyString_AS_STRING(*outobj); 2830 } 2831 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 2832 } 2833 else { 2834 const char *repchars = PyString_AS_STRING(rep); 2835 int repsize = PyString_GET_SIZE(rep); 2836 int requiredsize = *outpos+repsize; 2837 if (outsize<requiredsize) { 2838 /* exponentially overallocate to minimize reallocations */ 2839 if (requiredsize < 2*outsize) 2840 requiredsize = 2*outsize; 2841 if (_PyString_Resize(outobj, requiredsize)) { 2842 Py_DECREF(rep); 2843 return NULL; 2844 } 2845 outstart = PyString_AS_STRING(*outobj); 2846 } 2847 memcpy(outstart + *outpos, repchars, repsize); 2848 *outpos += repsize; 2849 } 2850 } 2851 return rep; 2852} 2853 2854/* handle an error in PyUnicode_EncodeCharmap 2855 Return 0 on success, -1 on error */ 2856static 2857int charmap_encoding_error( 2858 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping, 2859 PyObject **exceptionObject, 2860 int *known_errorHandler, PyObject *errorHandler, const char *errors, 2861 PyObject **res, int *respos) 2862{ 2863 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 2864 int repsize; 2865 int newpos; 2866 Py_UNICODE *uni2; 2867 /* startpos for collecting unencodable chars */ 2868 int collstartpos = *inpos; 2869 int collendpos = *inpos+1; 2870 int collpos; 2871 char *encoding = "charmap"; 2872 char *reason = "character maps to <undefined>"; 2873 2874 PyObject *x; 2875 /* find all unencodable characters */ 2876 while (collendpos < size) { 2877 x = charmapencode_lookup(p[collendpos], mapping); 2878 if (x==NULL) 2879 return -1; 2880 else if (x!=Py_None) { 2881 Py_DECREF(x); 2882 break; 2883 } 2884 Py_DECREF(x); 2885 ++collendpos; 2886 } 2887 /* cache callback name lookup 2888 * (if not done yet, i.e. it's the first error) */ 2889 if (*known_errorHandler==-1) { 2890 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2891 *known_errorHandler = 1; 2892 else if (!strcmp(errors, "replace")) 2893 *known_errorHandler = 2; 2894 else if (!strcmp(errors, "ignore")) 2895 *known_errorHandler = 3; 2896 else if (!strcmp(errors, "xmlcharrefreplace")) 2897 *known_errorHandler = 4; 2898 else 2899 *known_errorHandler = 0; 2900 } 2901 switch (*known_errorHandler) { 2902 case 1: /* strict */ 2903 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2904 return -1; 2905 case 2: /* replace */ 2906 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 2907 x = charmapencode_output('?', mapping, res, respos); 2908 if (x==NULL) { 2909 return -1; 2910 } 2911 else if (x==Py_None) { 2912 Py_DECREF(x); 2913 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2914 return -1; 2915 } 2916 Py_DECREF(x); 2917 } 2918 /* fall through */ 2919 case 3: /* ignore */ 2920 *inpos = collendpos; 2921 break; 2922 case 4: /* xmlcharrefreplace */ 2923 /* generate replacement (temporarily (mis)uses p) */ 2924 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 2925 char buffer[2+29+1+1]; 2926 char *cp; 2927 sprintf(buffer, "&#%d;", (int)p[collpos]); 2928 for (cp = buffer; *cp; ++cp) { 2929 x = charmapencode_output(*cp, mapping, res, respos); 2930 if (x==NULL) 2931 return -1; 2932 else if (x==Py_None) { 2933 Py_DECREF(x); 2934 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2935 return -1; 2936 } 2937 Py_DECREF(x); 2938 } 2939 } 2940 *inpos = collendpos; 2941 break; 2942 default: 2943 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2944 encoding, reason, p, size, exceptionObject, 2945 collstartpos, collendpos, &newpos); 2946 if (repunicode == NULL) 2947 return -1; 2948 /* generate replacement */ 2949 repsize = PyUnicode_GET_SIZE(repunicode); 2950 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 2951 x = charmapencode_output(*uni2, mapping, res, respos); 2952 if (x==NULL) { 2953 Py_DECREF(repunicode); 2954 return -1; 2955 } 2956 else if (x==Py_None) { 2957 Py_DECREF(repunicode); 2958 Py_DECREF(x); 2959 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2960 return -1; 2961 } 2962 Py_DECREF(x); 2963 } 2964 *inpos = newpos; 2965 Py_DECREF(repunicode); 2966 } 2967 return 0; 2968} 2969 2970PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2971 int size, 2972 PyObject *mapping, 2973 const char *errors) 2974{ 2975 /* output object */ 2976 PyObject *res = NULL; 2977 /* current input position */ 2978 int inpos = 0; 2979 /* current output position */ 2980 int respos = 0; 2981 PyObject *errorHandler = NULL; 2982 PyObject *exc = NULL; 2983 /* the following variable is used for caching string comparisons 2984 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 2985 * 3=ignore, 4=xmlcharrefreplace */ 2986 int known_errorHandler = -1; 2987 2988 /* Default to Latin-1 */ 2989 if (mapping == NULL) 2990 return PyUnicode_EncodeLatin1(p, size, errors); 2991 2992 /* allocate enough for a simple encoding without 2993 replacements, if we need more, we'll resize */ 2994 res = PyString_FromStringAndSize(NULL, size); 2995 if (res == NULL) 2996 goto onError; 2997 if (size == 0) 2998 return res; 2999 3000 while (inpos<size) { 3001 /* try to encode it */ 3002 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); 3003 if (x==NULL) /* error */ 3004 goto onError; 3005 if (x==Py_None) { /* unencodable character */ 3006 if (charmap_encoding_error(p, size, &inpos, mapping, 3007 &exc, 3008 &known_errorHandler, errorHandler, errors, 3009 &res, &respos)) 3010 goto onError; 3011 } 3012 else 3013 /* done with this character => adjust input position */ 3014 ++inpos; 3015 Py_DECREF(x); 3016 } 3017 3018 /* Resize if we allocated to much */ 3019 if (respos<PyString_GET_SIZE(res)) { 3020 if (_PyString_Resize(&res, respos)) 3021 goto onError; 3022 } 3023 Py_XDECREF(exc); 3024 Py_XDECREF(errorHandler); 3025 return res; 3026 3027 onError: 3028 Py_XDECREF(res); 3029 Py_XDECREF(exc); 3030 Py_XDECREF(errorHandler); 3031 return NULL; 3032} 3033 3034PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3035 PyObject *mapping) 3036{ 3037 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3038 PyErr_BadArgument(); 3039 return NULL; 3040 } 3041 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3042 PyUnicode_GET_SIZE(unicode), 3043 mapping, 3044 NULL); 3045} 3046 3047/* create or adjust a UnicodeTranslateError */ 3048static void make_translate_exception(PyObject **exceptionObject, 3049 const Py_UNICODE *unicode, int size, 3050 int startpos, int endpos, 3051 const char *reason) 3052{ 3053 if (*exceptionObject == NULL) { 3054 *exceptionObject = PyUnicodeTranslateError_Create( 3055 unicode, size, startpos, endpos, reason); 3056 } 3057 else { 3058 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3059 goto onError; 3060 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3061 goto onError; 3062 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3063 goto onError; 3064 return; 3065 onError: 3066 Py_DECREF(*exceptionObject); 3067 *exceptionObject = NULL; 3068 } 3069} 3070 3071/* raises a UnicodeTranslateError */ 3072static void raise_translate_exception(PyObject **exceptionObject, 3073 const Py_UNICODE *unicode, int size, 3074 int startpos, int endpos, 3075 const char *reason) 3076{ 3077 make_translate_exception(exceptionObject, 3078 unicode, size, startpos, endpos, reason); 3079 if (*exceptionObject != NULL) 3080 PyCodec_StrictErrors(*exceptionObject); 3081} 3082 3083/* error handling callback helper: 3084 build arguments, call the callback and check the arguments, 3085 put the result into newpos and return the replacement string, which 3086 has to be freed by the caller */ 3087static PyObject *unicode_translate_call_errorhandler(const char *errors, 3088 PyObject **errorHandler, 3089 const char *reason, 3090 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 3091 int startpos, int endpos, 3092 int *newpos) 3093{ 3094 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; 3095 3096 PyObject *restuple; 3097 PyObject *resunicode; 3098 3099 if (*errorHandler == NULL) { 3100 *errorHandler = PyCodec_LookupError(errors); 3101 if (*errorHandler == NULL) 3102 return NULL; 3103 } 3104 3105 make_translate_exception(exceptionObject, 3106 unicode, size, startpos, endpos, reason); 3107 if (*exceptionObject == NULL) 3108 return NULL; 3109 3110 restuple = PyObject_CallFunctionObjArgs( 3111 *errorHandler, *exceptionObject, NULL); 3112 if (restuple == NULL) 3113 return NULL; 3114 if (!PyTuple_Check(restuple)) { 3115 PyErr_Format(PyExc_TypeError, &argparse[4]); 3116 Py_DECREF(restuple); 3117 return NULL; 3118 } 3119 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3120 &resunicode, newpos)) { 3121 Py_DECREF(restuple); 3122 return NULL; 3123 } 3124 if (*newpos<0) 3125 *newpos = size+*newpos; 3126 if (*newpos<0 || *newpos>size) { 3127 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 3128 Py_DECREF(restuple); 3129 return NULL; 3130 } 3131 Py_INCREF(resunicode); 3132 Py_DECREF(restuple); 3133 return resunicode; 3134} 3135 3136/* Lookup the character ch in the mapping and put the result in result, 3137 which must be decrefed by the caller. 3138 Return 0 on success, -1 on error */ 3139static 3140int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3141{ 3142 PyObject *w = PyInt_FromLong((long)c); 3143 PyObject *x; 3144 3145 if (w == NULL) 3146 return -1; 3147 x = PyObject_GetItem(mapping, w); 3148 Py_DECREF(w); 3149 if (x == NULL) { 3150 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3151 /* No mapping found means: use 1:1 mapping. */ 3152 PyErr_Clear(); 3153 *result = NULL; 3154 return 0; 3155 } else 3156 return -1; 3157 } 3158 else if (x == Py_None) { 3159 *result = x; 3160 return 0; 3161 } 3162 else if (PyInt_Check(x)) { 3163 long value = PyInt_AS_LONG(x); 3164 long max = PyUnicode_GetMax(); 3165 if (value < 0 || value > max) { 3166 PyErr_Format(PyExc_TypeError, 3167 "character mapping must be in range(0x%lx)", max+1); 3168 Py_DECREF(x); 3169 return -1; 3170 } 3171 *result = x; 3172 return 0; 3173 } 3174 else if (PyUnicode_Check(x)) { 3175 *result = x; 3176 return 0; 3177 } 3178 else { 3179 /* wrong return value */ 3180 PyErr_SetString(PyExc_TypeError, 3181 "character mapping must return integer, None or unicode"); 3182 return -1; 3183 } 3184} 3185/* ensure that *outobj is at least requiredsize characters long, 3186if not reallocate and adjust various state variables. 3187Return 0 on success, -1 on error */ 3188static 3189int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, 3190 int requiredsize) 3191{ 3192 if (requiredsize > *outsize) { 3193 /* remember old output position */ 3194 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3195 /* exponentially overallocate to minimize reallocations */ 3196 if (requiredsize < 2 * *outsize) 3197 requiredsize = 2 * *outsize; 3198 if (_PyUnicode_Resize(outobj, requiredsize)) 3199 return -1; 3200 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3201 *outsize = requiredsize; 3202 } 3203 return 0; 3204} 3205/* lookup the character, put the result in the output string and adjust 3206 various state variables. Return a new reference to the object that 3207 was put in the output buffer in *result, or Py_None, if the mapping was 3208 undefined (in which case no character was written). 3209 The called must decref result. 3210 Return 0 on success, -1 on error. */ 3211static 3212int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, 3213 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) 3214{ 3215 if (charmaptranslate_lookup(c, mapping, res)) 3216 return -1; 3217 if (*res==NULL) { 3218 /* not found => default to 1:1 mapping */ 3219 *(*outp)++ = (Py_UNICODE)c; 3220 } 3221 else if (*res==Py_None) 3222 ; 3223 else if (PyInt_Check(*res)) { 3224 /* no overflow check, because we know that the space is enough */ 3225 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3226 } 3227 else if (PyUnicode_Check(*res)) { 3228 int repsize = PyUnicode_GET_SIZE(*res); 3229 if (repsize==1) { 3230 /* no overflow check, because we know that the space is enough */ 3231 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3232 } 3233 else if (repsize!=0) { 3234 /* more than one character */ 3235 int requiredsize = *outsize + repsize - 1; 3236 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) 3237 return -1; 3238 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3239 *outp += repsize; 3240 } 3241 } 3242 else 3243 return -1; 3244 return 0; 3245} 3246 3247PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3248 int size, 3249 PyObject *mapping, 3250 const char *errors) 3251{ 3252 /* output object */ 3253 PyObject *res = NULL; 3254 /* pointers to the beginning and end+1 of input */ 3255 const Py_UNICODE *startp = p; 3256 const Py_UNICODE *endp = p + size; 3257 /* pointer into the output */ 3258 Py_UNICODE *str; 3259 /* current output position */ 3260 int respos = 0; 3261 int ressize; 3262 char *reason = "character maps to <undefined>"; 3263 PyObject *errorHandler = NULL; 3264 PyObject *exc = NULL; 3265 /* the following variable is used for caching string comparisons 3266 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3267 * 3=ignore, 4=xmlcharrefreplace */ 3268 int known_errorHandler = -1; 3269 3270 if (mapping == NULL) { 3271 PyErr_BadArgument(); 3272 return NULL; 3273 } 3274 3275 /* allocate enough for a simple 1:1 translation without 3276 replacements, if we need more, we'll resize */ 3277 res = PyUnicode_FromUnicode(NULL, size); 3278 if (res == NULL) 3279 goto onError; 3280 if (size == 0) 3281 return res; 3282 str = PyUnicode_AS_UNICODE(res); 3283 ressize = size; 3284 3285 while (p<endp) { 3286 /* try to encode it */ 3287 PyObject *x = NULL; 3288 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) { 3289 Py_XDECREF(x); 3290 goto onError; 3291 } 3292 Py_XDECREF(x); 3293 if (x!=Py_None) /* it worked => adjust input pointer */ 3294 ++p; 3295 else { /* untranslatable character */ 3296 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3297 int repsize; 3298 int newpos; 3299 Py_UNICODE *uni2; 3300 /* startpos for collecting untranslatable chars */ 3301 const Py_UNICODE *collstart = p; 3302 const Py_UNICODE *collend = p+1; 3303 const Py_UNICODE *coll; 3304 3305 /* find all untranslatable characters */ 3306 while (collend < endp) { 3307 if (charmaptranslate_lookup(*collend, mapping, &x)) 3308 goto onError; 3309 Py_XDECREF(x); 3310 if (x!=Py_None) 3311 break; 3312 ++collend; 3313 } 3314 /* cache callback name lookup 3315 * (if not done yet, i.e. it's the first error) */ 3316 if (known_errorHandler==-1) { 3317 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3318 known_errorHandler = 1; 3319 else if (!strcmp(errors, "replace")) 3320 known_errorHandler = 2; 3321 else if (!strcmp(errors, "ignore")) 3322 known_errorHandler = 3; 3323 else if (!strcmp(errors, "xmlcharrefreplace")) 3324 known_errorHandler = 4; 3325 else 3326 known_errorHandler = 0; 3327 } 3328 switch (known_errorHandler) { 3329 case 1: /* strict */ 3330 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 3331 goto onError; 3332 case 2: /* replace */ 3333 /* No need to check for space, this is a 1:1 replacement */ 3334 for (coll = collstart; coll<collend; ++coll) 3335 *str++ = '?'; 3336 /* fall through */ 3337 case 3: /* ignore */ 3338 p = collend; 3339 break; 3340 case 4: /* xmlcharrefreplace */ 3341 /* generate replacement (temporarily (mis)uses p) */ 3342 for (p = collstart; p < collend; ++p) { 3343 char buffer[2+29+1+1]; 3344 char *cp; 3345 sprintf(buffer, "&#%d;", (int)*p); 3346 if (charmaptranslate_makespace(&res, &str, &ressize, 3347 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 3348 goto onError; 3349 for (cp = buffer; *cp; ++cp) 3350 *str++ = *cp; 3351 } 3352 p = collend; 3353 break; 3354 default: 3355 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 3356 reason, startp, size, &exc, 3357 collstart-startp, collend-startp, &newpos); 3358 if (repunicode == NULL) 3359 goto onError; 3360 /* generate replacement */ 3361 repsize = PyUnicode_GET_SIZE(repunicode); 3362 if (charmaptranslate_makespace(&res, &str, &ressize, 3363 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 3364 Py_DECREF(repunicode); 3365 goto onError; 3366 } 3367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 3368 *str++ = *uni2; 3369 p = startp + newpos; 3370 Py_DECREF(repunicode); 3371 } 3372 } 3373 } 3374 /* Resize if we allocated to much */ 3375 respos = str-PyUnicode_AS_UNICODE(res); 3376 if (respos<ressize) { 3377 if (_PyUnicode_Resize(&res, respos)) 3378 goto onError; 3379 } 3380 Py_XDECREF(exc); 3381 Py_XDECREF(errorHandler); 3382 return res; 3383 3384 onError: 3385 Py_XDECREF(res); 3386 Py_XDECREF(exc); 3387 Py_XDECREF(errorHandler); 3388 return NULL; 3389} 3390 3391PyObject *PyUnicode_Translate(PyObject *str, 3392 PyObject *mapping, 3393 const char *errors) 3394{ 3395 PyObject *result; 3396 3397 str = PyUnicode_FromObject(str); 3398 if (str == NULL) 3399 goto onError; 3400 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 3401 PyUnicode_GET_SIZE(str), 3402 mapping, 3403 errors); 3404 Py_DECREF(str); 3405 return result; 3406 3407 onError: 3408 Py_XDECREF(str); 3409 return NULL; 3410} 3411 3412/* --- Decimal Encoder ---------------------------------------------------- */ 3413 3414int PyUnicode_EncodeDecimal(Py_UNICODE *s, 3415 int length, 3416 char *output, 3417 const char *errors) 3418{ 3419 Py_UNICODE *p, *end; 3420 PyObject *errorHandler = NULL; 3421 PyObject *exc = NULL; 3422 const char *encoding = "decimal"; 3423 const char *reason = "invalid decimal Unicode string"; 3424 /* the following variable is used for caching string comparisons 3425 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3426 int known_errorHandler = -1; 3427 3428 if (output == NULL) { 3429 PyErr_BadArgument(); 3430 return -1; 3431 } 3432 3433 p = s; 3434 end = s + length; 3435 while (p < end) { 3436 register Py_UNICODE ch = *p; 3437 int decimal; 3438 PyObject *repunicode; 3439 int repsize; 3440 int newpos; 3441 Py_UNICODE *uni2; 3442 Py_UNICODE *collstart; 3443 Py_UNICODE *collend; 3444 3445 if (Py_UNICODE_ISSPACE(ch)) { 3446 *output++ = ' '; 3447 ++p; 3448 continue; 3449 } 3450 decimal = Py_UNICODE_TODECIMAL(ch); 3451 if (decimal >= 0) { 3452 *output++ = '0' + decimal; 3453 ++p; 3454 continue; 3455 } 3456 if (0 < ch && ch < 256) { 3457 *output++ = (char)ch; 3458 ++p; 3459 continue; 3460 } 3461 /* All other characters are considered unencodable */ 3462 collstart = p; 3463 collend = p+1; 3464 while (collend < end) { 3465 if ((0 < *collend && *collend < 256) || 3466 !Py_UNICODE_ISSPACE(*collend) || 3467 Py_UNICODE_TODECIMAL(*collend)) 3468 break; 3469 } 3470 /* cache callback name lookup 3471 * (if not done yet, i.e. it's the first error) */ 3472 if (known_errorHandler==-1) { 3473 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3474 known_errorHandler = 1; 3475 else if (!strcmp(errors, "replace")) 3476 known_errorHandler = 2; 3477 else if (!strcmp(errors, "ignore")) 3478 known_errorHandler = 3; 3479 else if (!strcmp(errors, "xmlcharrefreplace")) 3480 known_errorHandler = 4; 3481 else 3482 known_errorHandler = 0; 3483 } 3484 switch (known_errorHandler) { 3485 case 1: /* strict */ 3486 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 3487 goto onError; 3488 case 2: /* replace */ 3489 for (p = collstart; p < collend; ++p) 3490 *output++ = '?'; 3491 /* fall through */ 3492 case 3: /* ignore */ 3493 p = collend; 3494 break; 3495 case 4: /* xmlcharrefreplace */ 3496 /* generate replacement (temporarily (mis)uses p) */ 3497 for (p = collstart; p < collend; ++p) 3498 output += sprintf(output, "&#%d;", (int)*p); 3499 p = collend; 3500 break; 3501 default: 3502 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3503 encoding, reason, s, length, &exc, 3504 collstart-s, collend-s, &newpos); 3505 if (repunicode == NULL) 3506 goto onError; 3507 /* generate replacement */ 3508 repsize = PyUnicode_GET_SIZE(repunicode); 3509 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3510 Py_UNICODE ch = *uni2; 3511 if (Py_UNICODE_ISSPACE(ch)) 3512 *output++ = ' '; 3513 else { 3514 decimal = Py_UNICODE_TODECIMAL(ch); 3515 if (decimal >= 0) 3516 *output++ = '0' + decimal; 3517 else if (0 < ch && ch < 256) 3518 *output++ = (char)ch; 3519 else { 3520 Py_DECREF(repunicode); 3521 raise_encode_exception(&exc, encoding, 3522 s, length, collstart-s, collend-s, reason); 3523 goto onError; 3524 } 3525 } 3526 } 3527 p = s + newpos; 3528 Py_DECREF(repunicode); 3529 } 3530 } 3531 /* 0-terminate the output string */ 3532 *output++ = '\0'; 3533 Py_XDECREF(exc); 3534 Py_XDECREF(errorHandler); 3535 return 0; 3536 3537 onError: 3538 Py_XDECREF(exc); 3539 Py_XDECREF(errorHandler); 3540 return -1; 3541} 3542 3543/* --- Helpers ------------------------------------------------------------ */ 3544 3545static 3546int count(PyUnicodeObject *self, 3547 int start, 3548 int end, 3549 PyUnicodeObject *substring) 3550{ 3551 int count = 0; 3552 3553 if (start < 0) 3554 start += self->length; 3555 if (start < 0) 3556 start = 0; 3557 if (end > self->length) 3558 end = self->length; 3559 if (end < 0) 3560 end += self->length; 3561 if (end < 0) 3562 end = 0; 3563 3564 if (substring->length == 0) 3565 return (end - start + 1); 3566 3567 end -= substring->length; 3568 3569 while (start <= end) 3570 if (Py_UNICODE_MATCH(self, start, substring)) { 3571 count++; 3572 start += substring->length; 3573 } else 3574 start++; 3575 3576 return count; 3577} 3578 3579int PyUnicode_Count(PyObject *str, 3580 PyObject *substr, 3581 int start, 3582 int end) 3583{ 3584 int result; 3585 3586 str = PyUnicode_FromObject(str); 3587 if (str == NULL) 3588 return -1; 3589 substr = PyUnicode_FromObject(substr); 3590 if (substr == NULL) { 3591 Py_DECREF(str); 3592 return -1; 3593 } 3594 3595 result = count((PyUnicodeObject *)str, 3596 start, end, 3597 (PyUnicodeObject *)substr); 3598 3599 Py_DECREF(str); 3600 Py_DECREF(substr); 3601 return result; 3602} 3603 3604static 3605int findstring(PyUnicodeObject *self, 3606 PyUnicodeObject *substring, 3607 int start, 3608 int end, 3609 int direction) 3610{ 3611 if (start < 0) 3612 start += self->length; 3613 if (start < 0) 3614 start = 0; 3615 3616 if (end > self->length) 3617 end = self->length; 3618 if (end < 0) 3619 end += self->length; 3620 if (end < 0) 3621 end = 0; 3622 3623 if (substring->length == 0) 3624 return (direction > 0) ? start : end; 3625 3626 end -= substring->length; 3627 3628 if (direction < 0) { 3629 for (; end >= start; end--) 3630 if (Py_UNICODE_MATCH(self, end, substring)) 3631 return end; 3632 } else { 3633 for (; start <= end; start++) 3634 if (Py_UNICODE_MATCH(self, start, substring)) 3635 return start; 3636 } 3637 3638 return -1; 3639} 3640 3641int PyUnicode_Find(PyObject *str, 3642 PyObject *substr, 3643 int start, 3644 int end, 3645 int direction) 3646{ 3647 int result; 3648 3649 str = PyUnicode_FromObject(str); 3650 if (str == NULL) 3651 return -2; 3652 substr = PyUnicode_FromObject(substr); 3653 if (substr == NULL) { 3654 Py_DECREF(str); 3655 return -2; 3656 } 3657 3658 result = findstring((PyUnicodeObject *)str, 3659 (PyUnicodeObject *)substr, 3660 start, end, direction); 3661 Py_DECREF(str); 3662 Py_DECREF(substr); 3663 return result; 3664} 3665 3666static 3667int tailmatch(PyUnicodeObject *self, 3668 PyUnicodeObject *substring, 3669 int start, 3670 int end, 3671 int direction) 3672{ 3673 if (start < 0) 3674 start += self->length; 3675 if (start < 0) 3676 start = 0; 3677 3678 if (substring->length == 0) 3679 return 1; 3680 3681 if (end > self->length) 3682 end = self->length; 3683 if (end < 0) 3684 end += self->length; 3685 if (end < 0) 3686 end = 0; 3687 3688 end -= substring->length; 3689 if (end < start) 3690 return 0; 3691 3692 if (direction > 0) { 3693 if (Py_UNICODE_MATCH(self, end, substring)) 3694 return 1; 3695 } else { 3696 if (Py_UNICODE_MATCH(self, start, substring)) 3697 return 1; 3698 } 3699 3700 return 0; 3701} 3702 3703int PyUnicode_Tailmatch(PyObject *str, 3704 PyObject *substr, 3705 int start, 3706 int end, 3707 int direction) 3708{ 3709 int result; 3710 3711 str = PyUnicode_FromObject(str); 3712 if (str == NULL) 3713 return -1; 3714 substr = PyUnicode_FromObject(substr); 3715 if (substr == NULL) { 3716 Py_DECREF(substr); 3717 return -1; 3718 } 3719 3720 result = tailmatch((PyUnicodeObject *)str, 3721 (PyUnicodeObject *)substr, 3722 start, end, direction); 3723 Py_DECREF(str); 3724 Py_DECREF(substr); 3725 return result; 3726} 3727 3728static 3729const Py_UNICODE *findchar(const Py_UNICODE *s, 3730 int size, 3731 Py_UNICODE ch) 3732{ 3733 /* like wcschr, but doesn't stop at NULL characters */ 3734 3735 while (size-- > 0) { 3736 if (*s == ch) 3737 return s; 3738 s++; 3739 } 3740 3741 return NULL; 3742} 3743 3744/* Apply fixfct filter to the Unicode object self and return a 3745 reference to the modified object */ 3746 3747static 3748PyObject *fixup(PyUnicodeObject *self, 3749 int (*fixfct)(PyUnicodeObject *s)) 3750{ 3751 3752 PyUnicodeObject *u; 3753 3754 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3755 if (u == NULL) 3756 return NULL; 3757 3758 Py_UNICODE_COPY(u->str, self->str, self->length); 3759 3760 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3761 /* fixfct should return TRUE if it modified the buffer. If 3762 FALSE, return a reference to the original buffer instead 3763 (to save space, not time) */ 3764 Py_INCREF(self); 3765 Py_DECREF(u); 3766 return (PyObject*) self; 3767 } 3768 return (PyObject*) u; 3769} 3770 3771static 3772int fixupper(PyUnicodeObject *self) 3773{ 3774 int len = self->length; 3775 Py_UNICODE *s = self->str; 3776 int status = 0; 3777 3778 while (len-- > 0) { 3779 register Py_UNICODE ch; 3780 3781 ch = Py_UNICODE_TOUPPER(*s); 3782 if (ch != *s) { 3783 status = 1; 3784 *s = ch; 3785 } 3786 s++; 3787 } 3788 3789 return status; 3790} 3791 3792static 3793int fixlower(PyUnicodeObject *self) 3794{ 3795 int len = self->length; 3796 Py_UNICODE *s = self->str; 3797 int status = 0; 3798 3799 while (len-- > 0) { 3800 register Py_UNICODE ch; 3801 3802 ch = Py_UNICODE_TOLOWER(*s); 3803 if (ch != *s) { 3804 status = 1; 3805 *s = ch; 3806 } 3807 s++; 3808 } 3809 3810 return status; 3811} 3812 3813static 3814int fixswapcase(PyUnicodeObject *self) 3815{ 3816 int len = self->length; 3817 Py_UNICODE *s = self->str; 3818 int status = 0; 3819 3820 while (len-- > 0) { 3821 if (Py_UNICODE_ISUPPER(*s)) { 3822 *s = Py_UNICODE_TOLOWER(*s); 3823 status = 1; 3824 } else if (Py_UNICODE_ISLOWER(*s)) { 3825 *s = Py_UNICODE_TOUPPER(*s); 3826 status = 1; 3827 } 3828 s++; 3829 } 3830 3831 return status; 3832} 3833 3834static 3835int fixcapitalize(PyUnicodeObject *self) 3836{ 3837 int len = self->length; 3838 Py_UNICODE *s = self->str; 3839 int status = 0; 3840 3841 if (len == 0) 3842 return 0; 3843 if (Py_UNICODE_ISLOWER(*s)) { 3844 *s = Py_UNICODE_TOUPPER(*s); 3845 status = 1; 3846 } 3847 s++; 3848 while (--len > 0) { 3849 if (Py_UNICODE_ISUPPER(*s)) { 3850 *s = Py_UNICODE_TOLOWER(*s); 3851 status = 1; 3852 } 3853 s++; 3854 } 3855 return status; 3856} 3857 3858static 3859int fixtitle(PyUnicodeObject *self) 3860{ 3861 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3862 register Py_UNICODE *e; 3863 int previous_is_cased; 3864 3865 /* Shortcut for single character strings */ 3866 if (PyUnicode_GET_SIZE(self) == 1) { 3867 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3868 if (*p != ch) { 3869 *p = ch; 3870 return 1; 3871 } 3872 else 3873 return 0; 3874 } 3875 3876 e = p + PyUnicode_GET_SIZE(self); 3877 previous_is_cased = 0; 3878 for (; p < e; p++) { 3879 register const Py_UNICODE ch = *p; 3880 3881 if (previous_is_cased) 3882 *p = Py_UNICODE_TOLOWER(ch); 3883 else 3884 *p = Py_UNICODE_TOTITLE(ch); 3885 3886 if (Py_UNICODE_ISLOWER(ch) || 3887 Py_UNICODE_ISUPPER(ch) || 3888 Py_UNICODE_ISTITLE(ch)) 3889 previous_is_cased = 1; 3890 else 3891 previous_is_cased = 0; 3892 } 3893 return 1; 3894} 3895 3896PyObject *PyUnicode_Join(PyObject *separator, 3897 PyObject *seq) 3898{ 3899 Py_UNICODE *sep; 3900 int seplen; 3901 PyUnicodeObject *res = NULL; 3902 int reslen = 0; 3903 Py_UNICODE *p; 3904 int sz = 100; 3905 int i; 3906 PyObject *it; 3907 3908 it = PyObject_GetIter(seq); 3909 if (it == NULL) 3910 return NULL; 3911 3912 if (separator == NULL) { 3913 Py_UNICODE blank = ' '; 3914 sep = ␣ 3915 seplen = 1; 3916 } 3917 else { 3918 separator = PyUnicode_FromObject(separator); 3919 if (separator == NULL) 3920 goto onError; 3921 sep = PyUnicode_AS_UNICODE(separator); 3922 seplen = PyUnicode_GET_SIZE(separator); 3923 } 3924 3925 res = _PyUnicode_New(sz); 3926 if (res == NULL) 3927 goto onError; 3928 p = PyUnicode_AS_UNICODE(res); 3929 reslen = 0; 3930 3931 for (i = 0; ; ++i) { 3932 int itemlen; 3933 PyObject *item = PyIter_Next(it); 3934 if (item == NULL) { 3935 if (PyErr_Occurred()) 3936 goto onError; 3937 break; 3938 } 3939 if (!PyUnicode_Check(item)) { 3940 PyObject *v; 3941 if (!PyString_Check(item)) { 3942 PyErr_Format(PyExc_TypeError, 3943 "sequence item %i: expected string or Unicode," 3944 " %.80s found", 3945 i, item->ob_type->tp_name); 3946 Py_DECREF(item); 3947 goto onError; 3948 } 3949 v = PyUnicode_FromObject(item); 3950 Py_DECREF(item); 3951 item = v; 3952 if (item == NULL) 3953 goto onError; 3954 } 3955 itemlen = PyUnicode_GET_SIZE(item); 3956 while (reslen + itemlen + seplen >= sz) { 3957 if (_PyUnicode_Resize(&res, sz*2)) { 3958 Py_DECREF(item); 3959 goto onError; 3960 } 3961 sz *= 2; 3962 p = PyUnicode_AS_UNICODE(res) + reslen; 3963 } 3964 if (i > 0) { 3965 Py_UNICODE_COPY(p, sep, seplen); 3966 p += seplen; 3967 reslen += seplen; 3968 } 3969 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3970 p += itemlen; 3971 reslen += itemlen; 3972 Py_DECREF(item); 3973 } 3974 if (_PyUnicode_Resize(&res, reslen)) 3975 goto onError; 3976 3977 Py_XDECREF(separator); 3978 Py_DECREF(it); 3979 return (PyObject *)res; 3980 3981 onError: 3982 Py_XDECREF(separator); 3983 Py_XDECREF(res); 3984 Py_DECREF(it); 3985 return NULL; 3986} 3987 3988static 3989PyUnicodeObject *pad(PyUnicodeObject *self, 3990 int left, 3991 int right, 3992 Py_UNICODE fill) 3993{ 3994 PyUnicodeObject *u; 3995 3996 if (left < 0) 3997 left = 0; 3998 if (right < 0) 3999 right = 0; 4000 4001 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 4002 Py_INCREF(self); 4003 return self; 4004 } 4005 4006 u = _PyUnicode_New(left + self->length + right); 4007 if (u) { 4008 if (left) 4009 Py_UNICODE_FILL(u->str, fill, left); 4010 Py_UNICODE_COPY(u->str + left, self->str, self->length); 4011 if (right) 4012 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 4013 } 4014 4015 return u; 4016} 4017 4018#define SPLIT_APPEND(data, left, right) \ 4019 str = PyUnicode_FromUnicode(data + left, right - left); \ 4020 if (!str) \ 4021 goto onError; \ 4022 if (PyList_Append(list, str)) { \ 4023 Py_DECREF(str); \ 4024 goto onError; \ 4025 } \ 4026 else \ 4027 Py_DECREF(str); 4028 4029static 4030PyObject *split_whitespace(PyUnicodeObject *self, 4031 PyObject *list, 4032 int maxcount) 4033{ 4034 register int i; 4035 register int j; 4036 int len = self->length; 4037 PyObject *str; 4038 4039 for (i = j = 0; i < len; ) { 4040 /* find a token */ 4041 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4042 i++; 4043 j = i; 4044 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4045 i++; 4046 if (j < i) { 4047 if (maxcount-- <= 0) 4048 break; 4049 SPLIT_APPEND(self->str, j, i); 4050 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4051 i++; 4052 j = i; 4053 } 4054 } 4055 if (j < len) { 4056 SPLIT_APPEND(self->str, j, len); 4057 } 4058 return list; 4059 4060 onError: 4061 Py_DECREF(list); 4062 return NULL; 4063} 4064 4065PyObject *PyUnicode_Splitlines(PyObject *string, 4066 int keepends) 4067{ 4068 register int i; 4069 register int j; 4070 int len; 4071 PyObject *list; 4072 PyObject *str; 4073 Py_UNICODE *data; 4074 4075 string = PyUnicode_FromObject(string); 4076 if (string == NULL) 4077 return NULL; 4078 data = PyUnicode_AS_UNICODE(string); 4079 len = PyUnicode_GET_SIZE(string); 4080 4081 list = PyList_New(0); 4082 if (!list) 4083 goto onError; 4084 4085 for (i = j = 0; i < len; ) { 4086 int eol; 4087 4088 /* Find a line and append it */ 4089 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 4090 i++; 4091 4092 /* Skip the line break reading CRLF as one line break */ 4093 eol = i; 4094 if (i < len) { 4095 if (data[i] == '\r' && i + 1 < len && 4096 data[i+1] == '\n') 4097 i += 2; 4098 else 4099 i++; 4100 if (keepends) 4101 eol = i; 4102 } 4103 SPLIT_APPEND(data, j, eol); 4104 j = i; 4105 } 4106 if (j < len) { 4107 SPLIT_APPEND(data, j, len); 4108 } 4109 4110 Py_DECREF(string); 4111 return list; 4112 4113 onError: 4114 Py_DECREF(list); 4115 Py_DECREF(string); 4116 return NULL; 4117} 4118 4119static 4120PyObject *split_char(PyUnicodeObject *self, 4121 PyObject *list, 4122 Py_UNICODE ch, 4123 int maxcount) 4124{ 4125 register int i; 4126 register int j; 4127 int len = self->length; 4128 PyObject *str; 4129 4130 for (i = j = 0; i < len; ) { 4131 if (self->str[i] == ch) { 4132 if (maxcount-- <= 0) 4133 break; 4134 SPLIT_APPEND(self->str, j, i); 4135 i = j = i + 1; 4136 } else 4137 i++; 4138 } 4139 if (j <= len) { 4140 SPLIT_APPEND(self->str, j, len); 4141 } 4142 return list; 4143 4144 onError: 4145 Py_DECREF(list); 4146 return NULL; 4147} 4148 4149static 4150PyObject *split_substring(PyUnicodeObject *self, 4151 PyObject *list, 4152 PyUnicodeObject *substring, 4153 int maxcount) 4154{ 4155 register int i; 4156 register int j; 4157 int len = self->length; 4158 int sublen = substring->length; 4159 PyObject *str; 4160 4161 for (i = j = 0; i <= len - sublen; ) { 4162 if (Py_UNICODE_MATCH(self, i, substring)) { 4163 if (maxcount-- <= 0) 4164 break; 4165 SPLIT_APPEND(self->str, j, i); 4166 i = j = i + sublen; 4167 } else 4168 i++; 4169 } 4170 if (j <= len) { 4171 SPLIT_APPEND(self->str, j, len); 4172 } 4173 return list; 4174 4175 onError: 4176 Py_DECREF(list); 4177 return NULL; 4178} 4179 4180#undef SPLIT_APPEND 4181 4182static 4183PyObject *split(PyUnicodeObject *self, 4184 PyUnicodeObject *substring, 4185 int maxcount) 4186{ 4187 PyObject *list; 4188 4189 if (maxcount < 0) 4190 maxcount = INT_MAX; 4191 4192 list = PyList_New(0); 4193 if (!list) 4194 return NULL; 4195 4196 if (substring == NULL) 4197 return split_whitespace(self,list,maxcount); 4198 4199 else if (substring->length == 1) 4200 return split_char(self,list,substring->str[0],maxcount); 4201 4202 else if (substring->length == 0) { 4203 Py_DECREF(list); 4204 PyErr_SetString(PyExc_ValueError, "empty separator"); 4205 return NULL; 4206 } 4207 else 4208 return split_substring(self,list,substring,maxcount); 4209} 4210 4211static 4212PyObject *replace(PyUnicodeObject *self, 4213 PyUnicodeObject *str1, 4214 PyUnicodeObject *str2, 4215 int maxcount) 4216{ 4217 PyUnicodeObject *u; 4218 4219 if (maxcount < 0) 4220 maxcount = INT_MAX; 4221 4222 if (str1->length == 1 && str2->length == 1) { 4223 int i; 4224 4225 /* replace characters */ 4226 if (!findchar(self->str, self->length, str1->str[0]) && 4227 PyUnicode_CheckExact(self)) { 4228 /* nothing to replace, return original string */ 4229 Py_INCREF(self); 4230 u = self; 4231 } else { 4232 Py_UNICODE u1 = str1->str[0]; 4233 Py_UNICODE u2 = str2->str[0]; 4234 4235 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 4236 NULL, 4237 self->length 4238 ); 4239 if (u != NULL) { 4240 Py_UNICODE_COPY(u->str, self->str, 4241 self->length); 4242 for (i = 0; i < u->length; i++) 4243 if (u->str[i] == u1) { 4244 if (--maxcount < 0) 4245 break; 4246 u->str[i] = u2; 4247 } 4248 } 4249 } 4250 4251 } else { 4252 int n, i; 4253 Py_UNICODE *p; 4254 4255 /* replace strings */ 4256 n = count(self, 0, self->length, str1); 4257 if (n > maxcount) 4258 n = maxcount; 4259 if (n == 0) { 4260 /* nothing to replace, return original string */ 4261 if (PyUnicode_CheckExact(self)) { 4262 Py_INCREF(self); 4263 u = self; 4264 } 4265 else { 4266 u = (PyUnicodeObject *) 4267 PyUnicode_FromUnicode(self->str, self->length); 4268 } 4269 } else { 4270 u = _PyUnicode_New( 4271 self->length + n * (str2->length - str1->length)); 4272 if (u) { 4273 i = 0; 4274 p = u->str; 4275 if (str1->length > 0) { 4276 while (i <= self->length - str1->length) 4277 if (Py_UNICODE_MATCH(self, i, str1)) { 4278 /* replace string segment */ 4279 Py_UNICODE_COPY(p, str2->str, str2->length); 4280 p += str2->length; 4281 i += str1->length; 4282 if (--n <= 0) { 4283 /* copy remaining part */ 4284 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4285 break; 4286 } 4287 } else 4288 *p++ = self->str[i++]; 4289 } else { 4290 while (n > 0) { 4291 Py_UNICODE_COPY(p, str2->str, str2->length); 4292 p += str2->length; 4293 if (--n <= 0) 4294 break; 4295 *p++ = self->str[i++]; 4296 } 4297 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4298 } 4299 } 4300 } 4301 } 4302 4303 return (PyObject *) u; 4304} 4305 4306/* --- Unicode Object Methods --------------------------------------------- */ 4307 4308PyDoc_STRVAR(title__doc__, 4309"S.title() -> unicode\n\ 4310\n\ 4311Return a titlecased version of S, i.e. words start with title case\n\ 4312characters, all remaining cased characters have lower case."); 4313 4314static PyObject* 4315unicode_title(PyUnicodeObject *self) 4316{ 4317 return fixup(self, fixtitle); 4318} 4319 4320PyDoc_STRVAR(capitalize__doc__, 4321"S.capitalize() -> unicode\n\ 4322\n\ 4323Return a capitalized version of S, i.e. make the first character\n\ 4324have upper case."); 4325 4326static PyObject* 4327unicode_capitalize(PyUnicodeObject *self) 4328{ 4329 return fixup(self, fixcapitalize); 4330} 4331 4332#if 0 4333PyDoc_STRVAR(capwords__doc__, 4334"S.capwords() -> unicode\n\ 4335\n\ 4336Apply .capitalize() to all words in S and return the result with\n\ 4337normalized whitespace (all whitespace strings are replaced by ' ')."); 4338 4339static PyObject* 4340unicode_capwords(PyUnicodeObject *self) 4341{ 4342 PyObject *list; 4343 PyObject *item; 4344 int i; 4345 4346 /* Split into words */ 4347 list = split(self, NULL, -1); 4348 if (!list) 4349 return NULL; 4350 4351 /* Capitalize each word */ 4352 for (i = 0; i < PyList_GET_SIZE(list); i++) { 4353 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 4354 fixcapitalize); 4355 if (item == NULL) 4356 goto onError; 4357 Py_DECREF(PyList_GET_ITEM(list, i)); 4358 PyList_SET_ITEM(list, i, item); 4359 } 4360 4361 /* Join the words to form a new string */ 4362 item = PyUnicode_Join(NULL, list); 4363 4364onError: 4365 Py_DECREF(list); 4366 return (PyObject *)item; 4367} 4368#endif 4369 4370PyDoc_STRVAR(center__doc__, 4371"S.center(width) -> unicode\n\ 4372\n\ 4373Return S centered in a Unicode string of length width. Padding is done\n\ 4374using spaces."); 4375 4376static PyObject * 4377unicode_center(PyUnicodeObject *self, PyObject *args) 4378{ 4379 int marg, left; 4380 int width; 4381 4382 if (!PyArg_ParseTuple(args, "i:center", &width)) 4383 return NULL; 4384 4385 if (self->length >= width && PyUnicode_CheckExact(self)) { 4386 Py_INCREF(self); 4387 return (PyObject*) self; 4388 } 4389 4390 marg = width - self->length; 4391 left = marg / 2 + (marg & width & 1); 4392 4393 return (PyObject*) pad(self, left, marg - left, ' '); 4394} 4395 4396#if 0 4397 4398/* This code should go into some future Unicode collation support 4399 module. The basic comparison should compare ordinals on a naive 4400 basis (this is what Java does and thus JPython too). */ 4401 4402/* speedy UTF-16 code point order comparison */ 4403/* gleaned from: */ 4404/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 4405 4406static short utf16Fixup[32] = 4407{ 4408 0, 0, 0, 0, 0, 0, 0, 0, 4409 0, 0, 0, 0, 0, 0, 0, 0, 4410 0, 0, 0, 0, 0, 0, 0, 0, 4411 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 4412}; 4413 4414static int 4415unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4416{ 4417 int len1, len2; 4418 4419 Py_UNICODE *s1 = str1->str; 4420 Py_UNICODE *s2 = str2->str; 4421 4422 len1 = str1->length; 4423 len2 = str2->length; 4424 4425 while (len1 > 0 && len2 > 0) { 4426 Py_UNICODE c1, c2; 4427 4428 c1 = *s1++; 4429 c2 = *s2++; 4430 4431 if (c1 > (1<<11) * 26) 4432 c1 += utf16Fixup[c1>>11]; 4433 if (c2 > (1<<11) * 26) 4434 c2 += utf16Fixup[c2>>11]; 4435 /* now c1 and c2 are in UTF-32-compatible order */ 4436 4437 if (c1 != c2) 4438 return (c1 < c2) ? -1 : 1; 4439 4440 len1--; len2--; 4441 } 4442 4443 return (len1 < len2) ? -1 : (len1 != len2); 4444} 4445 4446#else 4447 4448static int 4449unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4450{ 4451 register int len1, len2; 4452 4453 Py_UNICODE *s1 = str1->str; 4454 Py_UNICODE *s2 = str2->str; 4455 4456 len1 = str1->length; 4457 len2 = str2->length; 4458 4459 while (len1 > 0 && len2 > 0) { 4460 Py_UNICODE c1, c2; 4461 4462 c1 = *s1++; 4463 c2 = *s2++; 4464 4465 if (c1 != c2) 4466 return (c1 < c2) ? -1 : 1; 4467 4468 len1--; len2--; 4469 } 4470 4471 return (len1 < len2) ? -1 : (len1 != len2); 4472} 4473 4474#endif 4475 4476int PyUnicode_Compare(PyObject *left, 4477 PyObject *right) 4478{ 4479 PyUnicodeObject *u = NULL, *v = NULL; 4480 int result; 4481 4482 /* Coerce the two arguments */ 4483 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4484 if (u == NULL) 4485 goto onError; 4486 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4487 if (v == NULL) 4488 goto onError; 4489 4490 /* Shortcut for empty or interned objects */ 4491 if (v == u) { 4492 Py_DECREF(u); 4493 Py_DECREF(v); 4494 return 0; 4495 } 4496 4497 result = unicode_compare(u, v); 4498 4499 Py_DECREF(u); 4500 Py_DECREF(v); 4501 return result; 4502 4503onError: 4504 Py_XDECREF(u); 4505 Py_XDECREF(v); 4506 return -1; 4507} 4508 4509int PyUnicode_Contains(PyObject *container, 4510 PyObject *element) 4511{ 4512 PyUnicodeObject *u = NULL, *v = NULL; 4513 int result, size; 4514 register const Py_UNICODE *lhs, *end, *rhs; 4515 4516 /* Coerce the two arguments */ 4517 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 4518 if (v == NULL) { 4519 PyErr_SetString(PyExc_TypeError, 4520 "'in <string>' requires string as left operand"); 4521 goto onError; 4522 } 4523 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 4524 if (u == NULL) 4525 goto onError; 4526 4527 size = PyUnicode_GET_SIZE(v); 4528 rhs = PyUnicode_AS_UNICODE(v); 4529 lhs = PyUnicode_AS_UNICODE(u); 4530 4531 result = 0; 4532 if (size == 1) { 4533 end = lhs + PyUnicode_GET_SIZE(u); 4534 while (lhs < end) { 4535 if (*lhs++ == *rhs) { 4536 result = 1; 4537 break; 4538 } 4539 } 4540 } 4541 else { 4542 end = lhs + (PyUnicode_GET_SIZE(u) - size); 4543 while (lhs <= end) { 4544 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) { 4545 result = 1; 4546 break; 4547 } 4548 } 4549 } 4550 4551 Py_DECREF(u); 4552 Py_DECREF(v); 4553 return result; 4554 4555onError: 4556 Py_XDECREF(u); 4557 Py_XDECREF(v); 4558 return -1; 4559} 4560 4561/* Concat to string or Unicode object giving a new Unicode object. */ 4562 4563PyObject *PyUnicode_Concat(PyObject *left, 4564 PyObject *right) 4565{ 4566 PyUnicodeObject *u = NULL, *v = NULL, *w; 4567 4568 /* Coerce the two arguments */ 4569 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4570 if (u == NULL) 4571 goto onError; 4572 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4573 if (v == NULL) 4574 goto onError; 4575 4576 /* Shortcuts */ 4577 if (v == unicode_empty) { 4578 Py_DECREF(v); 4579 return (PyObject *)u; 4580 } 4581 if (u == unicode_empty) { 4582 Py_DECREF(u); 4583 return (PyObject *)v; 4584 } 4585 4586 /* Concat the two Unicode strings */ 4587 w = _PyUnicode_New(u->length + v->length); 4588 if (w == NULL) 4589 goto onError; 4590 Py_UNICODE_COPY(w->str, u->str, u->length); 4591 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 4592 4593 Py_DECREF(u); 4594 Py_DECREF(v); 4595 return (PyObject *)w; 4596 4597onError: 4598 Py_XDECREF(u); 4599 Py_XDECREF(v); 4600 return NULL; 4601} 4602 4603PyDoc_STRVAR(count__doc__, 4604"S.count(sub[, start[, end]]) -> int\n\ 4605\n\ 4606Return the number of occurrences of substring sub in Unicode string\n\ 4607S[start:end]. Optional arguments start and end are\n\ 4608interpreted as in slice notation."); 4609 4610static PyObject * 4611unicode_count(PyUnicodeObject *self, PyObject *args) 4612{ 4613 PyUnicodeObject *substring; 4614 int start = 0; 4615 int end = INT_MAX; 4616 PyObject *result; 4617 4618 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 4619 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4620 return NULL; 4621 4622 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4623 (PyObject *)substring); 4624 if (substring == NULL) 4625 return NULL; 4626 4627 if (start < 0) 4628 start += self->length; 4629 if (start < 0) 4630 start = 0; 4631 if (end > self->length) 4632 end = self->length; 4633 if (end < 0) 4634 end += self->length; 4635 if (end < 0) 4636 end = 0; 4637 4638 result = PyInt_FromLong((long) count(self, start, end, substring)); 4639 4640 Py_DECREF(substring); 4641 return result; 4642} 4643 4644PyDoc_STRVAR(encode__doc__, 4645"S.encode([encoding[,errors]]) -> string\n\ 4646\n\ 4647Return an encoded string version of S. Default encoding is the current\n\ 4648default string encoding. errors may be given to set a different error\n\ 4649handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 4650a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 4651'xmlcharrefreplace' as well as any other name registered with\n\ 4652codecs.register_error that can handle UnicodeEncodeErrors."); 4653 4654static PyObject * 4655unicode_encode(PyUnicodeObject *self, PyObject *args) 4656{ 4657 char *encoding = NULL; 4658 char *errors = NULL; 4659 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 4660 return NULL; 4661 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 4662} 4663 4664PyDoc_STRVAR(expandtabs__doc__, 4665"S.expandtabs([tabsize]) -> unicode\n\ 4666\n\ 4667Return a copy of S where all tab characters are expanded using spaces.\n\ 4668If tabsize is not given, a tab size of 8 characters is assumed."); 4669 4670static PyObject* 4671unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 4672{ 4673 Py_UNICODE *e; 4674 Py_UNICODE *p; 4675 Py_UNICODE *q; 4676 int i, j; 4677 PyUnicodeObject *u; 4678 int tabsize = 8; 4679 4680 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 4681 return NULL; 4682 4683 /* First pass: determine size of output string */ 4684 i = j = 0; 4685 e = self->str + self->length; 4686 for (p = self->str; p < e; p++) 4687 if (*p == '\t') { 4688 if (tabsize > 0) 4689 j += tabsize - (j % tabsize); 4690 } 4691 else { 4692 j++; 4693 if (*p == '\n' || *p == '\r') { 4694 i += j; 4695 j = 0; 4696 } 4697 } 4698 4699 /* Second pass: create output string and fill it */ 4700 u = _PyUnicode_New(i + j); 4701 if (!u) 4702 return NULL; 4703 4704 j = 0; 4705 q = u->str; 4706 4707 for (p = self->str; p < e; p++) 4708 if (*p == '\t') { 4709 if (tabsize > 0) { 4710 i = tabsize - (j % tabsize); 4711 j += i; 4712 while (i--) 4713 *q++ = ' '; 4714 } 4715 } 4716 else { 4717 j++; 4718 *q++ = *p; 4719 if (*p == '\n' || *p == '\r') 4720 j = 0; 4721 } 4722 4723 return (PyObject*) u; 4724} 4725 4726PyDoc_STRVAR(find__doc__, 4727"S.find(sub [,start [,end]]) -> int\n\ 4728\n\ 4729Return the lowest index in S where substring sub is found,\n\ 4730such that sub is contained within s[start,end]. Optional\n\ 4731arguments start and end are interpreted as in slice notation.\n\ 4732\n\ 4733Return -1 on failure."); 4734 4735static PyObject * 4736unicode_find(PyUnicodeObject *self, PyObject *args) 4737{ 4738 PyUnicodeObject *substring; 4739 int start = 0; 4740 int end = INT_MAX; 4741 PyObject *result; 4742 4743 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4744 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4745 return NULL; 4746 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4747 (PyObject *)substring); 4748 if (substring == NULL) 4749 return NULL; 4750 4751 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4752 4753 Py_DECREF(substring); 4754 return result; 4755} 4756 4757static PyObject * 4758unicode_getitem(PyUnicodeObject *self, int index) 4759{ 4760 if (index < 0 || index >= self->length) { 4761 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4762 return NULL; 4763 } 4764 4765 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4766} 4767 4768static long 4769unicode_hash(PyUnicodeObject *self) 4770{ 4771 /* Since Unicode objects compare equal to their ASCII string 4772 counterparts, they should use the individual character values 4773 as basis for their hash value. This is needed to assure that 4774 strings and Unicode objects behave in the same way as 4775 dictionary keys. */ 4776 4777 register int len; 4778 register Py_UNICODE *p; 4779 register long x; 4780 4781 if (self->hash != -1) 4782 return self->hash; 4783 len = PyUnicode_GET_SIZE(self); 4784 p = PyUnicode_AS_UNICODE(self); 4785 x = *p << 7; 4786 while (--len >= 0) 4787 x = (1000003*x) ^ *p++; 4788 x ^= PyUnicode_GET_SIZE(self); 4789 if (x == -1) 4790 x = -2; 4791 self->hash = x; 4792 return x; 4793} 4794 4795PyDoc_STRVAR(index__doc__, 4796"S.index(sub [,start [,end]]) -> int\n\ 4797\n\ 4798Like S.find() but raise ValueError when the substring is not found."); 4799 4800static PyObject * 4801unicode_index(PyUnicodeObject *self, PyObject *args) 4802{ 4803 int result; 4804 PyUnicodeObject *substring; 4805 int start = 0; 4806 int end = INT_MAX; 4807 4808 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4809 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4810 return NULL; 4811 4812 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4813 (PyObject *)substring); 4814 if (substring == NULL) 4815 return NULL; 4816 4817 result = findstring(self, substring, start, end, 1); 4818 4819 Py_DECREF(substring); 4820 if (result < 0) { 4821 PyErr_SetString(PyExc_ValueError, "substring not found"); 4822 return NULL; 4823 } 4824 return PyInt_FromLong(result); 4825} 4826 4827PyDoc_STRVAR(islower__doc__, 4828"S.islower() -> bool\n\ 4829\n\ 4830Return True if all cased characters in S are lowercase and there is\n\ 4831at least one cased character in S, False otherwise."); 4832 4833static PyObject* 4834unicode_islower(PyUnicodeObject *self) 4835{ 4836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4837 register const Py_UNICODE *e; 4838 int cased; 4839 4840 /* Shortcut for single character strings */ 4841 if (PyUnicode_GET_SIZE(self) == 1) 4842 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 4843 4844 /* Special case for empty strings */ 4845 if (PyString_GET_SIZE(self) == 0) 4846 return PyBool_FromLong(0); 4847 4848 e = p + PyUnicode_GET_SIZE(self); 4849 cased = 0; 4850 for (; p < e; p++) { 4851 register const Py_UNICODE ch = *p; 4852 4853 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4854 return PyBool_FromLong(0); 4855 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4856 cased = 1; 4857 } 4858 return PyBool_FromLong(cased); 4859} 4860 4861PyDoc_STRVAR(isupper__doc__, 4862"S.isupper() -> bool\n\ 4863\n\ 4864Return True if all cased characters in S are uppercase and there is\n\ 4865at least one cased character in S, False otherwise."); 4866 4867static PyObject* 4868unicode_isupper(PyUnicodeObject *self) 4869{ 4870 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4871 register const Py_UNICODE *e; 4872 int cased; 4873 4874 /* Shortcut for single character strings */ 4875 if (PyUnicode_GET_SIZE(self) == 1) 4876 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4877 4878 /* Special case for empty strings */ 4879 if (PyString_GET_SIZE(self) == 0) 4880 return PyBool_FromLong(0); 4881 4882 e = p + PyUnicode_GET_SIZE(self); 4883 cased = 0; 4884 for (; p < e; p++) { 4885 register const Py_UNICODE ch = *p; 4886 4887 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4888 return PyBool_FromLong(0); 4889 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4890 cased = 1; 4891 } 4892 return PyBool_FromLong(cased); 4893} 4894 4895PyDoc_STRVAR(istitle__doc__, 4896"S.istitle() -> bool\n\ 4897\n\ 4898Return True if S is a titlecased string, i.e. upper- and titlecase\n\ 4899characters may only follow uncased characters and lowercase characters\n\ 4900only cased ones. Return False otherwise."); 4901 4902static PyObject* 4903unicode_istitle(PyUnicodeObject *self) 4904{ 4905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4906 register const Py_UNICODE *e; 4907 int cased, previous_is_cased; 4908 4909 /* Shortcut for single character strings */ 4910 if (PyUnicode_GET_SIZE(self) == 1) 4911 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4912 (Py_UNICODE_ISUPPER(*p) != 0)); 4913 4914 /* Special case for empty strings */ 4915 if (PyString_GET_SIZE(self) == 0) 4916 return PyBool_FromLong(0); 4917 4918 e = p + PyUnicode_GET_SIZE(self); 4919 cased = 0; 4920 previous_is_cased = 0; 4921 for (; p < e; p++) { 4922 register const Py_UNICODE ch = *p; 4923 4924 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4925 if (previous_is_cased) 4926 return PyBool_FromLong(0); 4927 previous_is_cased = 1; 4928 cased = 1; 4929 } 4930 else if (Py_UNICODE_ISLOWER(ch)) { 4931 if (!previous_is_cased) 4932 return PyBool_FromLong(0); 4933 previous_is_cased = 1; 4934 cased = 1; 4935 } 4936 else 4937 previous_is_cased = 0; 4938 } 4939 return PyBool_FromLong(cased); 4940} 4941 4942PyDoc_STRVAR(isspace__doc__, 4943"S.isspace() -> bool\n\ 4944\n\ 4945Return True if there are only whitespace characters in S,\n\ 4946False otherwise."); 4947 4948static PyObject* 4949unicode_isspace(PyUnicodeObject *self) 4950{ 4951 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4952 register const Py_UNICODE *e; 4953 4954 /* Shortcut for single character strings */ 4955 if (PyUnicode_GET_SIZE(self) == 1 && 4956 Py_UNICODE_ISSPACE(*p)) 4957 return PyBool_FromLong(1); 4958 4959 /* Special case for empty strings */ 4960 if (PyString_GET_SIZE(self) == 0) 4961 return PyBool_FromLong(0); 4962 4963 e = p + PyUnicode_GET_SIZE(self); 4964 for (; p < e; p++) { 4965 if (!Py_UNICODE_ISSPACE(*p)) 4966 return PyBool_FromLong(0); 4967 } 4968 return PyBool_FromLong(1); 4969} 4970 4971PyDoc_STRVAR(isalpha__doc__, 4972"S.isalpha() -> bool\n\ 4973\n\ 4974Return True if all characters in S are alphabetic\n\ 4975and there is at least one character in S, False otherwise."); 4976 4977static PyObject* 4978unicode_isalpha(PyUnicodeObject *self) 4979{ 4980 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4981 register const Py_UNICODE *e; 4982 4983 /* Shortcut for single character strings */ 4984 if (PyUnicode_GET_SIZE(self) == 1 && 4985 Py_UNICODE_ISALPHA(*p)) 4986 return PyBool_FromLong(1); 4987 4988 /* Special case for empty strings */ 4989 if (PyString_GET_SIZE(self) == 0) 4990 return PyBool_FromLong(0); 4991 4992 e = p + PyUnicode_GET_SIZE(self); 4993 for (; p < e; p++) { 4994 if (!Py_UNICODE_ISALPHA(*p)) 4995 return PyBool_FromLong(0); 4996 } 4997 return PyBool_FromLong(1); 4998} 4999 5000PyDoc_STRVAR(isalnum__doc__, 5001"S.isalnum() -> bool\n\ 5002\n\ 5003Return True if all characters in S are alphanumeric\n\ 5004and there is at least one character in S, False otherwise."); 5005 5006static PyObject* 5007unicode_isalnum(PyUnicodeObject *self) 5008{ 5009 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5010 register const Py_UNICODE *e; 5011 5012 /* Shortcut for single character strings */ 5013 if (PyUnicode_GET_SIZE(self) == 1 && 5014 Py_UNICODE_ISALNUM(*p)) 5015 return PyBool_FromLong(1); 5016 5017 /* Special case for empty strings */ 5018 if (PyString_GET_SIZE(self) == 0) 5019 return PyBool_FromLong(0); 5020 5021 e = p + PyUnicode_GET_SIZE(self); 5022 for (; p < e; p++) { 5023 if (!Py_UNICODE_ISALNUM(*p)) 5024 return PyBool_FromLong(0); 5025 } 5026 return PyBool_FromLong(1); 5027} 5028 5029PyDoc_STRVAR(isdecimal__doc__, 5030"S.isdecimal() -> bool\n\ 5031\n\ 5032Return True if there are only decimal characters in S,\n\ 5033False otherwise."); 5034 5035static PyObject* 5036unicode_isdecimal(PyUnicodeObject *self) 5037{ 5038 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5039 register const Py_UNICODE *e; 5040 5041 /* Shortcut for single character strings */ 5042 if (PyUnicode_GET_SIZE(self) == 1 && 5043 Py_UNICODE_ISDECIMAL(*p)) 5044 return PyBool_FromLong(1); 5045 5046 /* Special case for empty strings */ 5047 if (PyString_GET_SIZE(self) == 0) 5048 return PyBool_FromLong(0); 5049 5050 e = p + PyUnicode_GET_SIZE(self); 5051 for (; p < e; p++) { 5052 if (!Py_UNICODE_ISDECIMAL(*p)) 5053 return PyBool_FromLong(0); 5054 } 5055 return PyBool_FromLong(1); 5056} 5057 5058PyDoc_STRVAR(isdigit__doc__, 5059"S.isdigit() -> bool\n\ 5060\n\ 5061Return True if there are only digit characters in S,\n\ 5062False otherwise."); 5063 5064static PyObject* 5065unicode_isdigit(PyUnicodeObject *self) 5066{ 5067 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5068 register const Py_UNICODE *e; 5069 5070 /* Shortcut for single character strings */ 5071 if (PyUnicode_GET_SIZE(self) == 1 && 5072 Py_UNICODE_ISDIGIT(*p)) 5073 return PyBool_FromLong(1); 5074 5075 /* Special case for empty strings */ 5076 if (PyString_GET_SIZE(self) == 0) 5077 return PyBool_FromLong(0); 5078 5079 e = p + PyUnicode_GET_SIZE(self); 5080 for (; p < e; p++) { 5081 if (!Py_UNICODE_ISDIGIT(*p)) 5082 return PyBool_FromLong(0); 5083 } 5084 return PyBool_FromLong(1); 5085} 5086 5087PyDoc_STRVAR(isnumeric__doc__, 5088"S.isnumeric() -> bool\n\ 5089\n\ 5090Return True if there are only numeric characters in S,\n\ 5091False otherwise."); 5092 5093static PyObject* 5094unicode_isnumeric(PyUnicodeObject *self) 5095{ 5096 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5097 register const Py_UNICODE *e; 5098 5099 /* Shortcut for single character strings */ 5100 if (PyUnicode_GET_SIZE(self) == 1 && 5101 Py_UNICODE_ISNUMERIC(*p)) 5102 return PyBool_FromLong(1); 5103 5104 /* Special case for empty strings */ 5105 if (PyString_GET_SIZE(self) == 0) 5106 return PyBool_FromLong(0); 5107 5108 e = p + PyUnicode_GET_SIZE(self); 5109 for (; p < e; p++) { 5110 if (!Py_UNICODE_ISNUMERIC(*p)) 5111 return PyBool_FromLong(0); 5112 } 5113 return PyBool_FromLong(1); 5114} 5115 5116PyDoc_STRVAR(join__doc__, 5117"S.join(sequence) -> unicode\n\ 5118\n\ 5119Return a string which is the concatenation of the strings in the\n\ 5120sequence. The separator between elements is S."); 5121 5122static PyObject* 5123unicode_join(PyObject *self, PyObject *data) 5124{ 5125 return PyUnicode_Join(self, data); 5126} 5127 5128static int 5129unicode_length(PyUnicodeObject *self) 5130{ 5131 return self->length; 5132} 5133 5134PyDoc_STRVAR(ljust__doc__, 5135"S.ljust(width) -> unicode\n\ 5136\n\ 5137Return S left justified in a Unicode string of length width. Padding is\n\ 5138done using spaces."); 5139 5140static PyObject * 5141unicode_ljust(PyUnicodeObject *self, PyObject *args) 5142{ 5143 int width; 5144 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 5145 return NULL; 5146 5147 if (self->length >= width && PyUnicode_CheckExact(self)) { 5148 Py_INCREF(self); 5149 return (PyObject*) self; 5150 } 5151 5152 return (PyObject*) pad(self, 0, width - self->length, ' '); 5153} 5154 5155PyDoc_STRVAR(lower__doc__, 5156"S.lower() -> unicode\n\ 5157\n\ 5158Return a copy of the string S converted to lowercase."); 5159 5160static PyObject* 5161unicode_lower(PyUnicodeObject *self) 5162{ 5163 return fixup(self, fixlower); 5164} 5165 5166#define LEFTSTRIP 0 5167#define RIGHTSTRIP 1 5168#define BOTHSTRIP 2 5169 5170/* Arrays indexed by above */ 5171static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 5172 5173#define STRIPNAME(i) (stripformat[i]+3) 5174 5175static const Py_UNICODE * 5176unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) 5177{ 5178 size_t i; 5179 for (i = 0; i < n; ++i) 5180 if (s[i] == c) 5181 return s+i; 5182 return NULL; 5183} 5184 5185/* externally visible for str.strip(unicode) */ 5186PyObject * 5187_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 5188{ 5189 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5190 int len = PyUnicode_GET_SIZE(self); 5191 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 5192 int seplen = PyUnicode_GET_SIZE(sepobj); 5193 int i, j; 5194 5195 i = 0; 5196 if (striptype != RIGHTSTRIP) { 5197 while (i < len && unicode_memchr(sep, s[i], seplen)) { 5198 i++; 5199 } 5200 } 5201 5202 j = len; 5203 if (striptype != LEFTSTRIP) { 5204 do { 5205 j--; 5206 } while (j >= i && unicode_memchr(sep, s[j], seplen)); 5207 j++; 5208 } 5209 5210 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5211 Py_INCREF(self); 5212 return (PyObject*)self; 5213 } 5214 else 5215 return PyUnicode_FromUnicode(s+i, j-i); 5216} 5217 5218 5219static PyObject * 5220do_strip(PyUnicodeObject *self, int striptype) 5221{ 5222 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5223 int len = PyUnicode_GET_SIZE(self), i, j; 5224 5225 i = 0; 5226 if (striptype != RIGHTSTRIP) { 5227 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 5228 i++; 5229 } 5230 } 5231 5232 j = len; 5233 if (striptype != LEFTSTRIP) { 5234 do { 5235 j--; 5236 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 5237 j++; 5238 } 5239 5240 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5241 Py_INCREF(self); 5242 return (PyObject*)self; 5243 } 5244 else 5245 return PyUnicode_FromUnicode(s+i, j-i); 5246} 5247 5248 5249static PyObject * 5250do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 5251{ 5252 PyObject *sep = NULL; 5253 5254 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 5255 return NULL; 5256 5257 if (sep != NULL && sep != Py_None) { 5258 if (PyUnicode_Check(sep)) 5259 return _PyUnicode_XStrip(self, striptype, sep); 5260 else if (PyString_Check(sep)) { 5261 PyObject *res; 5262 sep = PyUnicode_FromObject(sep); 5263 if (sep==NULL) 5264 return NULL; 5265 res = _PyUnicode_XStrip(self, striptype, sep); 5266 Py_DECREF(sep); 5267 return res; 5268 } 5269 else { 5270 PyErr_Format(PyExc_TypeError, 5271 "%s arg must be None, unicode or str", 5272 STRIPNAME(striptype)); 5273 return NULL; 5274 } 5275 } 5276 5277 return do_strip(self, striptype); 5278} 5279 5280 5281PyDoc_STRVAR(strip__doc__, 5282"S.strip([chars]) -> unicode\n\ 5283\n\ 5284Return a copy of the string S with leading and trailing\n\ 5285whitespace removed.\n\ 5286If chars is given and not None, remove characters in chars instead.\n\ 5287If chars is a str, it will be converted to unicode before stripping"); 5288 5289static PyObject * 5290unicode_strip(PyUnicodeObject *self, PyObject *args) 5291{ 5292 if (PyTuple_GET_SIZE(args) == 0) 5293 return do_strip(self, BOTHSTRIP); /* Common case */ 5294 else 5295 return do_argstrip(self, BOTHSTRIP, args); 5296} 5297 5298 5299PyDoc_STRVAR(lstrip__doc__, 5300"S.lstrip([chars]) -> unicode\n\ 5301\n\ 5302Return a copy of the string S with leading whitespace removed.\n\ 5303If chars is given and not None, remove characters in chars instead.\n\ 5304If chars is a str, it will be converted to unicode before stripping"); 5305 5306static PyObject * 5307unicode_lstrip(PyUnicodeObject *self, PyObject *args) 5308{ 5309 if (PyTuple_GET_SIZE(args) == 0) 5310 return do_strip(self, LEFTSTRIP); /* Common case */ 5311 else 5312 return do_argstrip(self, LEFTSTRIP, args); 5313} 5314 5315 5316PyDoc_STRVAR(rstrip__doc__, 5317"S.rstrip([chars]) -> unicode\n\ 5318\n\ 5319Return a copy of the string S with trailing whitespace removed.\n\ 5320If chars is given and not None, remove characters in chars instead.\n\ 5321If chars is a str, it will be converted to unicode before stripping"); 5322 5323static PyObject * 5324unicode_rstrip(PyUnicodeObject *self, PyObject *args) 5325{ 5326 if (PyTuple_GET_SIZE(args) == 0) 5327 return do_strip(self, RIGHTSTRIP); /* Common case */ 5328 else 5329 return do_argstrip(self, RIGHTSTRIP, args); 5330} 5331 5332 5333static PyObject* 5334unicode_repeat(PyUnicodeObject *str, int len) 5335{ 5336 PyUnicodeObject *u; 5337 Py_UNICODE *p; 5338 int nchars; 5339 size_t nbytes; 5340 5341 if (len < 0) 5342 len = 0; 5343 5344 if (len == 1 && PyUnicode_CheckExact(str)) { 5345 /* no repeat, return original string */ 5346 Py_INCREF(str); 5347 return (PyObject*) str; 5348 } 5349 5350 /* ensure # of chars needed doesn't overflow int and # of bytes 5351 * needed doesn't overflow size_t 5352 */ 5353 nchars = len * str->length; 5354 if (len && nchars / len != str->length) { 5355 PyErr_SetString(PyExc_OverflowError, 5356 "repeated string is too long"); 5357 return NULL; 5358 } 5359 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 5360 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 5361 PyErr_SetString(PyExc_OverflowError, 5362 "repeated string is too long"); 5363 return NULL; 5364 } 5365 u = _PyUnicode_New(nchars); 5366 if (!u) 5367 return NULL; 5368 5369 p = u->str; 5370 5371 while (len-- > 0) { 5372 Py_UNICODE_COPY(p, str->str, str->length); 5373 p += str->length; 5374 } 5375 5376 return (PyObject*) u; 5377} 5378 5379PyObject *PyUnicode_Replace(PyObject *obj, 5380 PyObject *subobj, 5381 PyObject *replobj, 5382 int maxcount) 5383{ 5384 PyObject *self; 5385 PyObject *str1; 5386 PyObject *str2; 5387 PyObject *result; 5388 5389 self = PyUnicode_FromObject(obj); 5390 if (self == NULL) 5391 return NULL; 5392 str1 = PyUnicode_FromObject(subobj); 5393 if (str1 == NULL) { 5394 Py_DECREF(self); 5395 return NULL; 5396 } 5397 str2 = PyUnicode_FromObject(replobj); 5398 if (str2 == NULL) { 5399 Py_DECREF(self); 5400 Py_DECREF(str1); 5401 return NULL; 5402 } 5403 result = replace((PyUnicodeObject *)self, 5404 (PyUnicodeObject *)str1, 5405 (PyUnicodeObject *)str2, 5406 maxcount); 5407 Py_DECREF(self); 5408 Py_DECREF(str1); 5409 Py_DECREF(str2); 5410 return result; 5411} 5412 5413PyDoc_STRVAR(replace__doc__, 5414"S.replace (old, new[, maxsplit]) -> unicode\n\ 5415\n\ 5416Return a copy of S with all occurrences of substring\n\ 5417old replaced by new. If the optional argument maxsplit is\n\ 5418given, only the first maxsplit occurrences are replaced."); 5419 5420static PyObject* 5421unicode_replace(PyUnicodeObject *self, PyObject *args) 5422{ 5423 PyUnicodeObject *str1; 5424 PyUnicodeObject *str2; 5425 int maxcount = -1; 5426 PyObject *result; 5427 5428 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 5429 return NULL; 5430 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 5431 if (str1 == NULL) 5432 return NULL; 5433 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 5434 if (str2 == NULL) { 5435 Py_DECREF(str1); 5436 return NULL; 5437 } 5438 5439 result = replace(self, str1, str2, maxcount); 5440 5441 Py_DECREF(str1); 5442 Py_DECREF(str2); 5443 return result; 5444} 5445 5446static 5447PyObject *unicode_repr(PyObject *unicode) 5448{ 5449 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 5450 PyUnicode_GET_SIZE(unicode), 5451 1); 5452} 5453 5454PyDoc_STRVAR(rfind__doc__, 5455"S.rfind(sub [,start [,end]]) -> int\n\ 5456\n\ 5457Return the highest index in S where substring sub is found,\n\ 5458such that sub is contained within s[start,end]. Optional\n\ 5459arguments start and end are interpreted as in slice notation.\n\ 5460\n\ 5461Return -1 on failure."); 5462 5463static PyObject * 5464unicode_rfind(PyUnicodeObject *self, PyObject *args) 5465{ 5466 PyUnicodeObject *substring; 5467 int start = 0; 5468 int end = INT_MAX; 5469 PyObject *result; 5470 5471 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 5472 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5473 return NULL; 5474 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5475 (PyObject *)substring); 5476 if (substring == NULL) 5477 return NULL; 5478 5479 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 5480 5481 Py_DECREF(substring); 5482 return result; 5483} 5484 5485PyDoc_STRVAR(rindex__doc__, 5486"S.rindex(sub [,start [,end]]) -> int\n\ 5487\n\ 5488Like S.rfind() but raise ValueError when the substring is not found."); 5489 5490static PyObject * 5491unicode_rindex(PyUnicodeObject *self, PyObject *args) 5492{ 5493 int result; 5494 PyUnicodeObject *substring; 5495 int start = 0; 5496 int end = INT_MAX; 5497 5498 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 5499 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5500 return NULL; 5501 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5502 (PyObject *)substring); 5503 if (substring == NULL) 5504 return NULL; 5505 5506 result = findstring(self, substring, start, end, -1); 5507 5508 Py_DECREF(substring); 5509 if (result < 0) { 5510 PyErr_SetString(PyExc_ValueError, "substring not found"); 5511 return NULL; 5512 } 5513 return PyInt_FromLong(result); 5514} 5515 5516PyDoc_STRVAR(rjust__doc__, 5517"S.rjust(width) -> unicode\n\ 5518\n\ 5519Return S right justified in a Unicode string of length width. Padding is\n\ 5520done using spaces."); 5521 5522static PyObject * 5523unicode_rjust(PyUnicodeObject *self, PyObject *args) 5524{ 5525 int width; 5526 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 5527 return NULL; 5528 5529 if (self->length >= width && PyUnicode_CheckExact(self)) { 5530 Py_INCREF(self); 5531 return (PyObject*) self; 5532 } 5533 5534 return (PyObject*) pad(self, width - self->length, 0, ' '); 5535} 5536 5537static PyObject* 5538unicode_slice(PyUnicodeObject *self, int start, int end) 5539{ 5540 /* standard clamping */ 5541 if (start < 0) 5542 start = 0; 5543 if (end < 0) 5544 end = 0; 5545 if (end > self->length) 5546 end = self->length; 5547 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 5548 /* full slice, return original string */ 5549 Py_INCREF(self); 5550 return (PyObject*) self; 5551 } 5552 if (start > end) 5553 start = end; 5554 /* copy slice */ 5555 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 5556 end - start); 5557} 5558 5559PyObject *PyUnicode_Split(PyObject *s, 5560 PyObject *sep, 5561 int maxsplit) 5562{ 5563 PyObject *result; 5564 5565 s = PyUnicode_FromObject(s); 5566 if (s == NULL) 5567 return NULL; 5568 if (sep != NULL) { 5569 sep = PyUnicode_FromObject(sep); 5570 if (sep == NULL) { 5571 Py_DECREF(s); 5572 return NULL; 5573 } 5574 } 5575 5576 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 5577 5578 Py_DECREF(s); 5579 Py_XDECREF(sep); 5580 return result; 5581} 5582 5583PyDoc_STRVAR(split__doc__, 5584"S.split([sep [,maxsplit]]) -> list of strings\n\ 5585\n\ 5586Return a list of the words in S, using sep as the\n\ 5587delimiter string. If maxsplit is given, at most maxsplit\n\ 5588splits are done. If sep is not specified, any whitespace string\n\ 5589is a separator."); 5590 5591static PyObject* 5592unicode_split(PyUnicodeObject *self, PyObject *args) 5593{ 5594 PyObject *substring = Py_None; 5595 int maxcount = -1; 5596 5597 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 5598 return NULL; 5599 5600 if (substring == Py_None) 5601 return split(self, NULL, maxcount); 5602 else if (PyUnicode_Check(substring)) 5603 return split(self, (PyUnicodeObject *)substring, maxcount); 5604 else 5605 return PyUnicode_Split((PyObject *)self, substring, maxcount); 5606} 5607 5608PyDoc_STRVAR(splitlines__doc__, 5609"S.splitlines([keepends]]) -> list of strings\n\ 5610\n\ 5611Return a list of the lines in S, breaking at line boundaries.\n\ 5612Line breaks are not included in the resulting list unless keepends\n\ 5613is given and true."); 5614 5615static PyObject* 5616unicode_splitlines(PyUnicodeObject *self, PyObject *args) 5617{ 5618 int keepends = 0; 5619 5620 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 5621 return NULL; 5622 5623 return PyUnicode_Splitlines((PyObject *)self, keepends); 5624} 5625 5626static 5627PyObject *unicode_str(PyUnicodeObject *self) 5628{ 5629 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 5630} 5631 5632PyDoc_STRVAR(swapcase__doc__, 5633"S.swapcase() -> unicode\n\ 5634\n\ 5635Return a copy of S with uppercase characters converted to lowercase\n\ 5636and vice versa."); 5637 5638static PyObject* 5639unicode_swapcase(PyUnicodeObject *self) 5640{ 5641 return fixup(self, fixswapcase); 5642} 5643 5644PyDoc_STRVAR(translate__doc__, 5645"S.translate(table) -> unicode\n\ 5646\n\ 5647Return a copy of the string S, where all characters have been mapped\n\ 5648through the given translation table, which must be a mapping of\n\ 5649Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 5650Unmapped characters are left untouched. Characters mapped to None\n\ 5651are deleted."); 5652 5653static PyObject* 5654unicode_translate(PyUnicodeObject *self, PyObject *table) 5655{ 5656 return PyUnicode_TranslateCharmap(self->str, 5657 self->length, 5658 table, 5659 "ignore"); 5660} 5661 5662PyDoc_STRVAR(upper__doc__, 5663"S.upper() -> unicode\n\ 5664\n\ 5665Return a copy of S converted to uppercase."); 5666 5667static PyObject* 5668unicode_upper(PyUnicodeObject *self) 5669{ 5670 return fixup(self, fixupper); 5671} 5672 5673PyDoc_STRVAR(zfill__doc__, 5674"S.zfill(width) -> unicode\n\ 5675\n\ 5676Pad a numeric string x with zeros on the left, to fill a field\n\ 5677of the specified width. The string x is never truncated."); 5678 5679static PyObject * 5680unicode_zfill(PyUnicodeObject *self, PyObject *args) 5681{ 5682 int fill; 5683 PyUnicodeObject *u; 5684 5685 int width; 5686 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 5687 return NULL; 5688 5689 if (self->length >= width) { 5690 if (PyUnicode_CheckExact(self)) { 5691 Py_INCREF(self); 5692 return (PyObject*) self; 5693 } 5694 else 5695 return PyUnicode_FromUnicode( 5696 PyUnicode_AS_UNICODE(self), 5697 PyUnicode_GET_SIZE(self) 5698 ); 5699 } 5700 5701 fill = width - self->length; 5702 5703 u = pad(self, fill, 0, '0'); 5704 5705 if (u == NULL) 5706 return NULL; 5707 5708 if (u->str[fill] == '+' || u->str[fill] == '-') { 5709 /* move sign to beginning of string */ 5710 u->str[0] = u->str[fill]; 5711 u->str[fill] = '0'; 5712 } 5713 5714 return (PyObject*) u; 5715} 5716 5717#if 0 5718static PyObject* 5719unicode_freelistsize(PyUnicodeObject *self) 5720{ 5721 return PyInt_FromLong(unicode_freelist_size); 5722} 5723#endif 5724 5725PyDoc_STRVAR(startswith__doc__, 5726"S.startswith(prefix[, start[, end]]) -> bool\n\ 5727\n\ 5728Return True if S starts with the specified prefix, False otherwise.\n\ 5729With optional start, test S beginning at that position.\n\ 5730With optional end, stop comparing S at that position."); 5731 5732static PyObject * 5733unicode_startswith(PyUnicodeObject *self, 5734 PyObject *args) 5735{ 5736 PyUnicodeObject *substring; 5737 int start = 0; 5738 int end = INT_MAX; 5739 PyObject *result; 5740 5741 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 5742 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5743 return NULL; 5744 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5745 (PyObject *)substring); 5746 if (substring == NULL) 5747 return NULL; 5748 5749 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); 5750 5751 Py_DECREF(substring); 5752 return result; 5753} 5754 5755 5756PyDoc_STRVAR(endswith__doc__, 5757"S.endswith(suffix[, start[, end]]) -> bool\n\ 5758\n\ 5759Return True if S ends with the specified suffix, False otherwise.\n\ 5760With optional start, test S beginning at that position.\n\ 5761With optional end, stop comparing S at that position."); 5762 5763static PyObject * 5764unicode_endswith(PyUnicodeObject *self, 5765 PyObject *args) 5766{ 5767 PyUnicodeObject *substring; 5768 int start = 0; 5769 int end = INT_MAX; 5770 PyObject *result; 5771 5772 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 5773 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5774 return NULL; 5775 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5776 (PyObject *)substring); 5777 if (substring == NULL) 5778 return NULL; 5779 5780 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); 5781 5782 Py_DECREF(substring); 5783 return result; 5784} 5785 5786 5787 5788static PyObject * 5789unicode_getnewargs(PyUnicodeObject *v) 5790{ 5791 return Py_BuildValue("(u#)", v->str, v->length); 5792} 5793 5794 5795static PyMethodDef unicode_methods[] = { 5796 5797 /* Order is according to common usage: often used methods should 5798 appear first, since lookup is done sequentially. */ 5799 5800 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 5801 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 5802 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 5803 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 5804 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 5805 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 5806 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 5807 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 5808 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 5809 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 5810 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 5811 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 5812 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 5813 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 5814/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 5815 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 5816 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 5817 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 5818 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 5819 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 5820 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 5821 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 5822 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 5823 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 5824 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 5825 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 5826 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 5827 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 5828 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 5829 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 5830 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 5831 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 5832 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 5833 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 5834 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 5835 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 5836#if 0 5837 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 5838#endif 5839 5840#if 0 5841 /* This one is just used for debugging the implementation. */ 5842 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 5843#endif 5844 5845 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 5846 {NULL, NULL} 5847}; 5848 5849static PyObject * 5850unicode_mod(PyObject *v, PyObject *w) 5851{ 5852 if (!PyUnicode_Check(v)) { 5853 Py_INCREF(Py_NotImplemented); 5854 return Py_NotImplemented; 5855 } 5856 return PyUnicode_Format(v, w); 5857} 5858 5859static PyNumberMethods unicode_as_number = { 5860 0, /*nb_add*/ 5861 0, /*nb_subtract*/ 5862 0, /*nb_multiply*/ 5863 0, /*nb_divide*/ 5864 unicode_mod, /*nb_remainder*/ 5865}; 5866 5867static PySequenceMethods unicode_as_sequence = { 5868 (inquiry) unicode_length, /* sq_length */ 5869 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 5870 (intargfunc) unicode_repeat, /* sq_repeat */ 5871 (intargfunc) unicode_getitem, /* sq_item */ 5872 (intintargfunc) unicode_slice, /* sq_slice */ 5873 0, /* sq_ass_item */ 5874 0, /* sq_ass_slice */ 5875 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 5876}; 5877 5878static PyObject* 5879unicode_subscript(PyUnicodeObject* self, PyObject* item) 5880{ 5881 if (PyInt_Check(item)) { 5882 long i = PyInt_AS_LONG(item); 5883 if (i < 0) 5884 i += PyString_GET_SIZE(self); 5885 return unicode_getitem(self, i); 5886 } else if (PyLong_Check(item)) { 5887 long i = PyLong_AsLong(item); 5888 if (i == -1 && PyErr_Occurred()) 5889 return NULL; 5890 if (i < 0) 5891 i += PyString_GET_SIZE(self); 5892 return unicode_getitem(self, i); 5893 } else if (PySlice_Check(item)) { 5894 int start, stop, step, slicelength, cur, i; 5895 Py_UNICODE* source_buf; 5896 Py_UNICODE* result_buf; 5897 PyObject* result; 5898 5899 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self), 5900 &start, &stop, &step, &slicelength) < 0) { 5901 return NULL; 5902 } 5903 5904 if (slicelength <= 0) { 5905 return PyUnicode_FromUnicode(NULL, 0); 5906 } else { 5907 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 5908 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE)); 5909 5910 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 5911 result_buf[i] = source_buf[cur]; 5912 } 5913 5914 result = PyUnicode_FromUnicode(result_buf, slicelength); 5915 PyMem_FREE(result_buf); 5916 return result; 5917 } 5918 } else { 5919 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 5920 return NULL; 5921 } 5922} 5923 5924static PyMappingMethods unicode_as_mapping = { 5925 (inquiry)unicode_length, /* mp_length */ 5926 (binaryfunc)unicode_subscript, /* mp_subscript */ 5927 (objobjargproc)0, /* mp_ass_subscript */ 5928}; 5929 5930static int 5931unicode_buffer_getreadbuf(PyUnicodeObject *self, 5932 int index, 5933 const void **ptr) 5934{ 5935 if (index != 0) { 5936 PyErr_SetString(PyExc_SystemError, 5937 "accessing non-existent unicode segment"); 5938 return -1; 5939 } 5940 *ptr = (void *) self->str; 5941 return PyUnicode_GET_DATA_SIZE(self); 5942} 5943 5944static int 5945unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 5946 const void **ptr) 5947{ 5948 PyErr_SetString(PyExc_TypeError, 5949 "cannot use unicode as modifiable buffer"); 5950 return -1; 5951} 5952 5953static int 5954unicode_buffer_getsegcount(PyUnicodeObject *self, 5955 int *lenp) 5956{ 5957 if (lenp) 5958 *lenp = PyUnicode_GET_DATA_SIZE(self); 5959 return 1; 5960} 5961 5962static int 5963unicode_buffer_getcharbuf(PyUnicodeObject *self, 5964 int index, 5965 const void **ptr) 5966{ 5967 PyObject *str; 5968 5969 if (index != 0) { 5970 PyErr_SetString(PyExc_SystemError, 5971 "accessing non-existent unicode segment"); 5972 return -1; 5973 } 5974 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5975 if (str == NULL) 5976 return -1; 5977 *ptr = (void *) PyString_AS_STRING(str); 5978 return PyString_GET_SIZE(str); 5979} 5980 5981/* Helpers for PyUnicode_Format() */ 5982 5983static PyObject * 5984getnextarg(PyObject *args, int arglen, int *p_argidx) 5985{ 5986 int argidx = *p_argidx; 5987 if (argidx < arglen) { 5988 (*p_argidx)++; 5989 if (arglen < 0) 5990 return args; 5991 else 5992 return PyTuple_GetItem(args, argidx); 5993 } 5994 PyErr_SetString(PyExc_TypeError, 5995 "not enough arguments for format string"); 5996 return NULL; 5997} 5998 5999#define F_LJUST (1<<0) 6000#define F_SIGN (1<<1) 6001#define F_BLANK (1<<2) 6002#define F_ALT (1<<3) 6003#define F_ZERO (1<<4) 6004 6005static 6006int usprintf(register Py_UNICODE *buffer, char *format, ...) 6007{ 6008 register int i; 6009 int len; 6010 va_list va; 6011 char *charbuffer; 6012 va_start(va, format); 6013 6014 /* First, format the string as char array, then expand to Py_UNICODE 6015 array. */ 6016 charbuffer = (char *)buffer; 6017 len = vsprintf(charbuffer, format, va); 6018 for (i = len - 1; i >= 0; i--) 6019 buffer[i] = (Py_UNICODE) charbuffer[i]; 6020 6021 va_end(va); 6022 return len; 6023} 6024 6025/* XXX To save some code duplication, formatfloat/long/int could have been 6026 shared with stringobject.c, converting from 8-bit to Unicode after the 6027 formatting is done. */ 6028 6029static int 6030formatfloat(Py_UNICODE *buf, 6031 size_t buflen, 6032 int flags, 6033 int prec, 6034 int type, 6035 PyObject *v) 6036{ 6037 /* fmt = '%#.' + `prec` + `type` 6038 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 6039 char fmt[20]; 6040 double x; 6041 6042 x = PyFloat_AsDouble(v); 6043 if (x == -1.0 && PyErr_Occurred()) 6044 return -1; 6045 if (prec < 0) 6046 prec = 6; 6047 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 6048 type = 'g'; 6049 /* Worst case length calc to ensure no buffer overrun: 6050 6051 'g' formats: 6052 fmt = %#.<prec>g 6053 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 6054 for any double rep.) 6055 len = 1 + prec + 1 + 2 + 5 = 9 + prec 6056 6057 'f' formats: 6058 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 6059 len = 1 + 50 + 1 + prec = 52 + prec 6060 6061 If prec=0 the effective precision is 1 (the leading digit is 6062 always given), therefore increase the length by one. 6063 6064 */ 6065 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 6066 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 6067 PyErr_SetString(PyExc_OverflowError, 6068 "formatted float is too long (precision too large?)"); 6069 return -1; 6070 } 6071 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 6072 (flags&F_ALT) ? "#" : "", 6073 prec, type); 6074 return usprintf(buf, fmt, x); 6075} 6076 6077static PyObject* 6078formatlong(PyObject *val, int flags, int prec, int type) 6079{ 6080 char *buf; 6081 int i, len; 6082 PyObject *str; /* temporary string object. */ 6083 PyUnicodeObject *result; 6084 6085 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 6086 if (!str) 6087 return NULL; 6088 result = _PyUnicode_New(len); 6089 for (i = 0; i < len; i++) 6090 result->str[i] = buf[i]; 6091 result->str[len] = 0; 6092 Py_DECREF(str); 6093 return (PyObject*)result; 6094} 6095 6096static int 6097formatint(Py_UNICODE *buf, 6098 size_t buflen, 6099 int flags, 6100 int prec, 6101 int type, 6102 PyObject *v) 6103{ 6104 /* fmt = '%#.' + `prec` + 'l' + `type` 6105 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 6106 * + 1 + 1 6107 * = 24 6108 */ 6109 char fmt[64]; /* plenty big enough! */ 6110 long x; 6111 6112 x = PyInt_AsLong(v); 6113 if (x == -1 && PyErr_Occurred()) 6114 return -1; 6115 if (x < 0 && type != 'd' && type != 'i') { 6116 if (PyErr_Warn(PyExc_FutureWarning, 6117 "%u/%o/%x/%X of negative int will return " 6118 "a signed string in Python 2.4 and up") < 0) 6119 return -1; 6120 } 6121 if (prec < 0) 6122 prec = 1; 6123 6124 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 6125 * worst case buf = '0x' + [0-9]*prec, where prec >= 11 6126 */ 6127 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) { 6128 PyErr_SetString(PyExc_OverflowError, 6129 "formatted integer is too long (precision too large?)"); 6130 return -1; 6131 } 6132 6133 if ((flags & F_ALT) && 6134 (type == 'x' || type == 'X')) { 6135 /* When converting under %#x or %#X, there are a number 6136 * of issues that cause pain: 6137 * - when 0 is being converted, the C standard leaves off 6138 * the '0x' or '0X', which is inconsistent with other 6139 * %#x/%#X conversions and inconsistent with Python's 6140 * hex() function 6141 * - there are platforms that violate the standard and 6142 * convert 0 with the '0x' or '0X' 6143 * (Metrowerks, Compaq Tru64) 6144 * - there are platforms that give '0x' when converting 6145 * under %#X, but convert 0 in accordance with the 6146 * standard (OS/2 EMX) 6147 * 6148 * We can achieve the desired consistency by inserting our 6149 * own '0x' or '0X' prefix, and substituting %x/%X in place 6150 * of %#x/%#X. 6151 * 6152 * Note that this is the same approach as used in 6153 * formatint() in stringobject.c 6154 */ 6155 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c", 6156 type, prec, type); 6157 } 6158 else { 6159 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 6160 (flags&F_ALT) ? "#" : "", 6161 prec, type); 6162 } 6163 return usprintf(buf, fmt, x); 6164} 6165 6166static int 6167formatchar(Py_UNICODE *buf, 6168 size_t buflen, 6169 PyObject *v) 6170{ 6171 /* presume that the buffer is at least 2 characters long */ 6172 if (PyUnicode_Check(v)) { 6173 if (PyUnicode_GET_SIZE(v) != 1) 6174 goto onError; 6175 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 6176 } 6177 6178 else if (PyString_Check(v)) { 6179 if (PyString_GET_SIZE(v) != 1) 6180 goto onError; 6181 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 6182 } 6183 6184 else { 6185 /* Integer input truncated to a character */ 6186 long x; 6187 x = PyInt_AsLong(v); 6188 if (x == -1 && PyErr_Occurred()) 6189 goto onError; 6190#ifdef Py_UNICODE_WIDE 6191 if (x < 0 || x > 0x10ffff) { 6192 PyErr_SetString(PyExc_OverflowError, 6193 "%c arg not in range(0x110000) " 6194 "(wide Python build)"); 6195 return -1; 6196 } 6197#else 6198 if (x < 0 || x > 0xffff) { 6199 PyErr_SetString(PyExc_OverflowError, 6200 "%c arg not in range(0x10000) " 6201 "(narrow Python build)"); 6202 return -1; 6203 } 6204#endif 6205 buf[0] = (Py_UNICODE) x; 6206 } 6207 buf[1] = '\0'; 6208 return 1; 6209 6210 onError: 6211 PyErr_SetString(PyExc_TypeError, 6212 "%c requires int or char"); 6213 return -1; 6214} 6215 6216/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 6217 6218 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 6219 chars are formatted. XXX This is a magic number. Each formatting 6220 routine does bounds checking to ensure no overflow, but a better 6221 solution may be to malloc a buffer of appropriate size for each 6222 format. For now, the current solution is sufficient. 6223*/ 6224#define FORMATBUFLEN (size_t)120 6225 6226PyObject *PyUnicode_Format(PyObject *format, 6227 PyObject *args) 6228{ 6229 Py_UNICODE *fmt, *res; 6230 int fmtcnt, rescnt, reslen, arglen, argidx; 6231 int args_owned = 0; 6232 PyUnicodeObject *result = NULL; 6233 PyObject *dict = NULL; 6234 PyObject *uformat; 6235 6236 if (format == NULL || args == NULL) { 6237 PyErr_BadInternalCall(); 6238 return NULL; 6239 } 6240 uformat = PyUnicode_FromObject(format); 6241 if (uformat == NULL) 6242 return NULL; 6243 fmt = PyUnicode_AS_UNICODE(uformat); 6244 fmtcnt = PyUnicode_GET_SIZE(uformat); 6245 6246 reslen = rescnt = fmtcnt + 100; 6247 result = _PyUnicode_New(reslen); 6248 if (result == NULL) 6249 goto onError; 6250 res = PyUnicode_AS_UNICODE(result); 6251 6252 if (PyTuple_Check(args)) { 6253 arglen = PyTuple_Size(args); 6254 argidx = 0; 6255 } 6256 else { 6257 arglen = -1; 6258 argidx = -2; 6259 } 6260 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 6261 !PyObject_TypeCheck(args, &PyBaseString_Type)) 6262 dict = args; 6263 6264 while (--fmtcnt >= 0) { 6265 if (*fmt != '%') { 6266 if (--rescnt < 0) { 6267 rescnt = fmtcnt + 100; 6268 reslen += rescnt; 6269 if (_PyUnicode_Resize(&result, reslen) < 0) 6270 return NULL; 6271 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 6272 --rescnt; 6273 } 6274 *res++ = *fmt++; 6275 } 6276 else { 6277 /* Got a format specifier */ 6278 int flags = 0; 6279 int width = -1; 6280 int prec = -1; 6281 Py_UNICODE c = '\0'; 6282 Py_UNICODE fill; 6283 PyObject *v = NULL; 6284 PyObject *temp = NULL; 6285 Py_UNICODE *pbuf; 6286 Py_UNICODE sign; 6287 int len; 6288 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 6289 6290 fmt++; 6291 if (*fmt == '(') { 6292 Py_UNICODE *keystart; 6293 int keylen; 6294 PyObject *key; 6295 int pcount = 1; 6296 6297 if (dict == NULL) { 6298 PyErr_SetString(PyExc_TypeError, 6299 "format requires a mapping"); 6300 goto onError; 6301 } 6302 ++fmt; 6303 --fmtcnt; 6304 keystart = fmt; 6305 /* Skip over balanced parentheses */ 6306 while (pcount > 0 && --fmtcnt >= 0) { 6307 if (*fmt == ')') 6308 --pcount; 6309 else if (*fmt == '(') 6310 ++pcount; 6311 fmt++; 6312 } 6313 keylen = fmt - keystart - 1; 6314 if (fmtcnt < 0 || pcount > 0) { 6315 PyErr_SetString(PyExc_ValueError, 6316 "incomplete format key"); 6317 goto onError; 6318 } 6319#if 0 6320 /* keys are converted to strings using UTF-8 and 6321 then looked up since Python uses strings to hold 6322 variables names etc. in its namespaces and we 6323 wouldn't want to break common idioms. */ 6324 key = PyUnicode_EncodeUTF8(keystart, 6325 keylen, 6326 NULL); 6327#else 6328 key = PyUnicode_FromUnicode(keystart, keylen); 6329#endif 6330 if (key == NULL) 6331 goto onError; 6332 if (args_owned) { 6333 Py_DECREF(args); 6334 args_owned = 0; 6335 } 6336 args = PyObject_GetItem(dict, key); 6337 Py_DECREF(key); 6338 if (args == NULL) { 6339 goto onError; 6340 } 6341 args_owned = 1; 6342 arglen = -1; 6343 argidx = -2; 6344 } 6345 while (--fmtcnt >= 0) { 6346 switch (c = *fmt++) { 6347 case '-': flags |= F_LJUST; continue; 6348 case '+': flags |= F_SIGN; continue; 6349 case ' ': flags |= F_BLANK; continue; 6350 case '#': flags |= F_ALT; continue; 6351 case '0': flags |= F_ZERO; continue; 6352 } 6353 break; 6354 } 6355 if (c == '*') { 6356 v = getnextarg(args, arglen, &argidx); 6357 if (v == NULL) 6358 goto onError; 6359 if (!PyInt_Check(v)) { 6360 PyErr_SetString(PyExc_TypeError, 6361 "* wants int"); 6362 goto onError; 6363 } 6364 width = PyInt_AsLong(v); 6365 if (width < 0) { 6366 flags |= F_LJUST; 6367 width = -width; 6368 } 6369 if (--fmtcnt >= 0) 6370 c = *fmt++; 6371 } 6372 else if (c >= '0' && c <= '9') { 6373 width = c - '0'; 6374 while (--fmtcnt >= 0) { 6375 c = *fmt++; 6376 if (c < '0' || c > '9') 6377 break; 6378 if ((width*10) / 10 != width) { 6379 PyErr_SetString(PyExc_ValueError, 6380 "width too big"); 6381 goto onError; 6382 } 6383 width = width*10 + (c - '0'); 6384 } 6385 } 6386 if (c == '.') { 6387 prec = 0; 6388 if (--fmtcnt >= 0) 6389 c = *fmt++; 6390 if (c == '*') { 6391 v = getnextarg(args, arglen, &argidx); 6392 if (v == NULL) 6393 goto onError; 6394 if (!PyInt_Check(v)) { 6395 PyErr_SetString(PyExc_TypeError, 6396 "* wants int"); 6397 goto onError; 6398 } 6399 prec = PyInt_AsLong(v); 6400 if (prec < 0) 6401 prec = 0; 6402 if (--fmtcnt >= 0) 6403 c = *fmt++; 6404 } 6405 else if (c >= '0' && c <= '9') { 6406 prec = c - '0'; 6407 while (--fmtcnt >= 0) { 6408 c = Py_CHARMASK(*fmt++); 6409 if (c < '0' || c > '9') 6410 break; 6411 if ((prec*10) / 10 != prec) { 6412 PyErr_SetString(PyExc_ValueError, 6413 "prec too big"); 6414 goto onError; 6415 } 6416 prec = prec*10 + (c - '0'); 6417 } 6418 } 6419 } /* prec */ 6420 if (fmtcnt >= 0) { 6421 if (c == 'h' || c == 'l' || c == 'L') { 6422 if (--fmtcnt >= 0) 6423 c = *fmt++; 6424 } 6425 } 6426 if (fmtcnt < 0) { 6427 PyErr_SetString(PyExc_ValueError, 6428 "incomplete format"); 6429 goto onError; 6430 } 6431 if (c != '%') { 6432 v = getnextarg(args, arglen, &argidx); 6433 if (v == NULL) 6434 goto onError; 6435 } 6436 sign = 0; 6437 fill = ' '; 6438 switch (c) { 6439 6440 case '%': 6441 pbuf = formatbuf; 6442 /* presume that buffer length is at least 1 */ 6443 pbuf[0] = '%'; 6444 len = 1; 6445 break; 6446 6447 case 's': 6448 case 'r': 6449 if (PyUnicode_Check(v) && c == 's') { 6450 temp = v; 6451 Py_INCREF(temp); 6452 } 6453 else { 6454 PyObject *unicode; 6455 if (c == 's') 6456 temp = PyObject_Str(v); 6457 else 6458 temp = PyObject_Repr(v); 6459 if (temp == NULL) 6460 goto onError; 6461 if (!PyString_Check(temp)) { 6462 /* XXX Note: this should never happen, since 6463 PyObject_Repr() and PyObject_Str() assure 6464 this */ 6465 Py_DECREF(temp); 6466 PyErr_SetString(PyExc_TypeError, 6467 "%s argument has non-string str()"); 6468 goto onError; 6469 } 6470 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 6471 PyString_GET_SIZE(temp), 6472 NULL, 6473 "strict"); 6474 Py_DECREF(temp); 6475 temp = unicode; 6476 if (temp == NULL) 6477 goto onError; 6478 } 6479 pbuf = PyUnicode_AS_UNICODE(temp); 6480 len = PyUnicode_GET_SIZE(temp); 6481 if (prec >= 0 && len > prec) 6482 len = prec; 6483 break; 6484 6485 case 'i': 6486 case 'd': 6487 case 'u': 6488 case 'o': 6489 case 'x': 6490 case 'X': 6491 if (c == 'i') 6492 c = 'd'; 6493 if (PyLong_Check(v)) { 6494 temp = formatlong(v, flags, prec, c); 6495 if (!temp) 6496 goto onError; 6497 pbuf = PyUnicode_AS_UNICODE(temp); 6498 len = PyUnicode_GET_SIZE(temp); 6499 /* unbounded ints can always produce 6500 a sign character! */ 6501 sign = 1; 6502 } 6503 else { 6504 pbuf = formatbuf; 6505 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6506 flags, prec, c, v); 6507 if (len < 0) 6508 goto onError; 6509 /* only d conversion is signed */ 6510 sign = c == 'd'; 6511 } 6512 if (flags & F_ZERO) 6513 fill = '0'; 6514 break; 6515 6516 case 'e': 6517 case 'E': 6518 case 'f': 6519 case 'g': 6520 case 'G': 6521 pbuf = formatbuf; 6522 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6523 flags, prec, c, v); 6524 if (len < 0) 6525 goto onError; 6526 sign = 1; 6527 if (flags & F_ZERO) 6528 fill = '0'; 6529 break; 6530 6531 case 'c': 6532 pbuf = formatbuf; 6533 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 6534 if (len < 0) 6535 goto onError; 6536 break; 6537 6538 default: 6539 PyErr_Format(PyExc_ValueError, 6540 "unsupported format character '%c' (0x%x) " 6541 "at index %i", 6542 (31<=c && c<=126) ? (char)c : '?', 6543 (int)c, 6544 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat))); 6545 goto onError; 6546 } 6547 if (sign) { 6548 if (*pbuf == '-' || *pbuf == '+') { 6549 sign = *pbuf++; 6550 len--; 6551 } 6552 else if (flags & F_SIGN) 6553 sign = '+'; 6554 else if (flags & F_BLANK) 6555 sign = ' '; 6556 else 6557 sign = 0; 6558 } 6559 if (width < len) 6560 width = len; 6561 if (rescnt - (sign != 0) < width) { 6562 reslen -= rescnt; 6563 rescnt = width + fmtcnt + 100; 6564 reslen += rescnt; 6565 if (reslen < 0) { 6566 Py_DECREF(result); 6567 return PyErr_NoMemory(); 6568 } 6569 if (_PyUnicode_Resize(&result, reslen) < 0) 6570 return NULL; 6571 res = PyUnicode_AS_UNICODE(result) 6572 + reslen - rescnt; 6573 } 6574 if (sign) { 6575 if (fill != ' ') 6576 *res++ = sign; 6577 rescnt--; 6578 if (width > len) 6579 width--; 6580 } 6581 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6582 assert(pbuf[0] == '0'); 6583 assert(pbuf[1] == c); 6584 if (fill != ' ') { 6585 *res++ = *pbuf++; 6586 *res++ = *pbuf++; 6587 } 6588 rescnt -= 2; 6589 width -= 2; 6590 if (width < 0) 6591 width = 0; 6592 len -= 2; 6593 } 6594 if (width > len && !(flags & F_LJUST)) { 6595 do { 6596 --rescnt; 6597 *res++ = fill; 6598 } while (--width > len); 6599 } 6600 if (fill == ' ') { 6601 if (sign) 6602 *res++ = sign; 6603 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6604 assert(pbuf[0] == '0'); 6605 assert(pbuf[1] == c); 6606 *res++ = *pbuf++; 6607 *res++ = *pbuf++; 6608 } 6609 } 6610 Py_UNICODE_COPY(res, pbuf, len); 6611 res += len; 6612 rescnt -= len; 6613 while (--width >= len) { 6614 --rescnt; 6615 *res++ = ' '; 6616 } 6617 if (dict && (argidx < arglen) && c != '%') { 6618 PyErr_SetString(PyExc_TypeError, 6619 "not all arguments converted during string formatting"); 6620 goto onError; 6621 } 6622 Py_XDECREF(temp); 6623 } /* '%' */ 6624 } /* until end */ 6625 if (argidx < arglen && !dict) { 6626 PyErr_SetString(PyExc_TypeError, 6627 "not all arguments converted during string formatting"); 6628 goto onError; 6629 } 6630 6631 if (args_owned) { 6632 Py_DECREF(args); 6633 } 6634 Py_DECREF(uformat); 6635 if (_PyUnicode_Resize(&result, reslen - rescnt)) 6636 goto onError; 6637 return (PyObject *)result; 6638 6639 onError: 6640 Py_XDECREF(result); 6641 Py_DECREF(uformat); 6642 if (args_owned) { 6643 Py_DECREF(args); 6644 } 6645 return NULL; 6646} 6647 6648static PyBufferProcs unicode_as_buffer = { 6649 (getreadbufferproc) unicode_buffer_getreadbuf, 6650 (getwritebufferproc) unicode_buffer_getwritebuf, 6651 (getsegcountproc) unicode_buffer_getsegcount, 6652 (getcharbufferproc) unicode_buffer_getcharbuf, 6653}; 6654 6655static PyObject * 6656unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 6657 6658static PyObject * 6659unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6660{ 6661 PyObject *x = NULL; 6662 static char *kwlist[] = {"string", "encoding", "errors", 0}; 6663 char *encoding = NULL; 6664 char *errors = NULL; 6665 6666 if (type != &PyUnicode_Type) 6667 return unicode_subtype_new(type, args, kwds); 6668 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 6669 kwlist, &x, &encoding, &errors)) 6670 return NULL; 6671 if (x == NULL) 6672 return (PyObject *)_PyUnicode_New(0); 6673 if (encoding == NULL && errors == NULL) 6674 return PyObject_Unicode(x); 6675 else 6676 return PyUnicode_FromEncodedObject(x, encoding, errors); 6677} 6678 6679static PyObject * 6680unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6681{ 6682 PyUnicodeObject *tmp, *pnew; 6683 int n; 6684 6685 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 6686 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 6687 if (tmp == NULL) 6688 return NULL; 6689 assert(PyUnicode_Check(tmp)); 6690 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 6691 if (pnew == NULL) 6692 return NULL; 6693 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 6694 if (pnew->str == NULL) { 6695 _Py_ForgetReference((PyObject *)pnew); 6696 PyObject_Del(pnew); 6697 return PyErr_NoMemory(); 6698 } 6699 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 6700 pnew->length = n; 6701 pnew->hash = tmp->hash; 6702 Py_DECREF(tmp); 6703 return (PyObject *)pnew; 6704} 6705 6706PyDoc_STRVAR(unicode_doc, 6707"unicode(string [, encoding[, errors]]) -> object\n\ 6708\n\ 6709Create a new Unicode object from the given encoded string.\n\ 6710encoding defaults to the current default string encoding.\n\ 6711errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 6712 6713PyTypeObject PyUnicode_Type = { 6714 PyObject_HEAD_INIT(&PyType_Type) 6715 0, /* ob_size */ 6716 "unicode", /* tp_name */ 6717 sizeof(PyUnicodeObject), /* tp_size */ 6718 0, /* tp_itemsize */ 6719 /* Slots */ 6720 (destructor)unicode_dealloc, /* tp_dealloc */ 6721 0, /* tp_print */ 6722 0, /* tp_getattr */ 6723 0, /* tp_setattr */ 6724 (cmpfunc) unicode_compare, /* tp_compare */ 6725 (reprfunc) unicode_repr, /* tp_repr */ 6726 &unicode_as_number, /* tp_as_number */ 6727 &unicode_as_sequence, /* tp_as_sequence */ 6728 &unicode_as_mapping, /* tp_as_mapping */ 6729 (hashfunc) unicode_hash, /* tp_hash*/ 6730 0, /* tp_call*/ 6731 (reprfunc) unicode_str, /* tp_str */ 6732 PyObject_GenericGetAttr, /* tp_getattro */ 6733 0, /* tp_setattro */ 6734 &unicode_as_buffer, /* tp_as_buffer */ 6735 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 6736 Py_TPFLAGS_BASETYPE, /* tp_flags */ 6737 unicode_doc, /* tp_doc */ 6738 0, /* tp_traverse */ 6739 0, /* tp_clear */ 6740 0, /* tp_richcompare */ 6741 0, /* tp_weaklistoffset */ 6742 0, /* tp_iter */ 6743 0, /* tp_iternext */ 6744 unicode_methods, /* tp_methods */ 6745 0, /* tp_members */ 6746 0, /* tp_getset */ 6747 &PyBaseString_Type, /* tp_base */ 6748 0, /* tp_dict */ 6749 0, /* tp_descr_get */ 6750 0, /* tp_descr_set */ 6751 0, /* tp_dictoffset */ 6752 0, /* tp_init */ 6753 0, /* tp_alloc */ 6754 unicode_new, /* tp_new */ 6755 PyObject_Del, /* tp_free */ 6756}; 6757 6758/* Initialize the Unicode implementation */ 6759 6760void _PyUnicode_Init(void) 6761{ 6762 int i; 6763 6764 /* Init the implementation */ 6765 unicode_freelist = NULL; 6766 unicode_freelist_size = 0; 6767 unicode_empty = _PyUnicode_New(0); 6768 strcpy(unicode_default_encoding, "ascii"); 6769 for (i = 0; i < 256; i++) 6770 unicode_latin1[i] = NULL; 6771 if (PyType_Ready(&PyUnicode_Type) < 0) 6772 Py_FatalError("Can't initialize 'unicode'"); 6773} 6774 6775/* Finalize the Unicode implementation */ 6776 6777void 6778_PyUnicode_Fini(void) 6779{ 6780 PyUnicodeObject *u; 6781 int i; 6782 6783 Py_XDECREF(unicode_empty); 6784 unicode_empty = NULL; 6785 6786 for (i = 0; i < 256; i++) { 6787 if (unicode_latin1[i]) { 6788 Py_DECREF(unicode_latin1[i]); 6789 unicode_latin1[i] = NULL; 6790 } 6791 } 6792 6793 for (u = unicode_freelist; u != NULL;) { 6794 PyUnicodeObject *v = u; 6795 u = *(PyUnicodeObject **)u; 6796 if (v->str) 6797 PyMem_DEL(v->str); 6798 Py_XDECREF(v->defenc); 6799 PyObject_Del(v); 6800 } 6801 unicode_freelist = NULL; 6802 unicode_freelist_size = 0; 6803} 6804 6805/* 6806Local variables: 6807c-basic-offset: 4 6808indent-tabs-mode: nil 6809End: 6810*/ 6811