unicodeobject.c revision f6b56aecad067f730d7fc6ae76cca94a26c3c896
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9-------------------------------------------------------------------- 10The original string type implementation is: 11 12 Copyright (c) 1999 by Secret Labs AB 13 Copyright (c) 1999 by Fredrik Lundh 14 15By obtaining, using, and/or copying this software and/or its 16associated documentation, you agree that you have read, understood, 17and will comply with the following terms and conditions: 18 19Permission to use, copy, modify, and distribute this software and its 20associated documentation for any purpose and without fee is hereby 21granted, provided that the above copyright notice appears in all 22copies, and that both that copyright notice and this permission notice 23appear in supporting documentation, and that the name of Secret Labs 24AB or the author not be used in advertising or publicity pertaining to 25distribution of the software without specific, written prior 26permission. 27 28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 35-------------------------------------------------------------------- 36 37*/ 38 39#include "Python.h" 40 41#include "unicodeobject.h" 42#include "ucnhash.h" 43 44#ifdef MS_WINDOWS 45#include <windows.h> 46#endif 47 48/* Limit for the Unicode object free list */ 49 50#define MAX_UNICODE_FREELIST_SIZE 1024 51 52/* Limit for the Unicode object free list stay alive optimization. 53 54 The implementation will keep allocated Unicode memory intact for 55 all objects on the free list having a size less than this 56 limit. This reduces malloc() overhead for small Unicode objects. 57 58 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 60 malloc()-overhead) bytes of unused garbage. 61 62 Setting the limit to 0 effectively turns the feature off. 63 64 Note: This is an experimental feature ! If you get core dumps when 65 using Unicode objects, turn this feature off. 66 67*/ 68 69#define KEEPALIVE_SIZE_LIMIT 9 70 71/* Endianness switches; defaults to little endian */ 72 73#ifdef WORDS_BIGENDIAN 74# define BYTEORDER_IS_BIG_ENDIAN 75#else 76# define BYTEORDER_IS_LITTLE_ENDIAN 77#endif 78 79/* --- Globals ------------------------------------------------------------ 80 81 The globals are initialized by the _PyUnicode_Init() API and should 82 not be used before calling that API. 83 84*/ 85 86/* Free list for Unicode objects */ 87static PyUnicodeObject *unicode_freelist; 88static int unicode_freelist_size; 89 90/* The empty Unicode object is shared to improve performance. */ 91static PyUnicodeObject *unicode_empty; 92 93/* Single character Unicode strings in the Latin-1 range are being 94 shared as well. */ 95static PyUnicodeObject *unicode_latin1[256]; 96 97/* Default encoding to use and assume when NULL is passed as encoding 98 parameter; it is initialized by _PyUnicode_Init(). 99 100 Always use the PyUnicode_SetDefaultEncoding() and 101 PyUnicode_GetDefaultEncoding() APIs to access this global. 102 103*/ 104static char unicode_default_encoding[100]; 105 106Py_UNICODE 107PyUnicode_GetMax(void) 108{ 109#ifdef Py_UNICODE_WIDE 110 return 0x10FFFF; 111#else 112 /* This is actually an illegal character, so it should 113 not be passed to unichr. */ 114 return 0xFFFF; 115#endif 116} 117 118/* --- Unicode Object ----------------------------------------------------- */ 119 120static 121int unicode_resize(register PyUnicodeObject *unicode, 122 int length) 123{ 124 void *oldstr; 125 126 /* Shortcut if there's nothing much to do. */ 127 if (unicode->length == length) 128 goto reset; 129 130 /* Resizing shared object (unicode_empty or single character 131 objects) in-place is not allowed. Use PyUnicode_Resize() 132 instead ! */ 133 if (unicode == unicode_empty || 134 (unicode->length == 1 && 135 unicode->str[0] < 256 && 136 unicode_latin1[unicode->str[0]] == unicode)) { 137 PyErr_SetString(PyExc_SystemError, 138 "can't resize shared unicode objects"); 139 return -1; 140 } 141 142 /* We allocate one more byte to make sure the string is 143 Ux0000 terminated -- XXX is this needed ? */ 144 oldstr = unicode->str; 145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 146 if (!unicode->str) { 147 unicode->str = oldstr; 148 PyErr_NoMemory(); 149 return -1; 150 } 151 unicode->str[length] = 0; 152 unicode->length = length; 153 154 reset: 155 /* Reset the object caches */ 156 if (unicode->defenc) { 157 Py_DECREF(unicode->defenc); 158 unicode->defenc = NULL; 159 } 160 unicode->hash = -1; 161 162 return 0; 163} 164 165/* We allocate one more byte to make sure the string is 166 Ux0000 terminated -- XXX is this needed ? 167 168 XXX This allocator could further be enhanced by assuring that the 169 free list never reduces its size below 1. 170 171*/ 172 173static 174PyUnicodeObject *_PyUnicode_New(int length) 175{ 176 register PyUnicodeObject *unicode; 177 178 /* Optimization for empty strings */ 179 if (length == 0 && unicode_empty != NULL) { 180 Py_INCREF(unicode_empty); 181 return unicode_empty; 182 } 183 184 /* Unicode freelist & memory allocation */ 185 if (unicode_freelist) { 186 unicode = unicode_freelist; 187 unicode_freelist = *(PyUnicodeObject **)unicode; 188 unicode_freelist_size--; 189 if (unicode->str) { 190 /* Keep-Alive optimization: we only upsize the buffer, 191 never downsize it. */ 192 if ((unicode->length < length) && 193 unicode_resize(unicode, length)) { 194 PyMem_DEL(unicode->str); 195 goto onError; 196 } 197 } 198 else { 199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 200 } 201 PyObject_INIT(unicode, &PyUnicode_Type); 202 } 203 else { 204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 205 if (unicode == NULL) 206 return NULL; 207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 208 } 209 210 if (!unicode->str) { 211 PyErr_NoMemory(); 212 goto onError; 213 } 214 unicode->str[length] = 0; 215 unicode->length = length; 216 unicode->hash = -1; 217 unicode->defenc = NULL; 218 return unicode; 219 220 onError: 221 _Py_ForgetReference((PyObject *)unicode); 222 PyObject_Del(unicode); 223 return NULL; 224} 225 226static 227void unicode_dealloc(register PyUnicodeObject *unicode) 228{ 229 if (PyUnicode_CheckExact(unicode) && 230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 231 /* Keep-Alive optimization */ 232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 233 PyMem_DEL(unicode->str); 234 unicode->str = NULL; 235 unicode->length = 0; 236 } 237 if (unicode->defenc) { 238 Py_DECREF(unicode->defenc); 239 unicode->defenc = NULL; 240 } 241 /* Add to free list */ 242 *(PyUnicodeObject **)unicode = unicode_freelist; 243 unicode_freelist = unicode; 244 unicode_freelist_size++; 245 } 246 else { 247 PyMem_DEL(unicode->str); 248 Py_XDECREF(unicode->defenc); 249 unicode->ob_type->tp_free((PyObject *)unicode); 250 } 251} 252 253int PyUnicode_Resize(PyObject **unicode, 254 int length) 255{ 256 register PyUnicodeObject *v; 257 258 /* Argument checks */ 259 if (unicode == NULL) { 260 PyErr_BadInternalCall(); 261 return -1; 262 } 263 v = (PyUnicodeObject *)*unicode; 264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 265 PyErr_BadInternalCall(); 266 return -1; 267 } 268 269 /* Resizing unicode_empty and single character objects is not 270 possible since these are being shared. We simply return a fresh 271 copy with the same Unicode content. */ 272 if (v->length != length && 273 (v == unicode_empty || v->length == 1)) { 274 PyUnicodeObject *w = _PyUnicode_New(length); 275 if (w == NULL) 276 return -1; 277 Py_UNICODE_COPY(w->str, v->str, 278 length < v->length ? length : v->length); 279 *unicode = (PyObject *)w; 280 return 0; 281 } 282 283 /* Note that we don't have to modify *unicode for unshared Unicode 284 objects, since we can modify them in-place. */ 285 return unicode_resize(v, length); 286} 287 288/* Internal API for use in unicodeobject.c only ! */ 289#define _PyUnicode_Resize(unicodevar, length) \ 290 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 291 292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 293 int size) 294{ 295 PyUnicodeObject *unicode; 296 297 /* If the Unicode data is known at construction time, we can apply 298 some optimizations which share commonly used objects. */ 299 if (u != NULL) { 300 301 /* Optimization for empty strings */ 302 if (size == 0 && unicode_empty != NULL) { 303 Py_INCREF(unicode_empty); 304 return (PyObject *)unicode_empty; 305 } 306 307 /* Single character Unicode objects in the Latin-1 range are 308 shared when using this constructor */ 309 if (size == 1 && *u < 256) { 310 unicode = unicode_latin1[*u]; 311 if (!unicode) { 312 unicode = _PyUnicode_New(1); 313 if (!unicode) 314 return NULL; 315 unicode->str[0] = *u; 316 unicode_latin1[*u] = unicode; 317 } 318 Py_INCREF(unicode); 319 return (PyObject *)unicode; 320 } 321 } 322 323 unicode = _PyUnicode_New(size); 324 if (!unicode) 325 return NULL; 326 327 /* Copy the Unicode data into the new object */ 328 if (u != NULL) 329 Py_UNICODE_COPY(unicode->str, u, size); 330 331 return (PyObject *)unicode; 332} 333 334#ifdef HAVE_WCHAR_H 335 336PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 337 int size) 338{ 339 PyUnicodeObject *unicode; 340 341 if (w == NULL) { 342 PyErr_BadInternalCall(); 343 return NULL; 344 } 345 346 unicode = _PyUnicode_New(size); 347 if (!unicode) 348 return NULL; 349 350 /* Copy the wchar_t data into the new object */ 351#ifdef HAVE_USABLE_WCHAR_T 352 memcpy(unicode->str, w, size * sizeof(wchar_t)); 353#else 354 { 355 register Py_UNICODE *u; 356 register int i; 357 u = PyUnicode_AS_UNICODE(unicode); 358 for (i = size; i >= 0; i--) 359 *u++ = *w++; 360 } 361#endif 362 363 return (PyObject *)unicode; 364} 365 366int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 367 register wchar_t *w, 368 int size) 369{ 370 if (unicode == NULL) { 371 PyErr_BadInternalCall(); 372 return -1; 373 } 374 if (size > PyUnicode_GET_SIZE(unicode)) 375 size = PyUnicode_GET_SIZE(unicode); 376#ifdef HAVE_USABLE_WCHAR_T 377 memcpy(w, unicode->str, size * sizeof(wchar_t)); 378#else 379 { 380 register Py_UNICODE *u; 381 register int i; 382 u = PyUnicode_AS_UNICODE(unicode); 383 for (i = size; i >= 0; i--) 384 *w++ = *u++; 385 } 386#endif 387 388 return size; 389} 390 391#endif 392 393PyObject *PyUnicode_FromOrdinal(int ordinal) 394{ 395 Py_UNICODE s[2]; 396 397#ifdef Py_UNICODE_WIDE 398 if (ordinal < 0 || ordinal > 0x10ffff) { 399 PyErr_SetString(PyExc_ValueError, 400 "unichr() arg not in range(0x110000) " 401 "(wide Python build)"); 402 return NULL; 403 } 404#else 405 if (ordinal < 0 || ordinal > 0xffff) { 406 PyErr_SetString(PyExc_ValueError, 407 "unichr() arg not in range(0x10000) " 408 "(narrow Python build)"); 409 return NULL; 410 } 411#endif 412 413 if (ordinal <= 0xffff) { 414 /* UCS-2 character */ 415 s[0] = (Py_UNICODE) ordinal; 416 return PyUnicode_FromUnicode(s, 1); 417 } 418 else { 419#ifndef Py_UNICODE_WIDE 420 /* UCS-4 character. store as two surrogate characters */ 421 ordinal -= 0x10000L; 422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10); 423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF); 424 return PyUnicode_FromUnicode(s, 2); 425#else 426 s[0] = (Py_UNICODE)ordinal; 427 return PyUnicode_FromUnicode(s, 1); 428#endif 429 } 430} 431 432PyObject *PyUnicode_FromObject(register PyObject *obj) 433{ 434 /* XXX Perhaps we should make this API an alias of 435 PyObject_Unicode() instead ?! */ 436 if (PyUnicode_CheckExact(obj)) { 437 Py_INCREF(obj); 438 return obj; 439 } 440 if (PyUnicode_Check(obj)) { 441 /* For a Unicode subtype that's not a Unicode object, 442 return a true Unicode object with the same data. */ 443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 444 PyUnicode_GET_SIZE(obj)); 445 } 446 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 447} 448 449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 450 const char *encoding, 451 const char *errors) 452{ 453 const char *s = NULL; 454 int len; 455 PyObject *v; 456 457 if (obj == NULL) { 458 PyErr_BadInternalCall(); 459 return NULL; 460 } 461 462#if 0 463 /* For b/w compatibility we also accept Unicode objects provided 464 that no encodings is given and then redirect to 465 PyObject_Unicode() which then applies the additional logic for 466 Unicode subclasses. 467 468 NOTE: This API should really only be used for object which 469 represent *encoded* Unicode ! 470 471 */ 472 if (PyUnicode_Check(obj)) { 473 if (encoding) { 474 PyErr_SetString(PyExc_TypeError, 475 "decoding Unicode is not supported"); 476 return NULL; 477 } 478 return PyObject_Unicode(obj); 479 } 480#else 481 if (PyUnicode_Check(obj)) { 482 PyErr_SetString(PyExc_TypeError, 483 "decoding Unicode is not supported"); 484 return NULL; 485 } 486#endif 487 488 /* Coerce object */ 489 if (PyString_Check(obj)) { 490 s = PyString_AS_STRING(obj); 491 len = PyString_GET_SIZE(obj); 492 } 493 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 494 /* Overwrite the error message with something more useful in 495 case of a TypeError. */ 496 if (PyErr_ExceptionMatches(PyExc_TypeError)) 497 PyErr_Format(PyExc_TypeError, 498 "coercing to Unicode: need string or buffer, " 499 "%.80s found", 500 obj->ob_type->tp_name); 501 goto onError; 502 } 503 504 /* Convert to Unicode */ 505 if (len == 0) { 506 Py_INCREF(unicode_empty); 507 v = (PyObject *)unicode_empty; 508 } 509 else 510 v = PyUnicode_Decode(s, len, encoding, errors); 511 512 return v; 513 514 onError: 515 return NULL; 516} 517 518PyObject *PyUnicode_Decode(const char *s, 519 int size, 520 const char *encoding, 521 const char *errors) 522{ 523 PyObject *buffer = NULL, *unicode; 524 525 if (encoding == NULL) 526 encoding = PyUnicode_GetDefaultEncoding(); 527 528 /* Shortcuts for common default encodings */ 529 if (strcmp(encoding, "utf-8") == 0) 530 return PyUnicode_DecodeUTF8(s, size, errors); 531 else if (strcmp(encoding, "latin-1") == 0) 532 return PyUnicode_DecodeLatin1(s, size, errors); 533 else if (strcmp(encoding, "ascii") == 0) 534 return PyUnicode_DecodeASCII(s, size, errors); 535 536 /* Decode via the codec registry */ 537 buffer = PyBuffer_FromMemory((void *)s, size); 538 if (buffer == NULL) 539 goto onError; 540 unicode = PyCodec_Decode(buffer, encoding, errors); 541 if (unicode == NULL) 542 goto onError; 543 if (!PyUnicode_Check(unicode)) { 544 PyErr_Format(PyExc_TypeError, 545 "decoder did not return an unicode object (type=%.400s)", 546 unicode->ob_type->tp_name); 547 Py_DECREF(unicode); 548 goto onError; 549 } 550 Py_DECREF(buffer); 551 return unicode; 552 553 onError: 554 Py_XDECREF(buffer); 555 return NULL; 556} 557 558PyObject *PyUnicode_Encode(const Py_UNICODE *s, 559 int size, 560 const char *encoding, 561 const char *errors) 562{ 563 PyObject *v, *unicode; 564 565 unicode = PyUnicode_FromUnicode(s, size); 566 if (unicode == NULL) 567 return NULL; 568 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 569 Py_DECREF(unicode); 570 return v; 571} 572 573PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 574 const char *encoding, 575 const char *errors) 576{ 577 PyObject *v; 578 579 if (!PyUnicode_Check(unicode)) { 580 PyErr_BadArgument(); 581 goto onError; 582 } 583 584 if (encoding == NULL) 585 encoding = PyUnicode_GetDefaultEncoding(); 586 587 /* Shortcuts for common default encodings */ 588 if (errors == NULL) { 589 if (strcmp(encoding, "utf-8") == 0) 590 return PyUnicode_AsUTF8String(unicode); 591 else if (strcmp(encoding, "latin-1") == 0) 592 return PyUnicode_AsLatin1String(unicode); 593 else if (strcmp(encoding, "ascii") == 0) 594 return PyUnicode_AsASCIIString(unicode); 595 } 596 597 /* Encode via the codec registry */ 598 v = PyCodec_Encode(unicode, encoding, errors); 599 if (v == NULL) 600 goto onError; 601 /* XXX Should we really enforce this ? */ 602 if (!PyString_Check(v)) { 603 PyErr_Format(PyExc_TypeError, 604 "encoder did not return a string object (type=%.400s)", 605 v->ob_type->tp_name); 606 Py_DECREF(v); 607 goto onError; 608 } 609 return v; 610 611 onError: 612 return NULL; 613} 614 615PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 616 const char *errors) 617{ 618 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 619 620 if (v) 621 return v; 622 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 623 if (v && errors == NULL) 624 ((PyUnicodeObject *)unicode)->defenc = v; 625 return v; 626} 627 628Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 629{ 630 if (!PyUnicode_Check(unicode)) { 631 PyErr_BadArgument(); 632 goto onError; 633 } 634 return PyUnicode_AS_UNICODE(unicode); 635 636 onError: 637 return NULL; 638} 639 640int PyUnicode_GetSize(PyObject *unicode) 641{ 642 if (!PyUnicode_Check(unicode)) { 643 PyErr_BadArgument(); 644 goto onError; 645 } 646 return PyUnicode_GET_SIZE(unicode); 647 648 onError: 649 return -1; 650} 651 652const char *PyUnicode_GetDefaultEncoding(void) 653{ 654 return unicode_default_encoding; 655} 656 657int PyUnicode_SetDefaultEncoding(const char *encoding) 658{ 659 PyObject *v; 660 661 /* Make sure the encoding is valid. As side effect, this also 662 loads the encoding into the codec registry cache. */ 663 v = _PyCodec_Lookup(encoding); 664 if (v == NULL) 665 goto onError; 666 Py_DECREF(v); 667 strncpy(unicode_default_encoding, 668 encoding, 669 sizeof(unicode_default_encoding)); 670 return 0; 671 672 onError: 673 return -1; 674} 675 676/* error handling callback helper: 677 build arguments, call the callback and check the arguments, 678 if no exception occured, copy the replacement to the output 679 and adjust various state variables. 680 return 0 on success, -1 on error 681*/ 682 683static 684int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 685 const char *encoding, const char *reason, 686 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, 687 PyObject **output, int *outpos, Py_UNICODE **outptr) 688{ 689 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; 690 691 PyObject *restuple = NULL; 692 PyObject *repunicode = NULL; 693 int outsize = PyUnicode_GET_SIZE(*output); 694 int requiredsize; 695 int newpos; 696 Py_UNICODE *repptr; 697 int repsize; 698 int res = -1; 699 700 if (*errorHandler == NULL) { 701 *errorHandler = PyCodec_LookupError(errors); 702 if (*errorHandler == NULL) 703 goto onError; 704 } 705 706 if (*exceptionObject == NULL) { 707 *exceptionObject = PyUnicodeDecodeError_Create( 708 encoding, input, insize, *startinpos, *endinpos, reason); 709 if (*exceptionObject == NULL) 710 goto onError; 711 } 712 else { 713 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 714 goto onError; 715 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 716 goto onError; 717 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 718 goto onError; 719 } 720 721 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 722 if (restuple == NULL) 723 goto onError; 724 if (!PyTuple_Check(restuple)) { 725 PyErr_Format(PyExc_TypeError, &argparse[4]); 726 goto onError; 727 } 728 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 729 goto onError; 730 if (newpos<0) 731 newpos = insize+newpos; 732 if (newpos<0 || newpos>insize) { 733 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos); 734 goto onError; 735 } 736 737 /* need more space? (at least enough for what we 738 have+the replacement+the rest of the string (starting 739 at the new input position), so we won't have to check space 740 when there are no errors in the rest of the string) */ 741 repptr = PyUnicode_AS_UNICODE(repunicode); 742 repsize = PyUnicode_GET_SIZE(repunicode); 743 requiredsize = *outpos + repsize + insize-newpos; 744 if (requiredsize > outsize) { 745 if (requiredsize<2*outsize) 746 requiredsize = 2*outsize; 747 if (PyUnicode_Resize(output, requiredsize)) 748 goto onError; 749 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 750 } 751 *endinpos = newpos; 752 *inptr = input + newpos; 753 Py_UNICODE_COPY(*outptr, repptr, repsize); 754 *outptr += repsize; 755 *outpos += repsize; 756 /* we made it! */ 757 res = 0; 758 759 onError: 760 Py_XDECREF(restuple); 761 return res; 762} 763 764/* --- UTF-7 Codec -------------------------------------------------------- */ 765 766/* see RFC2152 for details */ 767 768static 769char utf7_special[128] = { 770 /* indicate whether a UTF-7 character is special i.e. cannot be directly 771 encoded: 772 0 - not special 773 1 - special 774 2 - whitespace (optional) 775 3 - RFC2152 Set O (optional) */ 776 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 778 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 779 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 780 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 782 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 784 785}; 786 787#define SPECIAL(c, encodeO, encodeWS) \ 788 (((c)>127 || utf7_special[(c)] == 1) || \ 789 (encodeWS && (utf7_special[(c)] == 2)) || \ 790 (encodeO && (utf7_special[(c)] == 3))) 791 792#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 793#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') 794#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 795 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) 796 797#define ENCODE(out, ch, bits) \ 798 while (bits >= 6) { \ 799 *out++ = B64(ch >> (bits-6)); \ 800 bits -= 6; \ 801 } 802 803#define DECODE(out, ch, bits, surrogate) \ 804 while (bits >= 16) { \ 805 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 806 bits -= 16; \ 807 if (surrogate) { \ 808 /* We have already generated an error for the high surrogate 809 so let's not bother seeing if the low surrogate is correct or not */\ 810 surrogate = 0; \ 811 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 812 /* This is a surrogate pair. Unfortunately we can't represent \ 813 it in a 16-bit character */ \ 814 surrogate = 1; \ 815 errmsg = "code pairs are not supported"; \ 816 goto utf7Error; \ 817 } else { \ 818 *out++ = outCh; \ 819 } \ 820 } \ 821 822PyObject *PyUnicode_DecodeUTF7(const char *s, 823 int size, 824 const char *errors) 825{ 826 const char *starts = s; 827 int startinpos; 828 int endinpos; 829 int outpos; 830 const char *e; 831 PyUnicodeObject *unicode; 832 Py_UNICODE *p; 833 const char *errmsg = ""; 834 int inShift = 0; 835 unsigned int bitsleft = 0; 836 unsigned long charsleft = 0; 837 int surrogate = 0; 838 PyObject *errorHandler = NULL; 839 PyObject *exc = NULL; 840 841 unicode = _PyUnicode_New(size); 842 if (!unicode) 843 return NULL; 844 if (size == 0) 845 return (PyObject *)unicode; 846 847 p = unicode->str; 848 e = s + size; 849 850 while (s < e) { 851 Py_UNICODE ch; 852 restart: 853 ch = *s; 854 855 if (inShift) { 856 if ((ch == '-') || !B64CHAR(ch)) { 857 inShift = 0; 858 s++; 859 860 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 861 if (bitsleft >= 6) { 862 /* The shift sequence has a partial character in it. If 863 bitsleft < 6 then we could just classify it as padding 864 but that is not the case here */ 865 866 errmsg = "partial character in shift sequence"; 867 goto utf7Error; 868 } 869 /* According to RFC2152 the remaining bits should be zero. We 870 choose to signal an error/insert a replacement character 871 here so indicate the potential of a misencoded character. */ 872 873 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 874 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 875 errmsg = "non-zero padding bits in shift sequence"; 876 goto utf7Error; 877 } 878 879 if (ch == '-') { 880 if ((s < e) && (*(s) == '-')) { 881 *p++ = '-'; 882 inShift = 1; 883 } 884 } else if (SPECIAL(ch,0,0)) { 885 errmsg = "unexpected special character"; 886 goto utf7Error; 887 } else { 888 *p++ = ch; 889 } 890 } else { 891 charsleft = (charsleft << 6) | UB64(ch); 892 bitsleft += 6; 893 s++; 894 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 895 } 896 } 897 else if ( ch == '+' ) { 898 startinpos = s-starts; 899 s++; 900 if (s < e && *s == '-') { 901 s++; 902 *p++ = '+'; 903 } else 904 { 905 inShift = 1; 906 bitsleft = 0; 907 } 908 } 909 else if (SPECIAL(ch,0,0)) { 910 errmsg = "unexpected special character"; 911 s++; 912 goto utf7Error; 913 } 914 else { 915 *p++ = ch; 916 s++; 917 } 918 continue; 919 utf7Error: 920 outpos = p-PyUnicode_AS_UNICODE(unicode); 921 endinpos = s-starts; 922 if (unicode_decode_call_errorhandler( 923 errors, &errorHandler, 924 "utf7", errmsg, 925 starts, size, &startinpos, &endinpos, &exc, &s, 926 (PyObject **)&unicode, &outpos, &p)) 927 goto onError; 928 } 929 930 if (inShift) { 931 outpos = p-PyUnicode_AS_UNICODE(unicode); 932 endinpos = size; 933 if (unicode_decode_call_errorhandler( 934 errors, &errorHandler, 935 "utf7", "unterminated shift sequence", 936 starts, size, &startinpos, &endinpos, &exc, &s, 937 (PyObject **)&unicode, &outpos, &p)) 938 goto onError; 939 if (s < e) 940 goto restart; 941 } 942 943 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode))) 944 goto onError; 945 946 Py_XDECREF(errorHandler); 947 Py_XDECREF(exc); 948 return (PyObject *)unicode; 949 950onError: 951 Py_XDECREF(errorHandler); 952 Py_XDECREF(exc); 953 Py_DECREF(unicode); 954 return NULL; 955} 956 957 958PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 959 int size, 960 int encodeSetO, 961 int encodeWhiteSpace, 962 const char *errors) 963{ 964 PyObject *v; 965 /* It might be possible to tighten this worst case */ 966 unsigned int cbAllocated = 5 * size; 967 int inShift = 0; 968 int i = 0; 969 unsigned int bitsleft = 0; 970 unsigned long charsleft = 0; 971 char * out; 972 char * start; 973 974 if (size == 0) 975 return PyString_FromStringAndSize(NULL, 0); 976 977 v = PyString_FromStringAndSize(NULL, cbAllocated); 978 if (v == NULL) 979 return NULL; 980 981 start = out = PyString_AS_STRING(v); 982 for (;i < size; ++i) { 983 Py_UNICODE ch = s[i]; 984 985 if (!inShift) { 986 if (ch == '+') { 987 *out++ = '+'; 988 *out++ = '-'; 989 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 990 charsleft = ch; 991 bitsleft = 16; 992 *out++ = '+'; 993 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 994 inShift = bitsleft > 0; 995 } else { 996 *out++ = (char) ch; 997 } 998 } else { 999 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1000 *out++ = B64(charsleft << (6-bitsleft)); 1001 charsleft = 0; 1002 bitsleft = 0; 1003 /* Characters not in the BASE64 set implicitly unshift the sequence 1004 so no '-' is required, except if the character is itself a '-' */ 1005 if (B64CHAR(ch) || ch == '-') { 1006 *out++ = '-'; 1007 } 1008 inShift = 0; 1009 *out++ = (char) ch; 1010 } else { 1011 bitsleft += 16; 1012 charsleft = (charsleft << 16) | ch; 1013 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1014 1015 /* If the next character is special then we dont' need to terminate 1016 the shift sequence. If the next character is not a BASE64 character 1017 or '-' then the shift sequence will be terminated implicitly and we 1018 don't have to insert a '-'. */ 1019 1020 if (bitsleft == 0) { 1021 if (i + 1 < size) { 1022 Py_UNICODE ch2 = s[i+1]; 1023 1024 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1025 1026 } else if (B64CHAR(ch2) || ch2 == '-') { 1027 *out++ = '-'; 1028 inShift = 0; 1029 } else { 1030 inShift = 0; 1031 } 1032 1033 } 1034 else { 1035 *out++ = '-'; 1036 inShift = 0; 1037 } 1038 } 1039 } 1040 } 1041 } 1042 if (bitsleft) { 1043 *out++= B64(charsleft << (6-bitsleft) ); 1044 *out++ = '-'; 1045 } 1046 1047 _PyString_Resize(&v, out - start); 1048 return v; 1049} 1050 1051#undef SPECIAL 1052#undef B64 1053#undef B64CHAR 1054#undef UB64 1055#undef ENCODE 1056#undef DECODE 1057 1058/* --- UTF-8 Codec -------------------------------------------------------- */ 1059 1060static 1061char utf8_code_length[256] = { 1062 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1063 illegal prefix. see RFC 2279 for details */ 1064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1076 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1078 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1079 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1080}; 1081 1082PyObject *PyUnicode_DecodeUTF8(const char *s, 1083 int size, 1084 const char *errors) 1085{ 1086 const char *starts = s; 1087 int n; 1088 int startinpos; 1089 int endinpos; 1090 int outpos; 1091 const char *e; 1092 PyUnicodeObject *unicode; 1093 Py_UNICODE *p; 1094 const char *errmsg = ""; 1095 PyObject *errorHandler = NULL; 1096 PyObject *exc = NULL; 1097 1098 /* Note: size will always be longer than the resulting Unicode 1099 character count */ 1100 unicode = _PyUnicode_New(size); 1101 if (!unicode) 1102 return NULL; 1103 if (size == 0) 1104 return (PyObject *)unicode; 1105 1106 /* Unpack UTF-8 encoded data */ 1107 p = unicode->str; 1108 e = s + size; 1109 1110 while (s < e) { 1111 Py_UCS4 ch = (unsigned char)*s; 1112 1113 if (ch < 0x80) { 1114 *p++ = (Py_UNICODE)ch; 1115 s++; 1116 continue; 1117 } 1118 1119 n = utf8_code_length[ch]; 1120 1121 if (s + n > e) { 1122 errmsg = "unexpected end of data"; 1123 startinpos = s-starts; 1124 endinpos = size; 1125 goto utf8Error; 1126 } 1127 1128 switch (n) { 1129 1130 case 0: 1131 errmsg = "unexpected code byte"; 1132 startinpos = s-starts; 1133 endinpos = startinpos+1; 1134 goto utf8Error; 1135 1136 case 1: 1137 errmsg = "internal error"; 1138 startinpos = s-starts; 1139 endinpos = startinpos+1; 1140 goto utf8Error; 1141 1142 case 2: 1143 if ((s[1] & 0xc0) != 0x80) { 1144 errmsg = "invalid data"; 1145 startinpos = s-starts; 1146 endinpos = startinpos+2; 1147 goto utf8Error; 1148 } 1149 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1150 if (ch < 0x80) { 1151 startinpos = s-starts; 1152 endinpos = startinpos+2; 1153 errmsg = "illegal encoding"; 1154 goto utf8Error; 1155 } 1156 else 1157 *p++ = (Py_UNICODE)ch; 1158 break; 1159 1160 case 3: 1161 if ((s[1] & 0xc0) != 0x80 || 1162 (s[2] & 0xc0) != 0x80) { 1163 errmsg = "invalid data"; 1164 startinpos = s-starts; 1165 endinpos = startinpos+3; 1166 goto utf8Error; 1167 } 1168 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1169 if (ch < 0x0800) { 1170 /* Note: UTF-8 encodings of surrogates are considered 1171 legal UTF-8 sequences; 1172 1173 XXX For wide builds (UCS-4) we should probably try 1174 to recombine the surrogates into a single code 1175 unit. 1176 */ 1177 errmsg = "illegal encoding"; 1178 startinpos = s-starts; 1179 endinpos = startinpos+3; 1180 goto utf8Error; 1181 } 1182 else 1183 *p++ = (Py_UNICODE)ch; 1184 break; 1185 1186 case 4: 1187 if ((s[1] & 0xc0) != 0x80 || 1188 (s[2] & 0xc0) != 0x80 || 1189 (s[3] & 0xc0) != 0x80) { 1190 errmsg = "invalid data"; 1191 startinpos = s-starts; 1192 endinpos = startinpos+4; 1193 goto utf8Error; 1194 } 1195 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1196 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1197 /* validate and convert to UTF-16 */ 1198 if ((ch < 0x10000) /* minimum value allowed for 4 1199 byte encoding */ 1200 || (ch > 0x10ffff)) /* maximum value allowed for 1201 UTF-16 */ 1202 { 1203 errmsg = "illegal encoding"; 1204 startinpos = s-starts; 1205 endinpos = startinpos+4; 1206 goto utf8Error; 1207 } 1208#ifdef Py_UNICODE_WIDE 1209 *p++ = (Py_UNICODE)ch; 1210#else 1211 /* compute and append the two surrogates: */ 1212 1213 /* translate from 10000..10FFFF to 0..FFFF */ 1214 ch -= 0x10000; 1215 1216 /* high surrogate = top 10 bits added to D800 */ 1217 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1218 1219 /* low surrogate = bottom 10 bits added to DC00 */ 1220 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1221#endif 1222 break; 1223 1224 default: 1225 /* Other sizes are only needed for UCS-4 */ 1226 errmsg = "unsupported Unicode code range"; 1227 startinpos = s-starts; 1228 endinpos = startinpos+n; 1229 goto utf8Error; 1230 } 1231 s += n; 1232 continue; 1233 1234 utf8Error: 1235 outpos = p-PyUnicode_AS_UNICODE(unicode); 1236 if (unicode_decode_call_errorhandler( 1237 errors, &errorHandler, 1238 "utf8", errmsg, 1239 starts, size, &startinpos, &endinpos, &exc, &s, 1240 (PyObject **)&unicode, &outpos, &p)) 1241 goto onError; 1242 } 1243 1244 /* Adjust length */ 1245 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1246 goto onError; 1247 1248 Py_XDECREF(errorHandler); 1249 Py_XDECREF(exc); 1250 return (PyObject *)unicode; 1251 1252onError: 1253 Py_XDECREF(errorHandler); 1254 Py_XDECREF(exc); 1255 Py_DECREF(unicode); 1256 return NULL; 1257} 1258 1259/* Allocation strategy: if the string is short, convert into a stack buffer 1260 and allocate exactly as much space needed at the end. Else allocate the 1261 maximum possible needed (4 result bytes per Unicode character), and return 1262 the excess memory at the end. 1263*/ 1264PyObject * 1265PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1266 int size, 1267 const char *errors) 1268{ 1269#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1270 1271 int i; /* index into s of next input byte */ 1272 PyObject *v; /* result string object */ 1273 char *p; /* next free byte in output buffer */ 1274 int nallocated; /* number of result bytes allocated */ 1275 int nneeded; /* number of result bytes needed */ 1276 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1277 1278 assert(s != NULL); 1279 assert(size >= 0); 1280 1281 if (size <= MAX_SHORT_UNICHARS) { 1282 /* Write into the stack buffer; nallocated can't overflow. 1283 * At the end, we'll allocate exactly as much heap space as it 1284 * turns out we need. 1285 */ 1286 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1287 v = NULL; /* will allocate after we're done */ 1288 p = stackbuf; 1289 } 1290 else { 1291 /* Overallocate on the heap, and give the excess back at the end. */ 1292 nallocated = size * 4; 1293 if (nallocated / 4 != size) /* overflow! */ 1294 return PyErr_NoMemory(); 1295 v = PyString_FromStringAndSize(NULL, nallocated); 1296 if (v == NULL) 1297 return NULL; 1298 p = PyString_AS_STRING(v); 1299 } 1300 1301 for (i = 0; i < size;) { 1302 Py_UCS4 ch = s[i++]; 1303 1304 if (ch < 0x80) 1305 /* Encode ASCII */ 1306 *p++ = (char) ch; 1307 1308 else if (ch < 0x0800) { 1309 /* Encode Latin-1 */ 1310 *p++ = (char)(0xc0 | (ch >> 6)); 1311 *p++ = (char)(0x80 | (ch & 0x3f)); 1312 } 1313 else { 1314 /* Encode UCS2 Unicode ordinals */ 1315 if (ch < 0x10000) { 1316 /* Special case: check for high surrogate */ 1317 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1318 Py_UCS4 ch2 = s[i]; 1319 /* Check for low surrogate and combine the two to 1320 form a UCS4 value */ 1321 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1322 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1323 i++; 1324 goto encodeUCS4; 1325 } 1326 /* Fall through: handles isolated high surrogates */ 1327 } 1328 *p++ = (char)(0xe0 | (ch >> 12)); 1329 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1330 *p++ = (char)(0x80 | (ch & 0x3f)); 1331 continue; 1332 } 1333encodeUCS4: 1334 /* Encode UCS4 Unicode ordinals */ 1335 *p++ = (char)(0xf0 | (ch >> 18)); 1336 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1337 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1338 *p++ = (char)(0x80 | (ch & 0x3f)); 1339 } 1340 } 1341 1342 if (v == NULL) { 1343 /* This was stack allocated. */ 1344 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); 1345 assert(nneeded <= nallocated); 1346 v = PyString_FromStringAndSize(stackbuf, nneeded); 1347 } 1348 else { 1349 /* Cut back to size actually needed. */ 1350 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); 1351 assert(nneeded <= nallocated); 1352 _PyString_Resize(&v, nneeded); 1353 } 1354 return v; 1355 1356#undef MAX_SHORT_UNICHARS 1357} 1358 1359PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1360{ 1361 if (!PyUnicode_Check(unicode)) { 1362 PyErr_BadArgument(); 1363 return NULL; 1364 } 1365 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1366 PyUnicode_GET_SIZE(unicode), 1367 NULL); 1368} 1369 1370/* --- UTF-16 Codec ------------------------------------------------------- */ 1371 1372PyObject * 1373PyUnicode_DecodeUTF16(const char *s, 1374 int size, 1375 const char *errors, 1376 int *byteorder) 1377{ 1378 const char *starts = s; 1379 int startinpos; 1380 int endinpos; 1381 int outpos; 1382 PyUnicodeObject *unicode; 1383 Py_UNICODE *p; 1384 const unsigned char *q, *e; 1385 int bo = 0; /* assume native ordering by default */ 1386 const char *errmsg = ""; 1387 /* Offsets from q for retrieving byte pairs in the right order. */ 1388#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1389 int ihi = 1, ilo = 0; 1390#else 1391 int ihi = 0, ilo = 1; 1392#endif 1393 PyObject *errorHandler = NULL; 1394 PyObject *exc = NULL; 1395 1396 /* Note: size will always be longer than the resulting Unicode 1397 character count */ 1398 unicode = _PyUnicode_New(size); 1399 if (!unicode) 1400 return NULL; 1401 if (size == 0) 1402 return (PyObject *)unicode; 1403 1404 /* Unpack UTF-16 encoded data */ 1405 p = unicode->str; 1406 q = (unsigned char *)s; 1407 e = q + size; 1408 1409 if (byteorder) 1410 bo = *byteorder; 1411 1412 /* Check for BOM marks (U+FEFF) in the input and adjust current 1413 byte order setting accordingly. In native mode, the leading BOM 1414 mark is skipped, in all other modes, it is copied to the output 1415 stream as-is (giving a ZWNBSP character). */ 1416 if (bo == 0) { 1417 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 1418#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1419 if (bom == 0xFEFF) { 1420 q += 2; 1421 bo = -1; 1422 } 1423 else if (bom == 0xFFFE) { 1424 q += 2; 1425 bo = 1; 1426 } 1427#else 1428 if (bom == 0xFEFF) { 1429 q += 2; 1430 bo = 1; 1431 } 1432 else if (bom == 0xFFFE) { 1433 q += 2; 1434 bo = -1; 1435 } 1436#endif 1437 } 1438 1439 if (bo == -1) { 1440 /* force LE */ 1441 ihi = 1; 1442 ilo = 0; 1443 } 1444 else if (bo == 1) { 1445 /* force BE */ 1446 ihi = 0; 1447 ilo = 1; 1448 } 1449 1450 while (q < e) { 1451 Py_UNICODE ch; 1452 /* remaing bytes at the end? (size should be even) */ 1453 if (e-q<2) { 1454 errmsg = "truncated data"; 1455 startinpos = ((const char *)q)-starts; 1456 endinpos = ((const char *)e)-starts; 1457 goto utf16Error; 1458 /* The remaining input chars are ignored if the callback 1459 chooses to skip the input */ 1460 } 1461 ch = (q[ihi] << 8) | q[ilo]; 1462 1463 q += 2; 1464 1465 if (ch < 0xD800 || ch > 0xDFFF) { 1466 *p++ = ch; 1467 continue; 1468 } 1469 1470 /* UTF-16 code pair: */ 1471 if (q >= e) { 1472 errmsg = "unexpected end of data"; 1473 startinpos = (((const char *)q)-2)-starts; 1474 endinpos = ((const char *)e)-starts; 1475 goto utf16Error; 1476 } 1477 if (0xD800 <= ch && ch <= 0xDBFF) { 1478 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 1479 q += 2; 1480 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1481#ifndef Py_UNICODE_WIDE 1482 *p++ = ch; 1483 *p++ = ch2; 1484#else 1485 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 1486#endif 1487 continue; 1488 } 1489 else { 1490 errmsg = "illegal UTF-16 surrogate"; 1491 startinpos = (((const char *)q)-4)-starts; 1492 endinpos = startinpos+2; 1493 goto utf16Error; 1494 } 1495 1496 } 1497 errmsg = "illegal encoding"; 1498 startinpos = (((const char *)q)-2)-starts; 1499 endinpos = startinpos+2; 1500 /* Fall through to report the error */ 1501 1502 utf16Error: 1503 outpos = p-PyUnicode_AS_UNICODE(unicode); 1504 if (unicode_decode_call_errorhandler( 1505 errors, &errorHandler, 1506 "utf16", errmsg, 1507 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 1508 (PyObject **)&unicode, &outpos, &p)) 1509 goto onError; 1510 } 1511 1512 if (byteorder) 1513 *byteorder = bo; 1514 1515 /* Adjust length */ 1516 if (_PyUnicode_Resize(&unicode, p - unicode->str)) 1517 goto onError; 1518 1519 Py_XDECREF(errorHandler); 1520 Py_XDECREF(exc); 1521 return (PyObject *)unicode; 1522 1523onError: 1524 Py_DECREF(unicode); 1525 Py_XDECREF(errorHandler); 1526 Py_XDECREF(exc); 1527 return NULL; 1528} 1529 1530PyObject * 1531PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1532 int size, 1533 const char *errors, 1534 int byteorder) 1535{ 1536 PyObject *v; 1537 unsigned char *p; 1538 int i, pairs; 1539 /* Offsets from p for storing byte pairs in the right order. */ 1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1541 int ihi = 1, ilo = 0; 1542#else 1543 int ihi = 0, ilo = 1; 1544#endif 1545 1546#define STORECHAR(CH) \ 1547 do { \ 1548 p[ihi] = ((CH) >> 8) & 0xff; \ 1549 p[ilo] = (CH) & 0xff; \ 1550 p += 2; \ 1551 } while(0) 1552 1553 for (i = pairs = 0; i < size; i++) 1554 if (s[i] >= 0x10000) 1555 pairs++; 1556 v = PyString_FromStringAndSize(NULL, 1557 2 * (size + pairs + (byteorder == 0))); 1558 if (v == NULL) 1559 return NULL; 1560 1561 p = (unsigned char *)PyString_AS_STRING(v); 1562 if (byteorder == 0) 1563 STORECHAR(0xFEFF); 1564 if (size == 0) 1565 return v; 1566 1567 if (byteorder == -1) { 1568 /* force LE */ 1569 ihi = 1; 1570 ilo = 0; 1571 } 1572 else if (byteorder == 1) { 1573 /* force BE */ 1574 ihi = 0; 1575 ilo = 1; 1576 } 1577 1578 while (size-- > 0) { 1579 Py_UNICODE ch = *s++; 1580 Py_UNICODE ch2 = 0; 1581 if (ch >= 0x10000) { 1582 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 1583 ch = 0xD800 | ((ch-0x10000) >> 10); 1584 } 1585 STORECHAR(ch); 1586 if (ch2) 1587 STORECHAR(ch2); 1588 } 1589 return v; 1590#undef STORECHAR 1591} 1592 1593PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1594{ 1595 if (!PyUnicode_Check(unicode)) { 1596 PyErr_BadArgument(); 1597 return NULL; 1598 } 1599 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1600 PyUnicode_GET_SIZE(unicode), 1601 NULL, 1602 0); 1603} 1604 1605/* --- Unicode Escape Codec ----------------------------------------------- */ 1606 1607static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 1608 1609PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1610 int size, 1611 const char *errors) 1612{ 1613 const char *starts = s; 1614 int startinpos; 1615 int endinpos; 1616 int outpos; 1617 int i; 1618 PyUnicodeObject *v; 1619 Py_UNICODE *p; 1620 const char *end; 1621 char* message; 1622 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 1623 PyObject *errorHandler = NULL; 1624 PyObject *exc = NULL; 1625 1626 /* Escaped strings will always be longer than the resulting 1627 Unicode string, so we start with size here and then reduce the 1628 length after conversion to the true value. 1629 (but if the error callback returns a long replacement string 1630 we'll have to allocate more space) */ 1631 v = _PyUnicode_New(size); 1632 if (v == NULL) 1633 goto onError; 1634 if (size == 0) 1635 return (PyObject *)v; 1636 1637 p = PyUnicode_AS_UNICODE(v); 1638 end = s + size; 1639 1640 while (s < end) { 1641 unsigned char c; 1642 Py_UNICODE x; 1643 int digits; 1644 1645 /* Non-escape characters are interpreted as Unicode ordinals */ 1646 if (*s != '\\') { 1647 *p++ = (unsigned char) *s++; 1648 continue; 1649 } 1650 1651 startinpos = s-starts; 1652 /* \ - Escapes */ 1653 s++; 1654 switch (*s++) { 1655 1656 /* \x escapes */ 1657 case '\n': break; 1658 case '\\': *p++ = '\\'; break; 1659 case '\'': *p++ = '\''; break; 1660 case '\"': *p++ = '\"'; break; 1661 case 'b': *p++ = '\b'; break; 1662 case 'f': *p++ = '\014'; break; /* FF */ 1663 case 't': *p++ = '\t'; break; 1664 case 'n': *p++ = '\n'; break; 1665 case 'r': *p++ = '\r'; break; 1666 case 'v': *p++ = '\013'; break; /* VT */ 1667 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1668 1669 /* \OOO (octal) escapes */ 1670 case '0': case '1': case '2': case '3': 1671 case '4': case '5': case '6': case '7': 1672 x = s[-1] - '0'; 1673 if ('0' <= *s && *s <= '7') { 1674 x = (x<<3) + *s++ - '0'; 1675 if ('0' <= *s && *s <= '7') 1676 x = (x<<3) + *s++ - '0'; 1677 } 1678 *p++ = x; 1679 break; 1680 1681 /* hex escapes */ 1682 /* \xXX */ 1683 case 'x': 1684 digits = 2; 1685 message = "truncated \\xXX escape"; 1686 goto hexescape; 1687 1688 /* \uXXXX */ 1689 case 'u': 1690 digits = 4; 1691 message = "truncated \\uXXXX escape"; 1692 goto hexescape; 1693 1694 /* \UXXXXXXXX */ 1695 case 'U': 1696 digits = 8; 1697 message = "truncated \\UXXXXXXXX escape"; 1698 hexescape: 1699 chr = 0; 1700 outpos = p-PyUnicode_AS_UNICODE(v); 1701 if (s+digits>end) { 1702 endinpos = size; 1703 if (unicode_decode_call_errorhandler( 1704 errors, &errorHandler, 1705 "unicodeescape", "end of string in escape sequence", 1706 starts, size, &startinpos, &endinpos, &exc, &s, 1707 (PyObject **)&v, &outpos, &p)) 1708 goto onError; 1709 goto nextByte; 1710 } 1711 for (i = 0; i < digits; ++i) { 1712 c = (unsigned char) s[i]; 1713 if (!isxdigit(c)) { 1714 endinpos = (s+i+1)-starts; 1715 if (unicode_decode_call_errorhandler( 1716 errors, &errorHandler, 1717 "unicodeescape", message, 1718 starts, size, &startinpos, &endinpos, &exc, &s, 1719 (PyObject **)&v, &outpos, &p)) 1720 goto onError; 1721 goto nextByte; 1722 } 1723 chr = (chr<<4) & ~0xF; 1724 if (c >= '0' && c <= '9') 1725 chr += c - '0'; 1726 else if (c >= 'a' && c <= 'f') 1727 chr += 10 + c - 'a'; 1728 else 1729 chr += 10 + c - 'A'; 1730 } 1731 s += i; 1732 if (chr == 0xffffffff) 1733 /* _decoding_error will have already written into the 1734 target buffer. */ 1735 break; 1736 store: 1737 /* when we get here, chr is a 32-bit unicode character */ 1738 if (chr <= 0xffff) 1739 /* UCS-2 character */ 1740 *p++ = (Py_UNICODE) chr; 1741 else if (chr <= 0x10ffff) { 1742 /* UCS-4 character. Either store directly, or as 1743 surrogate pair. */ 1744#ifdef Py_UNICODE_WIDE 1745 *p++ = chr; 1746#else 1747 chr -= 0x10000L; 1748 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1749 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 1750#endif 1751 } else { 1752 endinpos = s-starts; 1753 outpos = p-PyUnicode_AS_UNICODE(v); 1754 if (unicode_decode_call_errorhandler( 1755 errors, &errorHandler, 1756 "unicodeescape", "illegal Unicode character", 1757 starts, size, &startinpos, &endinpos, &exc, &s, 1758 (PyObject **)&v, &outpos, &p)) 1759 goto onError; 1760 } 1761 break; 1762 1763 /* \N{name} */ 1764 case 'N': 1765 message = "malformed \\N character escape"; 1766 if (ucnhash_CAPI == NULL) { 1767 /* load the unicode data module */ 1768 PyObject *m, *v; 1769 m = PyImport_ImportModule("unicodedata"); 1770 if (m == NULL) 1771 goto ucnhashError; 1772 v = PyObject_GetAttrString(m, "ucnhash_CAPI"); 1773 Py_DECREF(m); 1774 if (v == NULL) 1775 goto ucnhashError; 1776 ucnhash_CAPI = PyCObject_AsVoidPtr(v); 1777 Py_DECREF(v); 1778 if (ucnhash_CAPI == NULL) 1779 goto ucnhashError; 1780 } 1781 if (*s == '{') { 1782 const char *start = s+1; 1783 /* look for the closing brace */ 1784 while (*s != '}' && s < end) 1785 s++; 1786 if (s > start && s < end && *s == '}') { 1787 /* found a name. look it up in the unicode database */ 1788 message = "unknown Unicode character name"; 1789 s++; 1790 if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) 1791 goto store; 1792 } 1793 } 1794 endinpos = s-starts; 1795 outpos = p-PyUnicode_AS_UNICODE(v); 1796 if (unicode_decode_call_errorhandler( 1797 errors, &errorHandler, 1798 "unicodeescape", message, 1799 starts, size, &startinpos, &endinpos, &exc, &s, 1800 (PyObject **)&v, &outpos, &p)) 1801 goto onError; 1802 break; 1803 1804 default: 1805 if (s > end) { 1806 message = "\\ at end of string"; 1807 s--; 1808 endinpos = s-starts; 1809 outpos = p-PyUnicode_AS_UNICODE(v); 1810 if (unicode_decode_call_errorhandler( 1811 errors, &errorHandler, 1812 "unicodeescape", message, 1813 starts, size, &startinpos, &endinpos, &exc, &s, 1814 (PyObject **)&v, &outpos, &p)) 1815 goto onError; 1816 } 1817 else { 1818 *p++ = '\\'; 1819 *p++ = (unsigned char)s[-1]; 1820 } 1821 break; 1822 } 1823 nextByte: 1824 ; 1825 } 1826 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1827 goto onError; 1828 return (PyObject *)v; 1829 1830ucnhashError: 1831 PyErr_SetString( 1832 PyExc_UnicodeError, 1833 "\\N escapes not supported (can't load unicodedata module)" 1834 ); 1835 Py_XDECREF(errorHandler); 1836 Py_XDECREF(exc); 1837 return NULL; 1838 1839onError: 1840 Py_XDECREF(v); 1841 Py_XDECREF(errorHandler); 1842 Py_XDECREF(exc); 1843 return NULL; 1844} 1845 1846/* Return a Unicode-Escape string version of the Unicode object. 1847 1848 If quotes is true, the string is enclosed in u"" or u'' quotes as 1849 appropriate. 1850 1851*/ 1852 1853static const Py_UNICODE *findchar(const Py_UNICODE *s, 1854 int size, 1855 Py_UNICODE ch); 1856 1857static 1858PyObject *unicodeescape_string(const Py_UNICODE *s, 1859 int size, 1860 int quotes) 1861{ 1862 PyObject *repr; 1863 char *p; 1864 1865 static const char *hexdigit = "0123456789abcdef"; 1866 1867 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1868 if (repr == NULL) 1869 return NULL; 1870 1871 p = PyString_AS_STRING(repr); 1872 1873 if (quotes) { 1874 *p++ = 'u'; 1875 *p++ = (findchar(s, size, '\'') && 1876 !findchar(s, size, '"')) ? '"' : '\''; 1877 } 1878 while (size-- > 0) { 1879 Py_UNICODE ch = *s++; 1880 1881 /* Escape quotes */ 1882 if (quotes && 1883 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) { 1884 *p++ = '\\'; 1885 *p++ = (char) ch; 1886 continue; 1887 } 1888 1889#ifdef Py_UNICODE_WIDE 1890 /* Map 21-bit characters to '\U00xxxxxx' */ 1891 else if (ch >= 0x10000) { 1892 int offset = p - PyString_AS_STRING(repr); 1893 1894 /* Resize the string if necessary */ 1895 if (offset + 12 > PyString_GET_SIZE(repr)) { 1896 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100)) 1897 return NULL; 1898 p = PyString_AS_STRING(repr) + offset; 1899 } 1900 1901 *p++ = '\\'; 1902 *p++ = 'U'; 1903 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; 1904 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; 1905 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; 1906 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; 1907 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; 1908 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; 1909 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; 1910 *p++ = hexdigit[ch & 0x0000000F]; 1911 continue; 1912 } 1913#endif 1914 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ 1915 else if (ch >= 0xD800 && ch < 0xDC00) { 1916 Py_UNICODE ch2; 1917 Py_UCS4 ucs; 1918 1919 ch2 = *s++; 1920 size--; 1921 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 1922 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 1923 *p++ = '\\'; 1924 *p++ = 'U'; 1925 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; 1926 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; 1927 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; 1928 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; 1929 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; 1930 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; 1931 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; 1932 *p++ = hexdigit[ucs & 0x0000000F]; 1933 continue; 1934 } 1935 /* Fall through: isolated surrogates are copied as-is */ 1936 s--; 1937 size++; 1938 } 1939 1940 /* Map 16-bit characters to '\uxxxx' */ 1941 if (ch >= 256) { 1942 *p++ = '\\'; 1943 *p++ = 'u'; 1944 *p++ = hexdigit[(ch >> 12) & 0x000F]; 1945 *p++ = hexdigit[(ch >> 8) & 0x000F]; 1946 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1947 *p++ = hexdigit[ch & 0x000F]; 1948 } 1949 1950 /* Map special whitespace to '\t', \n', '\r' */ 1951 else if (ch == '\t') { 1952 *p++ = '\\'; 1953 *p++ = 't'; 1954 } 1955 else if (ch == '\n') { 1956 *p++ = '\\'; 1957 *p++ = 'n'; 1958 } 1959 else if (ch == '\r') { 1960 *p++ = '\\'; 1961 *p++ = 'r'; 1962 } 1963 1964 /* Map non-printable US ASCII to '\xhh' */ 1965 else if (ch < ' ' || ch >= 0x7F) { 1966 *p++ = '\\'; 1967 *p++ = 'x'; 1968 *p++ = hexdigit[(ch >> 4) & 0x000F]; 1969 *p++ = hexdigit[ch & 0x000F]; 1970 } 1971 1972 /* Copy everything else as-is */ 1973 else 1974 *p++ = (char) ch; 1975 } 1976 if (quotes) 1977 *p++ = PyString_AS_STRING(repr)[1]; 1978 1979 *p = '\0'; 1980 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); 1981 return repr; 1982} 1983 1984PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1985 int size) 1986{ 1987 return unicodeescape_string(s, size, 0); 1988} 1989 1990PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1991{ 1992 if (!PyUnicode_Check(unicode)) { 1993 PyErr_BadArgument(); 1994 return NULL; 1995 } 1996 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1997 PyUnicode_GET_SIZE(unicode)); 1998} 1999 2000/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2001 2002PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2003 int size, 2004 const char *errors) 2005{ 2006 const char *starts = s; 2007 int startinpos; 2008 int endinpos; 2009 int outpos; 2010 PyUnicodeObject *v; 2011 Py_UNICODE *p; 2012 const char *end; 2013 const char *bs; 2014 PyObject *errorHandler = NULL; 2015 PyObject *exc = NULL; 2016 2017 /* Escaped strings will always be longer than the resulting 2018 Unicode string, so we start with size here and then reduce the 2019 length after conversion to the true value. (But decoding error 2020 handler might have to resize the string) */ 2021 v = _PyUnicode_New(size); 2022 if (v == NULL) 2023 goto onError; 2024 if (size == 0) 2025 return (PyObject *)v; 2026 p = PyUnicode_AS_UNICODE(v); 2027 end = s + size; 2028 while (s < end) { 2029 unsigned char c; 2030 Py_UCS4 x; 2031 int i; 2032 2033 /* Non-escape characters are interpreted as Unicode ordinals */ 2034 if (*s != '\\') { 2035 *p++ = (unsigned char)*s++; 2036 continue; 2037 } 2038 startinpos = s-starts; 2039 2040 /* \u-escapes are only interpreted iff the number of leading 2041 backslashes if odd */ 2042 bs = s; 2043 for (;s < end;) { 2044 if (*s != '\\') 2045 break; 2046 *p++ = (unsigned char)*s++; 2047 } 2048 if (((s - bs) & 1) == 0 || 2049 s >= end || 2050 *s != 'u') { 2051 continue; 2052 } 2053 p--; 2054 s++; 2055 2056 /* \uXXXX with 4 hex digits */ 2057 outpos = p-PyUnicode_AS_UNICODE(v); 2058 for (x = 0, i = 0; i < 4; ++i, ++s) { 2059 c = (unsigned char)*s; 2060 if (!isxdigit(c)) { 2061 endinpos = s-starts; 2062 if (unicode_decode_call_errorhandler( 2063 errors, &errorHandler, 2064 "rawunicodeescape", "truncated \\uXXXX", 2065 starts, size, &startinpos, &endinpos, &exc, &s, 2066 (PyObject **)&v, &outpos, &p)) 2067 goto onError; 2068 goto nextByte; 2069 } 2070 x = (x<<4) & ~0xF; 2071 if (c >= '0' && c <= '9') 2072 x += c - '0'; 2073 else if (c >= 'a' && c <= 'f') 2074 x += 10 + c - 'a'; 2075 else 2076 x += 10 + c - 'A'; 2077 } 2078 *p++ = x; 2079 nextByte: 2080 ; 2081 } 2082 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2083 goto onError; 2084 Py_XDECREF(errorHandler); 2085 Py_XDECREF(exc); 2086 return (PyObject *)v; 2087 2088 onError: 2089 Py_XDECREF(v); 2090 Py_XDECREF(errorHandler); 2091 Py_XDECREF(exc); 2092 return NULL; 2093} 2094 2095PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2096 int size) 2097{ 2098 PyObject *repr; 2099 char *p; 2100 char *q; 2101 2102 static const char *hexdigit = "0123456789abcdef"; 2103 2104 repr = PyString_FromStringAndSize(NULL, 6 * size); 2105 if (repr == NULL) 2106 return NULL; 2107 if (size == 0) 2108 return repr; 2109 2110 p = q = PyString_AS_STRING(repr); 2111 while (size-- > 0) { 2112 Py_UNICODE ch = *s++; 2113 /* Map 16-bit characters to '\uxxxx' */ 2114 if (ch >= 256) { 2115 *p++ = '\\'; 2116 *p++ = 'u'; 2117 *p++ = hexdigit[(ch >> 12) & 0xf]; 2118 *p++ = hexdigit[(ch >> 8) & 0xf]; 2119 *p++ = hexdigit[(ch >> 4) & 0xf]; 2120 *p++ = hexdigit[ch & 15]; 2121 } 2122 /* Copy everything else as-is */ 2123 else 2124 *p++ = (char) ch; 2125 } 2126 *p = '\0'; 2127 _PyString_Resize(&repr, p - q); 2128 return repr; 2129} 2130 2131PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2132{ 2133 if (!PyUnicode_Check(unicode)) { 2134 PyErr_BadArgument(); 2135 return NULL; 2136 } 2137 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2138 PyUnicode_GET_SIZE(unicode)); 2139} 2140 2141/* --- Latin-1 Codec ------------------------------------------------------ */ 2142 2143PyObject *PyUnicode_DecodeLatin1(const char *s, 2144 int size, 2145 const char *errors) 2146{ 2147 PyUnicodeObject *v; 2148 Py_UNICODE *p; 2149 2150 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2151 if (size == 1 && *(unsigned char*)s < 256) { 2152 Py_UNICODE r = *(unsigned char*)s; 2153 return PyUnicode_FromUnicode(&r, 1); 2154 } 2155 2156 v = _PyUnicode_New(size); 2157 if (v == NULL) 2158 goto onError; 2159 if (size == 0) 2160 return (PyObject *)v; 2161 p = PyUnicode_AS_UNICODE(v); 2162 while (size-- > 0) 2163 *p++ = (unsigned char)*s++; 2164 return (PyObject *)v; 2165 2166 onError: 2167 Py_XDECREF(v); 2168 return NULL; 2169} 2170 2171/* create or adjust a UnicodeEncodeError */ 2172static void make_encode_exception(PyObject **exceptionObject, 2173 const char *encoding, 2174 const Py_UNICODE *unicode, int size, 2175 int startpos, int endpos, 2176 const char *reason) 2177{ 2178 if (*exceptionObject == NULL) { 2179 *exceptionObject = PyUnicodeEncodeError_Create( 2180 encoding, unicode, size, startpos, endpos, reason); 2181 } 2182 else { 2183 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2184 goto onError; 2185 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2186 goto onError; 2187 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2188 goto onError; 2189 return; 2190 onError: 2191 Py_DECREF(*exceptionObject); 2192 *exceptionObject = NULL; 2193 } 2194} 2195 2196/* raises a UnicodeEncodeError */ 2197static void raise_encode_exception(PyObject **exceptionObject, 2198 const char *encoding, 2199 const Py_UNICODE *unicode, int size, 2200 int startpos, int endpos, 2201 const char *reason) 2202{ 2203 make_encode_exception(exceptionObject, 2204 encoding, unicode, size, startpos, endpos, reason); 2205 if (*exceptionObject != NULL) 2206 PyCodec_StrictErrors(*exceptionObject); 2207} 2208 2209/* error handling callback helper: 2210 build arguments, call the callback and check the arguments, 2211 put the result into newpos and return the replacement string, which 2212 has to be freed by the caller */ 2213static PyObject *unicode_encode_call_errorhandler(const char *errors, 2214 PyObject **errorHandler, 2215 const char *encoding, const char *reason, 2216 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 2217 int startpos, int endpos, 2218 int *newpos) 2219{ 2220 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; 2221 2222 PyObject *restuple; 2223 PyObject *resunicode; 2224 2225 if (*errorHandler == NULL) { 2226 *errorHandler = PyCodec_LookupError(errors); 2227 if (*errorHandler == NULL) 2228 return NULL; 2229 } 2230 2231 make_encode_exception(exceptionObject, 2232 encoding, unicode, size, startpos, endpos, reason); 2233 if (*exceptionObject == NULL) 2234 return NULL; 2235 2236 restuple = PyObject_CallFunctionObjArgs( 2237 *errorHandler, *exceptionObject, NULL); 2238 if (restuple == NULL) 2239 return NULL; 2240 if (!PyTuple_Check(restuple)) { 2241 PyErr_Format(PyExc_TypeError, &argparse[4]); 2242 Py_DECREF(restuple); 2243 return NULL; 2244 } 2245 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 2246 &resunicode, newpos)) { 2247 Py_DECREF(restuple); 2248 return NULL; 2249 } 2250 if (*newpos<0) 2251 *newpos = size+*newpos; 2252 if (*newpos<0 || *newpos>size) { 2253 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 2254 Py_DECREF(restuple); 2255 return NULL; 2256 } 2257 Py_INCREF(resunicode); 2258 Py_DECREF(restuple); 2259 return resunicode; 2260} 2261 2262static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 2263 int size, 2264 const char *errors, 2265 int limit) 2266{ 2267 /* output object */ 2268 PyObject *res; 2269 /* pointers to the beginning and end+1 of input */ 2270 const Py_UNICODE *startp = p; 2271 const Py_UNICODE *endp = p + size; 2272 /* pointer to the beginning of the unencodable characters */ 2273 /* const Py_UNICODE *badp = NULL; */ 2274 /* pointer into the output */ 2275 char *str; 2276 /* current output position */ 2277 int respos = 0; 2278 int ressize; 2279 char *encoding = (limit == 256) ? "latin-1" : "ascii"; 2280 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 2281 PyObject *errorHandler = NULL; 2282 PyObject *exc = NULL; 2283 /* the following variable is used for caching string comparisons 2284 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 2285 int known_errorHandler = -1; 2286 2287 /* allocate enough for a simple encoding without 2288 replacements, if we need more, we'll resize */ 2289 res = PyString_FromStringAndSize(NULL, size); 2290 if (res == NULL) 2291 goto onError; 2292 if (size == 0) 2293 return res; 2294 str = PyString_AS_STRING(res); 2295 ressize = size; 2296 2297 while (p<endp) { 2298 Py_UNICODE c = *p; 2299 2300 /* can we encode this? */ 2301 if (c<limit) { 2302 /* no overflow check, because we know that the space is enough */ 2303 *str++ = (char)c; 2304 ++p; 2305 } 2306 else { 2307 int unicodepos = p-startp; 2308 int requiredsize; 2309 PyObject *repunicode; 2310 int repsize; 2311 int newpos; 2312 int respos; 2313 Py_UNICODE *uni2; 2314 /* startpos for collecting unencodable chars */ 2315 const Py_UNICODE *collstart = p; 2316 const Py_UNICODE *collend = p; 2317 /* find all unecodable characters */ 2318 while ((collend < endp) && ((*collend)>=limit)) 2319 ++collend; 2320 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 2321 if (known_errorHandler==-1) { 2322 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2323 known_errorHandler = 1; 2324 else if (!strcmp(errors, "replace")) 2325 known_errorHandler = 2; 2326 else if (!strcmp(errors, "ignore")) 2327 known_errorHandler = 3; 2328 else if (!strcmp(errors, "xmlcharrefreplace")) 2329 known_errorHandler = 4; 2330 else 2331 known_errorHandler = 0; 2332 } 2333 switch (known_errorHandler) { 2334 case 1: /* strict */ 2335 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 2336 goto onError; 2337 case 2: /* replace */ 2338 while (collstart++<collend) 2339 *str++ = '?'; /* fall through */ 2340 case 3: /* ignore */ 2341 p = collend; 2342 break; 2343 case 4: /* xmlcharrefreplace */ 2344 respos = str-PyString_AS_STRING(res); 2345 /* determine replacement size (temporarily (mis)uses p) */ 2346 for (p = collstart, repsize = 0; p < collend; ++p) { 2347 if (*p<10) 2348 repsize += 2+1+1; 2349 else if (*p<100) 2350 repsize += 2+2+1; 2351 else if (*p<1000) 2352 repsize += 2+3+1; 2353 else if (*p<10000) 2354 repsize += 2+4+1; 2355 else if (*p<100000) 2356 repsize += 2+5+1; 2357 else if (*p<1000000) 2358 repsize += 2+6+1; 2359 else 2360 repsize += 2+7+1; 2361 } 2362 requiredsize = respos+repsize+(endp-collend); 2363 if (requiredsize > ressize) { 2364 if (requiredsize<2*ressize) 2365 requiredsize = 2*ressize; 2366 if (_PyString_Resize(&res, requiredsize)) 2367 goto onError; 2368 str = PyString_AS_STRING(res) + respos; 2369 ressize = requiredsize; 2370 } 2371 /* generate replacement (temporarily (mis)uses p) */ 2372 for (p = collstart; p < collend; ++p) { 2373 str += sprintf(str, "&#%d;", (int)*p); 2374 } 2375 p = collend; 2376 break; 2377 default: 2378 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2379 encoding, reason, startp, size, &exc, 2380 collstart-startp, collend-startp, &newpos); 2381 if (repunicode == NULL) 2382 goto onError; 2383 /* need more space? (at least enough for what we 2384 have+the replacement+the rest of the string, so 2385 we won't have to check space for encodable characters) */ 2386 respos = str-PyString_AS_STRING(res); 2387 repsize = PyUnicode_GET_SIZE(repunicode); 2388 requiredsize = respos+repsize+(endp-collend); 2389 if (requiredsize > ressize) { 2390 if (requiredsize<2*ressize) 2391 requiredsize = 2*ressize; 2392 if (_PyString_Resize(&res, requiredsize)) { 2393 Py_DECREF(repunicode); 2394 goto onError; 2395 } 2396 str = PyString_AS_STRING(res) + respos; 2397 ressize = requiredsize; 2398 } 2399 /* check if there is anything unencodable in the replacement 2400 and copy it to the output */ 2401 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 2402 c = *uni2; 2403 if (c >= limit) { 2404 raise_encode_exception(&exc, encoding, startp, size, 2405 unicodepos, unicodepos+1, reason); 2406 Py_DECREF(repunicode); 2407 goto onError; 2408 } 2409 *str = (char)c; 2410 } 2411 p = startp + newpos; 2412 Py_DECREF(repunicode); 2413 } 2414 } 2415 } 2416 /* Resize if we allocated to much */ 2417 respos = str-PyString_AS_STRING(res); 2418 if (respos<ressize) 2419 /* If this falls res will be NULL */ 2420 _PyString_Resize(&res, respos); 2421 Py_XDECREF(errorHandler); 2422 Py_XDECREF(exc); 2423 return res; 2424 2425 onError: 2426 Py_XDECREF(res); 2427 Py_XDECREF(errorHandler); 2428 Py_XDECREF(exc); 2429 return NULL; 2430} 2431 2432PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 2433 int size, 2434 const char *errors) 2435{ 2436 return unicode_encode_ucs1(p, size, errors, 256); 2437} 2438 2439PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 2440{ 2441 if (!PyUnicode_Check(unicode)) { 2442 PyErr_BadArgument(); 2443 return NULL; 2444 } 2445 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 2446 PyUnicode_GET_SIZE(unicode), 2447 NULL); 2448} 2449 2450/* --- 7-bit ASCII Codec -------------------------------------------------- */ 2451 2452PyObject *PyUnicode_DecodeASCII(const char *s, 2453 int size, 2454 const char *errors) 2455{ 2456 const char *starts = s; 2457 PyUnicodeObject *v; 2458 Py_UNICODE *p; 2459 int startinpos; 2460 int endinpos; 2461 int outpos; 2462 const char *e; 2463 PyObject *errorHandler = NULL; 2464 PyObject *exc = NULL; 2465 2466 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 2467 if (size == 1 && *(unsigned char*)s < 128) { 2468 Py_UNICODE r = *(unsigned char*)s; 2469 return PyUnicode_FromUnicode(&r, 1); 2470 } 2471 2472 v = _PyUnicode_New(size); 2473 if (v == NULL) 2474 goto onError; 2475 if (size == 0) 2476 return (PyObject *)v; 2477 p = PyUnicode_AS_UNICODE(v); 2478 e = s + size; 2479 while (s < e) { 2480 register unsigned char c = (unsigned char)*s; 2481 if (c < 128) { 2482 *p++ = c; 2483 ++s; 2484 } 2485 else { 2486 startinpos = s-starts; 2487 endinpos = startinpos + 1; 2488 outpos = p-PyUnicode_AS_UNICODE(v); 2489 if (unicode_decode_call_errorhandler( 2490 errors, &errorHandler, 2491 "ascii", "ordinal not in range(128)", 2492 starts, size, &startinpos, &endinpos, &exc, &s, 2493 (PyObject **)&v, &outpos, &p)) 2494 goto onError; 2495 } 2496 } 2497 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 2498 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2499 goto onError; 2500 Py_XDECREF(errorHandler); 2501 Py_XDECREF(exc); 2502 return (PyObject *)v; 2503 2504 onError: 2505 Py_XDECREF(v); 2506 Py_XDECREF(errorHandler); 2507 Py_XDECREF(exc); 2508 return NULL; 2509} 2510 2511PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 2512 int size, 2513 const char *errors) 2514{ 2515 return unicode_encode_ucs1(p, size, errors, 128); 2516} 2517 2518PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 2519{ 2520 if (!PyUnicode_Check(unicode)) { 2521 PyErr_BadArgument(); 2522 return NULL; 2523 } 2524 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 2525 PyUnicode_GET_SIZE(unicode), 2526 NULL); 2527} 2528 2529#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 2530 2531/* --- MBCS codecs for Windows -------------------------------------------- */ 2532 2533PyObject *PyUnicode_DecodeMBCS(const char *s, 2534 int size, 2535 const char *errors) 2536{ 2537 PyUnicodeObject *v; 2538 Py_UNICODE *p; 2539 2540 /* First get the size of the result */ 2541 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 2542 if (size > 0 && usize==0) 2543 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2544 2545 v = _PyUnicode_New(usize); 2546 if (v == NULL) 2547 return NULL; 2548 if (usize == 0) 2549 return (PyObject *)v; 2550 p = PyUnicode_AS_UNICODE(v); 2551 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 2552 Py_DECREF(v); 2553 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2554 } 2555 2556 return (PyObject *)v; 2557} 2558 2559PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 2560 int size, 2561 const char *errors) 2562{ 2563 PyObject *repr; 2564 char *s; 2565 DWORD mbcssize; 2566 2567 /* If there are no characters, bail now! */ 2568 if (size==0) 2569 return PyString_FromString(""); 2570 2571 /* First get the size of the result */ 2572 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 2573 if (mbcssize==0) 2574 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2575 2576 repr = PyString_FromStringAndSize(NULL, mbcssize); 2577 if (repr == NULL) 2578 return NULL; 2579 if (mbcssize == 0) 2580 return repr; 2581 2582 /* Do the conversion */ 2583 s = PyString_AS_STRING(repr); 2584 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 2585 Py_DECREF(repr); 2586 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 2587 } 2588 return repr; 2589} 2590 2591#endif /* MS_WINDOWS */ 2592 2593/* --- Character Mapping Codec -------------------------------------------- */ 2594 2595PyObject *PyUnicode_DecodeCharmap(const char *s, 2596 int size, 2597 PyObject *mapping, 2598 const char *errors) 2599{ 2600 const char *starts = s; 2601 int startinpos; 2602 int endinpos; 2603 int outpos; 2604 const char *e; 2605 PyUnicodeObject *v; 2606 Py_UNICODE *p; 2607 int extrachars = 0; 2608 PyObject *errorHandler = NULL; 2609 PyObject *exc = NULL; 2610 2611 /* Default to Latin-1 */ 2612 if (mapping == NULL) 2613 return PyUnicode_DecodeLatin1(s, size, errors); 2614 2615 v = _PyUnicode_New(size); 2616 if (v == NULL) 2617 goto onError; 2618 if (size == 0) 2619 return (PyObject *)v; 2620 p = PyUnicode_AS_UNICODE(v); 2621 e = s + size; 2622 while (s < e) { 2623 unsigned char ch = *s; 2624 PyObject *w, *x; 2625 2626 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 2627 w = PyInt_FromLong((long)ch); 2628 if (w == NULL) 2629 goto onError; 2630 x = PyObject_GetItem(mapping, w); 2631 Py_DECREF(w); 2632 if (x == NULL) { 2633 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2634 /* No mapping found means: mapping is undefined. */ 2635 PyErr_Clear(); 2636 x = Py_None; 2637 Py_INCREF(x); 2638 } else 2639 goto onError; 2640 } 2641 2642 /* Apply mapping */ 2643 if (PyInt_Check(x)) { 2644 long value = PyInt_AS_LONG(x); 2645 if (value < 0 || value > 65535) { 2646 PyErr_SetString(PyExc_TypeError, 2647 "character mapping must be in range(65536)"); 2648 Py_DECREF(x); 2649 goto onError; 2650 } 2651 *p++ = (Py_UNICODE)value; 2652 } 2653 else if (x == Py_None) { 2654 /* undefined mapping */ 2655 outpos = p-PyUnicode_AS_UNICODE(v); 2656 startinpos = s-starts; 2657 endinpos = startinpos+1; 2658 if (unicode_decode_call_errorhandler( 2659 errors, &errorHandler, 2660 "charmap", "character maps to <undefined>", 2661 starts, size, &startinpos, &endinpos, &exc, &s, 2662 (PyObject **)&v, &outpos, &p)) { 2663 Py_DECREF(x); 2664 goto onError; 2665 } 2666 continue; 2667 } 2668 else if (PyUnicode_Check(x)) { 2669 int targetsize = PyUnicode_GET_SIZE(x); 2670 2671 if (targetsize == 1) 2672 /* 1-1 mapping */ 2673 *p++ = *PyUnicode_AS_UNICODE(x); 2674 2675 else if (targetsize > 1) { 2676 /* 1-n mapping */ 2677 if (targetsize > extrachars) { 2678 /* resize first */ 2679 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2680 int needed = (targetsize - extrachars) + \ 2681 (targetsize << 2); 2682 extrachars += needed; 2683 if (_PyUnicode_Resize(&v, 2684 PyUnicode_GET_SIZE(v) + needed)) { 2685 Py_DECREF(x); 2686 goto onError; 2687 } 2688 p = PyUnicode_AS_UNICODE(v) + oldpos; 2689 } 2690 Py_UNICODE_COPY(p, 2691 PyUnicode_AS_UNICODE(x), 2692 targetsize); 2693 p += targetsize; 2694 extrachars -= targetsize; 2695 } 2696 /* 1-0 mapping: skip the character */ 2697 } 2698 else { 2699 /* wrong return value */ 2700 PyErr_SetString(PyExc_TypeError, 2701 "character mapping must return integer, None or unicode"); 2702 Py_DECREF(x); 2703 goto onError; 2704 } 2705 Py_DECREF(x); 2706 ++s; 2707 } 2708 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2709 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2710 goto onError; 2711 Py_XDECREF(errorHandler); 2712 Py_XDECREF(exc); 2713 return (PyObject *)v; 2714 2715 onError: 2716 Py_XDECREF(errorHandler); 2717 Py_XDECREF(exc); 2718 Py_XDECREF(v); 2719 return NULL; 2720} 2721 2722/* Lookup the character ch in the mapping. If the character 2723 can't be found, Py_None is returned (or NULL, if another 2724 error occured). */ 2725static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 2726{ 2727 PyObject *w = PyInt_FromLong((long)c); 2728 PyObject *x; 2729 2730 if (w == NULL) 2731 return NULL; 2732 x = PyObject_GetItem(mapping, w); 2733 Py_DECREF(w); 2734 if (x == NULL) { 2735 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2736 /* No mapping found means: mapping is undefined. */ 2737 PyErr_Clear(); 2738 x = Py_None; 2739 Py_INCREF(x); 2740 return x; 2741 } else 2742 return NULL; 2743 } 2744 else if (x == Py_None) 2745 return x; 2746 else if (PyInt_Check(x)) { 2747 long value = PyInt_AS_LONG(x); 2748 if (value < 0 || value > 255) { 2749 PyErr_SetString(PyExc_TypeError, 2750 "character mapping must be in range(256)"); 2751 Py_DECREF(x); 2752 return NULL; 2753 } 2754 return x; 2755 } 2756 else if (PyString_Check(x)) 2757 return x; 2758 else { 2759 /* wrong return value */ 2760 PyErr_SetString(PyExc_TypeError, 2761 "character mapping must return integer, None or str"); 2762 Py_DECREF(x); 2763 return NULL; 2764 } 2765} 2766 2767/* lookup the character, put the result in the output string and adjust 2768 various state variables. Reallocate the output string if not enough 2769 space is available. Return a new reference to the object that 2770 was put in the output buffer, or Py_None, if the mapping was undefined 2771 (in which case no character was written) or NULL, if a 2772 reallocation error ocurred. The called must decref the result */ 2773static 2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, 2775 PyObject **outobj, int *outpos) 2776{ 2777 PyObject *rep = charmapencode_lookup(c, mapping); 2778 2779 if (rep==NULL) 2780 return NULL; 2781 else if (rep==Py_None) 2782 return rep; 2783 else { 2784 char *outstart = PyString_AS_STRING(*outobj); 2785 int outsize = PyString_GET_SIZE(*outobj); 2786 if (PyInt_Check(rep)) { 2787 int requiredsize = *outpos+1; 2788 if (outsize<requiredsize) { 2789 /* exponentially overallocate to minimize reallocations */ 2790 if (requiredsize < 2*outsize) 2791 requiredsize = 2*outsize; 2792 if (_PyString_Resize(outobj, requiredsize)) { 2793 Py_DECREF(rep); 2794 return NULL; 2795 } 2796 outstart = PyString_AS_STRING(*outobj); 2797 } 2798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 2799 } 2800 else { 2801 const char *repchars = PyString_AS_STRING(rep); 2802 int repsize = PyString_GET_SIZE(rep); 2803 int requiredsize = *outpos+repsize; 2804 if (outsize<requiredsize) { 2805 /* exponentially overallocate to minimize reallocations */ 2806 if (requiredsize < 2*outsize) 2807 requiredsize = 2*outsize; 2808 if (_PyString_Resize(outobj, requiredsize)) { 2809 Py_DECREF(rep); 2810 return NULL; 2811 } 2812 outstart = PyString_AS_STRING(*outobj); 2813 } 2814 memcpy(outstart + *outpos, repchars, repsize); 2815 *outpos += repsize; 2816 } 2817 } 2818 return rep; 2819} 2820 2821/* handle an error in PyUnicode_EncodeCharmap 2822 Return 0 on success, -1 on error */ 2823static 2824int charmap_encoding_error( 2825 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping, 2826 PyObject **exceptionObject, 2827 int *known_errorHandler, PyObject *errorHandler, const char *errors, 2828 PyObject **res, int *respos) 2829{ 2830 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 2831 int repsize; 2832 int newpos; 2833 Py_UNICODE *uni2; 2834 /* startpos for collecting unencodable chars */ 2835 int collstartpos = *inpos; 2836 int collendpos = *inpos+1; 2837 int collpos; 2838 char *encoding = "charmap"; 2839 char *reason = "character maps to <undefined>"; 2840 2841 PyObject *x; 2842 /* find all unencodable characters */ 2843 while (collendpos < size) { 2844 x = charmapencode_lookup(p[collendpos], mapping); 2845 if (x==NULL) 2846 return -1; 2847 else if (x!=Py_None) { 2848 Py_DECREF(x); 2849 break; 2850 } 2851 Py_DECREF(x); 2852 ++collendpos; 2853 } 2854 /* cache callback name lookup 2855 * (if not done yet, i.e. it's the first error) */ 2856 if (*known_errorHandler==-1) { 2857 if ((errors==NULL) || (!strcmp(errors, "strict"))) 2858 *known_errorHandler = 1; 2859 else if (!strcmp(errors, "replace")) 2860 *known_errorHandler = 2; 2861 else if (!strcmp(errors, "ignore")) 2862 *known_errorHandler = 3; 2863 else if (!strcmp(errors, "xmlcharrefreplace")) 2864 *known_errorHandler = 4; 2865 else 2866 *known_errorHandler = 0; 2867 } 2868 switch (*known_errorHandler) { 2869 case 1: /* strict */ 2870 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2871 return -1; 2872 case 2: /* replace */ 2873 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 2874 x = charmapencode_output('?', mapping, res, respos); 2875 if (x==NULL) { 2876 return -1; 2877 } 2878 else if (x==Py_None) { 2879 Py_DECREF(x); 2880 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2881 return -1; 2882 } 2883 Py_DECREF(x); 2884 } 2885 /* fall through */ 2886 case 3: /* ignore */ 2887 *inpos = collendpos; 2888 break; 2889 case 4: /* xmlcharrefreplace */ 2890 /* generate replacement (temporarily (mis)uses p) */ 2891 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 2892 char buffer[2+29+1+1]; 2893 char *cp; 2894 sprintf(buffer, "&#%d;", (int)p[collpos]); 2895 for (cp = buffer; *cp; ++cp) { 2896 x = charmapencode_output(*cp, mapping, res, respos); 2897 if (x==NULL) 2898 return -1; 2899 else if (x==Py_None) { 2900 Py_DECREF(x); 2901 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2902 return -1; 2903 } 2904 Py_DECREF(x); 2905 } 2906 } 2907 *inpos = collendpos; 2908 break; 2909 default: 2910 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 2911 encoding, reason, p, size, exceptionObject, 2912 collstartpos, collendpos, &newpos); 2913 if (repunicode == NULL) 2914 return -1; 2915 /* generate replacement */ 2916 repsize = PyUnicode_GET_SIZE(repunicode); 2917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 2918 x = charmapencode_output(*uni2, mapping, res, respos); 2919 if (x==NULL) { 2920 Py_DECREF(repunicode); 2921 return -1; 2922 } 2923 else if (x==Py_None) { 2924 Py_DECREF(repunicode); 2925 Py_DECREF(x); 2926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 2927 return -1; 2928 } 2929 Py_DECREF(x); 2930 } 2931 *inpos = newpos; 2932 Py_DECREF(repunicode); 2933 } 2934 return 0; 2935} 2936 2937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2938 int size, 2939 PyObject *mapping, 2940 const char *errors) 2941{ 2942 /* output object */ 2943 PyObject *res = NULL; 2944 /* current input position */ 2945 int inpos = 0; 2946 /* current output position */ 2947 int respos = 0; 2948 PyObject *errorHandler = NULL; 2949 PyObject *exc = NULL; 2950 /* the following variable is used for caching string comparisons 2951 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 2952 * 3=ignore, 4=xmlcharrefreplace */ 2953 int known_errorHandler = -1; 2954 2955 /* Default to Latin-1 */ 2956 if (mapping == NULL) 2957 return PyUnicode_EncodeLatin1(p, size, errors); 2958 2959 /* allocate enough for a simple encoding without 2960 replacements, if we need more, we'll resize */ 2961 res = PyString_FromStringAndSize(NULL, size); 2962 if (res == NULL) 2963 goto onError; 2964 if (size == 0) 2965 return res; 2966 2967 while (inpos<size) { 2968 /* try to encode it */ 2969 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); 2970 if (x==NULL) /* error */ 2971 goto onError; 2972 if (x==Py_None) { /* unencodable character */ 2973 if (charmap_encoding_error(p, size, &inpos, mapping, 2974 &exc, 2975 &known_errorHandler, errorHandler, errors, 2976 &res, &respos)) 2977 goto onError; 2978 } 2979 else 2980 /* done with this character => adjust input position */ 2981 ++inpos; 2982 Py_DECREF(x); 2983 } 2984 2985 /* Resize if we allocated to much */ 2986 if (respos<PyString_GET_SIZE(res)) { 2987 if (_PyString_Resize(&res, respos)) 2988 goto onError; 2989 } 2990 Py_XDECREF(exc); 2991 Py_XDECREF(errorHandler); 2992 return res; 2993 2994 onError: 2995 Py_XDECREF(res); 2996 Py_XDECREF(exc); 2997 Py_XDECREF(errorHandler); 2998 return NULL; 2999} 3000 3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 3002 PyObject *mapping) 3003{ 3004 if (!PyUnicode_Check(unicode) || mapping == NULL) { 3005 PyErr_BadArgument(); 3006 return NULL; 3007 } 3008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 3009 PyUnicode_GET_SIZE(unicode), 3010 mapping, 3011 NULL); 3012} 3013 3014/* create or adjust a UnicodeTranslateError */ 3015static void make_translate_exception(PyObject **exceptionObject, 3016 const Py_UNICODE *unicode, int size, 3017 int startpos, int endpos, 3018 const char *reason) 3019{ 3020 if (*exceptionObject == NULL) { 3021 *exceptionObject = PyUnicodeTranslateError_Create( 3022 unicode, size, startpos, endpos, reason); 3023 } 3024 else { 3025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 3026 goto onError; 3027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 3028 goto onError; 3029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 3030 goto onError; 3031 return; 3032 onError: 3033 Py_DECREF(*exceptionObject); 3034 *exceptionObject = NULL; 3035 } 3036} 3037 3038/* raises a UnicodeTranslateError */ 3039static void raise_translate_exception(PyObject **exceptionObject, 3040 const Py_UNICODE *unicode, int size, 3041 int startpos, int endpos, 3042 const char *reason) 3043{ 3044 make_translate_exception(exceptionObject, 3045 unicode, size, startpos, endpos, reason); 3046 if (*exceptionObject != NULL) 3047 PyCodec_StrictErrors(*exceptionObject); 3048} 3049 3050/* error handling callback helper: 3051 build arguments, call the callback and check the arguments, 3052 put the result into newpos and return the replacement string, which 3053 has to be freed by the caller */ 3054static PyObject *unicode_translate_call_errorhandler(const char *errors, 3055 PyObject **errorHandler, 3056 const char *reason, 3057 const Py_UNICODE *unicode, int size, PyObject **exceptionObject, 3058 int startpos, int endpos, 3059 int *newpos) 3060{ 3061 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; 3062 3063 PyObject *restuple; 3064 PyObject *resunicode; 3065 3066 if (*errorHandler == NULL) { 3067 *errorHandler = PyCodec_LookupError(errors); 3068 if (*errorHandler == NULL) 3069 return NULL; 3070 } 3071 3072 make_translate_exception(exceptionObject, 3073 unicode, size, startpos, endpos, reason); 3074 if (*exceptionObject == NULL) 3075 return NULL; 3076 3077 restuple = PyObject_CallFunctionObjArgs( 3078 *errorHandler, *exceptionObject, NULL); 3079 if (restuple == NULL) 3080 return NULL; 3081 if (!PyTuple_Check(restuple)) { 3082 PyErr_Format(PyExc_TypeError, &argparse[4]); 3083 Py_DECREF(restuple); 3084 return NULL; 3085 } 3086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3087 &resunicode, newpos)) { 3088 Py_DECREF(restuple); 3089 return NULL; 3090 } 3091 if (*newpos<0) 3092 *newpos = size+*newpos; 3093 if (*newpos<0 || *newpos>size) { 3094 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos); 3095 Py_DECREF(restuple); 3096 return NULL; 3097 } 3098 Py_INCREF(resunicode); 3099 Py_DECREF(restuple); 3100 return resunicode; 3101} 3102 3103/* Lookup the character ch in the mapping and put the result in result, 3104 which must be decrefed by the caller. 3105 Return 0 on success, -1 on error */ 3106static 3107int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 3108{ 3109 PyObject *w = PyInt_FromLong((long)c); 3110 PyObject *x; 3111 3112 if (w == NULL) 3113 return -1; 3114 x = PyObject_GetItem(mapping, w); 3115 Py_DECREF(w); 3116 if (x == NULL) { 3117 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3118 /* No mapping found means: use 1:1 mapping. */ 3119 PyErr_Clear(); 3120 *result = NULL; 3121 return 0; 3122 } else 3123 return -1; 3124 } 3125 else if (x == Py_None) { 3126 *result = x; 3127 return 0; 3128 } 3129 else if (PyInt_Check(x)) { 3130 long value = PyInt_AS_LONG(x); 3131 long max = PyUnicode_GetMax(); 3132 if (value < 0 || value > max) { 3133 PyErr_Format(PyExc_TypeError, 3134 "character mapping must be in range(0x%lx)", max+1); 3135 Py_DECREF(x); 3136 return -1; 3137 } 3138 *result = x; 3139 return 0; 3140 } 3141 else if (PyUnicode_Check(x)) { 3142 *result = x; 3143 return 0; 3144 } 3145 else { 3146 /* wrong return value */ 3147 PyErr_SetString(PyExc_TypeError, 3148 "character mapping must return integer, None or unicode"); 3149 return -1; 3150 } 3151} 3152/* ensure that *outobj is at least requiredsize characters long, 3153if not reallocate and adjust various state variables. 3154Return 0 on success, -1 on error */ 3155static 3156int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, 3157 int requiredsize) 3158{ 3159 if (requiredsize > *outsize) { 3160 /* remember old output position */ 3161 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 3162 /* exponentially overallocate to minimize reallocations */ 3163 if (requiredsize < 2 * *outsize) 3164 requiredsize = 2 * *outsize; 3165 if (_PyUnicode_Resize(outobj, requiredsize)) 3166 return -1; 3167 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 3168 *outsize = requiredsize; 3169 } 3170 return 0; 3171} 3172/* lookup the character, put the result in the output string and adjust 3173 various state variables. Return a new reference to the object that 3174 was put in the output buffer in *result, or Py_None, if the mapping was 3175 undefined (in which case no character was written). 3176 The called must decref result. 3177 Return 0 on success, -1 on error. */ 3178static 3179int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, 3180 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) 3181{ 3182 if (charmaptranslate_lookup(c, mapping, res)) 3183 return -1; 3184 if (*res==NULL) { 3185 /* not found => default to 1:1 mapping */ 3186 *(*outp)++ = (Py_UNICODE)c; 3187 } 3188 else if (*res==Py_None) 3189 ; 3190 else if (PyInt_Check(*res)) { 3191 /* no overflow check, because we know that the space is enough */ 3192 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 3193 } 3194 else if (PyUnicode_Check(*res)) { 3195 int repsize = PyUnicode_GET_SIZE(*res); 3196 if (repsize==1) { 3197 /* no overflow check, because we know that the space is enough */ 3198 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 3199 } 3200 else if (repsize!=0) { 3201 /* more than one character */ 3202 int requiredsize = *outsize + repsize - 1; 3203 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) 3204 return -1; 3205 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 3206 *outp += repsize; 3207 } 3208 } 3209 else 3210 return -1; 3211 return 0; 3212} 3213 3214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 3215 int size, 3216 PyObject *mapping, 3217 const char *errors) 3218{ 3219 /* output object */ 3220 PyObject *res = NULL; 3221 /* pointers to the beginning and end+1 of input */ 3222 const Py_UNICODE *startp = p; 3223 const Py_UNICODE *endp = p + size; 3224 /* pointer into the output */ 3225 Py_UNICODE *str; 3226 /* current output position */ 3227 int respos = 0; 3228 int ressize; 3229 char *reason = "character maps to <undefined>"; 3230 PyObject *errorHandler = NULL; 3231 PyObject *exc = NULL; 3232 /* the following variable is used for caching string comparisons 3233 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3234 * 3=ignore, 4=xmlcharrefreplace */ 3235 int known_errorHandler = -1; 3236 3237 if (mapping == NULL) { 3238 PyErr_BadArgument(); 3239 return NULL; 3240 } 3241 3242 /* allocate enough for a simple 1:1 translation without 3243 replacements, if we need more, we'll resize */ 3244 res = PyUnicode_FromUnicode(NULL, size); 3245 if (res == NULL) 3246 goto onError; 3247 if (size == 0) 3248 return res; 3249 str = PyUnicode_AS_UNICODE(res); 3250 ressize = size; 3251 3252 while (p<endp) { 3253 /* try to encode it */ 3254 PyObject *x = NULL; 3255 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) { 3256 Py_XDECREF(x); 3257 goto onError; 3258 } 3259 Py_XDECREF(x); 3260 if (x!=Py_None) /* it worked => adjust input pointer */ 3261 ++p; 3262 else { /* untranslatable character */ 3263 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 3264 int repsize; 3265 int newpos; 3266 Py_UNICODE *uni2; 3267 /* startpos for collecting untranslatable chars */ 3268 const Py_UNICODE *collstart = p; 3269 const Py_UNICODE *collend = p+1; 3270 const Py_UNICODE *coll; 3271 3272 /* find all untranslatable characters */ 3273 while (collend < endp) { 3274 if (charmaptranslate_lookup(*collend, mapping, &x)) 3275 goto onError; 3276 Py_XDECREF(x); 3277 if (x!=Py_None) 3278 break; 3279 ++collend; 3280 } 3281 /* cache callback name lookup 3282 * (if not done yet, i.e. it's the first error) */ 3283 if (known_errorHandler==-1) { 3284 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3285 known_errorHandler = 1; 3286 else if (!strcmp(errors, "replace")) 3287 known_errorHandler = 2; 3288 else if (!strcmp(errors, "ignore")) 3289 known_errorHandler = 3; 3290 else if (!strcmp(errors, "xmlcharrefreplace")) 3291 known_errorHandler = 4; 3292 else 3293 known_errorHandler = 0; 3294 } 3295 switch (known_errorHandler) { 3296 case 1: /* strict */ 3297 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 3298 goto onError; 3299 case 2: /* replace */ 3300 /* No need to check for space, this is a 1:1 replacement */ 3301 for (coll = collstart; coll<collend; ++coll) 3302 *str++ = '?'; 3303 /* fall through */ 3304 case 3: /* ignore */ 3305 p = collend; 3306 break; 3307 case 4: /* xmlcharrefreplace */ 3308 /* generate replacement (temporarily (mis)uses p) */ 3309 for (p = collstart; p < collend; ++p) { 3310 char buffer[2+29+1+1]; 3311 char *cp; 3312 sprintf(buffer, "&#%d;", (int)*p); 3313 if (charmaptranslate_makespace(&res, &str, &ressize, 3314 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 3315 goto onError; 3316 for (cp = buffer; *cp; ++cp) 3317 *str++ = *cp; 3318 } 3319 p = collend; 3320 break; 3321 default: 3322 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 3323 reason, startp, size, &exc, 3324 collstart-startp, collend-startp, &newpos); 3325 if (repunicode == NULL) 3326 goto onError; 3327 /* generate replacement */ 3328 repsize = PyUnicode_GET_SIZE(repunicode); 3329 if (charmaptranslate_makespace(&res, &str, &ressize, 3330 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 3331 Py_DECREF(repunicode); 3332 goto onError; 3333 } 3334 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 3335 *str++ = *uni2; 3336 p = startp + newpos; 3337 Py_DECREF(repunicode); 3338 } 3339 } 3340 } 3341 /* Resize if we allocated to much */ 3342 respos = str-PyUnicode_AS_UNICODE(res); 3343 if (respos<ressize) { 3344 if (_PyUnicode_Resize(&res, respos)) 3345 goto onError; 3346 } 3347 Py_XDECREF(exc); 3348 Py_XDECREF(errorHandler); 3349 return res; 3350 3351 onError: 3352 Py_XDECREF(res); 3353 Py_XDECREF(exc); 3354 Py_XDECREF(errorHandler); 3355 return NULL; 3356} 3357 3358PyObject *PyUnicode_Translate(PyObject *str, 3359 PyObject *mapping, 3360 const char *errors) 3361{ 3362 PyObject *result; 3363 3364 str = PyUnicode_FromObject(str); 3365 if (str == NULL) 3366 goto onError; 3367 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 3368 PyUnicode_GET_SIZE(str), 3369 mapping, 3370 errors); 3371 Py_DECREF(str); 3372 return result; 3373 3374 onError: 3375 Py_XDECREF(str); 3376 return NULL; 3377} 3378 3379/* --- Decimal Encoder ---------------------------------------------------- */ 3380 3381int PyUnicode_EncodeDecimal(Py_UNICODE *s, 3382 int length, 3383 char *output, 3384 const char *errors) 3385{ 3386 Py_UNICODE *p, *end; 3387 PyObject *errorHandler = NULL; 3388 PyObject *exc = NULL; 3389 const char *encoding = "decimal"; 3390 const char *reason = "invalid decimal Unicode string"; 3391 /* the following variable is used for caching string comparisons 3392 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3393 int known_errorHandler = -1; 3394 3395 if (output == NULL) { 3396 PyErr_BadArgument(); 3397 return -1; 3398 } 3399 3400 p = s; 3401 end = s + length; 3402 while (p < end) { 3403 register Py_UNICODE ch = *p; 3404 int decimal; 3405 PyObject *repunicode; 3406 int repsize; 3407 int newpos; 3408 Py_UNICODE *uni2; 3409 Py_UNICODE *collstart; 3410 Py_UNICODE *collend; 3411 3412 if (Py_UNICODE_ISSPACE(ch)) { 3413 *output++ = ' '; 3414 ++p; 3415 continue; 3416 } 3417 decimal = Py_UNICODE_TODECIMAL(ch); 3418 if (decimal >= 0) { 3419 *output++ = '0' + decimal; 3420 ++p; 3421 continue; 3422 } 3423 if (0 < ch && ch < 256) { 3424 *output++ = (char)ch; 3425 ++p; 3426 continue; 3427 } 3428 /* All other characters are considered unencodable */ 3429 collstart = p; 3430 collend = p+1; 3431 while (collend < end) { 3432 if ((0 < *collend && *collend < 256) || 3433 !Py_UNICODE_ISSPACE(*collend) || 3434 Py_UNICODE_TODECIMAL(*collend)) 3435 break; 3436 } 3437 /* cache callback name lookup 3438 * (if not done yet, i.e. it's the first error) */ 3439 if (known_errorHandler==-1) { 3440 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3441 known_errorHandler = 1; 3442 else if (!strcmp(errors, "replace")) 3443 known_errorHandler = 2; 3444 else if (!strcmp(errors, "ignore")) 3445 known_errorHandler = 3; 3446 else if (!strcmp(errors, "xmlcharrefreplace")) 3447 known_errorHandler = 4; 3448 else 3449 known_errorHandler = 0; 3450 } 3451 switch (known_errorHandler) { 3452 case 1: /* strict */ 3453 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 3454 goto onError; 3455 case 2: /* replace */ 3456 for (p = collstart; p < collend; ++p) 3457 *output++ = '?'; 3458 /* fall through */ 3459 case 3: /* ignore */ 3460 p = collend; 3461 break; 3462 case 4: /* xmlcharrefreplace */ 3463 /* generate replacement (temporarily (mis)uses p) */ 3464 for (p = collstart; p < collend; ++p) 3465 output += sprintf(output, "&#%d;", (int)*p); 3466 p = collend; 3467 break; 3468 default: 3469 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3470 encoding, reason, s, length, &exc, 3471 collstart-s, collend-s, &newpos); 3472 if (repunicode == NULL) 3473 goto onError; 3474 /* generate replacement */ 3475 repsize = PyUnicode_GET_SIZE(repunicode); 3476 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 3477 Py_UNICODE ch = *uni2; 3478 if (Py_UNICODE_ISSPACE(ch)) 3479 *output++ = ' '; 3480 else { 3481 decimal = Py_UNICODE_TODECIMAL(ch); 3482 if (decimal >= 0) 3483 *output++ = '0' + decimal; 3484 else if (0 < ch && ch < 256) 3485 *output++ = (char)ch; 3486 else { 3487 Py_DECREF(repunicode); 3488 raise_encode_exception(&exc, encoding, 3489 s, length, collstart-s, collend-s, reason); 3490 goto onError; 3491 } 3492 } 3493 } 3494 p = s + newpos; 3495 Py_DECREF(repunicode); 3496 } 3497 } 3498 /* 0-terminate the output string */ 3499 *output++ = '\0'; 3500 Py_XDECREF(exc); 3501 Py_XDECREF(errorHandler); 3502 return 0; 3503 3504 onError: 3505 Py_XDECREF(exc); 3506 Py_XDECREF(errorHandler); 3507 return -1; 3508} 3509 3510/* --- Helpers ------------------------------------------------------------ */ 3511 3512static 3513int count(PyUnicodeObject *self, 3514 int start, 3515 int end, 3516 PyUnicodeObject *substring) 3517{ 3518 int count = 0; 3519 3520 if (start < 0) 3521 start += self->length; 3522 if (start < 0) 3523 start = 0; 3524 if (end > self->length) 3525 end = self->length; 3526 if (end < 0) 3527 end += self->length; 3528 if (end < 0) 3529 end = 0; 3530 3531 if (substring->length == 0) 3532 return (end - start + 1); 3533 3534 end -= substring->length; 3535 3536 while (start <= end) 3537 if (Py_UNICODE_MATCH(self, start, substring)) { 3538 count++; 3539 start += substring->length; 3540 } else 3541 start++; 3542 3543 return count; 3544} 3545 3546int PyUnicode_Count(PyObject *str, 3547 PyObject *substr, 3548 int start, 3549 int end) 3550{ 3551 int result; 3552 3553 str = PyUnicode_FromObject(str); 3554 if (str == NULL) 3555 return -1; 3556 substr = PyUnicode_FromObject(substr); 3557 if (substr == NULL) { 3558 Py_DECREF(str); 3559 return -1; 3560 } 3561 3562 result = count((PyUnicodeObject *)str, 3563 start, end, 3564 (PyUnicodeObject *)substr); 3565 3566 Py_DECREF(str); 3567 Py_DECREF(substr); 3568 return result; 3569} 3570 3571static 3572int findstring(PyUnicodeObject *self, 3573 PyUnicodeObject *substring, 3574 int start, 3575 int end, 3576 int direction) 3577{ 3578 if (start < 0) 3579 start += self->length; 3580 if (start < 0) 3581 start = 0; 3582 3583 if (end > self->length) 3584 end = self->length; 3585 if (end < 0) 3586 end += self->length; 3587 if (end < 0) 3588 end = 0; 3589 3590 if (substring->length == 0) 3591 return (direction > 0) ? start : end; 3592 3593 end -= substring->length; 3594 3595 if (direction < 0) { 3596 for (; end >= start; end--) 3597 if (Py_UNICODE_MATCH(self, end, substring)) 3598 return end; 3599 } else { 3600 for (; start <= end; start++) 3601 if (Py_UNICODE_MATCH(self, start, substring)) 3602 return start; 3603 } 3604 3605 return -1; 3606} 3607 3608int PyUnicode_Find(PyObject *str, 3609 PyObject *substr, 3610 int start, 3611 int end, 3612 int direction) 3613{ 3614 int result; 3615 3616 str = PyUnicode_FromObject(str); 3617 if (str == NULL) 3618 return -2; 3619 substr = PyUnicode_FromObject(substr); 3620 if (substr == NULL) { 3621 Py_DECREF(str); 3622 return -2; 3623 } 3624 3625 result = findstring((PyUnicodeObject *)str, 3626 (PyUnicodeObject *)substr, 3627 start, end, direction); 3628 Py_DECREF(str); 3629 Py_DECREF(substr); 3630 return result; 3631} 3632 3633static 3634int tailmatch(PyUnicodeObject *self, 3635 PyUnicodeObject *substring, 3636 int start, 3637 int end, 3638 int direction) 3639{ 3640 if (start < 0) 3641 start += self->length; 3642 if (start < 0) 3643 start = 0; 3644 3645 if (substring->length == 0) 3646 return 1; 3647 3648 if (end > self->length) 3649 end = self->length; 3650 if (end < 0) 3651 end += self->length; 3652 if (end < 0) 3653 end = 0; 3654 3655 end -= substring->length; 3656 if (end < start) 3657 return 0; 3658 3659 if (direction > 0) { 3660 if (Py_UNICODE_MATCH(self, end, substring)) 3661 return 1; 3662 } else { 3663 if (Py_UNICODE_MATCH(self, start, substring)) 3664 return 1; 3665 } 3666 3667 return 0; 3668} 3669 3670int PyUnicode_Tailmatch(PyObject *str, 3671 PyObject *substr, 3672 int start, 3673 int end, 3674 int direction) 3675{ 3676 int result; 3677 3678 str = PyUnicode_FromObject(str); 3679 if (str == NULL) 3680 return -1; 3681 substr = PyUnicode_FromObject(substr); 3682 if (substr == NULL) { 3683 Py_DECREF(substr); 3684 return -1; 3685 } 3686 3687 result = tailmatch((PyUnicodeObject *)str, 3688 (PyUnicodeObject *)substr, 3689 start, end, direction); 3690 Py_DECREF(str); 3691 Py_DECREF(substr); 3692 return result; 3693} 3694 3695static 3696const Py_UNICODE *findchar(const Py_UNICODE *s, 3697 int size, 3698 Py_UNICODE ch) 3699{ 3700 /* like wcschr, but doesn't stop at NULL characters */ 3701 3702 while (size-- > 0) { 3703 if (*s == ch) 3704 return s; 3705 s++; 3706 } 3707 3708 return NULL; 3709} 3710 3711/* Apply fixfct filter to the Unicode object self and return a 3712 reference to the modified object */ 3713 3714static 3715PyObject *fixup(PyUnicodeObject *self, 3716 int (*fixfct)(PyUnicodeObject *s)) 3717{ 3718 3719 PyUnicodeObject *u; 3720 3721 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 3722 if (u == NULL) 3723 return NULL; 3724 3725 Py_UNICODE_COPY(u->str, self->str, self->length); 3726 3727 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 3728 /* fixfct should return TRUE if it modified the buffer. If 3729 FALSE, return a reference to the original buffer instead 3730 (to save space, not time) */ 3731 Py_INCREF(self); 3732 Py_DECREF(u); 3733 return (PyObject*) self; 3734 } 3735 return (PyObject*) u; 3736} 3737 3738static 3739int fixupper(PyUnicodeObject *self) 3740{ 3741 int len = self->length; 3742 Py_UNICODE *s = self->str; 3743 int status = 0; 3744 3745 while (len-- > 0) { 3746 register Py_UNICODE ch; 3747 3748 ch = Py_UNICODE_TOUPPER(*s); 3749 if (ch != *s) { 3750 status = 1; 3751 *s = ch; 3752 } 3753 s++; 3754 } 3755 3756 return status; 3757} 3758 3759static 3760int fixlower(PyUnicodeObject *self) 3761{ 3762 int len = self->length; 3763 Py_UNICODE *s = self->str; 3764 int status = 0; 3765 3766 while (len-- > 0) { 3767 register Py_UNICODE ch; 3768 3769 ch = Py_UNICODE_TOLOWER(*s); 3770 if (ch != *s) { 3771 status = 1; 3772 *s = ch; 3773 } 3774 s++; 3775 } 3776 3777 return status; 3778} 3779 3780static 3781int fixswapcase(PyUnicodeObject *self) 3782{ 3783 int len = self->length; 3784 Py_UNICODE *s = self->str; 3785 int status = 0; 3786 3787 while (len-- > 0) { 3788 if (Py_UNICODE_ISUPPER(*s)) { 3789 *s = Py_UNICODE_TOLOWER(*s); 3790 status = 1; 3791 } else if (Py_UNICODE_ISLOWER(*s)) { 3792 *s = Py_UNICODE_TOUPPER(*s); 3793 status = 1; 3794 } 3795 s++; 3796 } 3797 3798 return status; 3799} 3800 3801static 3802int fixcapitalize(PyUnicodeObject *self) 3803{ 3804 int len = self->length; 3805 Py_UNICODE *s = self->str; 3806 int status = 0; 3807 3808 if (len == 0) 3809 return 0; 3810 if (Py_UNICODE_ISLOWER(*s)) { 3811 *s = Py_UNICODE_TOUPPER(*s); 3812 status = 1; 3813 } 3814 s++; 3815 while (--len > 0) { 3816 if (Py_UNICODE_ISUPPER(*s)) { 3817 *s = Py_UNICODE_TOLOWER(*s); 3818 status = 1; 3819 } 3820 s++; 3821 } 3822 return status; 3823} 3824 3825static 3826int fixtitle(PyUnicodeObject *self) 3827{ 3828 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3829 register Py_UNICODE *e; 3830 int previous_is_cased; 3831 3832 /* Shortcut for single character strings */ 3833 if (PyUnicode_GET_SIZE(self) == 1) { 3834 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 3835 if (*p != ch) { 3836 *p = ch; 3837 return 1; 3838 } 3839 else 3840 return 0; 3841 } 3842 3843 e = p + PyUnicode_GET_SIZE(self); 3844 previous_is_cased = 0; 3845 for (; p < e; p++) { 3846 register const Py_UNICODE ch = *p; 3847 3848 if (previous_is_cased) 3849 *p = Py_UNICODE_TOLOWER(ch); 3850 else 3851 *p = Py_UNICODE_TOTITLE(ch); 3852 3853 if (Py_UNICODE_ISLOWER(ch) || 3854 Py_UNICODE_ISUPPER(ch) || 3855 Py_UNICODE_ISTITLE(ch)) 3856 previous_is_cased = 1; 3857 else 3858 previous_is_cased = 0; 3859 } 3860 return 1; 3861} 3862 3863PyObject *PyUnicode_Join(PyObject *separator, 3864 PyObject *seq) 3865{ 3866 Py_UNICODE *sep; 3867 int seplen; 3868 PyUnicodeObject *res = NULL; 3869 int reslen = 0; 3870 Py_UNICODE *p; 3871 int sz = 100; 3872 int i; 3873 PyObject *it; 3874 3875 it = PyObject_GetIter(seq); 3876 if (it == NULL) 3877 return NULL; 3878 3879 if (separator == NULL) { 3880 Py_UNICODE blank = ' '; 3881 sep = ␣ 3882 seplen = 1; 3883 } 3884 else { 3885 separator = PyUnicode_FromObject(separator); 3886 if (separator == NULL) 3887 goto onError; 3888 sep = PyUnicode_AS_UNICODE(separator); 3889 seplen = PyUnicode_GET_SIZE(separator); 3890 } 3891 3892 res = _PyUnicode_New(sz); 3893 if (res == NULL) 3894 goto onError; 3895 p = PyUnicode_AS_UNICODE(res); 3896 reslen = 0; 3897 3898 for (i = 0; ; ++i) { 3899 int itemlen; 3900 PyObject *item = PyIter_Next(it); 3901 if (item == NULL) { 3902 if (PyErr_Occurred()) 3903 goto onError; 3904 break; 3905 } 3906 if (!PyUnicode_Check(item)) { 3907 PyObject *v; 3908 if (!PyString_Check(item)) { 3909 PyErr_Format(PyExc_TypeError, 3910 "sequence item %i: expected string or Unicode," 3911 " %.80s found", 3912 i, item->ob_type->tp_name); 3913 Py_DECREF(item); 3914 goto onError; 3915 } 3916 v = PyUnicode_FromObject(item); 3917 Py_DECREF(item); 3918 item = v; 3919 if (item == NULL) 3920 goto onError; 3921 } 3922 itemlen = PyUnicode_GET_SIZE(item); 3923 while (reslen + itemlen + seplen >= sz) { 3924 if (_PyUnicode_Resize(&res, sz*2)) { 3925 Py_DECREF(item); 3926 goto onError; 3927 } 3928 sz *= 2; 3929 p = PyUnicode_AS_UNICODE(res) + reslen; 3930 } 3931 if (i > 0) { 3932 Py_UNICODE_COPY(p, sep, seplen); 3933 p += seplen; 3934 reslen += seplen; 3935 } 3936 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); 3937 p += itemlen; 3938 reslen += itemlen; 3939 Py_DECREF(item); 3940 } 3941 if (_PyUnicode_Resize(&res, reslen)) 3942 goto onError; 3943 3944 Py_XDECREF(separator); 3945 Py_DECREF(it); 3946 return (PyObject *)res; 3947 3948 onError: 3949 Py_XDECREF(separator); 3950 Py_XDECREF(res); 3951 Py_DECREF(it); 3952 return NULL; 3953} 3954 3955static 3956PyUnicodeObject *pad(PyUnicodeObject *self, 3957 int left, 3958 int right, 3959 Py_UNICODE fill) 3960{ 3961 PyUnicodeObject *u; 3962 3963 if (left < 0) 3964 left = 0; 3965 if (right < 0) 3966 right = 0; 3967 3968 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 3969 Py_INCREF(self); 3970 return self; 3971 } 3972 3973 u = _PyUnicode_New(left + self->length + right); 3974 if (u) { 3975 if (left) 3976 Py_UNICODE_FILL(u->str, fill, left); 3977 Py_UNICODE_COPY(u->str + left, self->str, self->length); 3978 if (right) 3979 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 3980 } 3981 3982 return u; 3983} 3984 3985#define SPLIT_APPEND(data, left, right) \ 3986 str = PyUnicode_FromUnicode(data + left, right - left); \ 3987 if (!str) \ 3988 goto onError; \ 3989 if (PyList_Append(list, str)) { \ 3990 Py_DECREF(str); \ 3991 goto onError; \ 3992 } \ 3993 else \ 3994 Py_DECREF(str); 3995 3996static 3997PyObject *split_whitespace(PyUnicodeObject *self, 3998 PyObject *list, 3999 int maxcount) 4000{ 4001 register int i; 4002 register int j; 4003 int len = self->length; 4004 PyObject *str; 4005 4006 for (i = j = 0; i < len; ) { 4007 /* find a token */ 4008 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4009 i++; 4010 j = i; 4011 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 4012 i++; 4013 if (j < i) { 4014 if (maxcount-- <= 0) 4015 break; 4016 SPLIT_APPEND(self->str, j, i); 4017 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 4018 i++; 4019 j = i; 4020 } 4021 } 4022 if (j < len) { 4023 SPLIT_APPEND(self->str, j, len); 4024 } 4025 return list; 4026 4027 onError: 4028 Py_DECREF(list); 4029 return NULL; 4030} 4031 4032PyObject *PyUnicode_Splitlines(PyObject *string, 4033 int keepends) 4034{ 4035 register int i; 4036 register int j; 4037 int len; 4038 PyObject *list; 4039 PyObject *str; 4040 Py_UNICODE *data; 4041 4042 string = PyUnicode_FromObject(string); 4043 if (string == NULL) 4044 return NULL; 4045 data = PyUnicode_AS_UNICODE(string); 4046 len = PyUnicode_GET_SIZE(string); 4047 4048 list = PyList_New(0); 4049 if (!list) 4050 goto onError; 4051 4052 for (i = j = 0; i < len; ) { 4053 int eol; 4054 4055 /* Find a line and append it */ 4056 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 4057 i++; 4058 4059 /* Skip the line break reading CRLF as one line break */ 4060 eol = i; 4061 if (i < len) { 4062 if (data[i] == '\r' && i + 1 < len && 4063 data[i+1] == '\n') 4064 i += 2; 4065 else 4066 i++; 4067 if (keepends) 4068 eol = i; 4069 } 4070 SPLIT_APPEND(data, j, eol); 4071 j = i; 4072 } 4073 if (j < len) { 4074 SPLIT_APPEND(data, j, len); 4075 } 4076 4077 Py_DECREF(string); 4078 return list; 4079 4080 onError: 4081 Py_DECREF(list); 4082 Py_DECREF(string); 4083 return NULL; 4084} 4085 4086static 4087PyObject *split_char(PyUnicodeObject *self, 4088 PyObject *list, 4089 Py_UNICODE ch, 4090 int maxcount) 4091{ 4092 register int i; 4093 register int j; 4094 int len = self->length; 4095 PyObject *str; 4096 4097 for (i = j = 0; i < len; ) { 4098 if (self->str[i] == ch) { 4099 if (maxcount-- <= 0) 4100 break; 4101 SPLIT_APPEND(self->str, j, i); 4102 i = j = i + 1; 4103 } else 4104 i++; 4105 } 4106 if (j <= len) { 4107 SPLIT_APPEND(self->str, j, len); 4108 } 4109 return list; 4110 4111 onError: 4112 Py_DECREF(list); 4113 return NULL; 4114} 4115 4116static 4117PyObject *split_substring(PyUnicodeObject *self, 4118 PyObject *list, 4119 PyUnicodeObject *substring, 4120 int maxcount) 4121{ 4122 register int i; 4123 register int j; 4124 int len = self->length; 4125 int sublen = substring->length; 4126 PyObject *str; 4127 4128 for (i = j = 0; i <= len - sublen; ) { 4129 if (Py_UNICODE_MATCH(self, i, substring)) { 4130 if (maxcount-- <= 0) 4131 break; 4132 SPLIT_APPEND(self->str, j, i); 4133 i = j = i + sublen; 4134 } else 4135 i++; 4136 } 4137 if (j <= len) { 4138 SPLIT_APPEND(self->str, j, len); 4139 } 4140 return list; 4141 4142 onError: 4143 Py_DECREF(list); 4144 return NULL; 4145} 4146 4147#undef SPLIT_APPEND 4148 4149static 4150PyObject *split(PyUnicodeObject *self, 4151 PyUnicodeObject *substring, 4152 int maxcount) 4153{ 4154 PyObject *list; 4155 4156 if (maxcount < 0) 4157 maxcount = INT_MAX; 4158 4159 list = PyList_New(0); 4160 if (!list) 4161 return NULL; 4162 4163 if (substring == NULL) 4164 return split_whitespace(self,list,maxcount); 4165 4166 else if (substring->length == 1) 4167 return split_char(self,list,substring->str[0],maxcount); 4168 4169 else if (substring->length == 0) { 4170 Py_DECREF(list); 4171 PyErr_SetString(PyExc_ValueError, "empty separator"); 4172 return NULL; 4173 } 4174 else 4175 return split_substring(self,list,substring,maxcount); 4176} 4177 4178static 4179PyObject *replace(PyUnicodeObject *self, 4180 PyUnicodeObject *str1, 4181 PyUnicodeObject *str2, 4182 int maxcount) 4183{ 4184 PyUnicodeObject *u; 4185 4186 if (maxcount < 0) 4187 maxcount = INT_MAX; 4188 4189 if (str1->length == 1 && str2->length == 1) { 4190 int i; 4191 4192 /* replace characters */ 4193 if (!findchar(self->str, self->length, str1->str[0]) && 4194 PyUnicode_CheckExact(self)) { 4195 /* nothing to replace, return original string */ 4196 Py_INCREF(self); 4197 u = self; 4198 } else { 4199 Py_UNICODE u1 = str1->str[0]; 4200 Py_UNICODE u2 = str2->str[0]; 4201 4202 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 4203 NULL, 4204 self->length 4205 ); 4206 if (u != NULL) { 4207 Py_UNICODE_COPY(u->str, self->str, 4208 self->length); 4209 for (i = 0; i < u->length; i++) 4210 if (u->str[i] == u1) { 4211 if (--maxcount < 0) 4212 break; 4213 u->str[i] = u2; 4214 } 4215 } 4216 } 4217 4218 } else { 4219 int n, i; 4220 Py_UNICODE *p; 4221 4222 /* replace strings */ 4223 n = count(self, 0, self->length, str1); 4224 if (n > maxcount) 4225 n = maxcount; 4226 if (n == 0) { 4227 /* nothing to replace, return original string */ 4228 if (PyUnicode_CheckExact(self)) { 4229 Py_INCREF(self); 4230 u = self; 4231 } 4232 else { 4233 u = (PyUnicodeObject *) 4234 PyUnicode_FromUnicode(self->str, self->length); 4235 } 4236 } else { 4237 u = _PyUnicode_New( 4238 self->length + n * (str2->length - str1->length)); 4239 if (u) { 4240 i = 0; 4241 p = u->str; 4242 if (str1->length > 0) { 4243 while (i <= self->length - str1->length) 4244 if (Py_UNICODE_MATCH(self, i, str1)) { 4245 /* replace string segment */ 4246 Py_UNICODE_COPY(p, str2->str, str2->length); 4247 p += str2->length; 4248 i += str1->length; 4249 if (--n <= 0) { 4250 /* copy remaining part */ 4251 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4252 break; 4253 } 4254 } else 4255 *p++ = self->str[i++]; 4256 } else { 4257 while (n > 0) { 4258 Py_UNICODE_COPY(p, str2->str, str2->length); 4259 p += str2->length; 4260 if (--n <= 0) 4261 break; 4262 *p++ = self->str[i++]; 4263 } 4264 Py_UNICODE_COPY(p, self->str+i, self->length-i); 4265 } 4266 } 4267 } 4268 } 4269 4270 return (PyObject *) u; 4271} 4272 4273/* --- Unicode Object Methods --------------------------------------------- */ 4274 4275PyDoc_STRVAR(title__doc__, 4276"S.title() -> unicode\n\ 4277\n\ 4278Return a titlecased version of S, i.e. words start with title case\n\ 4279characters, all remaining cased characters have lower case."); 4280 4281static PyObject* 4282unicode_title(PyUnicodeObject *self) 4283{ 4284 return fixup(self, fixtitle); 4285} 4286 4287PyDoc_STRVAR(capitalize__doc__, 4288"S.capitalize() -> unicode\n\ 4289\n\ 4290Return a capitalized version of S, i.e. make the first character\n\ 4291have upper case."); 4292 4293static PyObject* 4294unicode_capitalize(PyUnicodeObject *self) 4295{ 4296 return fixup(self, fixcapitalize); 4297} 4298 4299#if 0 4300PyDoc_STRVAR(capwords__doc__, 4301"S.capwords() -> unicode\n\ 4302\n\ 4303Apply .capitalize() to all words in S and return the result with\n\ 4304normalized whitespace (all whitespace strings are replaced by ' ')."); 4305 4306static PyObject* 4307unicode_capwords(PyUnicodeObject *self) 4308{ 4309 PyObject *list; 4310 PyObject *item; 4311 int i; 4312 4313 /* Split into words */ 4314 list = split(self, NULL, -1); 4315 if (!list) 4316 return NULL; 4317 4318 /* Capitalize each word */ 4319 for (i = 0; i < PyList_GET_SIZE(list); i++) { 4320 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 4321 fixcapitalize); 4322 if (item == NULL) 4323 goto onError; 4324 Py_DECREF(PyList_GET_ITEM(list, i)); 4325 PyList_SET_ITEM(list, i, item); 4326 } 4327 4328 /* Join the words to form a new string */ 4329 item = PyUnicode_Join(NULL, list); 4330 4331onError: 4332 Py_DECREF(list); 4333 return (PyObject *)item; 4334} 4335#endif 4336 4337PyDoc_STRVAR(center__doc__, 4338"S.center(width) -> unicode\n\ 4339\n\ 4340Return S centered in a Unicode string of length width. Padding is done\n\ 4341using spaces."); 4342 4343static PyObject * 4344unicode_center(PyUnicodeObject *self, PyObject *args) 4345{ 4346 int marg, left; 4347 int width; 4348 4349 if (!PyArg_ParseTuple(args, "i:center", &width)) 4350 return NULL; 4351 4352 if (self->length >= width && PyUnicode_CheckExact(self)) { 4353 Py_INCREF(self); 4354 return (PyObject*) self; 4355 } 4356 4357 marg = width - self->length; 4358 left = marg / 2 + (marg & width & 1); 4359 4360 return (PyObject*) pad(self, left, marg - left, ' '); 4361} 4362 4363#if 0 4364 4365/* This code should go into some future Unicode collation support 4366 module. The basic comparison should compare ordinals on a naive 4367 basis (this is what Java does and thus JPython too). */ 4368 4369/* speedy UTF-16 code point order comparison */ 4370/* gleaned from: */ 4371/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 4372 4373static short utf16Fixup[32] = 4374{ 4375 0, 0, 0, 0, 0, 0, 0, 0, 4376 0, 0, 0, 0, 0, 0, 0, 0, 4377 0, 0, 0, 0, 0, 0, 0, 0, 4378 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 4379}; 4380 4381static int 4382unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4383{ 4384 int len1, len2; 4385 4386 Py_UNICODE *s1 = str1->str; 4387 Py_UNICODE *s2 = str2->str; 4388 4389 len1 = str1->length; 4390 len2 = str2->length; 4391 4392 while (len1 > 0 && len2 > 0) { 4393 Py_UNICODE c1, c2; 4394 4395 c1 = *s1++; 4396 c2 = *s2++; 4397 4398 if (c1 > (1<<11) * 26) 4399 c1 += utf16Fixup[c1>>11]; 4400 if (c2 > (1<<11) * 26) 4401 c2 += utf16Fixup[c2>>11]; 4402 /* now c1 and c2 are in UTF-32-compatible order */ 4403 4404 if (c1 != c2) 4405 return (c1 < c2) ? -1 : 1; 4406 4407 len1--; len2--; 4408 } 4409 4410 return (len1 < len2) ? -1 : (len1 != len2); 4411} 4412 4413#else 4414 4415static int 4416unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 4417{ 4418 register int len1, len2; 4419 4420 Py_UNICODE *s1 = str1->str; 4421 Py_UNICODE *s2 = str2->str; 4422 4423 len1 = str1->length; 4424 len2 = str2->length; 4425 4426 while (len1 > 0 && len2 > 0) { 4427 Py_UNICODE c1, c2; 4428 4429 c1 = *s1++; 4430 c2 = *s2++; 4431 4432 if (c1 != c2) 4433 return (c1 < c2) ? -1 : 1; 4434 4435 len1--; len2--; 4436 } 4437 4438 return (len1 < len2) ? -1 : (len1 != len2); 4439} 4440 4441#endif 4442 4443int PyUnicode_Compare(PyObject *left, 4444 PyObject *right) 4445{ 4446 PyUnicodeObject *u = NULL, *v = NULL; 4447 int result; 4448 4449 /* Coerce the two arguments */ 4450 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4451 if (u == NULL) 4452 goto onError; 4453 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4454 if (v == NULL) 4455 goto onError; 4456 4457 /* Shortcut for empty or interned objects */ 4458 if (v == u) { 4459 Py_DECREF(u); 4460 Py_DECREF(v); 4461 return 0; 4462 } 4463 4464 result = unicode_compare(u, v); 4465 4466 Py_DECREF(u); 4467 Py_DECREF(v); 4468 return result; 4469 4470onError: 4471 Py_XDECREF(u); 4472 Py_XDECREF(v); 4473 return -1; 4474} 4475 4476int PyUnicode_Contains(PyObject *container, 4477 PyObject *element) 4478{ 4479 PyUnicodeObject *u = NULL, *v = NULL; 4480 int result, size; 4481 register const Py_UNICODE *lhs, *end, *rhs; 4482 4483 /* Coerce the two arguments */ 4484 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 4485 if (v == NULL) { 4486 PyErr_SetString(PyExc_TypeError, 4487 "'in <string>' requires string as left operand"); 4488 goto onError; 4489 } 4490 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 4491 if (u == NULL) 4492 goto onError; 4493 4494 size = PyUnicode_GET_SIZE(v); 4495 rhs = PyUnicode_AS_UNICODE(v); 4496 lhs = PyUnicode_AS_UNICODE(u); 4497 4498 result = 0; 4499 if (size == 1) { 4500 end = lhs + PyUnicode_GET_SIZE(u); 4501 while (lhs < end) { 4502 if (*lhs++ == *rhs) { 4503 result = 1; 4504 break; 4505 } 4506 } 4507 } 4508 else { 4509 end = lhs + (PyUnicode_GET_SIZE(u) - size); 4510 while (lhs <= end) { 4511 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) { 4512 result = 1; 4513 break; 4514 } 4515 } 4516 } 4517 4518 Py_DECREF(u); 4519 Py_DECREF(v); 4520 return result; 4521 4522onError: 4523 Py_XDECREF(u); 4524 Py_XDECREF(v); 4525 return -1; 4526} 4527 4528/* Concat to string or Unicode object giving a new Unicode object. */ 4529 4530PyObject *PyUnicode_Concat(PyObject *left, 4531 PyObject *right) 4532{ 4533 PyUnicodeObject *u = NULL, *v = NULL, *w; 4534 4535 /* Coerce the two arguments */ 4536 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 4537 if (u == NULL) 4538 goto onError; 4539 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 4540 if (v == NULL) 4541 goto onError; 4542 4543 /* Shortcuts */ 4544 if (v == unicode_empty) { 4545 Py_DECREF(v); 4546 return (PyObject *)u; 4547 } 4548 if (u == unicode_empty) { 4549 Py_DECREF(u); 4550 return (PyObject *)v; 4551 } 4552 4553 /* Concat the two Unicode strings */ 4554 w = _PyUnicode_New(u->length + v->length); 4555 if (w == NULL) 4556 goto onError; 4557 Py_UNICODE_COPY(w->str, u->str, u->length); 4558 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 4559 4560 Py_DECREF(u); 4561 Py_DECREF(v); 4562 return (PyObject *)w; 4563 4564onError: 4565 Py_XDECREF(u); 4566 Py_XDECREF(v); 4567 return NULL; 4568} 4569 4570PyDoc_STRVAR(count__doc__, 4571"S.count(sub[, start[, end]]) -> int\n\ 4572\n\ 4573Return the number of occurrences of substring sub in Unicode string\n\ 4574S[start:end]. Optional arguments start and end are\n\ 4575interpreted as in slice notation."); 4576 4577static PyObject * 4578unicode_count(PyUnicodeObject *self, PyObject *args) 4579{ 4580 PyUnicodeObject *substring; 4581 int start = 0; 4582 int end = INT_MAX; 4583 PyObject *result; 4584 4585 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 4586 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4587 return NULL; 4588 4589 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4590 (PyObject *)substring); 4591 if (substring == NULL) 4592 return NULL; 4593 4594 if (start < 0) 4595 start += self->length; 4596 if (start < 0) 4597 start = 0; 4598 if (end > self->length) 4599 end = self->length; 4600 if (end < 0) 4601 end += self->length; 4602 if (end < 0) 4603 end = 0; 4604 4605 result = PyInt_FromLong((long) count(self, start, end, substring)); 4606 4607 Py_DECREF(substring); 4608 return result; 4609} 4610 4611PyDoc_STRVAR(encode__doc__, 4612"S.encode([encoding[,errors]]) -> string\n\ 4613\n\ 4614Return an encoded string version of S. Default encoding is the current\n\ 4615default string encoding. errors may be given to set a different error\n\ 4616handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 4617a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 4618'xmlcharrefreplace' as well as any other name registered with\n\ 4619codecs.register_error that can handle UnicodeEncodeErrors."); 4620 4621static PyObject * 4622unicode_encode(PyUnicodeObject *self, PyObject *args) 4623{ 4624 char *encoding = NULL; 4625 char *errors = NULL; 4626 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 4627 return NULL; 4628 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 4629} 4630 4631PyDoc_STRVAR(expandtabs__doc__, 4632"S.expandtabs([tabsize]) -> unicode\n\ 4633\n\ 4634Return a copy of S where all tab characters are expanded using spaces.\n\ 4635If tabsize is not given, a tab size of 8 characters is assumed."); 4636 4637static PyObject* 4638unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 4639{ 4640 Py_UNICODE *e; 4641 Py_UNICODE *p; 4642 Py_UNICODE *q; 4643 int i, j; 4644 PyUnicodeObject *u; 4645 int tabsize = 8; 4646 4647 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 4648 return NULL; 4649 4650 /* First pass: determine size of output string */ 4651 i = j = 0; 4652 e = self->str + self->length; 4653 for (p = self->str; p < e; p++) 4654 if (*p == '\t') { 4655 if (tabsize > 0) 4656 j += tabsize - (j % tabsize); 4657 } 4658 else { 4659 j++; 4660 if (*p == '\n' || *p == '\r') { 4661 i += j; 4662 j = 0; 4663 } 4664 } 4665 4666 /* Second pass: create output string and fill it */ 4667 u = _PyUnicode_New(i + j); 4668 if (!u) 4669 return NULL; 4670 4671 j = 0; 4672 q = u->str; 4673 4674 for (p = self->str; p < e; p++) 4675 if (*p == '\t') { 4676 if (tabsize > 0) { 4677 i = tabsize - (j % tabsize); 4678 j += i; 4679 while (i--) 4680 *q++ = ' '; 4681 } 4682 } 4683 else { 4684 j++; 4685 *q++ = *p; 4686 if (*p == '\n' || *p == '\r') 4687 j = 0; 4688 } 4689 4690 return (PyObject*) u; 4691} 4692 4693PyDoc_STRVAR(find__doc__, 4694"S.find(sub [,start [,end]]) -> int\n\ 4695\n\ 4696Return the lowest index in S where substring sub is found,\n\ 4697such that sub is contained within s[start,end]. Optional\n\ 4698arguments start and end are interpreted as in slice notation.\n\ 4699\n\ 4700Return -1 on failure."); 4701 4702static PyObject * 4703unicode_find(PyUnicodeObject *self, PyObject *args) 4704{ 4705 PyUnicodeObject *substring; 4706 int start = 0; 4707 int end = INT_MAX; 4708 PyObject *result; 4709 4710 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 4711 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4712 return NULL; 4713 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4714 (PyObject *)substring); 4715 if (substring == NULL) 4716 return NULL; 4717 4718 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 4719 4720 Py_DECREF(substring); 4721 return result; 4722} 4723 4724static PyObject * 4725unicode_getitem(PyUnicodeObject *self, int index) 4726{ 4727 if (index < 0 || index >= self->length) { 4728 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4729 return NULL; 4730 } 4731 4732 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 4733} 4734 4735static long 4736unicode_hash(PyUnicodeObject *self) 4737{ 4738 /* Since Unicode objects compare equal to their ASCII string 4739 counterparts, they should use the individual character values 4740 as basis for their hash value. This is needed to assure that 4741 strings and Unicode objects behave in the same way as 4742 dictionary keys. */ 4743 4744 register int len; 4745 register Py_UNICODE *p; 4746 register long x; 4747 4748 if (self->hash != -1) 4749 return self->hash; 4750 len = PyUnicode_GET_SIZE(self); 4751 p = PyUnicode_AS_UNICODE(self); 4752 x = *p << 7; 4753 while (--len >= 0) 4754 x = (1000003*x) ^ *p++; 4755 x ^= PyUnicode_GET_SIZE(self); 4756 if (x == -1) 4757 x = -2; 4758 self->hash = x; 4759 return x; 4760} 4761 4762PyDoc_STRVAR(index__doc__, 4763"S.index(sub [,start [,end]]) -> int\n\ 4764\n\ 4765Like S.find() but raise ValueError when the substring is not found."); 4766 4767static PyObject * 4768unicode_index(PyUnicodeObject *self, PyObject *args) 4769{ 4770 int result; 4771 PyUnicodeObject *substring; 4772 int start = 0; 4773 int end = INT_MAX; 4774 4775 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 4776 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4777 return NULL; 4778 4779 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4780 (PyObject *)substring); 4781 if (substring == NULL) 4782 return NULL; 4783 4784 result = findstring(self, substring, start, end, 1); 4785 4786 Py_DECREF(substring); 4787 if (result < 0) { 4788 PyErr_SetString(PyExc_ValueError, "substring not found"); 4789 return NULL; 4790 } 4791 return PyInt_FromLong(result); 4792} 4793 4794PyDoc_STRVAR(islower__doc__, 4795"S.islower() -> bool\n\ 4796\n\ 4797Return True if all cased characters in S are lowercase and there is\n\ 4798at least one cased character in S, False otherwise."); 4799 4800static PyObject* 4801unicode_islower(PyUnicodeObject *self) 4802{ 4803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4804 register const Py_UNICODE *e; 4805 int cased; 4806 4807 /* Shortcut for single character strings */ 4808 if (PyUnicode_GET_SIZE(self) == 1) 4809 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 4810 4811 /* Special case for empty strings */ 4812 if (PyString_GET_SIZE(self) == 0) 4813 return PyBool_FromLong(0); 4814 4815 e = p + PyUnicode_GET_SIZE(self); 4816 cased = 0; 4817 for (; p < e; p++) { 4818 register const Py_UNICODE ch = *p; 4819 4820 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 4821 return PyBool_FromLong(0); 4822 else if (!cased && Py_UNICODE_ISLOWER(ch)) 4823 cased = 1; 4824 } 4825 return PyBool_FromLong(cased); 4826} 4827 4828PyDoc_STRVAR(isupper__doc__, 4829"S.isupper() -> bool\n\ 4830\n\ 4831Return True if all cased characters in S are uppercase and there is\n\ 4832at least one cased character in S, False otherwise."); 4833 4834static PyObject* 4835unicode_isupper(PyUnicodeObject *self) 4836{ 4837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4838 register const Py_UNICODE *e; 4839 int cased; 4840 4841 /* Shortcut for single character strings */ 4842 if (PyUnicode_GET_SIZE(self) == 1) 4843 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 4844 4845 /* Special case for empty strings */ 4846 if (PyString_GET_SIZE(self) == 0) 4847 return PyBool_FromLong(0); 4848 4849 e = p + PyUnicode_GET_SIZE(self); 4850 cased = 0; 4851 for (; p < e; p++) { 4852 register const Py_UNICODE ch = *p; 4853 4854 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 4855 return PyBool_FromLong(0); 4856 else if (!cased && Py_UNICODE_ISUPPER(ch)) 4857 cased = 1; 4858 } 4859 return PyBool_FromLong(cased); 4860} 4861 4862PyDoc_STRVAR(istitle__doc__, 4863"S.istitle() -> bool\n\ 4864\n\ 4865Return True if S is a titlecased string, i.e. upper- and titlecase\n\ 4866characters may only follow uncased characters and lowercase characters\n\ 4867only cased ones. Return False otherwise."); 4868 4869static PyObject* 4870unicode_istitle(PyUnicodeObject *self) 4871{ 4872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4873 register const Py_UNICODE *e; 4874 int cased, previous_is_cased; 4875 4876 /* Shortcut for single character strings */ 4877 if (PyUnicode_GET_SIZE(self) == 1) 4878 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 4879 (Py_UNICODE_ISUPPER(*p) != 0)); 4880 4881 /* Special case for empty strings */ 4882 if (PyString_GET_SIZE(self) == 0) 4883 return PyBool_FromLong(0); 4884 4885 e = p + PyUnicode_GET_SIZE(self); 4886 cased = 0; 4887 previous_is_cased = 0; 4888 for (; p < e; p++) { 4889 register const Py_UNICODE ch = *p; 4890 4891 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 4892 if (previous_is_cased) 4893 return PyBool_FromLong(0); 4894 previous_is_cased = 1; 4895 cased = 1; 4896 } 4897 else if (Py_UNICODE_ISLOWER(ch)) { 4898 if (!previous_is_cased) 4899 return PyBool_FromLong(0); 4900 previous_is_cased = 1; 4901 cased = 1; 4902 } 4903 else 4904 previous_is_cased = 0; 4905 } 4906 return PyBool_FromLong(cased); 4907} 4908 4909PyDoc_STRVAR(isspace__doc__, 4910"S.isspace() -> bool\n\ 4911\n\ 4912Return True if there are only whitespace characters in S,\n\ 4913False otherwise."); 4914 4915static PyObject* 4916unicode_isspace(PyUnicodeObject *self) 4917{ 4918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4919 register const Py_UNICODE *e; 4920 4921 /* Shortcut for single character strings */ 4922 if (PyUnicode_GET_SIZE(self) == 1 && 4923 Py_UNICODE_ISSPACE(*p)) 4924 return PyBool_FromLong(1); 4925 4926 /* Special case for empty strings */ 4927 if (PyString_GET_SIZE(self) == 0) 4928 return PyBool_FromLong(0); 4929 4930 e = p + PyUnicode_GET_SIZE(self); 4931 for (; p < e; p++) { 4932 if (!Py_UNICODE_ISSPACE(*p)) 4933 return PyBool_FromLong(0); 4934 } 4935 return PyBool_FromLong(1); 4936} 4937 4938PyDoc_STRVAR(isalpha__doc__, 4939"S.isalpha() -> bool\n\ 4940\n\ 4941Return True if all characters in S are alphabetic\n\ 4942and there is at least one character in S, False otherwise."); 4943 4944static PyObject* 4945unicode_isalpha(PyUnicodeObject *self) 4946{ 4947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4948 register const Py_UNICODE *e; 4949 4950 /* Shortcut for single character strings */ 4951 if (PyUnicode_GET_SIZE(self) == 1 && 4952 Py_UNICODE_ISALPHA(*p)) 4953 return PyBool_FromLong(1); 4954 4955 /* Special case for empty strings */ 4956 if (PyString_GET_SIZE(self) == 0) 4957 return PyBool_FromLong(0); 4958 4959 e = p + PyUnicode_GET_SIZE(self); 4960 for (; p < e; p++) { 4961 if (!Py_UNICODE_ISALPHA(*p)) 4962 return PyBool_FromLong(0); 4963 } 4964 return PyBool_FromLong(1); 4965} 4966 4967PyDoc_STRVAR(isalnum__doc__, 4968"S.isalnum() -> bool\n\ 4969\n\ 4970Return True if all characters in S are alphanumeric\n\ 4971and there is at least one character in S, False otherwise."); 4972 4973static PyObject* 4974unicode_isalnum(PyUnicodeObject *self) 4975{ 4976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 4977 register const Py_UNICODE *e; 4978 4979 /* Shortcut for single character strings */ 4980 if (PyUnicode_GET_SIZE(self) == 1 && 4981 Py_UNICODE_ISALNUM(*p)) 4982 return PyBool_FromLong(1); 4983 4984 /* Special case for empty strings */ 4985 if (PyString_GET_SIZE(self) == 0) 4986 return PyBool_FromLong(0); 4987 4988 e = p + PyUnicode_GET_SIZE(self); 4989 for (; p < e; p++) { 4990 if (!Py_UNICODE_ISALNUM(*p)) 4991 return PyBool_FromLong(0); 4992 } 4993 return PyBool_FromLong(1); 4994} 4995 4996PyDoc_STRVAR(isdecimal__doc__, 4997"S.isdecimal() -> bool\n\ 4998\n\ 4999Return True if there are only decimal characters in S,\n\ 5000False otherwise."); 5001 5002static PyObject* 5003unicode_isdecimal(PyUnicodeObject *self) 5004{ 5005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5006 register const Py_UNICODE *e; 5007 5008 /* Shortcut for single character strings */ 5009 if (PyUnicode_GET_SIZE(self) == 1 && 5010 Py_UNICODE_ISDECIMAL(*p)) 5011 return PyBool_FromLong(1); 5012 5013 /* Special case for empty strings */ 5014 if (PyString_GET_SIZE(self) == 0) 5015 return PyBool_FromLong(0); 5016 5017 e = p + PyUnicode_GET_SIZE(self); 5018 for (; p < e; p++) { 5019 if (!Py_UNICODE_ISDECIMAL(*p)) 5020 return PyBool_FromLong(0); 5021 } 5022 return PyBool_FromLong(1); 5023} 5024 5025PyDoc_STRVAR(isdigit__doc__, 5026"S.isdigit() -> bool\n\ 5027\n\ 5028Return True if there are only digit characters in S,\n\ 5029False otherwise."); 5030 5031static PyObject* 5032unicode_isdigit(PyUnicodeObject *self) 5033{ 5034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5035 register const Py_UNICODE *e; 5036 5037 /* Shortcut for single character strings */ 5038 if (PyUnicode_GET_SIZE(self) == 1 && 5039 Py_UNICODE_ISDIGIT(*p)) 5040 return PyBool_FromLong(1); 5041 5042 /* Special case for empty strings */ 5043 if (PyString_GET_SIZE(self) == 0) 5044 return PyBool_FromLong(0); 5045 5046 e = p + PyUnicode_GET_SIZE(self); 5047 for (; p < e; p++) { 5048 if (!Py_UNICODE_ISDIGIT(*p)) 5049 return PyBool_FromLong(0); 5050 } 5051 return PyBool_FromLong(1); 5052} 5053 5054PyDoc_STRVAR(isnumeric__doc__, 5055"S.isnumeric() -> bool\n\ 5056\n\ 5057Return True if there are only numeric characters in S,\n\ 5058False otherwise."); 5059 5060static PyObject* 5061unicode_isnumeric(PyUnicodeObject *self) 5062{ 5063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5064 register const Py_UNICODE *e; 5065 5066 /* Shortcut for single character strings */ 5067 if (PyUnicode_GET_SIZE(self) == 1 && 5068 Py_UNICODE_ISNUMERIC(*p)) 5069 return PyBool_FromLong(1); 5070 5071 /* Special case for empty strings */ 5072 if (PyString_GET_SIZE(self) == 0) 5073 return PyBool_FromLong(0); 5074 5075 e = p + PyUnicode_GET_SIZE(self); 5076 for (; p < e; p++) { 5077 if (!Py_UNICODE_ISNUMERIC(*p)) 5078 return PyBool_FromLong(0); 5079 } 5080 return PyBool_FromLong(1); 5081} 5082 5083PyDoc_STRVAR(join__doc__, 5084"S.join(sequence) -> unicode\n\ 5085\n\ 5086Return a string which is the concatenation of the strings in the\n\ 5087sequence. The separator between elements is S."); 5088 5089static PyObject* 5090unicode_join(PyObject *self, PyObject *data) 5091{ 5092 return PyUnicode_Join(self, data); 5093} 5094 5095static int 5096unicode_length(PyUnicodeObject *self) 5097{ 5098 return self->length; 5099} 5100 5101PyDoc_STRVAR(ljust__doc__, 5102"S.ljust(width) -> unicode\n\ 5103\n\ 5104Return S left justified in a Unicode string of length width. Padding is\n\ 5105done using spaces."); 5106 5107static PyObject * 5108unicode_ljust(PyUnicodeObject *self, PyObject *args) 5109{ 5110 int width; 5111 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 5112 return NULL; 5113 5114 if (self->length >= width && PyUnicode_CheckExact(self)) { 5115 Py_INCREF(self); 5116 return (PyObject*) self; 5117 } 5118 5119 return (PyObject*) pad(self, 0, width - self->length, ' '); 5120} 5121 5122PyDoc_STRVAR(lower__doc__, 5123"S.lower() -> unicode\n\ 5124\n\ 5125Return a copy of the string S converted to lowercase."); 5126 5127static PyObject* 5128unicode_lower(PyUnicodeObject *self) 5129{ 5130 return fixup(self, fixlower); 5131} 5132 5133#define LEFTSTRIP 0 5134#define RIGHTSTRIP 1 5135#define BOTHSTRIP 2 5136 5137/* Arrays indexed by above */ 5138static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 5139 5140#define STRIPNAME(i) (stripformat[i]+3) 5141 5142static const Py_UNICODE * 5143unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) 5144{ 5145 size_t i; 5146 for (i = 0; i < n; ++i) 5147 if (s[i] == c) 5148 return s+i; 5149 return NULL; 5150} 5151 5152/* externally visible for str.strip(unicode) */ 5153PyObject * 5154_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 5155{ 5156 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5157 int len = PyUnicode_GET_SIZE(self); 5158 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 5159 int seplen = PyUnicode_GET_SIZE(sepobj); 5160 int i, j; 5161 5162 i = 0; 5163 if (striptype != RIGHTSTRIP) { 5164 while (i < len && unicode_memchr(sep, s[i], seplen)) { 5165 i++; 5166 } 5167 } 5168 5169 j = len; 5170 if (striptype != LEFTSTRIP) { 5171 do { 5172 j--; 5173 } while (j >= i && unicode_memchr(sep, s[j], seplen)); 5174 j++; 5175 } 5176 5177 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5178 Py_INCREF(self); 5179 return (PyObject*)self; 5180 } 5181 else 5182 return PyUnicode_FromUnicode(s+i, j-i); 5183} 5184 5185 5186static PyObject * 5187do_strip(PyUnicodeObject *self, int striptype) 5188{ 5189 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 5190 int len = PyUnicode_GET_SIZE(self), i, j; 5191 5192 i = 0; 5193 if (striptype != RIGHTSTRIP) { 5194 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 5195 i++; 5196 } 5197 } 5198 5199 j = len; 5200 if (striptype != LEFTSTRIP) { 5201 do { 5202 j--; 5203 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 5204 j++; 5205 } 5206 5207 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 5208 Py_INCREF(self); 5209 return (PyObject*)self; 5210 } 5211 else 5212 return PyUnicode_FromUnicode(s+i, j-i); 5213} 5214 5215 5216static PyObject * 5217do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 5218{ 5219 PyObject *sep = NULL; 5220 5221 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 5222 return NULL; 5223 5224 if (sep != NULL && sep != Py_None) { 5225 if (PyUnicode_Check(sep)) 5226 return _PyUnicode_XStrip(self, striptype, sep); 5227 else if (PyString_Check(sep)) { 5228 PyObject *res; 5229 sep = PyUnicode_FromObject(sep); 5230 if (sep==NULL) 5231 return NULL; 5232 res = _PyUnicode_XStrip(self, striptype, sep); 5233 Py_DECREF(sep); 5234 return res; 5235 } 5236 else { 5237 PyErr_Format(PyExc_TypeError, 5238 "%s arg must be None, unicode or str", 5239 STRIPNAME(striptype)); 5240 return NULL; 5241 } 5242 } 5243 5244 return do_strip(self, striptype); 5245} 5246 5247 5248PyDoc_STRVAR(strip__doc__, 5249"S.strip([sep]) -> unicode\n\ 5250\n\ 5251Return a copy of the string S with leading and trailing\n\ 5252whitespace removed.\n\ 5253If sep is given and not None, remove characters in sep instead.\n\ 5254If sep is a str, it will be converted to unicode before stripping"); 5255 5256static PyObject * 5257unicode_strip(PyUnicodeObject *self, PyObject *args) 5258{ 5259 if (PyTuple_GET_SIZE(args) == 0) 5260 return do_strip(self, BOTHSTRIP); /* Common case */ 5261 else 5262 return do_argstrip(self, BOTHSTRIP, args); 5263} 5264 5265 5266PyDoc_STRVAR(lstrip__doc__, 5267"S.lstrip([sep]) -> unicode\n\ 5268\n\ 5269Return a copy of the string S with leading whitespace removed.\n\ 5270If sep is given and not None, remove characters in sep instead.\n\ 5271If sep is a str, it will be converted to unicode before stripping"); 5272 5273static PyObject * 5274unicode_lstrip(PyUnicodeObject *self, PyObject *args) 5275{ 5276 if (PyTuple_GET_SIZE(args) == 0) 5277 return do_strip(self, LEFTSTRIP); /* Common case */ 5278 else 5279 return do_argstrip(self, LEFTSTRIP, args); 5280} 5281 5282 5283PyDoc_STRVAR(rstrip__doc__, 5284"S.rstrip([sep]) -> unicode\n\ 5285\n\ 5286Return a copy of the string S with trailing whitespace removed.\n\ 5287If sep is given and not None, remove characters in sep instead.\n\ 5288If sep is a str, it will be converted to unicode before stripping"); 5289 5290static PyObject * 5291unicode_rstrip(PyUnicodeObject *self, PyObject *args) 5292{ 5293 if (PyTuple_GET_SIZE(args) == 0) 5294 return do_strip(self, RIGHTSTRIP); /* Common case */ 5295 else 5296 return do_argstrip(self, RIGHTSTRIP, args); 5297} 5298 5299 5300static PyObject* 5301unicode_repeat(PyUnicodeObject *str, int len) 5302{ 5303 PyUnicodeObject *u; 5304 Py_UNICODE *p; 5305 int nchars; 5306 size_t nbytes; 5307 5308 if (len < 0) 5309 len = 0; 5310 5311 if (len == 1 && PyUnicode_CheckExact(str)) { 5312 /* no repeat, return original string */ 5313 Py_INCREF(str); 5314 return (PyObject*) str; 5315 } 5316 5317 /* ensure # of chars needed doesn't overflow int and # of bytes 5318 * needed doesn't overflow size_t 5319 */ 5320 nchars = len * str->length; 5321 if (len && nchars / len != str->length) { 5322 PyErr_SetString(PyExc_OverflowError, 5323 "repeated string is too long"); 5324 return NULL; 5325 } 5326 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 5327 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 5328 PyErr_SetString(PyExc_OverflowError, 5329 "repeated string is too long"); 5330 return NULL; 5331 } 5332 u = _PyUnicode_New(nchars); 5333 if (!u) 5334 return NULL; 5335 5336 p = u->str; 5337 5338 while (len-- > 0) { 5339 Py_UNICODE_COPY(p, str->str, str->length); 5340 p += str->length; 5341 } 5342 5343 return (PyObject*) u; 5344} 5345 5346PyObject *PyUnicode_Replace(PyObject *obj, 5347 PyObject *subobj, 5348 PyObject *replobj, 5349 int maxcount) 5350{ 5351 PyObject *self; 5352 PyObject *str1; 5353 PyObject *str2; 5354 PyObject *result; 5355 5356 self = PyUnicode_FromObject(obj); 5357 if (self == NULL) 5358 return NULL; 5359 str1 = PyUnicode_FromObject(subobj); 5360 if (str1 == NULL) { 5361 Py_DECREF(self); 5362 return NULL; 5363 } 5364 str2 = PyUnicode_FromObject(replobj); 5365 if (str2 == NULL) { 5366 Py_DECREF(self); 5367 Py_DECREF(str1); 5368 return NULL; 5369 } 5370 result = replace((PyUnicodeObject *)self, 5371 (PyUnicodeObject *)str1, 5372 (PyUnicodeObject *)str2, 5373 maxcount); 5374 Py_DECREF(self); 5375 Py_DECREF(str1); 5376 Py_DECREF(str2); 5377 return result; 5378} 5379 5380PyDoc_STRVAR(replace__doc__, 5381"S.replace (old, new[, maxsplit]) -> unicode\n\ 5382\n\ 5383Return a copy of S with all occurrences of substring\n\ 5384old replaced by new. If the optional argument maxsplit is\n\ 5385given, only the first maxsplit occurrences are replaced."); 5386 5387static PyObject* 5388unicode_replace(PyUnicodeObject *self, PyObject *args) 5389{ 5390 PyUnicodeObject *str1; 5391 PyUnicodeObject *str2; 5392 int maxcount = -1; 5393 PyObject *result; 5394 5395 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 5396 return NULL; 5397 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 5398 if (str1 == NULL) 5399 return NULL; 5400 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 5401 if (str2 == NULL) { 5402 Py_DECREF(str1); 5403 return NULL; 5404 } 5405 5406 result = replace(self, str1, str2, maxcount); 5407 5408 Py_DECREF(str1); 5409 Py_DECREF(str2); 5410 return result; 5411} 5412 5413static 5414PyObject *unicode_repr(PyObject *unicode) 5415{ 5416 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 5417 PyUnicode_GET_SIZE(unicode), 5418 1); 5419} 5420 5421PyDoc_STRVAR(rfind__doc__, 5422"S.rfind(sub [,start [,end]]) -> int\n\ 5423\n\ 5424Return the highest index in S where substring sub is found,\n\ 5425such that sub is contained within s[start,end]. Optional\n\ 5426arguments start and end are interpreted as in slice notation.\n\ 5427\n\ 5428Return -1 on failure."); 5429 5430static PyObject * 5431unicode_rfind(PyUnicodeObject *self, PyObject *args) 5432{ 5433 PyUnicodeObject *substring; 5434 int start = 0; 5435 int end = INT_MAX; 5436 PyObject *result; 5437 5438 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 5439 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5440 return NULL; 5441 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5442 (PyObject *)substring); 5443 if (substring == NULL) 5444 return NULL; 5445 5446 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 5447 5448 Py_DECREF(substring); 5449 return result; 5450} 5451 5452PyDoc_STRVAR(rindex__doc__, 5453"S.rindex(sub [,start [,end]]) -> int\n\ 5454\n\ 5455Like S.rfind() but raise ValueError when the substring is not found."); 5456 5457static PyObject * 5458unicode_rindex(PyUnicodeObject *self, PyObject *args) 5459{ 5460 int result; 5461 PyUnicodeObject *substring; 5462 int start = 0; 5463 int end = INT_MAX; 5464 5465 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 5466 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5467 return NULL; 5468 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5469 (PyObject *)substring); 5470 if (substring == NULL) 5471 return NULL; 5472 5473 result = findstring(self, substring, start, end, -1); 5474 5475 Py_DECREF(substring); 5476 if (result < 0) { 5477 PyErr_SetString(PyExc_ValueError, "substring not found"); 5478 return NULL; 5479 } 5480 return PyInt_FromLong(result); 5481} 5482 5483PyDoc_STRVAR(rjust__doc__, 5484"S.rjust(width) -> unicode\n\ 5485\n\ 5486Return S right justified in a Unicode string of length width. Padding is\n\ 5487done using spaces."); 5488 5489static PyObject * 5490unicode_rjust(PyUnicodeObject *self, PyObject *args) 5491{ 5492 int width; 5493 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 5494 return NULL; 5495 5496 if (self->length >= width && PyUnicode_CheckExact(self)) { 5497 Py_INCREF(self); 5498 return (PyObject*) self; 5499 } 5500 5501 return (PyObject*) pad(self, width - self->length, 0, ' '); 5502} 5503 5504static PyObject* 5505unicode_slice(PyUnicodeObject *self, int start, int end) 5506{ 5507 /* standard clamping */ 5508 if (start < 0) 5509 start = 0; 5510 if (end < 0) 5511 end = 0; 5512 if (end > self->length) 5513 end = self->length; 5514 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 5515 /* full slice, return original string */ 5516 Py_INCREF(self); 5517 return (PyObject*) self; 5518 } 5519 if (start > end) 5520 start = end; 5521 /* copy slice */ 5522 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 5523 end - start); 5524} 5525 5526PyObject *PyUnicode_Split(PyObject *s, 5527 PyObject *sep, 5528 int maxsplit) 5529{ 5530 PyObject *result; 5531 5532 s = PyUnicode_FromObject(s); 5533 if (s == NULL) 5534 return NULL; 5535 if (sep != NULL) { 5536 sep = PyUnicode_FromObject(sep); 5537 if (sep == NULL) { 5538 Py_DECREF(s); 5539 return NULL; 5540 } 5541 } 5542 5543 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 5544 5545 Py_DECREF(s); 5546 Py_XDECREF(sep); 5547 return result; 5548} 5549 5550PyDoc_STRVAR(split__doc__, 5551"S.split([sep [,maxsplit]]) -> list of strings\n\ 5552\n\ 5553Return a list of the words in S, using sep as the\n\ 5554delimiter string. If maxsplit is given, at most maxsplit\n\ 5555splits are done. If sep is not specified, any whitespace string\n\ 5556is a separator."); 5557 5558static PyObject* 5559unicode_split(PyUnicodeObject *self, PyObject *args) 5560{ 5561 PyObject *substring = Py_None; 5562 int maxcount = -1; 5563 5564 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 5565 return NULL; 5566 5567 if (substring == Py_None) 5568 return split(self, NULL, maxcount); 5569 else if (PyUnicode_Check(substring)) 5570 return split(self, (PyUnicodeObject *)substring, maxcount); 5571 else 5572 return PyUnicode_Split((PyObject *)self, substring, maxcount); 5573} 5574 5575PyDoc_STRVAR(splitlines__doc__, 5576"S.splitlines([keepends]]) -> list of strings\n\ 5577\n\ 5578Return a list of the lines in S, breaking at line boundaries.\n\ 5579Line breaks are not included in the resulting list unless keepends\n\ 5580is given and true."); 5581 5582static PyObject* 5583unicode_splitlines(PyUnicodeObject *self, PyObject *args) 5584{ 5585 int keepends = 0; 5586 5587 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 5588 return NULL; 5589 5590 return PyUnicode_Splitlines((PyObject *)self, keepends); 5591} 5592 5593static 5594PyObject *unicode_str(PyUnicodeObject *self) 5595{ 5596 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 5597} 5598 5599PyDoc_STRVAR(swapcase__doc__, 5600"S.swapcase() -> unicode\n\ 5601\n\ 5602Return a copy of S with uppercase characters converted to lowercase\n\ 5603and vice versa."); 5604 5605static PyObject* 5606unicode_swapcase(PyUnicodeObject *self) 5607{ 5608 return fixup(self, fixswapcase); 5609} 5610 5611PyDoc_STRVAR(translate__doc__, 5612"S.translate(table) -> unicode\n\ 5613\n\ 5614Return a copy of the string S, where all characters have been mapped\n\ 5615through the given translation table, which must be a mapping of\n\ 5616Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 5617Unmapped characters are left untouched. Characters mapped to None\n\ 5618are deleted."); 5619 5620static PyObject* 5621unicode_translate(PyUnicodeObject *self, PyObject *table) 5622{ 5623 return PyUnicode_TranslateCharmap(self->str, 5624 self->length, 5625 table, 5626 "ignore"); 5627} 5628 5629PyDoc_STRVAR(upper__doc__, 5630"S.upper() -> unicode\n\ 5631\n\ 5632Return a copy of S converted to uppercase."); 5633 5634static PyObject* 5635unicode_upper(PyUnicodeObject *self) 5636{ 5637 return fixup(self, fixupper); 5638} 5639 5640PyDoc_STRVAR(zfill__doc__, 5641"S.zfill(width) -> unicode\n\ 5642\n\ 5643Pad a numeric string x with zeros on the left, to fill a field\n\ 5644of the specified width. The string x is never truncated."); 5645 5646static PyObject * 5647unicode_zfill(PyUnicodeObject *self, PyObject *args) 5648{ 5649 int fill; 5650 PyUnicodeObject *u; 5651 5652 int width; 5653 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 5654 return NULL; 5655 5656 if (self->length >= width) { 5657 if (PyUnicode_CheckExact(self)) { 5658 Py_INCREF(self); 5659 return (PyObject*) self; 5660 } 5661 else 5662 return PyUnicode_FromUnicode( 5663 PyUnicode_AS_UNICODE(self), 5664 PyUnicode_GET_SIZE(self) 5665 ); 5666 } 5667 5668 fill = width - self->length; 5669 5670 u = pad(self, fill, 0, '0'); 5671 5672 if (u == NULL) 5673 return NULL; 5674 5675 if (u->str[fill] == '+' || u->str[fill] == '-') { 5676 /* move sign to beginning of string */ 5677 u->str[0] = u->str[fill]; 5678 u->str[fill] = '0'; 5679 } 5680 5681 return (PyObject*) u; 5682} 5683 5684#if 0 5685static PyObject* 5686unicode_freelistsize(PyUnicodeObject *self) 5687{ 5688 return PyInt_FromLong(unicode_freelist_size); 5689} 5690#endif 5691 5692PyDoc_STRVAR(startswith__doc__, 5693"S.startswith(prefix[, start[, end]]) -> bool\n\ 5694\n\ 5695Return True if S starts with the specified prefix, False otherwise. With\n\ 5696optional start, test S beginning at that position. With optional end, stop\n\ 5697comparing S at that position."); 5698 5699static PyObject * 5700unicode_startswith(PyUnicodeObject *self, 5701 PyObject *args) 5702{ 5703 PyUnicodeObject *substring; 5704 int start = 0; 5705 int end = INT_MAX; 5706 PyObject *result; 5707 5708 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 5709 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5710 return NULL; 5711 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5712 (PyObject *)substring); 5713 if (substring == NULL) 5714 return NULL; 5715 5716 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1)); 5717 5718 Py_DECREF(substring); 5719 return result; 5720} 5721 5722 5723PyDoc_STRVAR(endswith__doc__, 5724"S.endswith(suffix[, start[, end]]) -> bool\n\ 5725\n\ 5726Return True if S ends with the specified suffix, False otherwise. With\n\ 5727optional start, test S beginning at that position. With optional end, stop\n\ 5728comparing S at that position."); 5729 5730static PyObject * 5731unicode_endswith(PyUnicodeObject *self, 5732 PyObject *args) 5733{ 5734 PyUnicodeObject *substring; 5735 int start = 0; 5736 int end = INT_MAX; 5737 PyObject *result; 5738 5739 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 5740 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 5741 return NULL; 5742 substring = (PyUnicodeObject *)PyUnicode_FromObject( 5743 (PyObject *)substring); 5744 if (substring == NULL) 5745 return NULL; 5746 5747 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1)); 5748 5749 Py_DECREF(substring); 5750 return result; 5751} 5752 5753 5754 5755static PyObject * 5756unicode_getnewargs(PyUnicodeObject *v) 5757{ 5758 return Py_BuildValue("(u#)", v->str, v->length); 5759} 5760 5761 5762static PyMethodDef unicode_methods[] = { 5763 5764 /* Order is according to common usage: often used methods should 5765 appear first, since lookup is done sequentially. */ 5766 5767 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 5768 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 5769 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 5770 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 5771 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 5772 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 5773 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 5774 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 5775 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 5776 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 5777 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 5778 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 5779 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 5780 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 5781/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 5782 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 5783 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 5784 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 5785 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 5786 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 5787 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 5788 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 5789 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 5790 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 5791 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 5792 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 5793 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 5794 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 5795 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 5796 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 5797 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 5798 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 5799 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 5800 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 5801 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 5802 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 5803#if 0 5804 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 5805#endif 5806 5807#if 0 5808 /* This one is just used for debugging the implementation. */ 5809 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 5810#endif 5811 5812 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 5813 {NULL, NULL} 5814}; 5815 5816static PyObject * 5817unicode_mod(PyObject *v, PyObject *w) 5818{ 5819 if (!PyUnicode_Check(v)) { 5820 Py_INCREF(Py_NotImplemented); 5821 return Py_NotImplemented; 5822 } 5823 return PyUnicode_Format(v, w); 5824} 5825 5826static PyNumberMethods unicode_as_number = { 5827 0, /*nb_add*/ 5828 0, /*nb_subtract*/ 5829 0, /*nb_multiply*/ 5830 0, /*nb_divide*/ 5831 unicode_mod, /*nb_remainder*/ 5832}; 5833 5834static PySequenceMethods unicode_as_sequence = { 5835 (inquiry) unicode_length, /* sq_length */ 5836 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 5837 (intargfunc) unicode_repeat, /* sq_repeat */ 5838 (intargfunc) unicode_getitem, /* sq_item */ 5839 (intintargfunc) unicode_slice, /* sq_slice */ 5840 0, /* sq_ass_item */ 5841 0, /* sq_ass_slice */ 5842 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 5843}; 5844 5845static PyObject* 5846unicode_subscript(PyUnicodeObject* self, PyObject* item) 5847{ 5848 if (PyInt_Check(item)) { 5849 long i = PyInt_AS_LONG(item); 5850 if (i < 0) 5851 i += PyString_GET_SIZE(self); 5852 return unicode_getitem(self, i); 5853 } else if (PyLong_Check(item)) { 5854 long i = PyLong_AsLong(item); 5855 if (i == -1 && PyErr_Occurred()) 5856 return NULL; 5857 if (i < 0) 5858 i += PyString_GET_SIZE(self); 5859 return unicode_getitem(self, i); 5860 } else if (PySlice_Check(item)) { 5861 int start, stop, step, slicelength, cur, i; 5862 Py_UNICODE* source_buf; 5863 Py_UNICODE* result_buf; 5864 PyObject* result; 5865 5866 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self), 5867 &start, &stop, &step, &slicelength) < 0) { 5868 return NULL; 5869 } 5870 5871 if (slicelength <= 0) { 5872 return PyUnicode_FromUnicode(NULL, 0); 5873 } else { 5874 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 5875 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE)); 5876 5877 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 5878 result_buf[i] = source_buf[cur]; 5879 } 5880 5881 result = PyUnicode_FromUnicode(result_buf, slicelength); 5882 PyMem_FREE(result_buf); 5883 return result; 5884 } 5885 } else { 5886 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 5887 return NULL; 5888 } 5889} 5890 5891static PyMappingMethods unicode_as_mapping = { 5892 (inquiry)unicode_length, /* mp_length */ 5893 (binaryfunc)unicode_subscript, /* mp_subscript */ 5894 (objobjargproc)0, /* mp_ass_subscript */ 5895}; 5896 5897static int 5898unicode_buffer_getreadbuf(PyUnicodeObject *self, 5899 int index, 5900 const void **ptr) 5901{ 5902 if (index != 0) { 5903 PyErr_SetString(PyExc_SystemError, 5904 "accessing non-existent unicode segment"); 5905 return -1; 5906 } 5907 *ptr = (void *) self->str; 5908 return PyUnicode_GET_DATA_SIZE(self); 5909} 5910 5911static int 5912unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 5913 const void **ptr) 5914{ 5915 PyErr_SetString(PyExc_TypeError, 5916 "cannot use unicode as modifiable buffer"); 5917 return -1; 5918} 5919 5920static int 5921unicode_buffer_getsegcount(PyUnicodeObject *self, 5922 int *lenp) 5923{ 5924 if (lenp) 5925 *lenp = PyUnicode_GET_DATA_SIZE(self); 5926 return 1; 5927} 5928 5929static int 5930unicode_buffer_getcharbuf(PyUnicodeObject *self, 5931 int index, 5932 const void **ptr) 5933{ 5934 PyObject *str; 5935 5936 if (index != 0) { 5937 PyErr_SetString(PyExc_SystemError, 5938 "accessing non-existent unicode segment"); 5939 return -1; 5940 } 5941 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 5942 if (str == NULL) 5943 return -1; 5944 *ptr = (void *) PyString_AS_STRING(str); 5945 return PyString_GET_SIZE(str); 5946} 5947 5948/* Helpers for PyUnicode_Format() */ 5949 5950static PyObject * 5951getnextarg(PyObject *args, int arglen, int *p_argidx) 5952{ 5953 int argidx = *p_argidx; 5954 if (argidx < arglen) { 5955 (*p_argidx)++; 5956 if (arglen < 0) 5957 return args; 5958 else 5959 return PyTuple_GetItem(args, argidx); 5960 } 5961 PyErr_SetString(PyExc_TypeError, 5962 "not enough arguments for format string"); 5963 return NULL; 5964} 5965 5966#define F_LJUST (1<<0) 5967#define F_SIGN (1<<1) 5968#define F_BLANK (1<<2) 5969#define F_ALT (1<<3) 5970#define F_ZERO (1<<4) 5971 5972static 5973int usprintf(register Py_UNICODE *buffer, char *format, ...) 5974{ 5975 register int i; 5976 int len; 5977 va_list va; 5978 char *charbuffer; 5979 va_start(va, format); 5980 5981 /* First, format the string as char array, then expand to Py_UNICODE 5982 array. */ 5983 charbuffer = (char *)buffer; 5984 len = vsprintf(charbuffer, format, va); 5985 for (i = len - 1; i >= 0; i--) 5986 buffer[i] = (Py_UNICODE) charbuffer[i]; 5987 5988 va_end(va); 5989 return len; 5990} 5991 5992/* XXX To save some code duplication, formatfloat/long/int could have been 5993 shared with stringobject.c, converting from 8-bit to Unicode after the 5994 formatting is done. */ 5995 5996static int 5997formatfloat(Py_UNICODE *buf, 5998 size_t buflen, 5999 int flags, 6000 int prec, 6001 int type, 6002 PyObject *v) 6003{ 6004 /* fmt = '%#.' + `prec` + `type` 6005 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 6006 char fmt[20]; 6007 double x; 6008 6009 x = PyFloat_AsDouble(v); 6010 if (x == -1.0 && PyErr_Occurred()) 6011 return -1; 6012 if (prec < 0) 6013 prec = 6; 6014 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 6015 type = 'g'; 6016 /* Worst case length calc to ensure no buffer overrun: 6017 6018 'g' formats: 6019 fmt = %#.<prec>g 6020 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 6021 for any double rep.) 6022 len = 1 + prec + 1 + 2 + 5 = 9 + prec 6023 6024 'f' formats: 6025 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 6026 len = 1 + 50 + 1 + prec = 52 + prec 6027 6028 If prec=0 the effective precision is 1 (the leading digit is 6029 always given), therefore increase the length by one. 6030 6031 */ 6032 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 6033 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 6034 PyErr_SetString(PyExc_OverflowError, 6035 "formatted float is too long (precision too large?)"); 6036 return -1; 6037 } 6038 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 6039 (flags&F_ALT) ? "#" : "", 6040 prec, type); 6041 return usprintf(buf, fmt, x); 6042} 6043 6044static PyObject* 6045formatlong(PyObject *val, int flags, int prec, int type) 6046{ 6047 char *buf; 6048 int i, len; 6049 PyObject *str; /* temporary string object. */ 6050 PyUnicodeObject *result; 6051 6052 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 6053 if (!str) 6054 return NULL; 6055 result = _PyUnicode_New(len); 6056 for (i = 0; i < len; i++) 6057 result->str[i] = buf[i]; 6058 result->str[len] = 0; 6059 Py_DECREF(str); 6060 return (PyObject*)result; 6061} 6062 6063static int 6064formatint(Py_UNICODE *buf, 6065 size_t buflen, 6066 int flags, 6067 int prec, 6068 int type, 6069 PyObject *v) 6070{ 6071 /* fmt = '%#.' + `prec` + 'l' + `type` 6072 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 6073 * + 1 + 1 6074 * = 24 6075 */ 6076 char fmt[64]; /* plenty big enough! */ 6077 long x; 6078 6079 x = PyInt_AsLong(v); 6080 if (x == -1 && PyErr_Occurred()) 6081 return -1; 6082 if (x < 0 && type != 'd' && type != 'i') { 6083 if (PyErr_Warn(PyExc_FutureWarning, 6084 "%u/%o/%x/%X of negative int will return " 6085 "a signed string in Python 2.4 and up") < 0) 6086 return -1; 6087 } 6088 if (prec < 0) 6089 prec = 1; 6090 6091 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 6092 * worst case buf = '0x' + [0-9]*prec, where prec >= 11 6093 */ 6094 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) { 6095 PyErr_SetString(PyExc_OverflowError, 6096 "formatted integer is too long (precision too large?)"); 6097 return -1; 6098 } 6099 6100 if ((flags & F_ALT) && 6101 (type == 'x' || type == 'X')) { 6102 /* When converting under %#x or %#X, there are a number 6103 * of issues that cause pain: 6104 * - when 0 is being converted, the C standard leaves off 6105 * the '0x' or '0X', which is inconsistent with other 6106 * %#x/%#X conversions and inconsistent with Python's 6107 * hex() function 6108 * - there are platforms that violate the standard and 6109 * convert 0 with the '0x' or '0X' 6110 * (Metrowerks, Compaq Tru64) 6111 * - there are platforms that give '0x' when converting 6112 * under %#X, but convert 0 in accordance with the 6113 * standard (OS/2 EMX) 6114 * 6115 * We can achieve the desired consistency by inserting our 6116 * own '0x' or '0X' prefix, and substituting %x/%X in place 6117 * of %#x/%#X. 6118 * 6119 * Note that this is the same approach as used in 6120 * formatint() in stringobject.c 6121 */ 6122 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c", 6123 type, prec, type); 6124 } 6125 else { 6126 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c", 6127 (flags&F_ALT) ? "#" : "", 6128 prec, type); 6129 } 6130 return usprintf(buf, fmt, x); 6131} 6132 6133static int 6134formatchar(Py_UNICODE *buf, 6135 size_t buflen, 6136 PyObject *v) 6137{ 6138 /* presume that the buffer is at least 2 characters long */ 6139 if (PyUnicode_Check(v)) { 6140 if (PyUnicode_GET_SIZE(v) != 1) 6141 goto onError; 6142 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 6143 } 6144 6145 else if (PyString_Check(v)) { 6146 if (PyString_GET_SIZE(v) != 1) 6147 goto onError; 6148 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 6149 } 6150 6151 else { 6152 /* Integer input truncated to a character */ 6153 long x; 6154 x = PyInt_AsLong(v); 6155 if (x == -1 && PyErr_Occurred()) 6156 goto onError; 6157#ifdef Py_UNICODE_WIDE 6158 if (x < 0 || x > 0x10ffff) { 6159 PyErr_SetString(PyExc_ValueError, 6160 "%c arg not in range(0x110000) " 6161 "(wide Python build)"); 6162 return -1; 6163 } 6164#else 6165 if (x < 0 || x > 0xffff) { 6166 PyErr_SetString(PyExc_ValueError, 6167 "%c arg not in range(0x10000) " 6168 "(narrow Python build)"); 6169 return -1; 6170 } 6171#endif 6172 buf[0] = (Py_UNICODE) x; 6173 } 6174 buf[1] = '\0'; 6175 return 1; 6176 6177 onError: 6178 PyErr_SetString(PyExc_TypeError, 6179 "%c requires int or char"); 6180 return -1; 6181} 6182 6183/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 6184 6185 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 6186 chars are formatted. XXX This is a magic number. Each formatting 6187 routine does bounds checking to ensure no overflow, but a better 6188 solution may be to malloc a buffer of appropriate size for each 6189 format. For now, the current solution is sufficient. 6190*/ 6191#define FORMATBUFLEN (size_t)120 6192 6193PyObject *PyUnicode_Format(PyObject *format, 6194 PyObject *args) 6195{ 6196 Py_UNICODE *fmt, *res; 6197 int fmtcnt, rescnt, reslen, arglen, argidx; 6198 int args_owned = 0; 6199 PyUnicodeObject *result = NULL; 6200 PyObject *dict = NULL; 6201 PyObject *uformat; 6202 6203 if (format == NULL || args == NULL) { 6204 PyErr_BadInternalCall(); 6205 return NULL; 6206 } 6207 uformat = PyUnicode_FromObject(format); 6208 if (uformat == NULL) 6209 return NULL; 6210 fmt = PyUnicode_AS_UNICODE(uformat); 6211 fmtcnt = PyUnicode_GET_SIZE(uformat); 6212 6213 reslen = rescnt = fmtcnt + 100; 6214 result = _PyUnicode_New(reslen); 6215 if (result == NULL) 6216 goto onError; 6217 res = PyUnicode_AS_UNICODE(result); 6218 6219 if (PyTuple_Check(args)) { 6220 arglen = PyTuple_Size(args); 6221 argidx = 0; 6222 } 6223 else { 6224 arglen = -1; 6225 argidx = -2; 6226 } 6227 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 6228 !PyObject_TypeCheck(args, &PyBaseString_Type)) 6229 dict = args; 6230 6231 while (--fmtcnt >= 0) { 6232 if (*fmt != '%') { 6233 if (--rescnt < 0) { 6234 rescnt = fmtcnt + 100; 6235 reslen += rescnt; 6236 if (_PyUnicode_Resize(&result, reslen) < 0) 6237 return NULL; 6238 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 6239 --rescnt; 6240 } 6241 *res++ = *fmt++; 6242 } 6243 else { 6244 /* Got a format specifier */ 6245 int flags = 0; 6246 int width = -1; 6247 int prec = -1; 6248 Py_UNICODE c = '\0'; 6249 Py_UNICODE fill; 6250 PyObject *v = NULL; 6251 PyObject *temp = NULL; 6252 Py_UNICODE *pbuf; 6253 Py_UNICODE sign; 6254 int len; 6255 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 6256 6257 fmt++; 6258 if (*fmt == '(') { 6259 Py_UNICODE *keystart; 6260 int keylen; 6261 PyObject *key; 6262 int pcount = 1; 6263 6264 if (dict == NULL) { 6265 PyErr_SetString(PyExc_TypeError, 6266 "format requires a mapping"); 6267 goto onError; 6268 } 6269 ++fmt; 6270 --fmtcnt; 6271 keystart = fmt; 6272 /* Skip over balanced parentheses */ 6273 while (pcount > 0 && --fmtcnt >= 0) { 6274 if (*fmt == ')') 6275 --pcount; 6276 else if (*fmt == '(') 6277 ++pcount; 6278 fmt++; 6279 } 6280 keylen = fmt - keystart - 1; 6281 if (fmtcnt < 0 || pcount > 0) { 6282 PyErr_SetString(PyExc_ValueError, 6283 "incomplete format key"); 6284 goto onError; 6285 } 6286#if 0 6287 /* keys are converted to strings using UTF-8 and 6288 then looked up since Python uses strings to hold 6289 variables names etc. in its namespaces and we 6290 wouldn't want to break common idioms. */ 6291 key = PyUnicode_EncodeUTF8(keystart, 6292 keylen, 6293 NULL); 6294#else 6295 key = PyUnicode_FromUnicode(keystart, keylen); 6296#endif 6297 if (key == NULL) 6298 goto onError; 6299 if (args_owned) { 6300 Py_DECREF(args); 6301 args_owned = 0; 6302 } 6303 args = PyObject_GetItem(dict, key); 6304 Py_DECREF(key); 6305 if (args == NULL) { 6306 goto onError; 6307 } 6308 args_owned = 1; 6309 arglen = -1; 6310 argidx = -2; 6311 } 6312 while (--fmtcnt >= 0) { 6313 switch (c = *fmt++) { 6314 case '-': flags |= F_LJUST; continue; 6315 case '+': flags |= F_SIGN; continue; 6316 case ' ': flags |= F_BLANK; continue; 6317 case '#': flags |= F_ALT; continue; 6318 case '0': flags |= F_ZERO; continue; 6319 } 6320 break; 6321 } 6322 if (c == '*') { 6323 v = getnextarg(args, arglen, &argidx); 6324 if (v == NULL) 6325 goto onError; 6326 if (!PyInt_Check(v)) { 6327 PyErr_SetString(PyExc_TypeError, 6328 "* wants int"); 6329 goto onError; 6330 } 6331 width = PyInt_AsLong(v); 6332 if (width < 0) { 6333 flags |= F_LJUST; 6334 width = -width; 6335 } 6336 if (--fmtcnt >= 0) 6337 c = *fmt++; 6338 } 6339 else if (c >= '0' && c <= '9') { 6340 width = c - '0'; 6341 while (--fmtcnt >= 0) { 6342 c = *fmt++; 6343 if (c < '0' || c > '9') 6344 break; 6345 if ((width*10) / 10 != width) { 6346 PyErr_SetString(PyExc_ValueError, 6347 "width too big"); 6348 goto onError; 6349 } 6350 width = width*10 + (c - '0'); 6351 } 6352 } 6353 if (c == '.') { 6354 prec = 0; 6355 if (--fmtcnt >= 0) 6356 c = *fmt++; 6357 if (c == '*') { 6358 v = getnextarg(args, arglen, &argidx); 6359 if (v == NULL) 6360 goto onError; 6361 if (!PyInt_Check(v)) { 6362 PyErr_SetString(PyExc_TypeError, 6363 "* wants int"); 6364 goto onError; 6365 } 6366 prec = PyInt_AsLong(v); 6367 if (prec < 0) 6368 prec = 0; 6369 if (--fmtcnt >= 0) 6370 c = *fmt++; 6371 } 6372 else if (c >= '0' && c <= '9') { 6373 prec = c - '0'; 6374 while (--fmtcnt >= 0) { 6375 c = Py_CHARMASK(*fmt++); 6376 if (c < '0' || c > '9') 6377 break; 6378 if ((prec*10) / 10 != prec) { 6379 PyErr_SetString(PyExc_ValueError, 6380 "prec too big"); 6381 goto onError; 6382 } 6383 prec = prec*10 + (c - '0'); 6384 } 6385 } 6386 } /* prec */ 6387 if (fmtcnt >= 0) { 6388 if (c == 'h' || c == 'l' || c == 'L') { 6389 if (--fmtcnt >= 0) 6390 c = *fmt++; 6391 } 6392 } 6393 if (fmtcnt < 0) { 6394 PyErr_SetString(PyExc_ValueError, 6395 "incomplete format"); 6396 goto onError; 6397 } 6398 if (c != '%') { 6399 v = getnextarg(args, arglen, &argidx); 6400 if (v == NULL) 6401 goto onError; 6402 } 6403 sign = 0; 6404 fill = ' '; 6405 switch (c) { 6406 6407 case '%': 6408 pbuf = formatbuf; 6409 /* presume that buffer length is at least 1 */ 6410 pbuf[0] = '%'; 6411 len = 1; 6412 break; 6413 6414 case 's': 6415 case 'r': 6416 if (PyUnicode_Check(v) && c == 's') { 6417 temp = v; 6418 Py_INCREF(temp); 6419 } 6420 else { 6421 PyObject *unicode; 6422 if (c == 's') 6423 temp = PyObject_Str(v); 6424 else 6425 temp = PyObject_Repr(v); 6426 if (temp == NULL) 6427 goto onError; 6428 if (!PyString_Check(temp)) { 6429 /* XXX Note: this should never happen, since 6430 PyObject_Repr() and PyObject_Str() assure 6431 this */ 6432 Py_DECREF(temp); 6433 PyErr_SetString(PyExc_TypeError, 6434 "%s argument has non-string str()"); 6435 goto onError; 6436 } 6437 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 6438 PyString_GET_SIZE(temp), 6439 NULL, 6440 "strict"); 6441 Py_DECREF(temp); 6442 temp = unicode; 6443 if (temp == NULL) 6444 goto onError; 6445 } 6446 pbuf = PyUnicode_AS_UNICODE(temp); 6447 len = PyUnicode_GET_SIZE(temp); 6448 if (prec >= 0 && len > prec) 6449 len = prec; 6450 break; 6451 6452 case 'i': 6453 case 'd': 6454 case 'u': 6455 case 'o': 6456 case 'x': 6457 case 'X': 6458 if (c == 'i') 6459 c = 'd'; 6460 if (PyLong_Check(v)) { 6461 temp = formatlong(v, flags, prec, c); 6462 if (!temp) 6463 goto onError; 6464 pbuf = PyUnicode_AS_UNICODE(temp); 6465 len = PyUnicode_GET_SIZE(temp); 6466 /* unbounded ints can always produce 6467 a sign character! */ 6468 sign = 1; 6469 } 6470 else { 6471 pbuf = formatbuf; 6472 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6473 flags, prec, c, v); 6474 if (len < 0) 6475 goto onError; 6476 /* only d conversion is signed */ 6477 sign = c == 'd'; 6478 } 6479 if (flags & F_ZERO) 6480 fill = '0'; 6481 break; 6482 6483 case 'e': 6484 case 'E': 6485 case 'f': 6486 case 'g': 6487 case 'G': 6488 pbuf = formatbuf; 6489 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 6490 flags, prec, c, v); 6491 if (len < 0) 6492 goto onError; 6493 sign = 1; 6494 if (flags & F_ZERO) 6495 fill = '0'; 6496 break; 6497 6498 case 'c': 6499 pbuf = formatbuf; 6500 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 6501 if (len < 0) 6502 goto onError; 6503 break; 6504 6505 default: 6506 PyErr_Format(PyExc_ValueError, 6507 "unsupported format character '%c' (0x%x) " 6508 "at index %i", 6509 (31<=c && c<=126) ? (char)c : '?', 6510 (int)c, 6511 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat))); 6512 goto onError; 6513 } 6514 if (sign) { 6515 if (*pbuf == '-' || *pbuf == '+') { 6516 sign = *pbuf++; 6517 len--; 6518 } 6519 else if (flags & F_SIGN) 6520 sign = '+'; 6521 else if (flags & F_BLANK) 6522 sign = ' '; 6523 else 6524 sign = 0; 6525 } 6526 if (width < len) 6527 width = len; 6528 if (rescnt - (sign != 0) < width) { 6529 reslen -= rescnt; 6530 rescnt = width + fmtcnt + 100; 6531 reslen += rescnt; 6532 if (reslen < 0) { 6533 Py_DECREF(result); 6534 return PyErr_NoMemory(); 6535 } 6536 if (_PyUnicode_Resize(&result, reslen) < 0) 6537 return NULL; 6538 res = PyUnicode_AS_UNICODE(result) 6539 + reslen - rescnt; 6540 } 6541 if (sign) { 6542 if (fill != ' ') 6543 *res++ = sign; 6544 rescnt--; 6545 if (width > len) 6546 width--; 6547 } 6548 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6549 assert(pbuf[0] == '0'); 6550 assert(pbuf[1] == c); 6551 if (fill != ' ') { 6552 *res++ = *pbuf++; 6553 *res++ = *pbuf++; 6554 } 6555 rescnt -= 2; 6556 width -= 2; 6557 if (width < 0) 6558 width = 0; 6559 len -= 2; 6560 } 6561 if (width > len && !(flags & F_LJUST)) { 6562 do { 6563 --rescnt; 6564 *res++ = fill; 6565 } while (--width > len); 6566 } 6567 if (fill == ' ') { 6568 if (sign) 6569 *res++ = sign; 6570 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 6571 assert(pbuf[0] == '0'); 6572 assert(pbuf[1] == c); 6573 *res++ = *pbuf++; 6574 *res++ = *pbuf++; 6575 } 6576 } 6577 Py_UNICODE_COPY(res, pbuf, len); 6578 res += len; 6579 rescnt -= len; 6580 while (--width >= len) { 6581 --rescnt; 6582 *res++ = ' '; 6583 } 6584 if (dict && (argidx < arglen) && c != '%') { 6585 PyErr_SetString(PyExc_TypeError, 6586 "not all arguments converted during string formatting"); 6587 goto onError; 6588 } 6589 Py_XDECREF(temp); 6590 } /* '%' */ 6591 } /* until end */ 6592 if (argidx < arglen && !dict) { 6593 PyErr_SetString(PyExc_TypeError, 6594 "not all arguments converted during string formatting"); 6595 goto onError; 6596 } 6597 6598 if (args_owned) { 6599 Py_DECREF(args); 6600 } 6601 Py_DECREF(uformat); 6602 if (_PyUnicode_Resize(&result, reslen - rescnt)) 6603 goto onError; 6604 return (PyObject *)result; 6605 6606 onError: 6607 Py_XDECREF(result); 6608 Py_DECREF(uformat); 6609 if (args_owned) { 6610 Py_DECREF(args); 6611 } 6612 return NULL; 6613} 6614 6615static PyBufferProcs unicode_as_buffer = { 6616 (getreadbufferproc) unicode_buffer_getreadbuf, 6617 (getwritebufferproc) unicode_buffer_getwritebuf, 6618 (getsegcountproc) unicode_buffer_getsegcount, 6619 (getcharbufferproc) unicode_buffer_getcharbuf, 6620}; 6621 6622static PyObject * 6623unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 6624 6625static PyObject * 6626unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6627{ 6628 PyObject *x = NULL; 6629 static char *kwlist[] = {"string", "encoding", "errors", 0}; 6630 char *encoding = NULL; 6631 char *errors = NULL; 6632 6633 if (type != &PyUnicode_Type) 6634 return unicode_subtype_new(type, args, kwds); 6635 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 6636 kwlist, &x, &encoding, &errors)) 6637 return NULL; 6638 if (x == NULL) 6639 return (PyObject *)_PyUnicode_New(0); 6640 if (encoding == NULL && errors == NULL) 6641 return PyObject_Unicode(x); 6642 else 6643 return PyUnicode_FromEncodedObject(x, encoding, errors); 6644} 6645 6646static PyObject * 6647unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 6648{ 6649 PyUnicodeObject *tmp, *pnew; 6650 int n; 6651 6652 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 6653 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 6654 if (tmp == NULL) 6655 return NULL; 6656 assert(PyUnicode_Check(tmp)); 6657 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 6658 if (pnew == NULL) 6659 return NULL; 6660 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 6661 if (pnew->str == NULL) { 6662 _Py_ForgetReference((PyObject *)pnew); 6663 PyObject_Del(pnew); 6664 return NULL; 6665 } 6666 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 6667 pnew->length = n; 6668 pnew->hash = tmp->hash; 6669 Py_DECREF(tmp); 6670 return (PyObject *)pnew; 6671} 6672 6673PyDoc_STRVAR(unicode_doc, 6674"unicode(string [, encoding[, errors]]) -> object\n\ 6675\n\ 6676Create a new Unicode object from the given encoded string.\n\ 6677encoding defaults to the current default string encoding.\n\ 6678errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 6679 6680PyTypeObject PyUnicode_Type = { 6681 PyObject_HEAD_INIT(&PyType_Type) 6682 0, /* ob_size */ 6683 "unicode", /* tp_name */ 6684 sizeof(PyUnicodeObject), /* tp_size */ 6685 0, /* tp_itemsize */ 6686 /* Slots */ 6687 (destructor)unicode_dealloc, /* tp_dealloc */ 6688 0, /* tp_print */ 6689 0, /* tp_getattr */ 6690 0, /* tp_setattr */ 6691 (cmpfunc) unicode_compare, /* tp_compare */ 6692 (reprfunc) unicode_repr, /* tp_repr */ 6693 &unicode_as_number, /* tp_as_number */ 6694 &unicode_as_sequence, /* tp_as_sequence */ 6695 &unicode_as_mapping, /* tp_as_mapping */ 6696 (hashfunc) unicode_hash, /* tp_hash*/ 6697 0, /* tp_call*/ 6698 (reprfunc) unicode_str, /* tp_str */ 6699 PyObject_GenericGetAttr, /* tp_getattro */ 6700 0, /* tp_setattro */ 6701 &unicode_as_buffer, /* tp_as_buffer */ 6702 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | 6703 Py_TPFLAGS_BASETYPE, /* tp_flags */ 6704 unicode_doc, /* tp_doc */ 6705 0, /* tp_traverse */ 6706 0, /* tp_clear */ 6707 0, /* tp_richcompare */ 6708 0, /* tp_weaklistoffset */ 6709 0, /* tp_iter */ 6710 0, /* tp_iternext */ 6711 unicode_methods, /* tp_methods */ 6712 0, /* tp_members */ 6713 0, /* tp_getset */ 6714 &PyBaseString_Type, /* tp_base */ 6715 0, /* tp_dict */ 6716 0, /* tp_descr_get */ 6717 0, /* tp_descr_set */ 6718 0, /* tp_dictoffset */ 6719 0, /* tp_init */ 6720 0, /* tp_alloc */ 6721 unicode_new, /* tp_new */ 6722 PyObject_Del, /* tp_free */ 6723}; 6724 6725/* Initialize the Unicode implementation */ 6726 6727void _PyUnicode_Init(void) 6728{ 6729 int i; 6730 6731 /* Init the implementation */ 6732 unicode_freelist = NULL; 6733 unicode_freelist_size = 0; 6734 unicode_empty = _PyUnicode_New(0); 6735 strcpy(unicode_default_encoding, "ascii"); 6736 for (i = 0; i < 256; i++) 6737 unicode_latin1[i] = NULL; 6738 if (PyType_Ready(&PyUnicode_Type) < 0) 6739 Py_FatalError("Can't initialize 'unicode'"); 6740} 6741 6742/* Finalize the Unicode implementation */ 6743 6744void 6745_PyUnicode_Fini(void) 6746{ 6747 PyUnicodeObject *u; 6748 int i; 6749 6750 Py_XDECREF(unicode_empty); 6751 unicode_empty = NULL; 6752 6753 for (i = 0; i < 256; i++) { 6754 if (unicode_latin1[i]) { 6755 Py_DECREF(unicode_latin1[i]); 6756 unicode_latin1[i] = NULL; 6757 } 6758 } 6759 6760 for (u = unicode_freelist; u != NULL;) { 6761 PyUnicodeObject *v = u; 6762 u = *(PyUnicodeObject **)u; 6763 if (v->str) 6764 PyMem_DEL(v->str); 6765 Py_XDECREF(v->defenc); 6766 PyObject_Del(v); 6767 } 6768 unicode_freelist = NULL; 6769 unicode_freelist_size = 0; 6770} 6771