unicodeobject.c revision 63a28be01693584afcadc39ca650efc5fa8f2880
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Major speed upgrades to the method implementations at the Reykjavik 8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 9 10Copyright (c) Corporation for National Research Initiatives. 11 12-------------------------------------------------------------------- 13The original string type implementation is: 14 15 Copyright (c) 1999 by Secret Labs AB 16 Copyright (c) 1999 by Fredrik Lundh 17 18By obtaining, using, and/or copying this software and/or its 19associated documentation, you agree that you have read, understood, 20and will comply with the following terms and conditions: 21 22Permission to use, copy, modify, and distribute this software and its 23associated documentation for any purpose and without fee is hereby 24granted, provided that the above copyright notice appears in all 25copies, and that both that copyright notice and this permission notice 26appear in supporting documentation, and that the name of Secret Labs 27AB or the author not be used in advertising or publicity pertaining to 28distribution of the software without specific, written prior 29permission. 30 31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 38-------------------------------------------------------------------- 39 40*/ 41 42#define PY_SSIZE_T_CLEAN 43#include "Python.h" 44 45#include "unicodeobject.h" 46#include "ucnhash.h" 47 48#ifdef MS_WINDOWS 49#include <windows.h> 50#endif 51 52/* Limit for the Unicode object free list */ 53 54#define MAX_UNICODE_FREELIST_SIZE 1024 55 56/* Limit for the Unicode object free list stay alive optimization. 57 58 The implementation will keep allocated Unicode memory intact for 59 all objects on the free list having a size less than this 60 limit. This reduces malloc() overhead for small Unicode objects. 61 62 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 64 malloc()-overhead) bytes of unused garbage. 65 66 Setting the limit to 0 effectively turns the feature off. 67 68 Note: This is an experimental feature ! If you get core dumps when 69 using Unicode objects, turn this feature off. 70 71*/ 72 73#define KEEPALIVE_SIZE_LIMIT 9 74 75/* Endianness switches; defaults to little endian */ 76 77#ifdef WORDS_BIGENDIAN 78# define BYTEORDER_IS_BIG_ENDIAN 79#else 80# define BYTEORDER_IS_LITTLE_ENDIAN 81#endif 82 83/* --- Globals ------------------------------------------------------------ 84 85 The globals are initialized by the _PyUnicode_Init() API and should 86 not be used before calling that API. 87 88*/ 89 90 91#ifdef __cplusplus 92extern "C" { 93#endif 94 95/* This dictionary holds all interned unicode strings. Note that references 96 to strings in this dictionary are *not* counted in the string's ob_refcnt. 97 When the interned string reaches a refcnt of 0 the string deallocation 98 function will delete the reference from this dictionary. 99 100 Another way to look at this is that to say that the actual reference 101 count of a string is: s->ob_refcnt + (s->ob_sstate?2:0) 102*/ 103static PyObject *interned; 104 105/* Free list for Unicode objects */ 106static PyUnicodeObject *unicode_freelist; 107static int unicode_freelist_size; 108 109/* The empty Unicode object is shared to improve performance. */ 110static PyUnicodeObject *unicode_empty; 111 112/* Single character Unicode strings in the Latin-1 range are being 113 shared as well. */ 114static PyUnicodeObject *unicode_latin1[256]; 115 116/* Default encoding to use and assume when NULL is passed as encoding 117 parameter; it is fixed to "utf-8". Always use the 118 PyUnicode_GetDefaultEncoding() API to access this global. */ 119static const char unicode_default_encoding[] = "utf-8"; 120 121Py_UNICODE 122PyUnicode_GetMax(void) 123{ 124#ifdef Py_UNICODE_WIDE 125 return 0x10FFFF; 126#else 127 /* This is actually an illegal character, so it should 128 not be passed to unichr. */ 129 return 0xFFFF; 130#endif 131} 132 133/* --- Bloom Filters ----------------------------------------------------- */ 134 135/* stuff to implement simple "bloom filters" for Unicode characters. 136 to keep things simple, we use a single bitmask, using the least 5 137 bits from each unicode characters as the bit index. */ 138 139/* the linebreak mask is set up by Unicode_Init below */ 140 141#define BLOOM_MASK unsigned long 142 143static BLOOM_MASK bloom_linebreak; 144 145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) 146 147#define BLOOM_LINEBREAK(ch)\ 148 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch))) 149 150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) 151{ 152 /* calculate simple bloom-style bitmask for a given unicode string */ 153 154 long mask; 155 Py_ssize_t i; 156 157 mask = 0; 158 for (i = 0; i < len; i++) 159 mask |= (1 << (ptr[i] & 0x1F)); 160 161 return mask; 162} 163 164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) 165{ 166 Py_ssize_t i; 167 168 for (i = 0; i < setlen; i++) 169 if (set[i] == chr) 170 return 1; 171 172 return 0; 173} 174 175#define BLOOM_MEMBER(mask, chr, set, setlen)\ 176 BLOOM(mask, chr) && unicode_member(chr, set, setlen) 177 178/* --- Unicode Object ----------------------------------------------------- */ 179 180static 181int unicode_resize(register PyUnicodeObject *unicode, 182 Py_ssize_t length) 183{ 184 void *oldstr; 185 186 /* Shortcut if there's nothing much to do. */ 187 if (unicode->length == length) 188 goto reset; 189 190 /* Resizing shared object (unicode_empty or single character 191 objects) in-place is not allowed. Use PyUnicode_Resize() 192 instead ! */ 193 194 if (unicode == unicode_empty || 195 (unicode->length == 1 && 196 unicode->str[0] < 256U && 197 unicode_latin1[unicode->str[0]] == unicode)) { 198 PyErr_SetString(PyExc_SystemError, 199 "can't resize shared unicode objects"); 200 return -1; 201 } 202 203 /* We allocate one more byte to make sure the string is Ux0000 terminated. 204 The overallocation is also used by fastsearch, which assumes that it's 205 safe to look at str[length] (without making any assumptions about what 206 it contains). */ 207 208 oldstr = unicode->str; 209 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 210 if (!unicode->str) { 211 unicode->str = (Py_UNICODE *)oldstr; 212 PyErr_NoMemory(); 213 return -1; 214 } 215 unicode->str[length] = 0; 216 unicode->length = length; 217 218 reset: 219 /* Reset the object caches */ 220 if (unicode->defenc) { 221 Py_DECREF(unicode->defenc); 222 unicode->defenc = NULL; 223 } 224 unicode->hash = -1; 225 226 return 0; 227} 228 229/* We allocate one more byte to make sure the string is 230 Ux0000 terminated -- XXX is this needed ? 231 232 XXX This allocator could further be enhanced by assuring that the 233 free list never reduces its size below 1. 234 235*/ 236 237static 238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) 239{ 240 register PyUnicodeObject *unicode; 241 242 /* Optimization for empty strings */ 243 if (length == 0 && unicode_empty != NULL) { 244 Py_INCREF(unicode_empty); 245 return unicode_empty; 246 } 247 248 /* Unicode freelist & memory allocation */ 249 if (unicode_freelist) { 250 unicode = unicode_freelist; 251 unicode_freelist = *(PyUnicodeObject **)unicode; 252 unicode_freelist_size--; 253 if (unicode->str) { 254 /* Keep-Alive optimization: we only upsize the buffer, 255 never downsize it. */ 256 if ((unicode->length < length) && 257 unicode_resize(unicode, length) < 0) { 258 PyMem_DEL(unicode->str); 259 goto onError; 260 } 261 } 262 else { 263 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 264 } 265 PyObject_INIT(unicode, &PyUnicode_Type); 266 } 267 else { 268 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 269 if (unicode == NULL) 270 return NULL; 271 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 272 } 273 274 if (!unicode->str) { 275 PyErr_NoMemory(); 276 goto onError; 277 } 278 /* Initialize the first element to guard against cases where 279 * the caller fails before initializing str -- unicode_resize() 280 * reads str[0], and the Keep-Alive optimization can keep memory 281 * allocated for str alive across a call to unicode_dealloc(unicode). 282 * We don't want unicode_resize to read uninitialized memory in 283 * that case. 284 */ 285 unicode->str[0] = 0; 286 unicode->str[length] = 0; 287 unicode->length = length; 288 unicode->hash = -1; 289 unicode->state = 0; 290 unicode->defenc = NULL; 291 return unicode; 292 293 onError: 294 _Py_ForgetReference((PyObject *)unicode); 295 PyObject_Del(unicode); 296 return NULL; 297} 298 299static 300void unicode_dealloc(register PyUnicodeObject *unicode) 301{ 302 switch (PyUnicode_CHECK_INTERNED(unicode)) { 303 case SSTATE_NOT_INTERNED: 304 break; 305 306 case SSTATE_INTERNED_MORTAL: 307 /* revive dead object temporarily for DelItem */ 308 unicode->ob_refcnt = 3; 309 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 310 Py_FatalError( 311 "deletion of interned unicode string failed"); 312 break; 313 314 case SSTATE_INTERNED_IMMORTAL: 315 Py_FatalError("Immortal interned unicode string died."); 316 317 default: 318 Py_FatalError("Inconsistent interned unicode string state."); 319 } 320 321 if (PyUnicode_CheckExact(unicode) && 322 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 323 /* Keep-Alive optimization */ 324 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 325 PyMem_DEL(unicode->str); 326 unicode->str = NULL; 327 unicode->length = 0; 328 } 329 if (unicode->defenc) { 330 Py_DECREF(unicode->defenc); 331 unicode->defenc = NULL; 332 } 333 /* Add to free list */ 334 *(PyUnicodeObject **)unicode = unicode_freelist; 335 unicode_freelist = unicode; 336 unicode_freelist_size++; 337 } 338 else { 339 PyMem_DEL(unicode->str); 340 Py_XDECREF(unicode->defenc); 341 unicode->ob_type->tp_free((PyObject *)unicode); 342 } 343} 344 345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) 346{ 347 register PyUnicodeObject *v; 348 349 /* Argument checks */ 350 if (unicode == NULL) { 351 PyErr_BadInternalCall(); 352 return -1; 353 } 354 v = (PyUnicodeObject *)*unicode; 355 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) { 356 PyErr_BadInternalCall(); 357 return -1; 358 } 359 360 /* Resizing unicode_empty and single character objects is not 361 possible since these are being shared. We simply return a fresh 362 copy with the same Unicode content. */ 363 if (v->length != length && 364 (v == unicode_empty || v->length == 1)) { 365 PyUnicodeObject *w = _PyUnicode_New(length); 366 if (w == NULL) 367 return -1; 368 Py_UNICODE_COPY(w->str, v->str, 369 length < v->length ? length : v->length); 370 Py_DECREF(*unicode); 371 *unicode = (PyObject *)w; 372 return 0; 373 } 374 375 /* Note that we don't have to modify *unicode for unshared Unicode 376 objects, since we can modify them in-place. */ 377 return unicode_resize(v, length); 378} 379 380/* Internal API for use in unicodeobject.c only ! */ 381#define _PyUnicode_Resize(unicodevar, length) \ 382 PyUnicode_Resize(((PyObject **)(unicodevar)), length) 383 384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 385 Py_ssize_t size) 386{ 387 PyUnicodeObject *unicode; 388 389 /* If the Unicode data is known at construction time, we can apply 390 some optimizations which share commonly used objects. */ 391 if (u != NULL) { 392 393 /* Optimization for empty strings */ 394 if (size == 0 && unicode_empty != NULL) { 395 Py_INCREF(unicode_empty); 396 return (PyObject *)unicode_empty; 397 } 398 399 /* Single character Unicode objects in the Latin-1 range are 400 shared when using this constructor */ 401 if (size == 1 && *u < 256) { 402 unicode = unicode_latin1[*u]; 403 if (!unicode) { 404 unicode = _PyUnicode_New(1); 405 if (!unicode) 406 return NULL; 407 unicode->str[0] = *u; 408 unicode_latin1[*u] = unicode; 409 } 410 Py_INCREF(unicode); 411 return (PyObject *)unicode; 412 } 413 } 414 415 unicode = _PyUnicode_New(size); 416 if (!unicode) 417 return NULL; 418 419 /* Copy the Unicode data into the new object */ 420 if (u != NULL) 421 Py_UNICODE_COPY(unicode->str, u, size); 422 423 return (PyObject *)unicode; 424} 425 426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 427{ 428 PyUnicodeObject *unicode; 429 /* If the Unicode data is known at construction time, we can apply 430 some optimizations which share commonly used objects. */ 431 if (u != NULL) { 432 433 /* Optimization for empty strings */ 434 if (size == 0 && unicode_empty != NULL) { 435 Py_INCREF(unicode_empty); 436 return (PyObject *)unicode_empty; 437 } 438 439 /* Single characters are shared when using this constructor */ 440 if (size == 1) { 441 unicode = unicode_latin1[(int)*u]; 442 if (!unicode) { 443 unicode = _PyUnicode_New(1); 444 if (!unicode) 445 return NULL; 446 unicode->str[0] = *u; 447 unicode_latin1[(int)*u] = unicode; 448 } 449 Py_INCREF(unicode); 450 return (PyObject *)unicode; 451 } 452 } 453 454 unicode = _PyUnicode_New(size); 455 if (!unicode) 456 return NULL; 457 458 /* Copy the Unicode data into the new object */ 459 if (u != NULL) { 460 Py_UNICODE *p = unicode->str; 461 while (size--) 462 *p++ = *u++; 463 /* Don't need to write trailing 0 because 464 that's already done by _PyUnicode_New */ 465 } 466 467 return (PyObject *)unicode; 468} 469 470PyObject *PyUnicode_FromString(const char *u) 471{ 472 size_t size = strlen(u); 473 if (size > PY_SSIZE_T_MAX) { 474 PyErr_SetString(PyExc_OverflowError, "input too long"); 475 return NULL; 476 } 477 478 return PyUnicode_FromStringAndSize(u, size); 479} 480 481#ifdef HAVE_WCHAR_H 482 483PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 484 Py_ssize_t size) 485{ 486 PyUnicodeObject *unicode; 487 488 if (w == NULL) { 489 PyErr_BadInternalCall(); 490 return NULL; 491 } 492 493 unicode = _PyUnicode_New(size); 494 if (!unicode) 495 return NULL; 496 497 /* Copy the wchar_t data into the new object */ 498#ifdef HAVE_USABLE_WCHAR_T 499 memcpy(unicode->str, w, size * sizeof(wchar_t)); 500#else 501 { 502 register Py_UNICODE *u; 503 register Py_ssize_t i; 504 u = PyUnicode_AS_UNICODE(unicode); 505 for (i = size; i > 0; i--) 506 *u++ = *w++; 507 } 508#endif 509 510 return (PyObject *)unicode; 511} 512 513static void 514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) 515{ 516 *fmt++ = '%'; 517 if (width) { 518 if (zeropad) 519 *fmt++ = '0'; 520 fmt += sprintf(fmt, "%d", width); 521 } 522 if (precision) 523 fmt += sprintf(fmt, ".%d", precision); 524 if (longflag) 525 *fmt++ = 'l'; 526 else if (size_tflag) { 527 char *f = PY_FORMAT_SIZE_T; 528 while (*f) 529 *fmt++ = *f++; 530 } 531 *fmt++ = c; 532 *fmt = '\0'; 533} 534 535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} 536 537PyObject * 538PyUnicode_FromFormatV(const char *format, va_list vargs) 539{ 540 va_list count; 541 Py_ssize_t callcount = 0; 542 PyObject **callresults = NULL; 543 PyObject **callresult = NULL; 544 Py_ssize_t n = 0; 545 int width = 0; 546 int precision = 0; 547 int zeropad; 548 const char* f; 549 Py_UNICODE *s; 550 PyObject *string; 551 /* used by sprintf */ 552 char buffer[21]; 553 /* use abuffer instead of buffer, if we need more space 554 * (which can happen if there's a format specifier with width). */ 555 char *abuffer = NULL; 556 char *realbuffer; 557 Py_ssize_t abuffersize = 0; 558 char fmt[60]; /* should be enough for %0width.precisionld */ 559 const char *copy; 560 561#ifdef VA_LIST_IS_ARRAY 562 Py_MEMCPY(count, vargs, sizeof(va_list)); 563#else 564#ifdef __va_copy 565 __va_copy(count, vargs); 566#else 567 count = vargs; 568#endif 569#endif 570 /* step 1: count the number of %S/%R format specifications 571 * (we call PyObject_Unicode()/PyObject_Repr() for these objects 572 * once during step 3 and put the result in an array) */ 573 for (f = format; *f; f++) { 574 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) 575 ++callcount; 576 } 577 /* step 2: allocate memory for the results of 578 * PyObject_Unicode()/PyObject_Repr() calls */ 579 if (callcount) { 580 callresults = PyMem_Malloc(sizeof(PyObject *)*callcount); 581 if (!callresults) { 582 PyErr_NoMemory(); 583 return NULL; 584 } 585 callresult = callresults; 586 } 587 /* step 3: figure out how large a buffer we need */ 588 for (f = format; *f; f++) { 589 if (*f == '%') { 590 const char* p = f; 591 width = 0; 592 while (isdigit(Py_CHARMASK(*f))) 593 width = (width*10) + *f++ - '0'; 594 while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) 595 ; 596 597 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since 598 * they don't affect the amount of space we reserve. 599 */ 600 if ((*f == 'l' || *f == 'z') && 601 (f[1] == 'd' || f[1] == 'u')) 602 ++f; 603 604 switch (*f) { 605 case 'c': 606 (void)va_arg(count, int); 607 /* fall through... */ 608 case '%': 609 n++; 610 break; 611 case 'd': case 'u': case 'i': case 'x': 612 (void) va_arg(count, int); 613 /* 20 bytes is enough to hold a 64-bit 614 integer. Decimal takes the most space. 615 This isn't enough for octal. 616 If a width is specified we need more 617 (which we allocate later). */ 618 if (width < 20) 619 width = 20; 620 n += width; 621 if (abuffersize < width) 622 abuffersize = width; 623 break; 624 case 's': 625 n += strlen(va_arg(count, char*)); 626 break; 627 case 'U': 628 { 629 PyObject *obj = va_arg(count, PyObject *); 630 assert(obj && PyUnicode_Check(obj)); 631 n += PyUnicode_GET_SIZE(obj); 632 break; 633 } 634 case 'V': 635 { 636 PyObject *obj = va_arg(count, PyObject *); 637 const char *str = va_arg(count, const char *); 638 assert(obj || str); 639 assert(!obj || PyUnicode_Check(obj)); 640 if (obj) 641 n += PyUnicode_GET_SIZE(obj); 642 else 643 n += strlen(str); 644 break; 645 } 646 case 'S': 647 { 648 PyObject *obj = va_arg(count, PyObject *); 649 PyObject *str; 650 assert(obj); 651 str = PyObject_Unicode(obj); 652 if (!str) 653 goto fail; 654 n += PyUnicode_GET_SIZE(str); 655 /* Remember the str and switch to the next slot */ 656 *callresult++ = str; 657 break; 658 } 659 case 'R': 660 { 661 PyObject *obj = va_arg(count, PyObject *); 662 PyObject *repr; 663 assert(obj); 664 repr = PyObject_Repr(obj); 665 if (!repr) 666 goto fail; 667 n += PyUnicode_GET_SIZE(repr); 668 /* Remember the repr and switch to the next slot */ 669 *callresult++ = repr; 670 break; 671 } 672 case 'p': 673 (void) va_arg(count, int); 674 /* maximum 64-bit pointer representation: 675 * 0xffffffffffffffff 676 * so 19 characters is enough. 677 * XXX I count 18 -- what's the extra for? 678 */ 679 n += 19; 680 break; 681 default: 682 /* if we stumble upon an unknown 683 formatting code, copy the rest of 684 the format string to the output 685 string. (we cannot just skip the 686 code, since there's no way to know 687 what's in the argument list) */ 688 n += strlen(p); 689 goto expand; 690 } 691 } else 692 n++; 693 } 694 expand: 695 if (abuffersize > 20) { 696 abuffer = PyMem_Malloc(abuffersize); 697 if (!abuffer) { 698 PyErr_NoMemory(); 699 goto fail; 700 } 701 realbuffer = abuffer; 702 } 703 else 704 realbuffer = buffer; 705 /* step 4: fill the buffer */ 706 /* Since we've analyzed how much space we need for the worst case, 707 we don't have to resize the string. 708 There can be no errors beyond this point. */ 709 string = PyUnicode_FromUnicode(NULL, n); 710 if (!string) 711 goto fail; 712 713 s = PyUnicode_AS_UNICODE(string); 714 callresult = callresults; 715 716 for (f = format; *f; f++) { 717 if (*f == '%') { 718 const char* p = f++; 719 int longflag = 0; 720 int size_tflag = 0; 721 zeropad = (*f == '0'); 722 /* parse the width.precision part */ 723 width = 0; 724 while (isdigit(Py_CHARMASK(*f))) 725 width = (width*10) + *f++ - '0'; 726 precision = 0; 727 if (*f == '.') { 728 f++; 729 while (isdigit(Py_CHARMASK(*f))) 730 precision = (precision*10) + *f++ - '0'; 731 } 732 /* handle the long flag, but only for %ld and %lu. 733 others can be added when necessary. */ 734 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { 735 longflag = 1; 736 ++f; 737 } 738 /* handle the size_t flag. */ 739 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { 740 size_tflag = 1; 741 ++f; 742 } 743 744 switch (*f) { 745 case 'c': 746 *s++ = va_arg(vargs, int); 747 break; 748 case 'd': 749 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); 750 if (longflag) 751 sprintf(realbuffer, fmt, va_arg(vargs, long)); 752 else if (size_tflag) 753 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); 754 else 755 sprintf(realbuffer, fmt, va_arg(vargs, int)); 756 appendstring(realbuffer); 757 break; 758 case 'u': 759 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); 760 if (longflag) 761 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); 762 else if (size_tflag) 763 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); 764 else 765 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); 766 appendstring(realbuffer); 767 break; 768 case 'i': 769 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); 770 sprintf(realbuffer, fmt, va_arg(vargs, int)); 771 appendstring(realbuffer); 772 break; 773 case 'x': 774 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); 775 sprintf(realbuffer, fmt, va_arg(vargs, int)); 776 appendstring(realbuffer); 777 break; 778 case 's': 779 p = va_arg(vargs, char*); 780 appendstring(p); 781 break; 782 case 'U': 783 { 784 PyObject *obj = va_arg(vargs, PyObject *); 785 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 786 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 787 s += size; 788 break; 789 } 790 case 'V': 791 { 792 PyObject *obj = va_arg(vargs, PyObject *); 793 const char *str = va_arg(vargs, const char *); 794 if (obj) { 795 Py_ssize_t size = PyUnicode_GET_SIZE(obj); 796 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); 797 s += size; 798 } else { 799 appendstring(str); 800 } 801 break; 802 } 803 case 'S': 804 case 'R': 805 { 806 Py_UNICODE *ucopy; 807 Py_ssize_t usize; 808 Py_ssize_t upos; 809 /* unused, since we already have the result */ 810 (void) va_arg(vargs, PyObject *); 811 ucopy = PyUnicode_AS_UNICODE(*callresult); 812 usize = PyUnicode_GET_SIZE(*callresult); 813 for (upos = 0; upos<usize;) 814 *s++ = ucopy[upos++]; 815 /* We're done with the unicode()/repr() => forget it */ 816 Py_DECREF(*callresult); 817 /* switch to next unicode()/repr() result */ 818 ++callresult; 819 break; 820 } 821 case 'p': 822 sprintf(buffer, "%p", va_arg(vargs, void*)); 823 /* %p is ill-defined: ensure leading 0x. */ 824 if (buffer[1] == 'X') 825 buffer[1] = 'x'; 826 else if (buffer[1] != 'x') { 827 memmove(buffer+2, buffer, strlen(buffer)+1); 828 buffer[0] = '0'; 829 buffer[1] = 'x'; 830 } 831 appendstring(buffer); 832 break; 833 case '%': 834 *s++ = '%'; 835 break; 836 default: 837 appendstring(p); 838 goto end; 839 } 840 } else 841 *s++ = *f; 842 } 843 844 end: 845 if (callresults) 846 PyMem_Free(callresults); 847 if (abuffer) 848 PyMem_Free(abuffer); 849 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); 850 return string; 851 fail: 852 if (callresults) { 853 PyObject **callresult2 = callresults; 854 while (callresult2 <= callresult) { 855 Py_DECREF(*callresult2); 856 ++callresult2; 857 } 858 PyMem_Free(callresults); 859 } 860 if (abuffer) 861 PyMem_Free(abuffer); 862 return NULL; 863} 864 865#undef appendstring 866 867PyObject * 868PyUnicode_FromFormat(const char *format, ...) 869{ 870 PyObject* ret; 871 va_list vargs; 872 873#ifdef HAVE_STDARG_PROTOTYPES 874 va_start(vargs, format); 875#else 876 va_start(vargs); 877#endif 878 ret = PyUnicode_FromFormatV(format, vargs); 879 va_end(vargs); 880 return ret; 881} 882 883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, 884 wchar_t *w, 885 Py_ssize_t size) 886{ 887 if (unicode == NULL) { 888 PyErr_BadInternalCall(); 889 return -1; 890 } 891 892 /* If possible, try to copy the 0-termination as well */ 893 if (size > PyUnicode_GET_SIZE(unicode)) 894 size = PyUnicode_GET_SIZE(unicode) + 1; 895 896#ifdef HAVE_USABLE_WCHAR_T 897 memcpy(w, unicode->str, size * sizeof(wchar_t)); 898#else 899 { 900 register Py_UNICODE *u; 901 register Py_ssize_t i; 902 u = PyUnicode_AS_UNICODE(unicode); 903 for (i = size; i > 0; i--) 904 *w++ = *u++; 905 } 906#endif 907 908 if (size > PyUnicode_GET_SIZE(unicode)) 909 return PyUnicode_GET_SIZE(unicode); 910 else 911 return size; 912} 913 914#endif 915 916PyObject *PyUnicode_FromOrdinal(int ordinal) 917{ 918 Py_UNICODE s[1]; 919 920#ifdef Py_UNICODE_WIDE 921 if (ordinal < 0 || ordinal > 0x10ffff) { 922 PyErr_SetString(PyExc_ValueError, 923 "chr() arg not in range(0x110000) " 924 "(wide Python build)"); 925 return NULL; 926 } 927#else 928 if (ordinal < 0 || ordinal > 0xffff) { 929 PyErr_SetString(PyExc_ValueError, 930 "chr() arg not in range(0x10000) " 931 "(narrow Python build)"); 932 return NULL; 933 } 934#endif 935 936 s[0] = (Py_UNICODE)ordinal; 937 return PyUnicode_FromUnicode(s, 1); 938} 939 940PyObject *PyUnicode_FromObject(register PyObject *obj) 941{ 942 /* XXX Perhaps we should make this API an alias of 943 PyObject_Unicode() instead ?! */ 944 if (PyUnicode_CheckExact(obj)) { 945 Py_INCREF(obj); 946 return obj; 947 } 948 if (PyUnicode_Check(obj)) { 949 /* For a Unicode subtype that's not a Unicode object, 950 return a true Unicode object with the same data. */ 951 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), 952 PyUnicode_GET_SIZE(obj)); 953 } 954 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 955} 956 957PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 958 const char *encoding, 959 const char *errors) 960{ 961 const char *s = NULL; 962 Py_ssize_t len; 963 PyObject *v; 964 965 if (obj == NULL) { 966 PyErr_BadInternalCall(); 967 return NULL; 968 } 969 970#if 0 971 /* For b/w compatibility we also accept Unicode objects provided 972 that no encodings is given and then redirect to 973 PyObject_Unicode() which then applies the additional logic for 974 Unicode subclasses. 975 976 NOTE: This API should really only be used for object which 977 represent *encoded* Unicode ! 978 979 */ 980 if (PyUnicode_Check(obj)) { 981 if (encoding) { 982 PyErr_SetString(PyExc_TypeError, 983 "decoding Unicode is not supported"); 984 return NULL; 985 } 986 return PyObject_Unicode(obj); 987 } 988#else 989 if (PyUnicode_Check(obj)) { 990 PyErr_SetString(PyExc_TypeError, 991 "decoding Unicode is not supported"); 992 return NULL; 993 } 994#endif 995 996 /* Coerce object */ 997 if (PyString_Check(obj)) { 998 s = PyString_AS_STRING(obj); 999 len = PyString_GET_SIZE(obj); 1000 } 1001 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 1002 /* Overwrite the error message with something more useful in 1003 case of a TypeError. */ 1004 if (PyErr_ExceptionMatches(PyExc_TypeError)) 1005 PyErr_Format(PyExc_TypeError, 1006 "coercing to Unicode: need string or buffer, " 1007 "%.80s found", 1008 obj->ob_type->tp_name); 1009 goto onError; 1010 } 1011 1012 /* Convert to Unicode */ 1013 if (len == 0) { 1014 Py_INCREF(unicode_empty); 1015 v = (PyObject *)unicode_empty; 1016 } 1017 else 1018 v = PyUnicode_Decode(s, len, encoding, errors); 1019 1020 return v; 1021 1022 onError: 1023 return NULL; 1024} 1025 1026PyObject *PyUnicode_Decode(const char *s, 1027 Py_ssize_t size, 1028 const char *encoding, 1029 const char *errors) 1030{ 1031 PyObject *buffer = NULL, *unicode; 1032 1033 if (encoding == NULL) 1034 encoding = PyUnicode_GetDefaultEncoding(); 1035 1036 /* Shortcuts for common default encodings */ 1037 if (strcmp(encoding, "utf-8") == 0) 1038 return PyUnicode_DecodeUTF8(s, size, errors); 1039 else if (strcmp(encoding, "latin-1") == 0) 1040 return PyUnicode_DecodeLatin1(s, size, errors); 1041#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1042 else if (strcmp(encoding, "mbcs") == 0) 1043 return PyUnicode_DecodeMBCS(s, size, errors); 1044#endif 1045 else if (strcmp(encoding, "ascii") == 0) 1046 return PyUnicode_DecodeASCII(s, size, errors); 1047 1048 /* Decode via the codec registry */ 1049 buffer = PyBuffer_FromMemory((void *)s, size); 1050 if (buffer == NULL) 1051 goto onError; 1052 unicode = PyCodec_Decode(buffer, encoding, errors); 1053 if (unicode == NULL) 1054 goto onError; 1055 if (!PyUnicode_Check(unicode)) { 1056 PyErr_Format(PyExc_TypeError, 1057 "decoder did not return an unicode object (type=%.400s)", 1058 unicode->ob_type->tp_name); 1059 Py_DECREF(unicode); 1060 goto onError; 1061 } 1062 Py_DECREF(buffer); 1063 return unicode; 1064 1065 onError: 1066 Py_XDECREF(buffer); 1067 return NULL; 1068} 1069 1070PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, 1071 const char *encoding, 1072 const char *errors) 1073{ 1074 PyObject *v; 1075 1076 if (!PyUnicode_Check(unicode)) { 1077 PyErr_BadArgument(); 1078 goto onError; 1079 } 1080 1081 if (encoding == NULL) 1082 encoding = PyUnicode_GetDefaultEncoding(); 1083 1084 /* Decode via the codec registry */ 1085 v = PyCodec_Decode(unicode, encoding, errors); 1086 if (v == NULL) 1087 goto onError; 1088 return v; 1089 1090 onError: 1091 return NULL; 1092} 1093 1094PyObject *PyUnicode_Encode(const Py_UNICODE *s, 1095 Py_ssize_t size, 1096 const char *encoding, 1097 const char *errors) 1098{ 1099 PyObject *v, *unicode; 1100 1101 unicode = PyUnicode_FromUnicode(s, size); 1102 if (unicode == NULL) 1103 return NULL; 1104 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 1105 Py_DECREF(unicode); 1106 return v; 1107} 1108 1109PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, 1110 const char *encoding, 1111 const char *errors) 1112{ 1113 PyObject *v; 1114 1115 if (!PyUnicode_Check(unicode)) { 1116 PyErr_BadArgument(); 1117 goto onError; 1118 } 1119 1120 if (encoding == NULL) 1121 encoding = PyUnicode_GetDefaultEncoding(); 1122 1123 /* Encode via the codec registry */ 1124 v = PyCodec_Encode(unicode, encoding, errors); 1125 if (v == NULL) 1126 goto onError; 1127 return v; 1128 1129 onError: 1130 return NULL; 1131} 1132 1133PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 1134 const char *encoding, 1135 const char *errors) 1136{ 1137 PyObject *v; 1138 1139 if (!PyUnicode_Check(unicode)) { 1140 PyErr_BadArgument(); 1141 goto onError; 1142 } 1143 1144 if (encoding == NULL) 1145 encoding = PyUnicode_GetDefaultEncoding(); 1146 1147 /* Shortcuts for common default encodings */ 1148 if (errors == NULL) { 1149 if (strcmp(encoding, "utf-8") == 0) 1150 return PyUnicode_AsUTF8String(unicode); 1151 else if (strcmp(encoding, "latin-1") == 0) 1152 return PyUnicode_AsLatin1String(unicode); 1153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 1154 else if (strcmp(encoding, "mbcs") == 0) 1155 return PyUnicode_AsMBCSString(unicode); 1156#endif 1157 else if (strcmp(encoding, "ascii") == 0) 1158 return PyUnicode_AsASCIIString(unicode); 1159 } 1160 1161 /* Encode via the codec registry */ 1162 v = PyCodec_Encode(unicode, encoding, errors); 1163 if (v == NULL) 1164 goto onError; 1165 if (!PyBytes_Check(v)) { 1166 if (PyString_Check(v)) { 1167 /* Old codec, turn it into bytes */ 1168 PyObject *b = PyBytes_FromObject(v); 1169 Py_DECREF(v); 1170 return b; 1171 } 1172 PyErr_Format(PyExc_TypeError, 1173 "encoder did not return a bytes object " 1174 "(type=%.400s, encoding=%.20s, errors=%.20s)", 1175 v->ob_type->tp_name, 1176 encoding ? encoding : "NULL", 1177 errors ? errors : "NULL"); 1178 Py_DECREF(v); 1179 goto onError; 1180 } 1181 return v; 1182 1183 onError: 1184 return NULL; 1185} 1186 1187PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 1188 const char *errors) 1189{ 1190 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 1191 PyObject *b; 1192 if (v) 1193 return v; 1194 if (errors != NULL) 1195 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); 1196 if (errors == NULL) { 1197 b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1198 PyUnicode_GET_SIZE(unicode), 1199 NULL); 1200 } 1201 else { 1202 b = PyUnicode_AsEncodedString(unicode, NULL, errors); 1203 } 1204 if (!b) 1205 return NULL; 1206 v = PyString_FromStringAndSize(PyBytes_AsString(b), 1207 PyBytes_Size(b)); 1208 Py_DECREF(b); 1209 if (!errors) { 1210 Py_XINCREF(v); 1211 ((PyUnicodeObject *)unicode)->defenc = v; 1212 } 1213 return v; 1214} 1215 1216char* 1217PyUnicode_AsString(PyObject *unicode) 1218{ 1219 assert(PyUnicode_Check(unicode)); 1220 unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL); 1221 if (!unicode) 1222 return NULL; 1223 return PyString_AsString(unicode); 1224} 1225 1226Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 1227{ 1228 if (!PyUnicode_Check(unicode)) { 1229 PyErr_BadArgument(); 1230 goto onError; 1231 } 1232 return PyUnicode_AS_UNICODE(unicode); 1233 1234 onError: 1235 return NULL; 1236} 1237 1238Py_ssize_t PyUnicode_GetSize(PyObject *unicode) 1239{ 1240 if (!PyUnicode_Check(unicode)) { 1241 PyErr_BadArgument(); 1242 goto onError; 1243 } 1244 return PyUnicode_GET_SIZE(unicode); 1245 1246 onError: 1247 return -1; 1248} 1249 1250const char *PyUnicode_GetDefaultEncoding(void) 1251{ 1252 return unicode_default_encoding; 1253} 1254 1255int PyUnicode_SetDefaultEncoding(const char *encoding) 1256{ 1257 if (strcmp(encoding, unicode_default_encoding) != 0) { 1258 PyErr_Format(PyExc_ValueError, 1259 "Can only set default encoding to %s", 1260 unicode_default_encoding); 1261 return -1; 1262 } 1263 return 0; 1264} 1265 1266/* error handling callback helper: 1267 build arguments, call the callback and check the arguments, 1268 if no exception occurred, copy the replacement to the output 1269 and adjust various state variables. 1270 return 0 on success, -1 on error 1271*/ 1272 1273static 1274int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 1275 const char *encoding, const char *reason, 1276 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 1277 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 1278{ 1279 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; 1280 1281 PyObject *restuple = NULL; 1282 PyObject *repunicode = NULL; 1283 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 1284 Py_ssize_t requiredsize; 1285 Py_ssize_t newpos; 1286 Py_UNICODE *repptr; 1287 Py_ssize_t repsize; 1288 int res = -1; 1289 1290 if (*errorHandler == NULL) { 1291 *errorHandler = PyCodec_LookupError(errors); 1292 if (*errorHandler == NULL) 1293 goto onError; 1294 } 1295 1296 if (*exceptionObject == NULL) { 1297 *exceptionObject = PyUnicodeDecodeError_Create( 1298 encoding, input, insize, *startinpos, *endinpos, reason); 1299 if (*exceptionObject == NULL) 1300 goto onError; 1301 } 1302 else { 1303 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) 1304 goto onError; 1305 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) 1306 goto onError; 1307 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 1308 goto onError; 1309 } 1310 1311 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 1312 if (restuple == NULL) 1313 goto onError; 1314 if (!PyTuple_Check(restuple)) { 1315 PyErr_Format(PyExc_TypeError, &argparse[4]); 1316 goto onError; 1317 } 1318 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 1319 goto onError; 1320 if (newpos<0) 1321 newpos = insize+newpos; 1322 if (newpos<0 || newpos>insize) { 1323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 1324 goto onError; 1325 } 1326 1327 /* need more space? (at least enough for what we 1328 have+the replacement+the rest of the string (starting 1329 at the new input position), so we won't have to check space 1330 when there are no errors in the rest of the string) */ 1331 repptr = PyUnicode_AS_UNICODE(repunicode); 1332 repsize = PyUnicode_GET_SIZE(repunicode); 1333 requiredsize = *outpos + repsize + insize-newpos; 1334 if (requiredsize > outsize) { 1335 if (requiredsize<2*outsize) 1336 requiredsize = 2*outsize; 1337 if (PyUnicode_Resize(output, requiredsize) < 0) 1338 goto onError; 1339 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 1340 } 1341 *endinpos = newpos; 1342 *inptr = input + newpos; 1343 Py_UNICODE_COPY(*outptr, repptr, repsize); 1344 *outptr += repsize; 1345 *outpos += repsize; 1346 /* we made it! */ 1347 res = 0; 1348 1349 onError: 1350 Py_XDECREF(restuple); 1351 return res; 1352} 1353 1354/* --- UTF-7 Codec -------------------------------------------------------- */ 1355 1356/* see RFC2152 for details */ 1357 1358static 1359char utf7_special[128] = { 1360 /* indicate whether a UTF-7 character is special i.e. cannot be directly 1361 encoded: 1362 0 - not special 1363 1 - special 1364 2 - whitespace (optional) 1365 3 - RFC2152 Set O (optional) */ 1366 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1367 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1368 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, 1369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 1370 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1371 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 1372 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, 1374 1375}; 1376 1377/* Note: The comparison (c) <= 0 is a trick to work-around gcc 1378 warnings about the comparison always being false; since 1379 utf7_special[0] is 1, we can safely make that one comparison 1380 true */ 1381 1382#define SPECIAL(c, encodeO, encodeWS) \ 1383 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ 1384 (encodeWS && (utf7_special[(c)] == 2)) || \ 1385 (encodeO && (utf7_special[(c)] == 3))) 1386 1387#define B64(n) \ 1388 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 1389#define B64CHAR(c) \ 1390 (isalnum(c) || (c) == '+' || (c) == '/') 1391#define UB64(c) \ 1392 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ 1393 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) 1394 1395#define ENCODE(out, ch, bits) \ 1396 while (bits >= 6) { \ 1397 *out++ = B64(ch >> (bits-6)); \ 1398 bits -= 6; \ 1399 } 1400 1401#define DECODE(out, ch, bits, surrogate) \ 1402 while (bits >= 16) { \ 1403 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ 1404 bits -= 16; \ 1405 if (surrogate) { \ 1406 /* We have already generated an error for the high surrogate \ 1407 so let's not bother seeing if the low surrogate is correct or not */ \ 1408 surrogate = 0; \ 1409 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ 1410 /* This is a surrogate pair. Unfortunately we can't represent \ 1411 it in a 16-bit character */ \ 1412 surrogate = 1; \ 1413 errmsg = "code pairs are not supported"; \ 1414 goto utf7Error; \ 1415 } else { \ 1416 *out++ = outCh; \ 1417 } \ 1418 } 1419 1420PyObject *PyUnicode_DecodeUTF7(const char *s, 1421 Py_ssize_t size, 1422 const char *errors) 1423{ 1424 const char *starts = s; 1425 Py_ssize_t startinpos; 1426 Py_ssize_t endinpos; 1427 Py_ssize_t outpos; 1428 const char *e; 1429 PyUnicodeObject *unicode; 1430 Py_UNICODE *p; 1431 const char *errmsg = ""; 1432 int inShift = 0; 1433 unsigned int bitsleft = 0; 1434 unsigned long charsleft = 0; 1435 int surrogate = 0; 1436 PyObject *errorHandler = NULL; 1437 PyObject *exc = NULL; 1438 1439 unicode = _PyUnicode_New(size); 1440 if (!unicode) 1441 return NULL; 1442 if (size == 0) 1443 return (PyObject *)unicode; 1444 1445 p = unicode->str; 1446 e = s + size; 1447 1448 while (s < e) { 1449 Py_UNICODE ch; 1450 restart: 1451 ch = *s; 1452 1453 if (inShift) { 1454 if ((ch == '-') || !B64CHAR(ch)) { 1455 inShift = 0; 1456 s++; 1457 1458 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1459 if (bitsleft >= 6) { 1460 /* The shift sequence has a partial character in it. If 1461 bitsleft < 6 then we could just classify it as padding 1462 but that is not the case here */ 1463 1464 errmsg = "partial character in shift sequence"; 1465 goto utf7Error; 1466 } 1467 /* According to RFC2152 the remaining bits should be zero. We 1468 choose to signal an error/insert a replacement character 1469 here so indicate the potential of a misencoded character. */ 1470 1471 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ 1472 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { 1473 errmsg = "non-zero padding bits in shift sequence"; 1474 goto utf7Error; 1475 } 1476 1477 if (ch == '-') { 1478 if ((s < e) && (*(s) == '-')) { 1479 *p++ = '-'; 1480 inShift = 1; 1481 } 1482 } else if (SPECIAL(ch,0,0)) { 1483 errmsg = "unexpected special character"; 1484 goto utf7Error; 1485 } else { 1486 *p++ = ch; 1487 } 1488 } else { 1489 charsleft = (charsleft << 6) | UB64(ch); 1490 bitsleft += 6; 1491 s++; 1492 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); 1493 } 1494 } 1495 else if ( ch == '+' ) { 1496 startinpos = s-starts; 1497 s++; 1498 if (s < e && *s == '-') { 1499 s++; 1500 *p++ = '+'; 1501 } else 1502 { 1503 inShift = 1; 1504 bitsleft = 0; 1505 } 1506 } 1507 else if (SPECIAL(ch,0,0)) { 1508 errmsg = "unexpected special character"; 1509 s++; 1510 goto utf7Error; 1511 } 1512 else { 1513 *p++ = ch; 1514 s++; 1515 } 1516 continue; 1517 utf7Error: 1518 outpos = p-PyUnicode_AS_UNICODE(unicode); 1519 endinpos = s-starts; 1520 if (unicode_decode_call_errorhandler( 1521 errors, &errorHandler, 1522 "utf7", errmsg, 1523 starts, size, &startinpos, &endinpos, &exc, &s, 1524 (PyObject **)&unicode, &outpos, &p)) 1525 goto onError; 1526 } 1527 1528 if (inShift) { 1529 outpos = p-PyUnicode_AS_UNICODE(unicode); 1530 endinpos = size; 1531 if (unicode_decode_call_errorhandler( 1532 errors, &errorHandler, 1533 "utf7", "unterminated shift sequence", 1534 starts, size, &startinpos, &endinpos, &exc, &s, 1535 (PyObject **)&unicode, &outpos, &p)) 1536 goto onError; 1537 if (s < e) 1538 goto restart; 1539 } 1540 1541 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 1542 goto onError; 1543 1544 Py_XDECREF(errorHandler); 1545 Py_XDECREF(exc); 1546 return (PyObject *)unicode; 1547 1548onError: 1549 Py_XDECREF(errorHandler); 1550 Py_XDECREF(exc); 1551 Py_DECREF(unicode); 1552 return NULL; 1553} 1554 1555 1556PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, 1557 Py_ssize_t size, 1558 int encodeSetO, 1559 int encodeWhiteSpace, 1560 const char *errors) 1561{ 1562 PyObject *v; 1563 /* It might be possible to tighten this worst case */ 1564 Py_ssize_t cbAllocated = 5 * size; 1565 int inShift = 0; 1566 Py_ssize_t i = 0; 1567 unsigned int bitsleft = 0; 1568 unsigned long charsleft = 0; 1569 char * out; 1570 char * start; 1571 1572 if (size == 0) 1573 return PyBytes_FromStringAndSize(NULL, 0); 1574 1575 v = PyBytes_FromStringAndSize(NULL, cbAllocated); 1576 if (v == NULL) 1577 return NULL; 1578 1579 start = out = PyBytes_AS_STRING(v); 1580 for (;i < size; ++i) { 1581 Py_UNICODE ch = s[i]; 1582 1583 if (!inShift) { 1584 if (ch == '+') { 1585 *out++ = '+'; 1586 *out++ = '-'; 1587 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1588 charsleft = ch; 1589 bitsleft = 16; 1590 *out++ = '+'; 1591 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1592 inShift = bitsleft > 0; 1593 } else { 1594 *out++ = (char) ch; 1595 } 1596 } else { 1597 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { 1598 *out++ = B64(charsleft << (6-bitsleft)); 1599 charsleft = 0; 1600 bitsleft = 0; 1601 /* Characters not in the BASE64 set implicitly unshift the sequence 1602 so no '-' is required, except if the character is itself a '-' */ 1603 if (B64CHAR(ch) || ch == '-') { 1604 *out++ = '-'; 1605 } 1606 inShift = 0; 1607 *out++ = (char) ch; 1608 } else { 1609 bitsleft += 16; 1610 charsleft = (charsleft << 16) | ch; 1611 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); 1612 1613 /* If the next character is special then we dont' need to terminate 1614 the shift sequence. If the next character is not a BASE64 character 1615 or '-' then the shift sequence will be terminated implicitly and we 1616 don't have to insert a '-'. */ 1617 1618 if (bitsleft == 0) { 1619 if (i + 1 < size) { 1620 Py_UNICODE ch2 = s[i+1]; 1621 1622 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { 1623 1624 } else if (B64CHAR(ch2) || ch2 == '-') { 1625 *out++ = '-'; 1626 inShift = 0; 1627 } else { 1628 inShift = 0; 1629 } 1630 1631 } 1632 else { 1633 *out++ = '-'; 1634 inShift = 0; 1635 } 1636 } 1637 } 1638 } 1639 } 1640 if (bitsleft) { 1641 *out++= B64(charsleft << (6-bitsleft) ); 1642 *out++ = '-'; 1643 } 1644 1645 if (PyBytes_Resize(v, out - start)) { 1646 Py_DECREF(v); 1647 return NULL; 1648 } 1649 return v; 1650} 1651 1652#undef SPECIAL 1653#undef B64 1654#undef B64CHAR 1655#undef UB64 1656#undef ENCODE 1657#undef DECODE 1658 1659/* --- UTF-8 Codec -------------------------------------------------------- */ 1660 1661static 1662char utf8_code_length[256] = { 1663 /* Map UTF-8 encoded prefix byte to sequence length. zero means 1664 illegal prefix. see RFC 2279 for details */ 1665 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1666 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1667 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1668 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1669 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1670 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1671 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1672 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1677 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1678 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1679 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1680 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 1681}; 1682 1683PyObject *PyUnicode_DecodeUTF8(const char *s, 1684 Py_ssize_t size, 1685 const char *errors) 1686{ 1687 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 1688} 1689 1690PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, 1691 Py_ssize_t size, 1692 const char *errors, 1693 Py_ssize_t *consumed) 1694{ 1695 const char *starts = s; 1696 int n; 1697 Py_ssize_t startinpos; 1698 Py_ssize_t endinpos; 1699 Py_ssize_t outpos; 1700 const char *e; 1701 PyUnicodeObject *unicode; 1702 Py_UNICODE *p; 1703 const char *errmsg = ""; 1704 PyObject *errorHandler = NULL; 1705 PyObject *exc = NULL; 1706 1707 /* Note: size will always be longer than the resulting Unicode 1708 character count */ 1709 unicode = _PyUnicode_New(size); 1710 if (!unicode) 1711 return NULL; 1712 if (size == 0) { 1713 if (consumed) 1714 *consumed = 0; 1715 return (PyObject *)unicode; 1716 } 1717 1718 /* Unpack UTF-8 encoded data */ 1719 p = unicode->str; 1720 e = s + size; 1721 1722 while (s < e) { 1723 Py_UCS4 ch = (unsigned char)*s; 1724 1725 if (ch < 0x80) { 1726 *p++ = (Py_UNICODE)ch; 1727 s++; 1728 continue; 1729 } 1730 1731 n = utf8_code_length[ch]; 1732 1733 if (s + n > e) { 1734 if (consumed) 1735 break; 1736 else { 1737 errmsg = "unexpected end of data"; 1738 startinpos = s-starts; 1739 endinpos = size; 1740 goto utf8Error; 1741 } 1742 } 1743 1744 switch (n) { 1745 1746 case 0: 1747 errmsg = "unexpected code byte"; 1748 startinpos = s-starts; 1749 endinpos = startinpos+1; 1750 goto utf8Error; 1751 1752 case 1: 1753 errmsg = "internal error"; 1754 startinpos = s-starts; 1755 endinpos = startinpos+1; 1756 goto utf8Error; 1757 1758 case 2: 1759 if ((s[1] & 0xc0) != 0x80) { 1760 errmsg = "invalid data"; 1761 startinpos = s-starts; 1762 endinpos = startinpos+2; 1763 goto utf8Error; 1764 } 1765 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 1766 if (ch < 0x80) { 1767 startinpos = s-starts; 1768 endinpos = startinpos+2; 1769 errmsg = "illegal encoding"; 1770 goto utf8Error; 1771 } 1772 else 1773 *p++ = (Py_UNICODE)ch; 1774 break; 1775 1776 case 3: 1777 if ((s[1] & 0xc0) != 0x80 || 1778 (s[2] & 0xc0) != 0x80) { 1779 errmsg = "invalid data"; 1780 startinpos = s-starts; 1781 endinpos = startinpos+3; 1782 goto utf8Error; 1783 } 1784 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 1785 if (ch < 0x0800) { 1786 /* Note: UTF-8 encodings of surrogates are considered 1787 legal UTF-8 sequences; 1788 1789 XXX For wide builds (UCS-4) we should probably try 1790 to recombine the surrogates into a single code 1791 unit. 1792 */ 1793 errmsg = "illegal encoding"; 1794 startinpos = s-starts; 1795 endinpos = startinpos+3; 1796 goto utf8Error; 1797 } 1798 else 1799 *p++ = (Py_UNICODE)ch; 1800 break; 1801 1802 case 4: 1803 if ((s[1] & 0xc0) != 0x80 || 1804 (s[2] & 0xc0) != 0x80 || 1805 (s[3] & 0xc0) != 0x80) { 1806 errmsg = "invalid data"; 1807 startinpos = s-starts; 1808 endinpos = startinpos+4; 1809 goto utf8Error; 1810 } 1811 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 1812 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 1813 /* validate and convert to UTF-16 */ 1814 if ((ch < 0x10000) /* minimum value allowed for 4 1815 byte encoding */ 1816 || (ch > 0x10ffff)) /* maximum value allowed for 1817 UTF-16 */ 1818 { 1819 errmsg = "illegal encoding"; 1820 startinpos = s-starts; 1821 endinpos = startinpos+4; 1822 goto utf8Error; 1823 } 1824#ifdef Py_UNICODE_WIDE 1825 *p++ = (Py_UNICODE)ch; 1826#else 1827 /* compute and append the two surrogates: */ 1828 1829 /* translate from 10000..10FFFF to 0..FFFF */ 1830 ch -= 0x10000; 1831 1832 /* high surrogate = top 10 bits added to D800 */ 1833 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 1834 1835 /* low surrogate = bottom 10 bits added to DC00 */ 1836 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); 1837#endif 1838 break; 1839 1840 default: 1841 /* Other sizes are only needed for UCS-4 */ 1842 errmsg = "unsupported Unicode code range"; 1843 startinpos = s-starts; 1844 endinpos = startinpos+n; 1845 goto utf8Error; 1846 } 1847 s += n; 1848 continue; 1849 1850 utf8Error: 1851 outpos = p-PyUnicode_AS_UNICODE(unicode); 1852 if (unicode_decode_call_errorhandler( 1853 errors, &errorHandler, 1854 "utf8", errmsg, 1855 starts, size, &startinpos, &endinpos, &exc, &s, 1856 (PyObject **)&unicode, &outpos, &p)) 1857 goto onError; 1858 } 1859 if (consumed) 1860 *consumed = s-starts; 1861 1862 /* Adjust length */ 1863 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 1864 goto onError; 1865 1866 Py_XDECREF(errorHandler); 1867 Py_XDECREF(exc); 1868 return (PyObject *)unicode; 1869 1870onError: 1871 Py_XDECREF(errorHandler); 1872 Py_XDECREF(exc); 1873 Py_DECREF(unicode); 1874 return NULL; 1875} 1876 1877/* Allocation strategy: if the string is short, convert into a stack buffer 1878 and allocate exactly as much space needed at the end. Else allocate the 1879 maximum possible needed (4 result bytes per Unicode character), and return 1880 the excess memory at the end. 1881*/ 1882PyObject * 1883PyUnicode_EncodeUTF8(const Py_UNICODE *s, 1884 Py_ssize_t size, 1885 const char *errors) 1886{ 1887#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 1888 1889 Py_ssize_t i; /* index into s of next input byte */ 1890 PyObject *v; /* result string object */ 1891 char *p; /* next free byte in output buffer */ 1892 Py_ssize_t nallocated; /* number of result bytes allocated */ 1893 Py_ssize_t nneeded; /* number of result bytes needed */ 1894 char stackbuf[MAX_SHORT_UNICHARS * 4]; 1895 1896 assert(s != NULL); 1897 assert(size >= 0); 1898 1899 if (size <= MAX_SHORT_UNICHARS) { 1900 /* Write into the stack buffer; nallocated can't overflow. 1901 * At the end, we'll allocate exactly as much heap space as it 1902 * turns out we need. 1903 */ 1904 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 1905 v = NULL; /* will allocate after we're done */ 1906 p = stackbuf; 1907 } 1908 else { 1909 /* Overallocate on the heap, and give the excess back at the end. */ 1910 nallocated = size * 4; 1911 if (nallocated / 4 != size) /* overflow! */ 1912 return PyErr_NoMemory(); 1913 v = PyBytes_FromStringAndSize(NULL, nallocated); 1914 if (v == NULL) 1915 return NULL; 1916 p = PyBytes_AS_STRING(v); 1917 } 1918 1919 for (i = 0; i < size;) { 1920 Py_UCS4 ch = s[i++]; 1921 1922 if (ch < 0x80) 1923 /* Encode ASCII */ 1924 *p++ = (char) ch; 1925 1926 else if (ch < 0x0800) { 1927 /* Encode Latin-1 */ 1928 *p++ = (char)(0xc0 | (ch >> 6)); 1929 *p++ = (char)(0x80 | (ch & 0x3f)); 1930 } 1931 else { 1932 /* Encode UCS2 Unicode ordinals */ 1933 if (ch < 0x10000) { 1934 /* Special case: check for high surrogate */ 1935 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { 1936 Py_UCS4 ch2 = s[i]; 1937 /* Check for low surrogate and combine the two to 1938 form a UCS4 value */ 1939 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 1940 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; 1941 i++; 1942 goto encodeUCS4; 1943 } 1944 /* Fall through: handles isolated high surrogates */ 1945 } 1946 *p++ = (char)(0xe0 | (ch >> 12)); 1947 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1948 *p++ = (char)(0x80 | (ch & 0x3f)); 1949 continue; 1950 } 1951encodeUCS4: 1952 /* Encode UCS4 Unicode ordinals */ 1953 *p++ = (char)(0xf0 | (ch >> 18)); 1954 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 1955 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 1956 *p++ = (char)(0x80 | (ch & 0x3f)); 1957 } 1958 } 1959 1960 if (v == NULL) { 1961 /* This was stack allocated. */ 1962 nneeded = p - stackbuf; 1963 assert(nneeded <= nallocated); 1964 v = PyBytes_FromStringAndSize(stackbuf, nneeded); 1965 } 1966 else { 1967 /* Cut back to size actually needed. */ 1968 nneeded = p - PyBytes_AS_STRING(v); 1969 assert(nneeded <= nallocated); 1970 PyBytes_Resize(v, nneeded); 1971 } 1972 return v; 1973 1974#undef MAX_SHORT_UNICHARS 1975} 1976 1977PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 1978{ 1979 if (!PyUnicode_Check(unicode)) { 1980 PyErr_BadArgument(); 1981 return NULL; 1982 } 1983 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 1984 PyUnicode_GET_SIZE(unicode), 1985 NULL); 1986} 1987 1988/* --- UTF-16 Codec ------------------------------------------------------- */ 1989 1990PyObject * 1991PyUnicode_DecodeUTF16(const char *s, 1992 Py_ssize_t size, 1993 const char *errors, 1994 int *byteorder) 1995{ 1996 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 1997} 1998 1999PyObject * 2000PyUnicode_DecodeUTF16Stateful(const char *s, 2001 Py_ssize_t size, 2002 const char *errors, 2003 int *byteorder, 2004 Py_ssize_t *consumed) 2005{ 2006 const char *starts = s; 2007 Py_ssize_t startinpos; 2008 Py_ssize_t endinpos; 2009 Py_ssize_t outpos; 2010 PyUnicodeObject *unicode; 2011 Py_UNICODE *p; 2012 const unsigned char *q, *e; 2013 int bo = 0; /* assume native ordering by default */ 2014 const char *errmsg = ""; 2015 /* Offsets from q for retrieving byte pairs in the right order. */ 2016#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2017 int ihi = 1, ilo = 0; 2018#else 2019 int ihi = 0, ilo = 1; 2020#endif 2021 PyObject *errorHandler = NULL; 2022 PyObject *exc = NULL; 2023 2024 /* Note: size will always be longer than the resulting Unicode 2025 character count */ 2026 unicode = _PyUnicode_New(size); 2027 if (!unicode) 2028 return NULL; 2029 if (size == 0) 2030 return (PyObject *)unicode; 2031 2032 /* Unpack UTF-16 encoded data */ 2033 p = unicode->str; 2034 q = (unsigned char *)s; 2035 e = q + size; 2036 2037 if (byteorder) 2038 bo = *byteorder; 2039 2040 /* Check for BOM marks (U+FEFF) in the input and adjust current 2041 byte order setting accordingly. In native mode, the leading BOM 2042 mark is skipped, in all other modes, it is copied to the output 2043 stream as-is (giving a ZWNBSP character). */ 2044 if (bo == 0) { 2045 if (size >= 2) { 2046 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 2047#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2048 if (bom == 0xFEFF) { 2049 q += 2; 2050 bo = -1; 2051 } 2052 else if (bom == 0xFFFE) { 2053 q += 2; 2054 bo = 1; 2055 } 2056#else 2057 if (bom == 0xFEFF) { 2058 q += 2; 2059 bo = 1; 2060 } 2061 else if (bom == 0xFFFE) { 2062 q += 2; 2063 bo = -1; 2064 } 2065#endif 2066 } 2067 } 2068 2069 if (bo == -1) { 2070 /* force LE */ 2071 ihi = 1; 2072 ilo = 0; 2073 } 2074 else if (bo == 1) { 2075 /* force BE */ 2076 ihi = 0; 2077 ilo = 1; 2078 } 2079 2080 while (q < e) { 2081 Py_UNICODE ch; 2082 /* remaining bytes at the end? (size should be even) */ 2083 if (e-q<2) { 2084 if (consumed) 2085 break; 2086 errmsg = "truncated data"; 2087 startinpos = ((const char *)q)-starts; 2088 endinpos = ((const char *)e)-starts; 2089 goto utf16Error; 2090 /* The remaining input chars are ignored if the callback 2091 chooses to skip the input */ 2092 } 2093 ch = (q[ihi] << 8) | q[ilo]; 2094 2095 q += 2; 2096 2097 if (ch < 0xD800 || ch > 0xDFFF) { 2098 *p++ = ch; 2099 continue; 2100 } 2101 2102 /* UTF-16 code pair: */ 2103 if (q >= e) { 2104 errmsg = "unexpected end of data"; 2105 startinpos = (((const char *)q)-2)-starts; 2106 endinpos = ((const char *)e)-starts; 2107 goto utf16Error; 2108 } 2109 if (0xD800 <= ch && ch <= 0xDBFF) { 2110 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 2111 q += 2; 2112 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 2113#ifndef Py_UNICODE_WIDE 2114 *p++ = ch; 2115 *p++ = ch2; 2116#else 2117 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 2118#endif 2119 continue; 2120 } 2121 else { 2122 errmsg = "illegal UTF-16 surrogate"; 2123 startinpos = (((const char *)q)-4)-starts; 2124 endinpos = startinpos+2; 2125 goto utf16Error; 2126 } 2127 2128 } 2129 errmsg = "illegal encoding"; 2130 startinpos = (((const char *)q)-2)-starts; 2131 endinpos = startinpos+2; 2132 /* Fall through to report the error */ 2133 2134 utf16Error: 2135 outpos = p-PyUnicode_AS_UNICODE(unicode); 2136 if (unicode_decode_call_errorhandler( 2137 errors, &errorHandler, 2138 "utf16", errmsg, 2139 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, 2140 (PyObject **)&unicode, &outpos, &p)) 2141 goto onError; 2142 } 2143 2144 if (byteorder) 2145 *byteorder = bo; 2146 2147 if (consumed) 2148 *consumed = (const char *)q-starts; 2149 2150 /* Adjust length */ 2151 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) 2152 goto onError; 2153 2154 Py_XDECREF(errorHandler); 2155 Py_XDECREF(exc); 2156 return (PyObject *)unicode; 2157 2158onError: 2159 Py_DECREF(unicode); 2160 Py_XDECREF(errorHandler); 2161 Py_XDECREF(exc); 2162 return NULL; 2163} 2164 2165PyObject * 2166PyUnicode_EncodeUTF16(const Py_UNICODE *s, 2167 Py_ssize_t size, 2168 const char *errors, 2169 int byteorder) 2170{ 2171 PyObject *v; 2172 unsigned char *p; 2173#ifdef Py_UNICODE_WIDE 2174 int i, pairs; 2175#else 2176 const int pairs = 0; 2177#endif 2178 /* Offsets from p for storing byte pairs in the right order. */ 2179#ifdef BYTEORDER_IS_LITTLE_ENDIAN 2180 int ihi = 1, ilo = 0; 2181#else 2182 int ihi = 0, ilo = 1; 2183#endif 2184 2185#define STORECHAR(CH) \ 2186 do { \ 2187 p[ihi] = ((CH) >> 8) & 0xff; \ 2188 p[ilo] = (CH) & 0xff; \ 2189 p += 2; \ 2190 } while(0) 2191 2192#ifdef Py_UNICODE_WIDE 2193 for (i = pairs = 0; i < size; i++) 2194 if (s[i] >= 0x10000) 2195 pairs++; 2196#endif 2197 v = PyBytes_FromStringAndSize(NULL, 2198 2 * (size + pairs + (byteorder == 0))); 2199 if (v == NULL) 2200 return NULL; 2201 2202 p = (unsigned char *)PyBytes_AS_STRING(v); 2203 if (byteorder == 0) 2204 STORECHAR(0xFEFF); 2205 if (size == 0) 2206 return v; 2207 2208 if (byteorder == -1) { 2209 /* force LE */ 2210 ihi = 1; 2211 ilo = 0; 2212 } 2213 else if (byteorder == 1) { 2214 /* force BE */ 2215 ihi = 0; 2216 ilo = 1; 2217 } 2218 2219 while (size-- > 0) { 2220 Py_UNICODE ch = *s++; 2221 Py_UNICODE ch2 = 0; 2222#ifdef Py_UNICODE_WIDE 2223 if (ch >= 0x10000) { 2224 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 2225 ch = 0xD800 | ((ch-0x10000) >> 10); 2226 } 2227#endif 2228 STORECHAR(ch); 2229 if (ch2) 2230 STORECHAR(ch2); 2231 } 2232 return v; 2233#undef STORECHAR 2234} 2235 2236PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 2237{ 2238 if (!PyUnicode_Check(unicode)) { 2239 PyErr_BadArgument(); 2240 return NULL; 2241 } 2242 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 2243 PyUnicode_GET_SIZE(unicode), 2244 NULL, 2245 0); 2246} 2247 2248/* --- Unicode Escape Codec ----------------------------------------------- */ 2249 2250static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 2251 2252PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 2253 Py_ssize_t size, 2254 const char *errors) 2255{ 2256 const char *starts = s; 2257 Py_ssize_t startinpos; 2258 Py_ssize_t endinpos; 2259 Py_ssize_t outpos; 2260 int i; 2261 PyUnicodeObject *v; 2262 Py_UNICODE *p; 2263 const char *end; 2264 char* message; 2265 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 2266 PyObject *errorHandler = NULL; 2267 PyObject *exc = NULL; 2268 2269 /* Escaped strings will always be longer than the resulting 2270 Unicode string, so we start with size here and then reduce the 2271 length after conversion to the true value. 2272 (but if the error callback returns a long replacement string 2273 we'll have to allocate more space) */ 2274 v = _PyUnicode_New(size); 2275 if (v == NULL) 2276 goto onError; 2277 if (size == 0) 2278 return (PyObject *)v; 2279 2280 p = PyUnicode_AS_UNICODE(v); 2281 end = s + size; 2282 2283 while (s < end) { 2284 unsigned char c; 2285 Py_UNICODE x; 2286 int digits; 2287 2288 /* Non-escape characters are interpreted as Unicode ordinals */ 2289 if (*s != '\\') { 2290 *p++ = (unsigned char) *s++; 2291 continue; 2292 } 2293 2294 startinpos = s-starts; 2295 /* \ - Escapes */ 2296 s++; 2297 switch (*s++) { 2298 2299 /* \x escapes */ 2300 case '\n': break; 2301 case '\\': *p++ = '\\'; break; 2302 case '\'': *p++ = '\''; break; 2303 case '\"': *p++ = '\"'; break; 2304 case 'b': *p++ = '\b'; break; 2305 case 'f': *p++ = '\014'; break; /* FF */ 2306 case 't': *p++ = '\t'; break; 2307 case 'n': *p++ = '\n'; break; 2308 case 'r': *p++ = '\r'; break; 2309 case 'v': *p++ = '\013'; break; /* VT */ 2310 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 2311 2312 /* \OOO (octal) escapes */ 2313 case '0': case '1': case '2': case '3': 2314 case '4': case '5': case '6': case '7': 2315 x = s[-1] - '0'; 2316 if ('0' <= *s && *s <= '7') { 2317 x = (x<<3) + *s++ - '0'; 2318 if ('0' <= *s && *s <= '7') 2319 x = (x<<3) + *s++ - '0'; 2320 } 2321 *p++ = x; 2322 break; 2323 2324 /* hex escapes */ 2325 /* \xXX */ 2326 case 'x': 2327 digits = 2; 2328 message = "truncated \\xXX escape"; 2329 goto hexescape; 2330 2331 /* \uXXXX */ 2332 case 'u': 2333 digits = 4; 2334 message = "truncated \\uXXXX escape"; 2335 goto hexescape; 2336 2337 /* \UXXXXXXXX */ 2338 case 'U': 2339 digits = 8; 2340 message = "truncated \\UXXXXXXXX escape"; 2341 hexescape: 2342 chr = 0; 2343 outpos = p-PyUnicode_AS_UNICODE(v); 2344 if (s+digits>end) { 2345 endinpos = size; 2346 if (unicode_decode_call_errorhandler( 2347 errors, &errorHandler, 2348 "unicodeescape", "end of string in escape sequence", 2349 starts, size, &startinpos, &endinpos, &exc, &s, 2350 (PyObject **)&v, &outpos, &p)) 2351 goto onError; 2352 goto nextByte; 2353 } 2354 for (i = 0; i < digits; ++i) { 2355 c = (unsigned char) s[i]; 2356 if (!isxdigit(c)) { 2357 endinpos = (s+i+1)-starts; 2358 if (unicode_decode_call_errorhandler( 2359 errors, &errorHandler, 2360 "unicodeescape", message, 2361 starts, size, &startinpos, &endinpos, &exc, &s, 2362 (PyObject **)&v, &outpos, &p)) 2363 goto onError; 2364 goto nextByte; 2365 } 2366 chr = (chr<<4) & ~0xF; 2367 if (c >= '0' && c <= '9') 2368 chr += c - '0'; 2369 else if (c >= 'a' && c <= 'f') 2370 chr += 10 + c - 'a'; 2371 else 2372 chr += 10 + c - 'A'; 2373 } 2374 s += i; 2375 if (chr == 0xffffffff && PyErr_Occurred()) 2376 /* _decoding_error will have already written into the 2377 target buffer. */ 2378 break; 2379 store: 2380 /* when we get here, chr is a 32-bit unicode character */ 2381 if (chr <= 0xffff) 2382 /* UCS-2 character */ 2383 *p++ = (Py_UNICODE) chr; 2384 else if (chr <= 0x10ffff) { 2385 /* UCS-4 character. Either store directly, or as 2386 surrogate pair. */ 2387#ifdef Py_UNICODE_WIDE 2388 *p++ = chr; 2389#else 2390 chr -= 0x10000L; 2391 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 2392 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); 2393#endif 2394 } else { 2395 endinpos = s-starts; 2396 outpos = p-PyUnicode_AS_UNICODE(v); 2397 if (unicode_decode_call_errorhandler( 2398 errors, &errorHandler, 2399 "unicodeescape", "illegal Unicode character", 2400 starts, size, &startinpos, &endinpos, &exc, &s, 2401 (PyObject **)&v, &outpos, &p)) 2402 goto onError; 2403 } 2404 break; 2405 2406 /* \N{name} */ 2407 case 'N': 2408 message = "malformed \\N character escape"; 2409 if (ucnhash_CAPI == NULL) { 2410 /* load the unicode data module */ 2411 PyObject *m, *api; 2412 m = PyImport_ImportModule("unicodedata"); 2413 if (m == NULL) 2414 goto ucnhashError; 2415 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); 2416 Py_DECREF(m); 2417 if (api == NULL) 2418 goto ucnhashError; 2419 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); 2420 Py_DECREF(api); 2421 if (ucnhash_CAPI == NULL) 2422 goto ucnhashError; 2423 } 2424 if (*s == '{') { 2425 const char *start = s+1; 2426 /* look for the closing brace */ 2427 while (*s != '}' && s < end) 2428 s++; 2429 if (s > start && s < end && *s == '}') { 2430 /* found a name. look it up in the unicode database */ 2431 message = "unknown Unicode character name"; 2432 s++; 2433 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) 2434 goto store; 2435 } 2436 } 2437 endinpos = s-starts; 2438 outpos = p-PyUnicode_AS_UNICODE(v); 2439 if (unicode_decode_call_errorhandler( 2440 errors, &errorHandler, 2441 "unicodeescape", message, 2442 starts, size, &startinpos, &endinpos, &exc, &s, 2443 (PyObject **)&v, &outpos, &p)) 2444 goto onError; 2445 break; 2446 2447 default: 2448 if (s > end) { 2449 message = "\\ at end of string"; 2450 s--; 2451 endinpos = s-starts; 2452 outpos = p-PyUnicode_AS_UNICODE(v); 2453 if (unicode_decode_call_errorhandler( 2454 errors, &errorHandler, 2455 "unicodeescape", message, 2456 starts, size, &startinpos, &endinpos, &exc, &s, 2457 (PyObject **)&v, &outpos, &p)) 2458 goto onError; 2459 } 2460 else { 2461 *p++ = '\\'; 2462 *p++ = (unsigned char)s[-1]; 2463 } 2464 break; 2465 } 2466 nextByte: 2467 ; 2468 } 2469 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2470 goto onError; 2471 Py_XDECREF(errorHandler); 2472 Py_XDECREF(exc); 2473 return (PyObject *)v; 2474 2475ucnhashError: 2476 PyErr_SetString( 2477 PyExc_UnicodeError, 2478 "\\N escapes not supported (can't load unicodedata module)" 2479 ); 2480 Py_XDECREF(v); 2481 Py_XDECREF(errorHandler); 2482 Py_XDECREF(exc); 2483 return NULL; 2484 2485onError: 2486 Py_XDECREF(v); 2487 Py_XDECREF(errorHandler); 2488 Py_XDECREF(exc); 2489 return NULL; 2490} 2491 2492/* Return a Unicode-Escape string version of the Unicode object. 2493 2494 If quotes is true, the string is enclosed in u"" or u'' quotes as 2495 appropriate. 2496 2497*/ 2498 2499Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, 2500 Py_ssize_t size, 2501 Py_UNICODE ch) 2502{ 2503 /* like wcschr, but doesn't stop at NULL characters */ 2504 2505 while (size-- > 0) { 2506 if (*s == ch) 2507 return s; 2508 s++; 2509 } 2510 2511 return NULL; 2512} 2513 2514static const char *hexdigits = "0123456789abcdef"; 2515 2516PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 2517 Py_ssize_t size) 2518{ 2519 PyObject *repr; 2520 char *p; 2521 2522 /* XXX(nnorwitz): rather than over-allocating, it would be 2523 better to choose a different scheme. Perhaps scan the 2524 first N-chars of the string and allocate based on that size. 2525 */ 2526 /* Initial allocation is based on the longest-possible unichr 2527 escape. 2528 2529 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 2530 unichr, so in this case it's the longest unichr escape. In 2531 narrow (UTF-16) builds this is five chars per source unichr 2532 since there are two unichrs in the surrogate pair, so in narrow 2533 (UTF-16) builds it's not the longest unichr escape. 2534 2535 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 2536 so in the narrow (UTF-16) build case it's the longest unichr 2537 escape. 2538 */ 2539 2540 repr = PyBytes_FromStringAndSize(NULL, 2541#ifdef Py_UNICODE_WIDE 2542 + 10*size 2543#else 2544 + 6*size 2545#endif 2546 + 1); 2547 if (repr == NULL) 2548 return NULL; 2549 2550 p = PyBytes_AS_STRING(repr); 2551 2552 while (size-- > 0) { 2553 Py_UNICODE ch = *s++; 2554 2555 /* Escape backslashes */ 2556 if (ch == '\\') { 2557 *p++ = '\\'; 2558 *p++ = (char) ch; 2559 continue; 2560 } 2561 2562#ifdef Py_UNICODE_WIDE 2563 /* Map 21-bit characters to '\U00xxxxxx' */ 2564 else if (ch >= 0x10000) { 2565 *p++ = '\\'; 2566 *p++ = 'U'; 2567 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 2568 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 2569 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 2570 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 2571 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 2572 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 2573 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 2574 *p++ = hexdigits[ch & 0x0000000F]; 2575 continue; 2576 } 2577#else 2578 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 2579 else if (ch >= 0xD800 && ch < 0xDC00) { 2580 Py_UNICODE ch2; 2581 Py_UCS4 ucs; 2582 2583 ch2 = *s++; 2584 size--; 2585 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 2586 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 2587 *p++ = '\\'; 2588 *p++ = 'U'; 2589 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 2590 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 2591 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 2592 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 2593 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 2594 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 2595 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 2596 *p++ = hexdigits[ucs & 0x0000000F]; 2597 continue; 2598 } 2599 /* Fall through: isolated surrogates are copied as-is */ 2600 s--; 2601 size++; 2602 } 2603#endif 2604 2605 /* Map 16-bit characters to '\uxxxx' */ 2606 if (ch >= 256) { 2607 *p++ = '\\'; 2608 *p++ = 'u'; 2609 *p++ = hexdigits[(ch >> 12) & 0x000F]; 2610 *p++ = hexdigits[(ch >> 8) & 0x000F]; 2611 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2612 *p++ = hexdigits[ch & 0x000F]; 2613 } 2614 2615 /* Map special whitespace to '\t', \n', '\r' */ 2616 else if (ch == '\t') { 2617 *p++ = '\\'; 2618 *p++ = 't'; 2619 } 2620 else if (ch == '\n') { 2621 *p++ = '\\'; 2622 *p++ = 'n'; 2623 } 2624 else if (ch == '\r') { 2625 *p++ = '\\'; 2626 *p++ = 'r'; 2627 } 2628 2629 /* Map non-printable US ASCII to '\xhh' */ 2630 else if (ch < ' ' || ch >= 0x7F) { 2631 *p++ = '\\'; 2632 *p++ = 'x'; 2633 *p++ = hexdigits[(ch >> 4) & 0x000F]; 2634 *p++ = hexdigits[ch & 0x000F]; 2635 } 2636 2637 /* Copy everything else as-is */ 2638 else 2639 *p++ = (char) ch; 2640 } 2641 2642 *p = '\0'; 2643 if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) { 2644 Py_DECREF(repr); 2645 return NULL; 2646 } 2647 return repr; 2648} 2649 2650PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 2651{ 2652 PyObject *s, *result; 2653 if (!PyUnicode_Check(unicode)) { 2654 PyErr_BadArgument(); 2655 return NULL; 2656 } 2657 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2658 PyUnicode_GET_SIZE(unicode)); 2659 2660 if (!s) 2661 return NULL; 2662 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 2663 PyBytes_GET_SIZE(s)); 2664 Py_DECREF(s); 2665 return result; 2666} 2667 2668/* --- Raw Unicode Escape Codec ------------------------------------------- */ 2669 2670PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 2671 Py_ssize_t size, 2672 const char *errors) 2673{ 2674 const char *starts = s; 2675 Py_ssize_t startinpos; 2676 Py_ssize_t endinpos; 2677 Py_ssize_t outpos; 2678 PyUnicodeObject *v; 2679 Py_UNICODE *p; 2680 const char *end; 2681 const char *bs; 2682 PyObject *errorHandler = NULL; 2683 PyObject *exc = NULL; 2684 2685 /* Escaped strings will always be longer than the resulting 2686 Unicode string, so we start with size here and then reduce the 2687 length after conversion to the true value. (But decoding error 2688 handler might have to resize the string) */ 2689 v = _PyUnicode_New(size); 2690 if (v == NULL) 2691 goto onError; 2692 if (size == 0) 2693 return (PyObject *)v; 2694 p = PyUnicode_AS_UNICODE(v); 2695 end = s + size; 2696 while (s < end) { 2697 unsigned char c; 2698 Py_UCS4 x; 2699 int i; 2700 int count; 2701 2702 /* Non-escape characters are interpreted as Unicode ordinals */ 2703 if (*s != '\\') { 2704 *p++ = (unsigned char)*s++; 2705 continue; 2706 } 2707 startinpos = s-starts; 2708 2709 /* \u-escapes are only interpreted iff the number of leading 2710 backslashes if odd */ 2711 bs = s; 2712 for (;s < end;) { 2713 if (*s != '\\') 2714 break; 2715 *p++ = (unsigned char)*s++; 2716 } 2717 if (((s - bs) & 1) == 0 || 2718 s >= end || 2719 (*s != 'u' && *s != 'U')) { 2720 continue; 2721 } 2722 p--; 2723 count = *s=='u' ? 4 : 8; 2724 s++; 2725 2726 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 2727 outpos = p-PyUnicode_AS_UNICODE(v); 2728 for (x = 0, i = 0; i < count; ++i, ++s) { 2729 c = (unsigned char)*s; 2730 if (!isxdigit(c)) { 2731 endinpos = s-starts; 2732 if (unicode_decode_call_errorhandler( 2733 errors, &errorHandler, 2734 "rawunicodeescape", "truncated \\uXXXX", 2735 starts, size, &startinpos, &endinpos, &exc, &s, 2736 (PyObject **)&v, &outpos, &p)) 2737 goto onError; 2738 goto nextByte; 2739 } 2740 x = (x<<4) & ~0xF; 2741 if (c >= '0' && c <= '9') 2742 x += c - '0'; 2743 else if (c >= 'a' && c <= 'f') 2744 x += 10 + c - 'a'; 2745 else 2746 x += 10 + c - 'A'; 2747 } 2748#ifndef Py_UNICODE_WIDE 2749 if (x > 0x10000) { 2750 if (unicode_decode_call_errorhandler( 2751 errors, &errorHandler, 2752 "rawunicodeescape", "\\Uxxxxxxxx out of range", 2753 starts, size, &startinpos, &endinpos, &exc, &s, 2754 (PyObject **)&v, &outpos, &p)) 2755 goto onError; 2756 } 2757#endif 2758 *p++ = x; 2759 nextByte: 2760 ; 2761 } 2762 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2763 goto onError; 2764 Py_XDECREF(errorHandler); 2765 Py_XDECREF(exc); 2766 return (PyObject *)v; 2767 2768 onError: 2769 Py_XDECREF(v); 2770 Py_XDECREF(errorHandler); 2771 Py_XDECREF(exc); 2772 return NULL; 2773} 2774 2775PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 2776 Py_ssize_t size) 2777{ 2778 PyObject *repr; 2779 char *p; 2780 char *q; 2781 2782#ifdef Py_UNICODE_WIDE 2783 repr = PyBytes_FromStringAndSize(NULL, 10 * size); 2784#else 2785 repr = PyBytes_FromStringAndSize(NULL, 6 * size); 2786#endif 2787 if (repr == NULL) 2788 return NULL; 2789 if (size == 0) 2790 return repr; 2791 2792 p = q = PyBytes_AS_STRING(repr); 2793 while (size-- > 0) { 2794 Py_UNICODE ch = *s++; 2795#ifdef Py_UNICODE_WIDE 2796 /* Map 32-bit characters to '\Uxxxxxxxx' */ 2797 if (ch >= 0x10000) { 2798 *p++ = '\\'; 2799 *p++ = 'U'; 2800 *p++ = hexdigits[(ch >> 28) & 0xf]; 2801 *p++ = hexdigits[(ch >> 24) & 0xf]; 2802 *p++ = hexdigits[(ch >> 20) & 0xf]; 2803 *p++ = hexdigits[(ch >> 16) & 0xf]; 2804 *p++ = hexdigits[(ch >> 12) & 0xf]; 2805 *p++ = hexdigits[(ch >> 8) & 0xf]; 2806 *p++ = hexdigits[(ch >> 4) & 0xf]; 2807 *p++ = hexdigits[ch & 15]; 2808 } 2809 else 2810#endif 2811 /* Map 16-bit characters to '\uxxxx' */ 2812 if (ch >= 256) { 2813 *p++ = '\\'; 2814 *p++ = 'u'; 2815 *p++ = hexdigits[(ch >> 12) & 0xf]; 2816 *p++ = hexdigits[(ch >> 8) & 0xf]; 2817 *p++ = hexdigits[(ch >> 4) & 0xf]; 2818 *p++ = hexdigits[ch & 15]; 2819 } 2820 /* Copy everything else as-is */ 2821 else 2822 *p++ = (char) ch; 2823 } 2824 *p = '\0'; 2825 if (PyBytes_Resize(repr, p - q)) { 2826 Py_DECREF(repr); 2827 return NULL; 2828 } 2829 return repr; 2830} 2831 2832PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 2833{ 2834 PyObject *s, *result; 2835 if (!PyUnicode_Check(unicode)) { 2836 PyErr_BadArgument(); 2837 return NULL; 2838 } 2839 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 2840 PyUnicode_GET_SIZE(unicode)); 2841 2842 if (!s) 2843 return NULL; 2844 result = PyString_FromStringAndSize(PyBytes_AS_STRING(s), 2845 PyBytes_GET_SIZE(s)); 2846 Py_DECREF(s); 2847 return result; 2848} 2849 2850/* --- Unicode Internal Codec ------------------------------------------- */ 2851 2852PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, 2853 Py_ssize_t size, 2854 const char *errors) 2855{ 2856 const char *starts = s; 2857 Py_ssize_t startinpos; 2858 Py_ssize_t endinpos; 2859 Py_ssize_t outpos; 2860 PyUnicodeObject *v; 2861 Py_UNICODE *p; 2862 const char *end; 2863 const char *reason; 2864 PyObject *errorHandler = NULL; 2865 PyObject *exc = NULL; 2866 2867#ifdef Py_UNICODE_WIDE 2868 Py_UNICODE unimax = PyUnicode_GetMax(); 2869#endif 2870 2871 /* XXX overflow detection missing */ 2872 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 2873 if (v == NULL) 2874 goto onError; 2875 if (PyUnicode_GetSize((PyObject *)v) == 0) 2876 return (PyObject *)v; 2877 p = PyUnicode_AS_UNICODE(v); 2878 end = s + size; 2879 2880 while (s < end) { 2881 memcpy(p, s, sizeof(Py_UNICODE)); 2882 /* We have to sanity check the raw data, otherwise doom looms for 2883 some malformed UCS-4 data. */ 2884 if ( 2885 #ifdef Py_UNICODE_WIDE 2886 *p > unimax || *p < 0 || 2887 #endif 2888 end-s < Py_UNICODE_SIZE 2889 ) 2890 { 2891 startinpos = s - starts; 2892 if (end-s < Py_UNICODE_SIZE) { 2893 endinpos = end-starts; 2894 reason = "truncated input"; 2895 } 2896 else { 2897 endinpos = s - starts + Py_UNICODE_SIZE; 2898 reason = "illegal code point (> 0x10FFFF)"; 2899 } 2900 outpos = p - PyUnicode_AS_UNICODE(v); 2901 if (unicode_decode_call_errorhandler( 2902 errors, &errorHandler, 2903 "unicode_internal", reason, 2904 starts, size, &startinpos, &endinpos, &exc, &s, 2905 (PyObject **)&v, &outpos, &p)) { 2906 goto onError; 2907 } 2908 } 2909 else { 2910 p++; 2911 s += Py_UNICODE_SIZE; 2912 } 2913 } 2914 2915 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 2916 goto onError; 2917 Py_XDECREF(errorHandler); 2918 Py_XDECREF(exc); 2919 return (PyObject *)v; 2920 2921 onError: 2922 Py_XDECREF(v); 2923 Py_XDECREF(errorHandler); 2924 Py_XDECREF(exc); 2925 return NULL; 2926} 2927 2928/* --- Latin-1 Codec ------------------------------------------------------ */ 2929 2930PyObject *PyUnicode_DecodeLatin1(const char *s, 2931 Py_ssize_t size, 2932 const char *errors) 2933{ 2934 PyUnicodeObject *v; 2935 Py_UNICODE *p; 2936 2937 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 2938 if (size == 1) { 2939 Py_UNICODE r = *(unsigned char*)s; 2940 return PyUnicode_FromUnicode(&r, 1); 2941 } 2942 2943 v = _PyUnicode_New(size); 2944 if (v == NULL) 2945 goto onError; 2946 if (size == 0) 2947 return (PyObject *)v; 2948 p = PyUnicode_AS_UNICODE(v); 2949 while (size-- > 0) 2950 *p++ = (unsigned char)*s++; 2951 return (PyObject *)v; 2952 2953 onError: 2954 Py_XDECREF(v); 2955 return NULL; 2956} 2957 2958/* create or adjust a UnicodeEncodeError */ 2959static void make_encode_exception(PyObject **exceptionObject, 2960 const char *encoding, 2961 const Py_UNICODE *unicode, Py_ssize_t size, 2962 Py_ssize_t startpos, Py_ssize_t endpos, 2963 const char *reason) 2964{ 2965 if (*exceptionObject == NULL) { 2966 *exceptionObject = PyUnicodeEncodeError_Create( 2967 encoding, unicode, size, startpos, endpos, reason); 2968 } 2969 else { 2970 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 2971 goto onError; 2972 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 2973 goto onError; 2974 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 2975 goto onError; 2976 return; 2977 onError: 2978 Py_DECREF(*exceptionObject); 2979 *exceptionObject = NULL; 2980 } 2981} 2982 2983/* raises a UnicodeEncodeError */ 2984static void raise_encode_exception(PyObject **exceptionObject, 2985 const char *encoding, 2986 const Py_UNICODE *unicode, Py_ssize_t size, 2987 Py_ssize_t startpos, Py_ssize_t endpos, 2988 const char *reason) 2989{ 2990 make_encode_exception(exceptionObject, 2991 encoding, unicode, size, startpos, endpos, reason); 2992 if (*exceptionObject != NULL) 2993 PyCodec_StrictErrors(*exceptionObject); 2994} 2995 2996/* error handling callback helper: 2997 build arguments, call the callback and check the arguments, 2998 put the result into newpos and return the replacement string, which 2999 has to be freed by the caller */ 3000static PyObject *unicode_encode_call_errorhandler(const char *errors, 3001 PyObject **errorHandler, 3002 const char *encoding, const char *reason, 3003 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 3004 Py_ssize_t startpos, Py_ssize_t endpos, 3005 Py_ssize_t *newpos) 3006{ 3007 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; 3008 3009 PyObject *restuple; 3010 PyObject *resunicode; 3011 3012 if (*errorHandler == NULL) { 3013 *errorHandler = PyCodec_LookupError(errors); 3014 if (*errorHandler == NULL) 3015 return NULL; 3016 } 3017 3018 make_encode_exception(exceptionObject, 3019 encoding, unicode, size, startpos, endpos, reason); 3020 if (*exceptionObject == NULL) 3021 return NULL; 3022 3023 restuple = PyObject_CallFunctionObjArgs( 3024 *errorHandler, *exceptionObject, NULL); 3025 if (restuple == NULL) 3026 return NULL; 3027 if (!PyTuple_Check(restuple)) { 3028 PyErr_Format(PyExc_TypeError, &argparse[4]); 3029 Py_DECREF(restuple); 3030 return NULL; 3031 } 3032 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 3033 &resunicode, newpos)) { 3034 Py_DECREF(restuple); 3035 return NULL; 3036 } 3037 if (*newpos<0) 3038 *newpos = size+*newpos; 3039 if (*newpos<0 || *newpos>size) { 3040 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 3041 Py_DECREF(restuple); 3042 return NULL; 3043 } 3044 Py_INCREF(resunicode); 3045 Py_DECREF(restuple); 3046 return resunicode; 3047} 3048 3049static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, 3050 Py_ssize_t size, 3051 const char *errors, 3052 int limit) 3053{ 3054 /* output object */ 3055 PyObject *res; 3056 /* pointers to the beginning and end+1 of input */ 3057 const Py_UNICODE *startp = p; 3058 const Py_UNICODE *endp = p + size; 3059 /* pointer to the beginning of the unencodable characters */ 3060 /* const Py_UNICODE *badp = NULL; */ 3061 /* pointer into the output */ 3062 char *str; 3063 /* current output position */ 3064 Py_ssize_t respos = 0; 3065 Py_ssize_t ressize; 3066 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 3067 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 3068 PyObject *errorHandler = NULL; 3069 PyObject *exc = NULL; 3070 /* the following variable is used for caching string comparisons 3071 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 3072 int known_errorHandler = -1; 3073 3074 /* allocate enough for a simple encoding without 3075 replacements, if we need more, we'll resize */ 3076 res = PyBytes_FromStringAndSize(NULL, size); 3077 if (res == NULL) 3078 goto onError; 3079 if (size == 0) 3080 return res; 3081 str = PyBytes_AS_STRING(res); 3082 ressize = size; 3083 3084 while (p<endp) { 3085 Py_UNICODE c = *p; 3086 3087 /* can we encode this? */ 3088 if (c<limit) { 3089 /* no overflow check, because we know that the space is enough */ 3090 *str++ = (char)c; 3091 ++p; 3092 } 3093 else { 3094 Py_ssize_t unicodepos = p-startp; 3095 Py_ssize_t requiredsize; 3096 PyObject *repunicode; 3097 Py_ssize_t repsize; 3098 Py_ssize_t newpos; 3099 Py_ssize_t respos; 3100 Py_UNICODE *uni2; 3101 /* startpos for collecting unencodable chars */ 3102 const Py_UNICODE *collstart = p; 3103 const Py_UNICODE *collend = p; 3104 /* find all unecodable characters */ 3105 while ((collend < endp) && ((*collend)>=limit)) 3106 ++collend; 3107 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 3108 if (known_errorHandler==-1) { 3109 if ((errors==NULL) || (!strcmp(errors, "strict"))) 3110 known_errorHandler = 1; 3111 else if (!strcmp(errors, "replace")) 3112 known_errorHandler = 2; 3113 else if (!strcmp(errors, "ignore")) 3114 known_errorHandler = 3; 3115 else if (!strcmp(errors, "xmlcharrefreplace")) 3116 known_errorHandler = 4; 3117 else 3118 known_errorHandler = 0; 3119 } 3120 switch (known_errorHandler) { 3121 case 1: /* strict */ 3122 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 3123 goto onError; 3124 case 2: /* replace */ 3125 while (collstart++<collend) 3126 *str++ = '?'; /* fall through */ 3127 case 3: /* ignore */ 3128 p = collend; 3129 break; 3130 case 4: /* xmlcharrefreplace */ 3131 respos = str - PyBytes_AS_STRING(res); 3132 /* determine replacement size (temporarily (mis)uses p) */ 3133 for (p = collstart, repsize = 0; p < collend; ++p) { 3134 if (*p<10) 3135 repsize += 2+1+1; 3136 else if (*p<100) 3137 repsize += 2+2+1; 3138 else if (*p<1000) 3139 repsize += 2+3+1; 3140 else if (*p<10000) 3141 repsize += 2+4+1; 3142#ifndef Py_UNICODE_WIDE 3143 else 3144 repsize += 2+5+1; 3145#else 3146 else if (*p<100000) 3147 repsize += 2+5+1; 3148 else if (*p<1000000) 3149 repsize += 2+6+1; 3150 else 3151 repsize += 2+7+1; 3152#endif 3153 } 3154 requiredsize = respos+repsize+(endp-collend); 3155 if (requiredsize > ressize) { 3156 if (requiredsize<2*ressize) 3157 requiredsize = 2*ressize; 3158 if (PyBytes_Resize(res, requiredsize)) 3159 goto onError; 3160 str = PyBytes_AS_STRING(res) + respos; 3161 ressize = requiredsize; 3162 } 3163 /* generate replacement (temporarily (mis)uses p) */ 3164 for (p = collstart; p < collend; ++p) { 3165 str += sprintf(str, "&#%d;", (int)*p); 3166 } 3167 p = collend; 3168 break; 3169 default: 3170 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 3171 encoding, reason, startp, size, &exc, 3172 collstart-startp, collend-startp, &newpos); 3173 if (repunicode == NULL) 3174 goto onError; 3175 /* need more space? (at least enough for what we 3176 have+the replacement+the rest of the string, so 3177 we won't have to check space for encodable characters) */ 3178 respos = str - PyBytes_AS_STRING(res); 3179 repsize = PyUnicode_GET_SIZE(repunicode); 3180 requiredsize = respos+repsize+(endp-collend); 3181 if (requiredsize > ressize) { 3182 if (requiredsize<2*ressize) 3183 requiredsize = 2*ressize; 3184 if (PyBytes_Resize(res, requiredsize)) { 3185 Py_DECREF(repunicode); 3186 goto onError; 3187 } 3188 str = PyBytes_AS_STRING(res) + respos; 3189 ressize = requiredsize; 3190 } 3191 /* check if there is anything unencodable in the replacement 3192 and copy it to the output */ 3193 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 3194 c = *uni2; 3195 if (c >= limit) { 3196 raise_encode_exception(&exc, encoding, startp, size, 3197 unicodepos, unicodepos+1, reason); 3198 Py_DECREF(repunicode); 3199 goto onError; 3200 } 3201 *str = (char)c; 3202 } 3203 p = startp + newpos; 3204 Py_DECREF(repunicode); 3205 } 3206 } 3207 } 3208 /* Resize if we allocated to much */ 3209 respos = str - PyBytes_AS_STRING(res); 3210 if (respos<ressize) 3211 /* If this falls res will be NULL */ 3212 PyBytes_Resize(res, respos); 3213 Py_XDECREF(errorHandler); 3214 Py_XDECREF(exc); 3215 return res; 3216 3217 onError: 3218 Py_XDECREF(res); 3219 Py_XDECREF(errorHandler); 3220 Py_XDECREF(exc); 3221 return NULL; 3222} 3223 3224PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 3225 Py_ssize_t size, 3226 const char *errors) 3227{ 3228 return unicode_encode_ucs1(p, size, errors, 256); 3229} 3230 3231PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 3232{ 3233 if (!PyUnicode_Check(unicode)) { 3234 PyErr_BadArgument(); 3235 return NULL; 3236 } 3237 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 3238 PyUnicode_GET_SIZE(unicode), 3239 NULL); 3240} 3241 3242/* --- 7-bit ASCII Codec -------------------------------------------------- */ 3243 3244PyObject *PyUnicode_DecodeASCII(const char *s, 3245 Py_ssize_t size, 3246 const char *errors) 3247{ 3248 const char *starts = s; 3249 PyUnicodeObject *v; 3250 Py_UNICODE *p; 3251 Py_ssize_t startinpos; 3252 Py_ssize_t endinpos; 3253 Py_ssize_t outpos; 3254 const char *e; 3255 PyObject *errorHandler = NULL; 3256 PyObject *exc = NULL; 3257 3258 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 3259 if (size == 1 && *(unsigned char*)s < 128) { 3260 Py_UNICODE r = *(unsigned char*)s; 3261 return PyUnicode_FromUnicode(&r, 1); 3262 } 3263 3264 v = _PyUnicode_New(size); 3265 if (v == NULL) 3266 goto onError; 3267 if (size == 0) 3268 return (PyObject *)v; 3269 p = PyUnicode_AS_UNICODE(v); 3270 e = s + size; 3271 while (s < e) { 3272 register unsigned char c = (unsigned char)*s; 3273 if (c < 128) { 3274 *p++ = c; 3275 ++s; 3276 } 3277 else { 3278 startinpos = s-starts; 3279 endinpos = startinpos + 1; 3280 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 3281 if (unicode_decode_call_errorhandler( 3282 errors, &errorHandler, 3283 "ascii", "ordinal not in range(128)", 3284 starts, size, &startinpos, &endinpos, &exc, &s, 3285 (PyObject **)&v, &outpos, &p)) 3286 goto onError; 3287 } 3288 } 3289 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3290 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3291 goto onError; 3292 Py_XDECREF(errorHandler); 3293 Py_XDECREF(exc); 3294 return (PyObject *)v; 3295 3296 onError: 3297 Py_XDECREF(v); 3298 Py_XDECREF(errorHandler); 3299 Py_XDECREF(exc); 3300 return NULL; 3301} 3302 3303PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 3304 Py_ssize_t size, 3305 const char *errors) 3306{ 3307 return unicode_encode_ucs1(p, size, errors, 128); 3308} 3309 3310PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 3311{ 3312 if (!PyUnicode_Check(unicode)) { 3313 PyErr_BadArgument(); 3314 return NULL; 3315 } 3316 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 3317 PyUnicode_GET_SIZE(unicode), 3318 NULL); 3319} 3320 3321#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) 3322 3323/* --- MBCS codecs for Windows -------------------------------------------- */ 3324 3325#if SIZEOF_INT < SIZEOF_SSIZE_T 3326#define NEED_RETRY 3327#endif 3328 3329/* XXX This code is limited to "true" double-byte encodings, as 3330 a) it assumes an incomplete character consists of a single byte, and 3331 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 3332 encodings, see IsDBCSLeadByteEx documentation. */ 3333 3334static int is_dbcs_lead_byte(const char *s, int offset) 3335{ 3336 const char *curr = s + offset; 3337 3338 if (IsDBCSLeadByte(*curr)) { 3339 const char *prev = CharPrev(s, curr); 3340 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 3341 } 3342 return 0; 3343} 3344 3345/* 3346 * Decode MBCS string into unicode object. If 'final' is set, converts 3347 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 3348 */ 3349static int decode_mbcs(PyUnicodeObject **v, 3350 const char *s, /* MBCS string */ 3351 int size, /* sizeof MBCS string */ 3352 int final) 3353{ 3354 Py_UNICODE *p; 3355 Py_ssize_t n = 0; 3356 int usize = 0; 3357 3358 assert(size >= 0); 3359 3360 /* Skip trailing lead-byte unless 'final' is set */ 3361 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 3362 --size; 3363 3364 /* First get the size of the result */ 3365 if (size > 0) { 3366 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 3367 if (usize == 0) { 3368 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3369 return -1; 3370 } 3371 } 3372 3373 if (*v == NULL) { 3374 /* Create unicode object */ 3375 *v = _PyUnicode_New(usize); 3376 if (*v == NULL) 3377 return -1; 3378 } 3379 else { 3380 /* Extend unicode object */ 3381 n = PyUnicode_GET_SIZE(*v); 3382 if (_PyUnicode_Resize(v, n + usize) < 0) 3383 return -1; 3384 } 3385 3386 /* Do the conversion */ 3387 if (size > 0) { 3388 p = PyUnicode_AS_UNICODE(*v) + n; 3389 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 3390 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3391 return -1; 3392 } 3393 } 3394 3395 return size; 3396} 3397 3398PyObject *PyUnicode_DecodeMBCSStateful(const char *s, 3399 Py_ssize_t size, 3400 const char *errors, 3401 Py_ssize_t *consumed) 3402{ 3403 PyUnicodeObject *v = NULL; 3404 int done; 3405 3406 if (consumed) 3407 *consumed = 0; 3408 3409#ifdef NEED_RETRY 3410 retry: 3411 if (size > INT_MAX) 3412 done = decode_mbcs(&v, s, INT_MAX, 0); 3413 else 3414#endif 3415 done = decode_mbcs(&v, s, (int)size, !consumed); 3416 3417 if (done < 0) { 3418 Py_XDECREF(v); 3419 return NULL; 3420 } 3421 3422 if (consumed) 3423 *consumed += done; 3424 3425#ifdef NEED_RETRY 3426 if (size > INT_MAX) { 3427 s += done; 3428 size -= done; 3429 goto retry; 3430 } 3431#endif 3432 3433 return (PyObject *)v; 3434} 3435 3436PyObject *PyUnicode_DecodeMBCS(const char *s, 3437 Py_ssize_t size, 3438 const char *errors) 3439{ 3440 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 3441} 3442 3443/* 3444 * Convert unicode into string object (MBCS). 3445 * Returns 0 if succeed, -1 otherwise. 3446 */ 3447static int encode_mbcs(PyObject **repr, 3448 const Py_UNICODE *p, /* unicode */ 3449 int size) /* size of unicode */ 3450{ 3451 int mbcssize = 0; 3452 Py_ssize_t n = 0; 3453 3454 assert(size >= 0); 3455 3456 /* First get the size of the result */ 3457 if (size > 0) { 3458 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 3459 if (mbcssize == 0) { 3460 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3461 return -1; 3462 } 3463 } 3464 3465 if (*repr == NULL) { 3466 /* Create string object */ 3467 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 3468 if (*repr == NULL) 3469 return -1; 3470 } 3471 else { 3472 /* Extend string object */ 3473 n = PyBytes_Size(*repr); 3474 if (PyBytes_Resize(*repr, n + mbcssize) < 0) 3475 return -1; 3476 } 3477 3478 /* Do the conversion */ 3479 if (size > 0) { 3480 char *s = PyBytes_AS_STRING(*repr) + n; 3481 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 3482 PyErr_SetFromWindowsErrWithFilename(0, NULL); 3483 return -1; 3484 } 3485 } 3486 3487 return 0; 3488} 3489 3490PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 3491 Py_ssize_t size, 3492 const char *errors) 3493{ 3494 PyObject *repr = NULL; 3495 int ret; 3496 3497#ifdef NEED_RETRY 3498 retry: 3499 if (size > INT_MAX) 3500 ret = encode_mbcs(&repr, p, INT_MAX); 3501 else 3502#endif 3503 ret = encode_mbcs(&repr, p, (int)size); 3504 3505 if (ret < 0) { 3506 Py_XDECREF(repr); 3507 return NULL; 3508 } 3509 3510#ifdef NEED_RETRY 3511 if (size > INT_MAX) { 3512 p += INT_MAX; 3513 size -= INT_MAX; 3514 goto retry; 3515 } 3516#endif 3517 3518 return repr; 3519} 3520 3521PyObject *PyUnicode_AsMBCSString(PyObject *unicode) 3522{ 3523 if (!PyUnicode_Check(unicode)) { 3524 PyErr_BadArgument(); 3525 return NULL; 3526 } 3527 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3528 PyUnicode_GET_SIZE(unicode), 3529 NULL); 3530} 3531 3532#undef NEED_RETRY 3533 3534#endif /* MS_WINDOWS */ 3535 3536/* --- Character Mapping Codec -------------------------------------------- */ 3537 3538PyObject *PyUnicode_DecodeCharmap(const char *s, 3539 Py_ssize_t size, 3540 PyObject *mapping, 3541 const char *errors) 3542{ 3543 const char *starts = s; 3544 Py_ssize_t startinpos; 3545 Py_ssize_t endinpos; 3546 Py_ssize_t outpos; 3547 const char *e; 3548 PyUnicodeObject *v; 3549 Py_UNICODE *p; 3550 Py_ssize_t extrachars = 0; 3551 PyObject *errorHandler = NULL; 3552 PyObject *exc = NULL; 3553 Py_UNICODE *mapstring = NULL; 3554 Py_ssize_t maplen = 0; 3555 3556 /* Default to Latin-1 */ 3557 if (mapping == NULL) 3558 return PyUnicode_DecodeLatin1(s, size, errors); 3559 3560 v = _PyUnicode_New(size); 3561 if (v == NULL) 3562 goto onError; 3563 if (size == 0) 3564 return (PyObject *)v; 3565 p = PyUnicode_AS_UNICODE(v); 3566 e = s + size; 3567 if (PyUnicode_CheckExact(mapping)) { 3568 mapstring = PyUnicode_AS_UNICODE(mapping); 3569 maplen = PyUnicode_GET_SIZE(mapping); 3570 while (s < e) { 3571 unsigned char ch = *s; 3572 Py_UNICODE x = 0xfffe; /* illegal value */ 3573 3574 if (ch < maplen) 3575 x = mapstring[ch]; 3576 3577 if (x == 0xfffe) { 3578 /* undefined mapping */ 3579 outpos = p-PyUnicode_AS_UNICODE(v); 3580 startinpos = s-starts; 3581 endinpos = startinpos+1; 3582 if (unicode_decode_call_errorhandler( 3583 errors, &errorHandler, 3584 "charmap", "character maps to <undefined>", 3585 starts, size, &startinpos, &endinpos, &exc, &s, 3586 (PyObject **)&v, &outpos, &p)) { 3587 goto onError; 3588 } 3589 continue; 3590 } 3591 *p++ = x; 3592 ++s; 3593 } 3594 } 3595 else { 3596 while (s < e) { 3597 unsigned char ch = *s; 3598 PyObject *w, *x; 3599 3600 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 3601 w = PyInt_FromLong((long)ch); 3602 if (w == NULL) 3603 goto onError; 3604 x = PyObject_GetItem(mapping, w); 3605 Py_DECREF(w); 3606 if (x == NULL) { 3607 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3608 /* No mapping found means: mapping is undefined. */ 3609 PyErr_Clear(); 3610 x = Py_None; 3611 Py_INCREF(x); 3612 } else 3613 goto onError; 3614 } 3615 3616 /* Apply mapping */ 3617 if (PyInt_Check(x)) { 3618 long value = PyInt_AS_LONG(x); 3619 if (value < 0 || value > 65535) { 3620 PyErr_SetString(PyExc_TypeError, 3621 "character mapping must be in range(65536)"); 3622 Py_DECREF(x); 3623 goto onError; 3624 } 3625 *p++ = (Py_UNICODE)value; 3626 } 3627 else if (x == Py_None) { 3628 /* undefined mapping */ 3629 outpos = p-PyUnicode_AS_UNICODE(v); 3630 startinpos = s-starts; 3631 endinpos = startinpos+1; 3632 if (unicode_decode_call_errorhandler( 3633 errors, &errorHandler, 3634 "charmap", "character maps to <undefined>", 3635 starts, size, &startinpos, &endinpos, &exc, &s, 3636 (PyObject **)&v, &outpos, &p)) { 3637 Py_DECREF(x); 3638 goto onError; 3639 } 3640 Py_DECREF(x); 3641 continue; 3642 } 3643 else if (PyUnicode_Check(x)) { 3644 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 3645 3646 if (targetsize == 1) 3647 /* 1-1 mapping */ 3648 *p++ = *PyUnicode_AS_UNICODE(x); 3649 3650 else if (targetsize > 1) { 3651 /* 1-n mapping */ 3652 if (targetsize > extrachars) { 3653 /* resize first */ 3654 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 3655 Py_ssize_t needed = (targetsize - extrachars) + \ 3656 (targetsize << 2); 3657 extrachars += needed; 3658 /* XXX overflow detection missing */ 3659 if (_PyUnicode_Resize(&v, 3660 PyUnicode_GET_SIZE(v) + needed) < 0) { 3661 Py_DECREF(x); 3662 goto onError; 3663 } 3664 p = PyUnicode_AS_UNICODE(v) + oldpos; 3665 } 3666 Py_UNICODE_COPY(p, 3667 PyUnicode_AS_UNICODE(x), 3668 targetsize); 3669 p += targetsize; 3670 extrachars -= targetsize; 3671 } 3672 /* 1-0 mapping: skip the character */ 3673 } 3674 else { 3675 /* wrong return value */ 3676 PyErr_SetString(PyExc_TypeError, 3677 "character mapping must return integer, None or unicode"); 3678 Py_DECREF(x); 3679 goto onError; 3680 } 3681 Py_DECREF(x); 3682 ++s; 3683 } 3684 } 3685 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 3686 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 3687 goto onError; 3688 Py_XDECREF(errorHandler); 3689 Py_XDECREF(exc); 3690 return (PyObject *)v; 3691 3692 onError: 3693 Py_XDECREF(errorHandler); 3694 Py_XDECREF(exc); 3695 Py_XDECREF(v); 3696 return NULL; 3697} 3698 3699/* Charmap encoding: the lookup table */ 3700 3701struct encoding_map{ 3702 PyObject_HEAD 3703 unsigned char level1[32]; 3704 int count2, count3; 3705 unsigned char level23[1]; 3706}; 3707 3708static PyObject* 3709encoding_map_size(PyObject *obj, PyObject* args) 3710{ 3711 struct encoding_map *map = (struct encoding_map*)obj; 3712 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 3713 128*map->count3); 3714} 3715 3716static PyMethodDef encoding_map_methods[] = { 3717 {"size", encoding_map_size, METH_NOARGS, 3718 PyDoc_STR("Return the size (in bytes) of this object") }, 3719 { 0 } 3720}; 3721 3722static void 3723encoding_map_dealloc(PyObject* o) 3724{ 3725 PyObject_FREE(o); 3726} 3727 3728static PyTypeObject EncodingMapType = { 3729 PyObject_HEAD_INIT(NULL) 3730 0, /*ob_size*/ 3731 "EncodingMap", /*tp_name*/ 3732 sizeof(struct encoding_map), /*tp_basicsize*/ 3733 0, /*tp_itemsize*/ 3734 /* methods */ 3735 encoding_map_dealloc, /*tp_dealloc*/ 3736 0, /*tp_print*/ 3737 0, /*tp_getattr*/ 3738 0, /*tp_setattr*/ 3739 0, /*tp_compare*/ 3740 0, /*tp_repr*/ 3741 0, /*tp_as_number*/ 3742 0, /*tp_as_sequence*/ 3743 0, /*tp_as_mapping*/ 3744 0, /*tp_hash*/ 3745 0, /*tp_call*/ 3746 0, /*tp_str*/ 3747 0, /*tp_getattro*/ 3748 0, /*tp_setattro*/ 3749 0, /*tp_as_buffer*/ 3750 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 3751 0, /*tp_doc*/ 3752 0, /*tp_traverse*/ 3753 0, /*tp_clear*/ 3754 0, /*tp_richcompare*/ 3755 0, /*tp_weaklistoffset*/ 3756 0, /*tp_iter*/ 3757 0, /*tp_iternext*/ 3758 encoding_map_methods, /*tp_methods*/ 3759 0, /*tp_members*/ 3760 0, /*tp_getset*/ 3761 0, /*tp_base*/ 3762 0, /*tp_dict*/ 3763 0, /*tp_descr_get*/ 3764 0, /*tp_descr_set*/ 3765 0, /*tp_dictoffset*/ 3766 0, /*tp_init*/ 3767 0, /*tp_alloc*/ 3768 0, /*tp_new*/ 3769 0, /*tp_free*/ 3770 0, /*tp_is_gc*/ 3771}; 3772 3773PyObject* 3774PyUnicode_BuildEncodingMap(PyObject* string) 3775{ 3776 Py_UNICODE *decode; 3777 PyObject *result; 3778 struct encoding_map *mresult; 3779 int i; 3780 int need_dict = 0; 3781 unsigned char level1[32]; 3782 unsigned char level2[512]; 3783 unsigned char *mlevel1, *mlevel2, *mlevel3; 3784 int count2 = 0, count3 = 0; 3785 3786 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { 3787 PyErr_BadArgument(); 3788 return NULL; 3789 } 3790 decode = PyUnicode_AS_UNICODE(string); 3791 memset(level1, 0xFF, sizeof level1); 3792 memset(level2, 0xFF, sizeof level2); 3793 3794 /* If there isn't a one-to-one mapping of NULL to \0, 3795 or if there are non-BMP characters, we need to use 3796 a mapping dictionary. */ 3797 if (decode[0] != 0) 3798 need_dict = 1; 3799 for (i = 1; i < 256; i++) { 3800 int l1, l2; 3801 if (decode[i] == 0 3802 #ifdef Py_UNICODE_WIDE 3803 || decode[i] > 0xFFFF 3804 #endif 3805 ) { 3806 need_dict = 1; 3807 break; 3808 } 3809 if (decode[i] == 0xFFFE) 3810 /* unmapped character */ 3811 continue; 3812 l1 = decode[i] >> 11; 3813 l2 = decode[i] >> 7; 3814 if (level1[l1] == 0xFF) 3815 level1[l1] = count2++; 3816 if (level2[l2] == 0xFF) 3817 level2[l2] = count3++; 3818 } 3819 3820 if (count2 >= 0xFF || count3 >= 0xFF) 3821 need_dict = 1; 3822 3823 if (need_dict) { 3824 PyObject *result = PyDict_New(); 3825 PyObject *key, *value; 3826 if (!result) 3827 return NULL; 3828 for (i = 0; i < 256; i++) { 3829 key = value = NULL; 3830 key = PyInt_FromLong(decode[i]); 3831 value = PyInt_FromLong(i); 3832 if (!key || !value) 3833 goto failed1; 3834 if (PyDict_SetItem(result, key, value) == -1) 3835 goto failed1; 3836 Py_DECREF(key); 3837 Py_DECREF(value); 3838 } 3839 return result; 3840 failed1: 3841 Py_XDECREF(key); 3842 Py_XDECREF(value); 3843 Py_DECREF(result); 3844 return NULL; 3845 } 3846 3847 /* Create a three-level trie */ 3848 result = PyObject_MALLOC(sizeof(struct encoding_map) + 3849 16*count2 + 128*count3 - 1); 3850 if (!result) 3851 return PyErr_NoMemory(); 3852 PyObject_Init(result, &EncodingMapType); 3853 mresult = (struct encoding_map*)result; 3854 mresult->count2 = count2; 3855 mresult->count3 = count3; 3856 mlevel1 = mresult->level1; 3857 mlevel2 = mresult->level23; 3858 mlevel3 = mresult->level23 + 16*count2; 3859 memcpy(mlevel1, level1, 32); 3860 memset(mlevel2, 0xFF, 16*count2); 3861 memset(mlevel3, 0, 128*count3); 3862 count3 = 0; 3863 for (i = 1; i < 256; i++) { 3864 int o1, o2, o3, i2, i3; 3865 if (decode[i] == 0xFFFE) 3866 /* unmapped character */ 3867 continue; 3868 o1 = decode[i]>>11; 3869 o2 = (decode[i]>>7) & 0xF; 3870 i2 = 16*mlevel1[o1] + o2; 3871 if (mlevel2[i2] == 0xFF) 3872 mlevel2[i2] = count3++; 3873 o3 = decode[i] & 0x7F; 3874 i3 = 128*mlevel2[i2] + o3; 3875 mlevel3[i3] = i; 3876 } 3877 return result; 3878} 3879 3880static int 3881encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 3882{ 3883 struct encoding_map *map = (struct encoding_map*)mapping; 3884 int l1 = c>>11; 3885 int l2 = (c>>7) & 0xF; 3886 int l3 = c & 0x7F; 3887 int i; 3888 3889#ifdef Py_UNICODE_WIDE 3890 if (c > 0xFFFF) { 3891 return -1; 3892 } 3893#endif 3894 if (c == 0) 3895 return 0; 3896 /* level 1*/ 3897 i = map->level1[l1]; 3898 if (i == 0xFF) { 3899 return -1; 3900 } 3901 /* level 2*/ 3902 i = map->level23[16*i+l2]; 3903 if (i == 0xFF) { 3904 return -1; 3905 } 3906 /* level 3 */ 3907 i = map->level23[16*map->count2 + 128*i + l3]; 3908 if (i == 0) { 3909 return -1; 3910 } 3911 return i; 3912} 3913 3914/* Lookup the character ch in the mapping. If the character 3915 can't be found, Py_None is returned (or NULL, if another 3916 error occurred). */ 3917static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 3918{ 3919 PyObject *w = PyInt_FromLong((long)c); 3920 PyObject *x; 3921 3922 if (w == NULL) 3923 return NULL; 3924 x = PyObject_GetItem(mapping, w); 3925 Py_DECREF(w); 3926 if (x == NULL) { 3927 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 3928 /* No mapping found means: mapping is undefined. */ 3929 PyErr_Clear(); 3930 x = Py_None; 3931 Py_INCREF(x); 3932 return x; 3933 } else 3934 return NULL; 3935 } 3936 else if (x == Py_None) 3937 return x; 3938 else if (PyInt_Check(x)) { 3939 long value = PyInt_AS_LONG(x); 3940 if (value < 0 || value > 255) { 3941 PyErr_SetString(PyExc_TypeError, 3942 "character mapping must be in range(256)"); 3943 Py_DECREF(x); 3944 return NULL; 3945 } 3946 return x; 3947 } 3948 else if (PyString_Check(x)) 3949 return x; 3950 else { 3951 /* wrong return value */ 3952 PyErr_Format(PyExc_TypeError, 3953 "character mapping must return integer, None or str8, not %.400s", 3954 x->ob_type->tp_name); 3955 Py_DECREF(x); 3956 return NULL; 3957 } 3958} 3959 3960static int 3961charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 3962{ 3963 Py_ssize_t outsize = PyBytes_GET_SIZE( outobj); 3964 /* exponentially overallocate to minimize reallocations */ 3965 if (requiredsize < 2*outsize) 3966 requiredsize = 2*outsize; 3967 if (PyBytes_Resize(outobj, requiredsize)) { 3968 Py_DECREF(outobj); 3969 return -1; 3970 } 3971 return 0; 3972} 3973 3974typedef enum charmapencode_result { 3975 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 3976}charmapencode_result; 3977/* lookup the character, put the result in the output string and adjust 3978 various state variables. Resize the output bytes object if not enough 3979 space is available. Return a new reference to the object that 3980 was put in the output buffer, or Py_None, if the mapping was undefined 3981 (in which case no character was written) or NULL, if a 3982 reallocation error occurred. The caller must decref the result */ 3983static 3984charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, 3985 PyObject *outobj, Py_ssize_t *outpos) 3986{ 3987 PyObject *rep; 3988 char *outstart; 3989 Py_ssize_t outsize = PyBytes_GET_SIZE(outobj); 3990 3991 if (mapping->ob_type == &EncodingMapType) { 3992 int res = encoding_map_lookup(c, mapping); 3993 Py_ssize_t requiredsize = *outpos+1; 3994 if (res == -1) 3995 return enc_FAILED; 3996 if (outsize<requiredsize) 3997 if (charmapencode_resize(outobj, outpos, requiredsize)) 3998 return enc_EXCEPTION; 3999 outstart = PyBytes_AS_STRING(outobj); 4000 outstart[(*outpos)++] = (char)res; 4001 return enc_SUCCESS; 4002 } 4003 4004 rep = charmapencode_lookup(c, mapping); 4005 if (rep==NULL) 4006 return enc_EXCEPTION; 4007 else if (rep==Py_None) { 4008 Py_DECREF(rep); 4009 return enc_FAILED; 4010 } else { 4011 if (PyInt_Check(rep)) { 4012 Py_ssize_t requiredsize = *outpos+1; 4013 if (outsize<requiredsize) 4014 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4015 Py_DECREF(rep); 4016 return enc_EXCEPTION; 4017 } 4018 outstart = PyBytes_AS_STRING(outobj); 4019 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); 4020 } 4021 else { 4022 const char *repchars = PyString_AS_STRING(rep); 4023 Py_ssize_t repsize = PyString_GET_SIZE(rep); 4024 Py_ssize_t requiredsize = *outpos+repsize; 4025 if (outsize<requiredsize) 4026 if (charmapencode_resize(outobj, outpos, requiredsize)) { 4027 Py_DECREF(rep); 4028 return enc_EXCEPTION; 4029 } 4030 outstart = PyBytes_AS_STRING(outobj); 4031 memcpy(outstart + *outpos, repchars, repsize); 4032 *outpos += repsize; 4033 } 4034 } 4035 Py_DECREF(rep); 4036 return enc_SUCCESS; 4037} 4038 4039/* handle an error in PyUnicode_EncodeCharmap 4040 Return 0 on success, -1 on error */ 4041static 4042int charmap_encoding_error( 4043 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 4044 PyObject **exceptionObject, 4045 int *known_errorHandler, PyObject **errorHandler, const char *errors, 4046 PyObject *res, Py_ssize_t *respos) 4047{ 4048 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4049 Py_ssize_t repsize; 4050 Py_ssize_t newpos; 4051 Py_UNICODE *uni2; 4052 /* startpos for collecting unencodable chars */ 4053 Py_ssize_t collstartpos = *inpos; 4054 Py_ssize_t collendpos = *inpos+1; 4055 Py_ssize_t collpos; 4056 char *encoding = "charmap"; 4057 char *reason = "character maps to <undefined>"; 4058 charmapencode_result x; 4059 4060 /* find all unencodable characters */ 4061 while (collendpos < size) { 4062 PyObject *rep; 4063 if (mapping->ob_type == &EncodingMapType) { 4064 int res = encoding_map_lookup(p[collendpos], mapping); 4065 if (res != -1) 4066 break; 4067 ++collendpos; 4068 continue; 4069 } 4070 4071 rep = charmapencode_lookup(p[collendpos], mapping); 4072 if (rep==NULL) 4073 return -1; 4074 else if (rep!=Py_None) { 4075 Py_DECREF(rep); 4076 break; 4077 } 4078 Py_DECREF(rep); 4079 ++collendpos; 4080 } 4081 /* cache callback name lookup 4082 * (if not done yet, i.e. it's the first error) */ 4083 if (*known_errorHandler==-1) { 4084 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4085 *known_errorHandler = 1; 4086 else if (!strcmp(errors, "replace")) 4087 *known_errorHandler = 2; 4088 else if (!strcmp(errors, "ignore")) 4089 *known_errorHandler = 3; 4090 else if (!strcmp(errors, "xmlcharrefreplace")) 4091 *known_errorHandler = 4; 4092 else 4093 *known_errorHandler = 0; 4094 } 4095 switch (*known_errorHandler) { 4096 case 1: /* strict */ 4097 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4098 return -1; 4099 case 2: /* replace */ 4100 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 4101 x = charmapencode_output('?', mapping, res, respos); 4102 if (x==enc_EXCEPTION) { 4103 return -1; 4104 } 4105 else if (x==enc_FAILED) { 4106 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4107 return -1; 4108 } 4109 } 4110 /* fall through */ 4111 case 3: /* ignore */ 4112 *inpos = collendpos; 4113 break; 4114 case 4: /* xmlcharrefreplace */ 4115 /* generate replacement (temporarily (mis)uses p) */ 4116 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 4117 char buffer[2+29+1+1]; 4118 char *cp; 4119 sprintf(buffer, "&#%d;", (int)p[collpos]); 4120 for (cp = buffer; *cp; ++cp) { 4121 x = charmapencode_output(*cp, mapping, res, respos); 4122 if (x==enc_EXCEPTION) 4123 return -1; 4124 else if (x==enc_FAILED) { 4125 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4126 return -1; 4127 } 4128 } 4129 } 4130 *inpos = collendpos; 4131 break; 4132 default: 4133 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 4134 encoding, reason, p, size, exceptionObject, 4135 collstartpos, collendpos, &newpos); 4136 if (repunicode == NULL) 4137 return -1; 4138 /* generate replacement */ 4139 repsize = PyUnicode_GET_SIZE(repunicode); 4140 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4141 x = charmapencode_output(*uni2, mapping, res, respos); 4142 if (x==enc_EXCEPTION) { 4143 return -1; 4144 } 4145 else if (x==enc_FAILED) { 4146 Py_DECREF(repunicode); 4147 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 4148 return -1; 4149 } 4150 } 4151 *inpos = newpos; 4152 Py_DECREF(repunicode); 4153 } 4154 return 0; 4155} 4156 4157PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 4158 Py_ssize_t size, 4159 PyObject *mapping, 4160 const char *errors) 4161{ 4162 /* output object */ 4163 PyObject *res = NULL; 4164 /* current input position */ 4165 Py_ssize_t inpos = 0; 4166 /* current output position */ 4167 Py_ssize_t respos = 0; 4168 PyObject *errorHandler = NULL; 4169 PyObject *exc = NULL; 4170 /* the following variable is used for caching string comparisons 4171 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4172 * 3=ignore, 4=xmlcharrefreplace */ 4173 int known_errorHandler = -1; 4174 4175 /* Default to Latin-1 */ 4176 if (mapping == NULL) 4177 return PyUnicode_EncodeLatin1(p, size, errors); 4178 4179 /* allocate enough for a simple encoding without 4180 replacements, if we need more, we'll resize */ 4181 res = PyBytes_FromStringAndSize(NULL, size); 4182 if (res == NULL) 4183 goto onError; 4184 if (size == 0) 4185 return res; 4186 4187 while (inpos<size) { 4188 /* try to encode it */ 4189 charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos); 4190 if (x==enc_EXCEPTION) /* error */ 4191 goto onError; 4192 if (x==enc_FAILED) { /* unencodable character */ 4193 if (charmap_encoding_error(p, size, &inpos, mapping, 4194 &exc, 4195 &known_errorHandler, &errorHandler, errors, 4196 res, &respos)) { 4197 goto onError; 4198 } 4199 } 4200 else 4201 /* done with this character => adjust input position */ 4202 ++inpos; 4203 } 4204 4205 /* Resize if we allocated to much */ 4206 if (respos<PyBytes_GET_SIZE(res)) { 4207 if (PyBytes_Resize(res, respos)) 4208 goto onError; 4209 } 4210 Py_XDECREF(exc); 4211 Py_XDECREF(errorHandler); 4212 return res; 4213 4214 onError: 4215 Py_XDECREF(res); 4216 Py_XDECREF(exc); 4217 Py_XDECREF(errorHandler); 4218 return NULL; 4219} 4220 4221PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 4222 PyObject *mapping) 4223{ 4224 if (!PyUnicode_Check(unicode) || mapping == NULL) { 4225 PyErr_BadArgument(); 4226 return NULL; 4227 } 4228 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 4229 PyUnicode_GET_SIZE(unicode), 4230 mapping, 4231 NULL); 4232} 4233 4234/* create or adjust a UnicodeTranslateError */ 4235static void make_translate_exception(PyObject **exceptionObject, 4236 const Py_UNICODE *unicode, Py_ssize_t size, 4237 Py_ssize_t startpos, Py_ssize_t endpos, 4238 const char *reason) 4239{ 4240 if (*exceptionObject == NULL) { 4241 *exceptionObject = PyUnicodeTranslateError_Create( 4242 unicode, size, startpos, endpos, reason); 4243 } 4244 else { 4245 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 4246 goto onError; 4247 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 4248 goto onError; 4249 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 4250 goto onError; 4251 return; 4252 onError: 4253 Py_DECREF(*exceptionObject); 4254 *exceptionObject = NULL; 4255 } 4256} 4257 4258/* raises a UnicodeTranslateError */ 4259static void raise_translate_exception(PyObject **exceptionObject, 4260 const Py_UNICODE *unicode, Py_ssize_t size, 4261 Py_ssize_t startpos, Py_ssize_t endpos, 4262 const char *reason) 4263{ 4264 make_translate_exception(exceptionObject, 4265 unicode, size, startpos, endpos, reason); 4266 if (*exceptionObject != NULL) 4267 PyCodec_StrictErrors(*exceptionObject); 4268} 4269 4270/* error handling callback helper: 4271 build arguments, call the callback and check the arguments, 4272 put the result into newpos and return the replacement string, which 4273 has to be freed by the caller */ 4274static PyObject *unicode_translate_call_errorhandler(const char *errors, 4275 PyObject **errorHandler, 4276 const char *reason, 4277 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 4278 Py_ssize_t startpos, Py_ssize_t endpos, 4279 Py_ssize_t *newpos) 4280{ 4281 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; 4282 4283 Py_ssize_t i_newpos; 4284 PyObject *restuple; 4285 PyObject *resunicode; 4286 4287 if (*errorHandler == NULL) { 4288 *errorHandler = PyCodec_LookupError(errors); 4289 if (*errorHandler == NULL) 4290 return NULL; 4291 } 4292 4293 make_translate_exception(exceptionObject, 4294 unicode, size, startpos, endpos, reason); 4295 if (*exceptionObject == NULL) 4296 return NULL; 4297 4298 restuple = PyObject_CallFunctionObjArgs( 4299 *errorHandler, *exceptionObject, NULL); 4300 if (restuple == NULL) 4301 return NULL; 4302 if (!PyTuple_Check(restuple)) { 4303 PyErr_Format(PyExc_TypeError, &argparse[4]); 4304 Py_DECREF(restuple); 4305 return NULL; 4306 } 4307 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 4308 &resunicode, &i_newpos)) { 4309 Py_DECREF(restuple); 4310 return NULL; 4311 } 4312 if (i_newpos<0) 4313 *newpos = size+i_newpos; 4314 else 4315 *newpos = i_newpos; 4316 if (*newpos<0 || *newpos>size) { 4317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 4318 Py_DECREF(restuple); 4319 return NULL; 4320 } 4321 Py_INCREF(resunicode); 4322 Py_DECREF(restuple); 4323 return resunicode; 4324} 4325 4326/* Lookup the character ch in the mapping and put the result in result, 4327 which must be decrefed by the caller. 4328 Return 0 on success, -1 on error */ 4329static 4330int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) 4331{ 4332 PyObject *w = PyInt_FromLong((long)c); 4333 PyObject *x; 4334 4335 if (w == NULL) 4336 return -1; 4337 x = PyObject_GetItem(mapping, w); 4338 Py_DECREF(w); 4339 if (x == NULL) { 4340 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 4341 /* No mapping found means: use 1:1 mapping. */ 4342 PyErr_Clear(); 4343 *result = NULL; 4344 return 0; 4345 } else 4346 return -1; 4347 } 4348 else if (x == Py_None) { 4349 *result = x; 4350 return 0; 4351 } 4352 else if (PyInt_Check(x)) { 4353 long value = PyInt_AS_LONG(x); 4354 long max = PyUnicode_GetMax(); 4355 if (value < 0 || value > max) { 4356 PyErr_Format(PyExc_TypeError, 4357 "character mapping must be in range(0x%lx)", max+1); 4358 Py_DECREF(x); 4359 return -1; 4360 } 4361 *result = x; 4362 return 0; 4363 } 4364 else if (PyUnicode_Check(x)) { 4365 *result = x; 4366 return 0; 4367 } 4368 else { 4369 /* wrong return value */ 4370 PyErr_SetString(PyExc_TypeError, 4371 "character mapping must return integer, None or unicode"); 4372 Py_DECREF(x); 4373 return -1; 4374 } 4375} 4376/* ensure that *outobj is at least requiredsize characters long, 4377if not reallocate and adjust various state variables. 4378Return 0 on success, -1 on error */ 4379static 4380int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, 4381 Py_ssize_t requiredsize) 4382{ 4383 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); 4384 if (requiredsize > oldsize) { 4385 /* remember old output position */ 4386 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); 4387 /* exponentially overallocate to minimize reallocations */ 4388 if (requiredsize < 2 * oldsize) 4389 requiredsize = 2 * oldsize; 4390 if (_PyUnicode_Resize(outobj, requiredsize) < 0) 4391 return -1; 4392 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; 4393 } 4394 return 0; 4395} 4396/* lookup the character, put the result in the output string and adjust 4397 various state variables. Return a new reference to the object that 4398 was put in the output buffer in *result, or Py_None, if the mapping was 4399 undefined (in which case no character was written). 4400 The called must decref result. 4401 Return 0 on success, -1 on error. */ 4402static 4403int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, 4404 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, 4405 PyObject **res) 4406{ 4407 if (charmaptranslate_lookup(*curinp, mapping, res)) 4408 return -1; 4409 if (*res==NULL) { 4410 /* not found => default to 1:1 mapping */ 4411 *(*outp)++ = *curinp; 4412 } 4413 else if (*res==Py_None) 4414 ; 4415 else if (PyInt_Check(*res)) { 4416 /* no overflow check, because we know that the space is enough */ 4417 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); 4418 } 4419 else if (PyUnicode_Check(*res)) { 4420 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); 4421 if (repsize==1) { 4422 /* no overflow check, because we know that the space is enough */ 4423 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); 4424 } 4425 else if (repsize!=0) { 4426 /* more than one character */ 4427 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + 4428 (insize - (curinp-startinp)) + 4429 repsize - 1; 4430 if (charmaptranslate_makespace(outobj, outp, requiredsize)) 4431 return -1; 4432 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); 4433 *outp += repsize; 4434 } 4435 } 4436 else 4437 return -1; 4438 return 0; 4439} 4440 4441PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, 4442 Py_ssize_t size, 4443 PyObject *mapping, 4444 const char *errors) 4445{ 4446 /* output object */ 4447 PyObject *res = NULL; 4448 /* pointers to the beginning and end+1 of input */ 4449 const Py_UNICODE *startp = p; 4450 const Py_UNICODE *endp = p + size; 4451 /* pointer into the output */ 4452 Py_UNICODE *str; 4453 /* current output position */ 4454 Py_ssize_t respos = 0; 4455 char *reason = "character maps to <undefined>"; 4456 PyObject *errorHandler = NULL; 4457 PyObject *exc = NULL; 4458 /* the following variable is used for caching string comparisons 4459 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 4460 * 3=ignore, 4=xmlcharrefreplace */ 4461 int known_errorHandler = -1; 4462 4463 if (mapping == NULL) { 4464 PyErr_BadArgument(); 4465 return NULL; 4466 } 4467 4468 /* allocate enough for a simple 1:1 translation without 4469 replacements, if we need more, we'll resize */ 4470 res = PyUnicode_FromUnicode(NULL, size); 4471 if (res == NULL) 4472 goto onError; 4473 if (size == 0) 4474 return res; 4475 str = PyUnicode_AS_UNICODE(res); 4476 4477 while (p<endp) { 4478 /* try to encode it */ 4479 PyObject *x = NULL; 4480 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { 4481 Py_XDECREF(x); 4482 goto onError; 4483 } 4484 Py_XDECREF(x); 4485 if (x!=Py_None) /* it worked => adjust input pointer */ 4486 ++p; 4487 else { /* untranslatable character */ 4488 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 4489 Py_ssize_t repsize; 4490 Py_ssize_t newpos; 4491 Py_UNICODE *uni2; 4492 /* startpos for collecting untranslatable chars */ 4493 const Py_UNICODE *collstart = p; 4494 const Py_UNICODE *collend = p+1; 4495 const Py_UNICODE *coll; 4496 4497 /* find all untranslatable characters */ 4498 while (collend < endp) { 4499 if (charmaptranslate_lookup(*collend, mapping, &x)) 4500 goto onError; 4501 Py_XDECREF(x); 4502 if (x!=Py_None) 4503 break; 4504 ++collend; 4505 } 4506 /* cache callback name lookup 4507 * (if not done yet, i.e. it's the first error) */ 4508 if (known_errorHandler==-1) { 4509 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4510 known_errorHandler = 1; 4511 else if (!strcmp(errors, "replace")) 4512 known_errorHandler = 2; 4513 else if (!strcmp(errors, "ignore")) 4514 known_errorHandler = 3; 4515 else if (!strcmp(errors, "xmlcharrefreplace")) 4516 known_errorHandler = 4; 4517 else 4518 known_errorHandler = 0; 4519 } 4520 switch (known_errorHandler) { 4521 case 1: /* strict */ 4522 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); 4523 goto onError; 4524 case 2: /* replace */ 4525 /* No need to check for space, this is a 1:1 replacement */ 4526 for (coll = collstart; coll<collend; ++coll) 4527 *str++ = '?'; 4528 /* fall through */ 4529 case 3: /* ignore */ 4530 p = collend; 4531 break; 4532 case 4: /* xmlcharrefreplace */ 4533 /* generate replacement (temporarily (mis)uses p) */ 4534 for (p = collstart; p < collend; ++p) { 4535 char buffer[2+29+1+1]; 4536 char *cp; 4537 sprintf(buffer, "&#%d;", (int)*p); 4538 if (charmaptranslate_makespace(&res, &str, 4539 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) 4540 goto onError; 4541 for (cp = buffer; *cp; ++cp) 4542 *str++ = *cp; 4543 } 4544 p = collend; 4545 break; 4546 default: 4547 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 4548 reason, startp, size, &exc, 4549 collstart-startp, collend-startp, &newpos); 4550 if (repunicode == NULL) 4551 goto onError; 4552 /* generate replacement */ 4553 repsize = PyUnicode_GET_SIZE(repunicode); 4554 if (charmaptranslate_makespace(&res, &str, 4555 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { 4556 Py_DECREF(repunicode); 4557 goto onError; 4558 } 4559 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) 4560 *str++ = *uni2; 4561 p = startp + newpos; 4562 Py_DECREF(repunicode); 4563 } 4564 } 4565 } 4566 /* Resize if we allocated to much */ 4567 respos = str-PyUnicode_AS_UNICODE(res); 4568 if (respos<PyUnicode_GET_SIZE(res)) { 4569 if (_PyUnicode_Resize(&res, respos) < 0) 4570 goto onError; 4571 } 4572 Py_XDECREF(exc); 4573 Py_XDECREF(errorHandler); 4574 return res; 4575 4576 onError: 4577 Py_XDECREF(res); 4578 Py_XDECREF(exc); 4579 Py_XDECREF(errorHandler); 4580 return NULL; 4581} 4582 4583PyObject *PyUnicode_Translate(PyObject *str, 4584 PyObject *mapping, 4585 const char *errors) 4586{ 4587 PyObject *result; 4588 4589 str = PyUnicode_FromObject(str); 4590 if (str == NULL) 4591 goto onError; 4592 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 4593 PyUnicode_GET_SIZE(str), 4594 mapping, 4595 errors); 4596 Py_DECREF(str); 4597 return result; 4598 4599 onError: 4600 Py_XDECREF(str); 4601 return NULL; 4602} 4603 4604/* --- Decimal Encoder ---------------------------------------------------- */ 4605 4606int PyUnicode_EncodeDecimal(Py_UNICODE *s, 4607 Py_ssize_t length, 4608 char *output, 4609 const char *errors) 4610{ 4611 Py_UNICODE *p, *end; 4612 PyObject *errorHandler = NULL; 4613 PyObject *exc = NULL; 4614 const char *encoding = "decimal"; 4615 const char *reason = "invalid decimal Unicode string"; 4616 /* the following variable is used for caching string comparisons 4617 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 4618 int known_errorHandler = -1; 4619 4620 if (output == NULL) { 4621 PyErr_BadArgument(); 4622 return -1; 4623 } 4624 4625 p = s; 4626 end = s + length; 4627 while (p < end) { 4628 register Py_UNICODE ch = *p; 4629 int decimal; 4630 PyObject *repunicode; 4631 Py_ssize_t repsize; 4632 Py_ssize_t newpos; 4633 Py_UNICODE *uni2; 4634 Py_UNICODE *collstart; 4635 Py_UNICODE *collend; 4636 4637 if (Py_UNICODE_ISSPACE(ch)) { 4638 *output++ = ' '; 4639 ++p; 4640 continue; 4641 } 4642 decimal = Py_UNICODE_TODECIMAL(ch); 4643 if (decimal >= 0) { 4644 *output++ = '0' + decimal; 4645 ++p; 4646 continue; 4647 } 4648 if (0 < ch && ch < 256) { 4649 *output++ = (char)ch; 4650 ++p; 4651 continue; 4652 } 4653 /* All other characters are considered unencodable */ 4654 collstart = p; 4655 collend = p+1; 4656 while (collend < end) { 4657 if ((0 < *collend && *collend < 256) || 4658 !Py_UNICODE_ISSPACE(*collend) || 4659 Py_UNICODE_TODECIMAL(*collend)) 4660 break; 4661 } 4662 /* cache callback name lookup 4663 * (if not done yet, i.e. it's the first error) */ 4664 if (known_errorHandler==-1) { 4665 if ((errors==NULL) || (!strcmp(errors, "strict"))) 4666 known_errorHandler = 1; 4667 else if (!strcmp(errors, "replace")) 4668 known_errorHandler = 2; 4669 else if (!strcmp(errors, "ignore")) 4670 known_errorHandler = 3; 4671 else if (!strcmp(errors, "xmlcharrefreplace")) 4672 known_errorHandler = 4; 4673 else 4674 known_errorHandler = 0; 4675 } 4676 switch (known_errorHandler) { 4677 case 1: /* strict */ 4678 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 4679 goto onError; 4680 case 2: /* replace */ 4681 for (p = collstart; p < collend; ++p) 4682 *output++ = '?'; 4683 /* fall through */ 4684 case 3: /* ignore */ 4685 p = collend; 4686 break; 4687 case 4: /* xmlcharrefreplace */ 4688 /* generate replacement (temporarily (mis)uses p) */ 4689 for (p = collstart; p < collend; ++p) 4690 output += sprintf(output, "&#%d;", (int)*p); 4691 p = collend; 4692 break; 4693 default: 4694 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 4695 encoding, reason, s, length, &exc, 4696 collstart-s, collend-s, &newpos); 4697 if (repunicode == NULL) 4698 goto onError; 4699 /* generate replacement */ 4700 repsize = PyUnicode_GET_SIZE(repunicode); 4701 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 4702 Py_UNICODE ch = *uni2; 4703 if (Py_UNICODE_ISSPACE(ch)) 4704 *output++ = ' '; 4705 else { 4706 decimal = Py_UNICODE_TODECIMAL(ch); 4707 if (decimal >= 0) 4708 *output++ = '0' + decimal; 4709 else if (0 < ch && ch < 256) 4710 *output++ = (char)ch; 4711 else { 4712 Py_DECREF(repunicode); 4713 raise_encode_exception(&exc, encoding, 4714 s, length, collstart-s, collend-s, reason); 4715 goto onError; 4716 } 4717 } 4718 } 4719 p = s + newpos; 4720 Py_DECREF(repunicode); 4721 } 4722 } 4723 /* 0-terminate the output string */ 4724 *output++ = '\0'; 4725 Py_XDECREF(exc); 4726 Py_XDECREF(errorHandler); 4727 return 0; 4728 4729 onError: 4730 Py_XDECREF(exc); 4731 Py_XDECREF(errorHandler); 4732 return -1; 4733} 4734 4735/* --- Helpers ------------------------------------------------------------ */ 4736 4737#define STRINGLIB_CHAR Py_UNICODE 4738 4739#define STRINGLIB_LEN PyUnicode_GET_SIZE 4740#define STRINGLIB_NEW PyUnicode_FromUnicode 4741#define STRINGLIB_STR PyUnicode_AS_UNICODE 4742 4743Py_LOCAL_INLINE(int) 4744STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) 4745{ 4746 if (str[0] != other[0]) 4747 return 1; 4748 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); 4749} 4750 4751#define STRINGLIB_EMPTY unicode_empty 4752 4753#include "stringlib/fastsearch.h" 4754 4755#include "stringlib/count.h" 4756#include "stringlib/find.h" 4757#include "stringlib/partition.h" 4758 4759/* helper macro to fixup start/end slice values */ 4760#define FIX_START_END(obj) \ 4761 if (start < 0) \ 4762 start += (obj)->length; \ 4763 if (start < 0) \ 4764 start = 0; \ 4765 if (end > (obj)->length) \ 4766 end = (obj)->length; \ 4767 if (end < 0) \ 4768 end += (obj)->length; \ 4769 if (end < 0) \ 4770 end = 0; 4771 4772Py_ssize_t PyUnicode_Count(PyObject *str, 4773 PyObject *substr, 4774 Py_ssize_t start, 4775 Py_ssize_t end) 4776{ 4777 Py_ssize_t result; 4778 PyUnicodeObject* str_obj; 4779 PyUnicodeObject* sub_obj; 4780 4781 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 4782 if (!str_obj) 4783 return -1; 4784 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 4785 if (!sub_obj) { 4786 Py_DECREF(str_obj); 4787 return -1; 4788 } 4789 4790 FIX_START_END(str_obj); 4791 4792 result = stringlib_count( 4793 str_obj->str + start, end - start, sub_obj->str, sub_obj->length 4794 ); 4795 4796 Py_DECREF(sub_obj); 4797 Py_DECREF(str_obj); 4798 4799 return result; 4800} 4801 4802Py_ssize_t PyUnicode_Find(PyObject *str, 4803 PyObject *sub, 4804 Py_ssize_t start, 4805 Py_ssize_t end, 4806 int direction) 4807{ 4808 Py_ssize_t result; 4809 4810 str = PyUnicode_FromObject(str); 4811 if (!str) 4812 return -2; 4813 sub = PyUnicode_FromObject(sub); 4814 if (!sub) { 4815 Py_DECREF(str); 4816 return -2; 4817 } 4818 4819 if (direction > 0) 4820 result = stringlib_find_slice( 4821 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4822 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4823 start, end 4824 ); 4825 else 4826 result = stringlib_rfind_slice( 4827 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), 4828 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), 4829 start, end 4830 ); 4831 4832 Py_DECREF(str); 4833 Py_DECREF(sub); 4834 4835 return result; 4836} 4837 4838static 4839int tailmatch(PyUnicodeObject *self, 4840 PyUnicodeObject *substring, 4841 Py_ssize_t start, 4842 Py_ssize_t end, 4843 int direction) 4844{ 4845 if (substring->length == 0) 4846 return 1; 4847 4848 FIX_START_END(self); 4849 4850 end -= substring->length; 4851 if (end < start) 4852 return 0; 4853 4854 if (direction > 0) { 4855 if (Py_UNICODE_MATCH(self, end, substring)) 4856 return 1; 4857 } else { 4858 if (Py_UNICODE_MATCH(self, start, substring)) 4859 return 1; 4860 } 4861 4862 return 0; 4863} 4864 4865Py_ssize_t PyUnicode_Tailmatch(PyObject *str, 4866 PyObject *substr, 4867 Py_ssize_t start, 4868 Py_ssize_t end, 4869 int direction) 4870{ 4871 Py_ssize_t result; 4872 4873 str = PyUnicode_FromObject(str); 4874 if (str == NULL) 4875 return -1; 4876 substr = PyUnicode_FromObject(substr); 4877 if (substr == NULL) { 4878 Py_DECREF(str); 4879 return -1; 4880 } 4881 4882 result = tailmatch((PyUnicodeObject *)str, 4883 (PyUnicodeObject *)substr, 4884 start, end, direction); 4885 Py_DECREF(str); 4886 Py_DECREF(substr); 4887 return result; 4888} 4889 4890/* Apply fixfct filter to the Unicode object self and return a 4891 reference to the modified object */ 4892 4893static 4894PyObject *fixup(PyUnicodeObject *self, 4895 int (*fixfct)(PyUnicodeObject *s)) 4896{ 4897 4898 PyUnicodeObject *u; 4899 4900 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 4901 if (u == NULL) 4902 return NULL; 4903 4904 Py_UNICODE_COPY(u->str, self->str, self->length); 4905 4906 if (!fixfct(u) && PyUnicode_CheckExact(self)) { 4907 /* fixfct should return TRUE if it modified the buffer. If 4908 FALSE, return a reference to the original buffer instead 4909 (to save space, not time) */ 4910 Py_INCREF(self); 4911 Py_DECREF(u); 4912 return (PyObject*) self; 4913 } 4914 return (PyObject*) u; 4915} 4916 4917static 4918int fixupper(PyUnicodeObject *self) 4919{ 4920 Py_ssize_t len = self->length; 4921 Py_UNICODE *s = self->str; 4922 int status = 0; 4923 4924 while (len-- > 0) { 4925 register Py_UNICODE ch; 4926 4927 ch = Py_UNICODE_TOUPPER(*s); 4928 if (ch != *s) { 4929 status = 1; 4930 *s = ch; 4931 } 4932 s++; 4933 } 4934 4935 return status; 4936} 4937 4938static 4939int fixlower(PyUnicodeObject *self) 4940{ 4941 Py_ssize_t len = self->length; 4942 Py_UNICODE *s = self->str; 4943 int status = 0; 4944 4945 while (len-- > 0) { 4946 register Py_UNICODE ch; 4947 4948 ch = Py_UNICODE_TOLOWER(*s); 4949 if (ch != *s) { 4950 status = 1; 4951 *s = ch; 4952 } 4953 s++; 4954 } 4955 4956 return status; 4957} 4958 4959static 4960int fixswapcase(PyUnicodeObject *self) 4961{ 4962 Py_ssize_t len = self->length; 4963 Py_UNICODE *s = self->str; 4964 int status = 0; 4965 4966 while (len-- > 0) { 4967 if (Py_UNICODE_ISUPPER(*s)) { 4968 *s = Py_UNICODE_TOLOWER(*s); 4969 status = 1; 4970 } else if (Py_UNICODE_ISLOWER(*s)) { 4971 *s = Py_UNICODE_TOUPPER(*s); 4972 status = 1; 4973 } 4974 s++; 4975 } 4976 4977 return status; 4978} 4979 4980static 4981int fixcapitalize(PyUnicodeObject *self) 4982{ 4983 Py_ssize_t len = self->length; 4984 Py_UNICODE *s = self->str; 4985 int status = 0; 4986 4987 if (len == 0) 4988 return 0; 4989 if (Py_UNICODE_ISLOWER(*s)) { 4990 *s = Py_UNICODE_TOUPPER(*s); 4991 status = 1; 4992 } 4993 s++; 4994 while (--len > 0) { 4995 if (Py_UNICODE_ISUPPER(*s)) { 4996 *s = Py_UNICODE_TOLOWER(*s); 4997 status = 1; 4998 } 4999 s++; 5000 } 5001 return status; 5002} 5003 5004static 5005int fixtitle(PyUnicodeObject *self) 5006{ 5007 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 5008 register Py_UNICODE *e; 5009 int previous_is_cased; 5010 5011 /* Shortcut for single character strings */ 5012 if (PyUnicode_GET_SIZE(self) == 1) { 5013 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 5014 if (*p != ch) { 5015 *p = ch; 5016 return 1; 5017 } 5018 else 5019 return 0; 5020 } 5021 5022 e = p + PyUnicode_GET_SIZE(self); 5023 previous_is_cased = 0; 5024 for (; p < e; p++) { 5025 register const Py_UNICODE ch = *p; 5026 5027 if (previous_is_cased) 5028 *p = Py_UNICODE_TOLOWER(ch); 5029 else 5030 *p = Py_UNICODE_TOTITLE(ch); 5031 5032 if (Py_UNICODE_ISLOWER(ch) || 5033 Py_UNICODE_ISUPPER(ch) || 5034 Py_UNICODE_ISTITLE(ch)) 5035 previous_is_cased = 1; 5036 else 5037 previous_is_cased = 0; 5038 } 5039 return 1; 5040} 5041 5042PyObject * 5043PyUnicode_Join(PyObject *separator, PyObject *seq) 5044{ 5045 PyObject *internal_separator = NULL; 5046 const Py_UNICODE blank = ' '; 5047 const Py_UNICODE *sep = ␣ 5048 Py_ssize_t seplen = 1; 5049 PyUnicodeObject *res = NULL; /* the result */ 5050 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ 5051 Py_ssize_t res_used; /* # used bytes */ 5052 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ 5053 PyObject *fseq; /* PySequence_Fast(seq) */ 5054 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 5055 PyObject *item; 5056 Py_ssize_t i; 5057 5058 fseq = PySequence_Fast(seq, ""); 5059 if (fseq == NULL) { 5060 return NULL; 5061 } 5062 5063 /* Grrrr. A codec may be invoked to convert str objects to 5064 * Unicode, and so it's possible to call back into Python code 5065 * during PyUnicode_FromObject(), and so it's possible for a sick 5066 * codec to change the size of fseq (if seq is a list). Therefore 5067 * we have to keep refetching the size -- can't assume seqlen 5068 * is invariant. 5069 */ 5070 seqlen = PySequence_Fast_GET_SIZE(fseq); 5071 /* If empty sequence, return u"". */ 5072 if (seqlen == 0) { 5073 res = _PyUnicode_New(0); /* empty sequence; return u"" */ 5074 goto Done; 5075 } 5076 /* If singleton sequence with an exact Unicode, return that. */ 5077 if (seqlen == 1) { 5078 item = PySequence_Fast_GET_ITEM(fseq, 0); 5079 if (PyUnicode_CheckExact(item)) { 5080 Py_INCREF(item); 5081 res = (PyUnicodeObject *)item; 5082 goto Done; 5083 } 5084 } 5085 5086 /* At least two items to join, or one that isn't exact Unicode. */ 5087 if (seqlen > 1) { 5088 /* Set up sep and seplen -- they're needed. */ 5089 if (separator == NULL) { 5090 sep = ␣ 5091 seplen = 1; 5092 } 5093 else { 5094 internal_separator = PyUnicode_FromObject(separator); 5095 if (internal_separator == NULL) 5096 goto onError; 5097 sep = PyUnicode_AS_UNICODE(internal_separator); 5098 seplen = PyUnicode_GET_SIZE(internal_separator); 5099 /* In case PyUnicode_FromObject() mutated seq. */ 5100 seqlen = PySequence_Fast_GET_SIZE(fseq); 5101 } 5102 } 5103 5104 /* Get space. */ 5105 res = _PyUnicode_New(res_alloc); 5106 if (res == NULL) 5107 goto onError; 5108 res_p = PyUnicode_AS_UNICODE(res); 5109 res_used = 0; 5110 5111 for (i = 0; i < seqlen; ++i) { 5112 Py_ssize_t itemlen; 5113 Py_ssize_t new_res_used; 5114 5115 item = PySequence_Fast_GET_ITEM(fseq, i); 5116 /* Convert item to Unicode. */ 5117 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { 5118 PyErr_Format(PyExc_TypeError, 5119 "sequence item %zd: expected string or Unicode," 5120 " %.80s found", 5121 i, item->ob_type->tp_name); 5122 goto onError; 5123 } 5124 item = PyUnicode_FromObject(item); 5125 if (item == NULL) 5126 goto onError; 5127 /* We own a reference to item from here on. */ 5128 5129 /* In case PyUnicode_FromObject() mutated seq. */ 5130 seqlen = PySequence_Fast_GET_SIZE(fseq); 5131 5132 /* Make sure we have enough space for the separator and the item. */ 5133 itemlen = PyUnicode_GET_SIZE(item); 5134 new_res_used = res_used + itemlen; 5135 if (new_res_used < 0) 5136 goto Overflow; 5137 if (i < seqlen - 1) { 5138 new_res_used += seplen; 5139 if (new_res_used < 0) 5140 goto Overflow; 5141 } 5142 if (new_res_used > res_alloc) { 5143 /* double allocated size until it's big enough */ 5144 do { 5145 res_alloc += res_alloc; 5146 if (res_alloc <= 0) 5147 goto Overflow; 5148 } while (new_res_used > res_alloc); 5149 if (_PyUnicode_Resize(&res, res_alloc) < 0) { 5150 Py_DECREF(item); 5151 goto onError; 5152 } 5153 res_p = PyUnicode_AS_UNICODE(res) + res_used; 5154 } 5155 5156 /* Copy item, and maybe the separator. */ 5157 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); 5158 res_p += itemlen; 5159 if (i < seqlen - 1) { 5160 Py_UNICODE_COPY(res_p, sep, seplen); 5161 res_p += seplen; 5162 } 5163 Py_DECREF(item); 5164 res_used = new_res_used; 5165 } 5166 5167 /* Shrink res to match the used area; this probably can't fail, 5168 * but it's cheap to check. 5169 */ 5170 if (_PyUnicode_Resize(&res, res_used) < 0) 5171 goto onError; 5172 5173 Done: 5174 Py_XDECREF(internal_separator); 5175 Py_DECREF(fseq); 5176 return (PyObject *)res; 5177 5178 Overflow: 5179 PyErr_SetString(PyExc_OverflowError, 5180 "join() result is too long for a Python string"); 5181 Py_DECREF(item); 5182 /* fall through */ 5183 5184 onError: 5185 Py_XDECREF(internal_separator); 5186 Py_DECREF(fseq); 5187 Py_XDECREF(res); 5188 return NULL; 5189} 5190 5191static 5192PyUnicodeObject *pad(PyUnicodeObject *self, 5193 Py_ssize_t left, 5194 Py_ssize_t right, 5195 Py_UNICODE fill) 5196{ 5197 PyUnicodeObject *u; 5198 5199 if (left < 0) 5200 left = 0; 5201 if (right < 0) 5202 right = 0; 5203 5204 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 5205 Py_INCREF(self); 5206 return self; 5207 } 5208 5209 u = _PyUnicode_New(left + self->length + right); 5210 if (u) { 5211 if (left) 5212 Py_UNICODE_FILL(u->str, fill, left); 5213 Py_UNICODE_COPY(u->str + left, self->str, self->length); 5214 if (right) 5215 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 5216 } 5217 5218 return u; 5219} 5220 5221#define SPLIT_APPEND(data, left, right) \ 5222 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ 5223 if (!str) \ 5224 goto onError; \ 5225 if (PyList_Append(list, str)) { \ 5226 Py_DECREF(str); \ 5227 goto onError; \ 5228 } \ 5229 else \ 5230 Py_DECREF(str); 5231 5232static 5233PyObject *split_whitespace(PyUnicodeObject *self, 5234 PyObject *list, 5235 Py_ssize_t maxcount) 5236{ 5237 register Py_ssize_t i; 5238 register Py_ssize_t j; 5239 Py_ssize_t len = self->length; 5240 PyObject *str; 5241 5242 for (i = j = 0; i < len; ) { 5243 /* find a token */ 5244 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5245 i++; 5246 j = i; 5247 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 5248 i++; 5249 if (j < i) { 5250 if (maxcount-- <= 0) 5251 break; 5252 SPLIT_APPEND(self->str, j, i); 5253 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 5254 i++; 5255 j = i; 5256 } 5257 } 5258 if (j < len) { 5259 SPLIT_APPEND(self->str, j, len); 5260 } 5261 return list; 5262 5263 onError: 5264 Py_DECREF(list); 5265 return NULL; 5266} 5267 5268PyObject *PyUnicode_Splitlines(PyObject *string, 5269 int keepends) 5270{ 5271 register Py_ssize_t i; 5272 register Py_ssize_t j; 5273 Py_ssize_t len; 5274 PyObject *list; 5275 PyObject *str; 5276 Py_UNICODE *data; 5277 5278 string = PyUnicode_FromObject(string); 5279 if (string == NULL) 5280 return NULL; 5281 data = PyUnicode_AS_UNICODE(string); 5282 len = PyUnicode_GET_SIZE(string); 5283 5284 list = PyList_New(0); 5285 if (!list) 5286 goto onError; 5287 5288 for (i = j = 0; i < len; ) { 5289 Py_ssize_t eol; 5290 5291 /* Find a line and append it */ 5292 while (i < len && !BLOOM_LINEBREAK(data[i])) 5293 i++; 5294 5295 /* Skip the line break reading CRLF as one line break */ 5296 eol = i; 5297 if (i < len) { 5298 if (data[i] == '\r' && i + 1 < len && 5299 data[i+1] == '\n') 5300 i += 2; 5301 else 5302 i++; 5303 if (keepends) 5304 eol = i; 5305 } 5306 SPLIT_APPEND(data, j, eol); 5307 j = i; 5308 } 5309 if (j < len) { 5310 SPLIT_APPEND(data, j, len); 5311 } 5312 5313 Py_DECREF(string); 5314 return list; 5315 5316 onError: 5317 Py_XDECREF(list); 5318 Py_DECREF(string); 5319 return NULL; 5320} 5321 5322static 5323PyObject *split_char(PyUnicodeObject *self, 5324 PyObject *list, 5325 Py_UNICODE ch, 5326 Py_ssize_t maxcount) 5327{ 5328 register Py_ssize_t i; 5329 register Py_ssize_t j; 5330 Py_ssize_t len = self->length; 5331 PyObject *str; 5332 5333 for (i = j = 0; i < len; ) { 5334 if (self->str[i] == ch) { 5335 if (maxcount-- <= 0) 5336 break; 5337 SPLIT_APPEND(self->str, j, i); 5338 i = j = i + 1; 5339 } else 5340 i++; 5341 } 5342 if (j <= len) { 5343 SPLIT_APPEND(self->str, j, len); 5344 } 5345 return list; 5346 5347 onError: 5348 Py_DECREF(list); 5349 return NULL; 5350} 5351 5352static 5353PyObject *split_substring(PyUnicodeObject *self, 5354 PyObject *list, 5355 PyUnicodeObject *substring, 5356 Py_ssize_t maxcount) 5357{ 5358 register Py_ssize_t i; 5359 register Py_ssize_t j; 5360 Py_ssize_t len = self->length; 5361 Py_ssize_t sublen = substring->length; 5362 PyObject *str; 5363 5364 for (i = j = 0; i <= len - sublen; ) { 5365 if (Py_UNICODE_MATCH(self, i, substring)) { 5366 if (maxcount-- <= 0) 5367 break; 5368 SPLIT_APPEND(self->str, j, i); 5369 i = j = i + sublen; 5370 } else 5371 i++; 5372 } 5373 if (j <= len) { 5374 SPLIT_APPEND(self->str, j, len); 5375 } 5376 return list; 5377 5378 onError: 5379 Py_DECREF(list); 5380 return NULL; 5381} 5382 5383static 5384PyObject *rsplit_whitespace(PyUnicodeObject *self, 5385 PyObject *list, 5386 Py_ssize_t maxcount) 5387{ 5388 register Py_ssize_t i; 5389 register Py_ssize_t j; 5390 Py_ssize_t len = self->length; 5391 PyObject *str; 5392 5393 for (i = j = len - 1; i >= 0; ) { 5394 /* find a token */ 5395 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5396 i--; 5397 j = i; 5398 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i])) 5399 i--; 5400 if (j > i) { 5401 if (maxcount-- <= 0) 5402 break; 5403 SPLIT_APPEND(self->str, i + 1, j + 1); 5404 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i])) 5405 i--; 5406 j = i; 5407 } 5408 } 5409 if (j >= 0) { 5410 SPLIT_APPEND(self->str, 0, j + 1); 5411 } 5412 if (PyList_Reverse(list) < 0) 5413 goto onError; 5414 return list; 5415 5416 onError: 5417 Py_DECREF(list); 5418 return NULL; 5419} 5420 5421static 5422PyObject *rsplit_char(PyUnicodeObject *self, 5423 PyObject *list, 5424 Py_UNICODE ch, 5425 Py_ssize_t maxcount) 5426{ 5427 register Py_ssize_t i; 5428 register Py_ssize_t j; 5429 Py_ssize_t len = self->length; 5430 PyObject *str; 5431 5432 for (i = j = len - 1; i >= 0; ) { 5433 if (self->str[i] == ch) { 5434 if (maxcount-- <= 0) 5435 break; 5436 SPLIT_APPEND(self->str, i + 1, j + 1); 5437 j = i = i - 1; 5438 } else 5439 i--; 5440 } 5441 if (j >= -1) { 5442 SPLIT_APPEND(self->str, 0, j + 1); 5443 } 5444 if (PyList_Reverse(list) < 0) 5445 goto onError; 5446 return list; 5447 5448 onError: 5449 Py_DECREF(list); 5450 return NULL; 5451} 5452 5453static 5454PyObject *rsplit_substring(PyUnicodeObject *self, 5455 PyObject *list, 5456 PyUnicodeObject *substring, 5457 Py_ssize_t maxcount) 5458{ 5459 register Py_ssize_t i; 5460 register Py_ssize_t j; 5461 Py_ssize_t len = self->length; 5462 Py_ssize_t sublen = substring->length; 5463 PyObject *str; 5464 5465 for (i = len - sublen, j = len; i >= 0; ) { 5466 if (Py_UNICODE_MATCH(self, i, substring)) { 5467 if (maxcount-- <= 0) 5468 break; 5469 SPLIT_APPEND(self->str, i + sublen, j); 5470 j = i; 5471 i -= sublen; 5472 } else 5473 i--; 5474 } 5475 if (j >= 0) { 5476 SPLIT_APPEND(self->str, 0, j); 5477 } 5478 if (PyList_Reverse(list) < 0) 5479 goto onError; 5480 return list; 5481 5482 onError: 5483 Py_DECREF(list); 5484 return NULL; 5485} 5486 5487#undef SPLIT_APPEND 5488 5489static 5490PyObject *split(PyUnicodeObject *self, 5491 PyUnicodeObject *substring, 5492 Py_ssize_t maxcount) 5493{ 5494 PyObject *list; 5495 5496 if (maxcount < 0) 5497 maxcount = PY_SSIZE_T_MAX; 5498 5499 list = PyList_New(0); 5500 if (!list) 5501 return NULL; 5502 5503 if (substring == NULL) 5504 return split_whitespace(self,list,maxcount); 5505 5506 else if (substring->length == 1) 5507 return split_char(self,list,substring->str[0],maxcount); 5508 5509 else if (substring->length == 0) { 5510 Py_DECREF(list); 5511 PyErr_SetString(PyExc_ValueError, "empty separator"); 5512 return NULL; 5513 } 5514 else 5515 return split_substring(self,list,substring,maxcount); 5516} 5517 5518static 5519PyObject *rsplit(PyUnicodeObject *self, 5520 PyUnicodeObject *substring, 5521 Py_ssize_t maxcount) 5522{ 5523 PyObject *list; 5524 5525 if (maxcount < 0) 5526 maxcount = PY_SSIZE_T_MAX; 5527 5528 list = PyList_New(0); 5529 if (!list) 5530 return NULL; 5531 5532 if (substring == NULL) 5533 return rsplit_whitespace(self,list,maxcount); 5534 5535 else if (substring->length == 1) 5536 return rsplit_char(self,list,substring->str[0],maxcount); 5537 5538 else if (substring->length == 0) { 5539 Py_DECREF(list); 5540 PyErr_SetString(PyExc_ValueError, "empty separator"); 5541 return NULL; 5542 } 5543 else 5544 return rsplit_substring(self,list,substring,maxcount); 5545} 5546 5547static 5548PyObject *replace(PyUnicodeObject *self, 5549 PyUnicodeObject *str1, 5550 PyUnicodeObject *str2, 5551 Py_ssize_t maxcount) 5552{ 5553 PyUnicodeObject *u; 5554 5555 if (maxcount < 0) 5556 maxcount = PY_SSIZE_T_MAX; 5557 5558 if (str1->length == str2->length) { 5559 /* same length */ 5560 Py_ssize_t i; 5561 if (str1->length == 1) { 5562 /* replace characters */ 5563 Py_UNICODE u1, u2; 5564 if (!findchar(self->str, self->length, str1->str[0])) 5565 goto nothing; 5566 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5567 if (!u) 5568 return NULL; 5569 Py_UNICODE_COPY(u->str, self->str, self->length); 5570 u1 = str1->str[0]; 5571 u2 = str2->str[0]; 5572 for (i = 0; i < u->length; i++) 5573 if (u->str[i] == u1) { 5574 if (--maxcount < 0) 5575 break; 5576 u->str[i] = u2; 5577 } 5578 } else { 5579 i = fastsearch( 5580 self->str, self->length, str1->str, str1->length, FAST_SEARCH 5581 ); 5582 if (i < 0) 5583 goto nothing; 5584 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); 5585 if (!u) 5586 return NULL; 5587 Py_UNICODE_COPY(u->str, self->str, self->length); 5588 while (i <= self->length - str1->length) 5589 if (Py_UNICODE_MATCH(self, i, str1)) { 5590 if (--maxcount < 0) 5591 break; 5592 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); 5593 i += str1->length; 5594 } else 5595 i++; 5596 } 5597 } else { 5598 5599 Py_ssize_t n, i, j, e; 5600 Py_ssize_t product, new_size, delta; 5601 Py_UNICODE *p; 5602 5603 /* replace strings */ 5604 n = stringlib_count(self->str, self->length, str1->str, str1->length); 5605 if (n > maxcount) 5606 n = maxcount; 5607 if (n == 0) 5608 goto nothing; 5609 /* new_size = self->length + n * (str2->length - str1->length)); */ 5610 delta = (str2->length - str1->length); 5611 if (delta == 0) { 5612 new_size = self->length; 5613 } else { 5614 product = n * (str2->length - str1->length); 5615 if ((product / (str2->length - str1->length)) != n) { 5616 PyErr_SetString(PyExc_OverflowError, 5617 "replace string is too long"); 5618 return NULL; 5619 } 5620 new_size = self->length + product; 5621 if (new_size < 0) { 5622 PyErr_SetString(PyExc_OverflowError, 5623 "replace string is too long"); 5624 return NULL; 5625 } 5626 } 5627 u = _PyUnicode_New(new_size); 5628 if (!u) 5629 return NULL; 5630 i = 0; 5631 p = u->str; 5632 e = self->length - str1->length; 5633 if (str1->length > 0) { 5634 while (n-- > 0) { 5635 /* look for next match */ 5636 j = i; 5637 while (j <= e) { 5638 if (Py_UNICODE_MATCH(self, j, str1)) 5639 break; 5640 j++; 5641 } 5642 if (j > i) { 5643 if (j > e) 5644 break; 5645 /* copy unchanged part [i:j] */ 5646 Py_UNICODE_COPY(p, self->str+i, j-i); 5647 p += j - i; 5648 } 5649 /* copy substitution string */ 5650 if (str2->length > 0) { 5651 Py_UNICODE_COPY(p, str2->str, str2->length); 5652 p += str2->length; 5653 } 5654 i = j + str1->length; 5655 } 5656 if (i < self->length) 5657 /* copy tail [i:] */ 5658 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5659 } else { 5660 /* interleave */ 5661 while (n > 0) { 5662 Py_UNICODE_COPY(p, str2->str, str2->length); 5663 p += str2->length; 5664 if (--n <= 0) 5665 break; 5666 *p++ = self->str[i++]; 5667 } 5668 Py_UNICODE_COPY(p, self->str+i, self->length-i); 5669 } 5670 } 5671 return (PyObject *) u; 5672 5673nothing: 5674 /* nothing to replace; return original string (when possible) */ 5675 if (PyUnicode_CheckExact(self)) { 5676 Py_INCREF(self); 5677 return (PyObject *) self; 5678 } 5679 return PyUnicode_FromUnicode(self->str, self->length); 5680} 5681 5682/* --- Unicode Object Methods --------------------------------------------- */ 5683 5684PyDoc_STRVAR(title__doc__, 5685"S.title() -> unicode\n\ 5686\n\ 5687Return a titlecased version of S, i.e. words start with title case\n\ 5688characters, all remaining cased characters have lower case."); 5689 5690static PyObject* 5691unicode_title(PyUnicodeObject *self) 5692{ 5693 return fixup(self, fixtitle); 5694} 5695 5696PyDoc_STRVAR(capitalize__doc__, 5697"S.capitalize() -> unicode\n\ 5698\n\ 5699Return a capitalized version of S, i.e. make the first character\n\ 5700have upper case."); 5701 5702static PyObject* 5703unicode_capitalize(PyUnicodeObject *self) 5704{ 5705 return fixup(self, fixcapitalize); 5706} 5707 5708#if 0 5709PyDoc_STRVAR(capwords__doc__, 5710"S.capwords() -> unicode\n\ 5711\n\ 5712Apply .capitalize() to all words in S and return the result with\n\ 5713normalized whitespace (all whitespace strings are replaced by ' ')."); 5714 5715static PyObject* 5716unicode_capwords(PyUnicodeObject *self) 5717{ 5718 PyObject *list; 5719 PyObject *item; 5720 Py_ssize_t i; 5721 5722 /* Split into words */ 5723 list = split(self, NULL, -1); 5724 if (!list) 5725 return NULL; 5726 5727 /* Capitalize each word */ 5728 for (i = 0; i < PyList_GET_SIZE(list); i++) { 5729 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 5730 fixcapitalize); 5731 if (item == NULL) 5732 goto onError; 5733 Py_DECREF(PyList_GET_ITEM(list, i)); 5734 PyList_SET_ITEM(list, i, item); 5735 } 5736 5737 /* Join the words to form a new string */ 5738 item = PyUnicode_Join(NULL, list); 5739 5740onError: 5741 Py_DECREF(list); 5742 return (PyObject *)item; 5743} 5744#endif 5745 5746/* Argument converter. Coerces to a single unicode character */ 5747 5748static int 5749convert_uc(PyObject *obj, void *addr) 5750{ 5751 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; 5752 PyObject *uniobj; 5753 Py_UNICODE *unistr; 5754 5755 uniobj = PyUnicode_FromObject(obj); 5756 if (uniobj == NULL) { 5757 PyErr_SetString(PyExc_TypeError, 5758 "The fill character cannot be converted to Unicode"); 5759 return 0; 5760 } 5761 if (PyUnicode_GET_SIZE(uniobj) != 1) { 5762 PyErr_SetString(PyExc_TypeError, 5763 "The fill character must be exactly one character long"); 5764 Py_DECREF(uniobj); 5765 return 0; 5766 } 5767 unistr = PyUnicode_AS_UNICODE(uniobj); 5768 *fillcharloc = unistr[0]; 5769 Py_DECREF(uniobj); 5770 return 1; 5771} 5772 5773PyDoc_STRVAR(center__doc__, 5774"S.center(width[, fillchar]) -> unicode\n\ 5775\n\ 5776Return S centered in a Unicode string of length width. Padding is\n\ 5777done using the specified fill character (default is a space)"); 5778 5779static PyObject * 5780unicode_center(PyUnicodeObject *self, PyObject *args) 5781{ 5782 Py_ssize_t marg, left; 5783 Py_ssize_t width; 5784 Py_UNICODE fillchar = ' '; 5785 5786 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 5787 return NULL; 5788 5789 if (self->length >= width && PyUnicode_CheckExact(self)) { 5790 Py_INCREF(self); 5791 return (PyObject*) self; 5792 } 5793 5794 marg = width - self->length; 5795 left = marg / 2 + (marg & width & 1); 5796 5797 return (PyObject*) pad(self, left, marg - left, fillchar); 5798} 5799 5800#if 0 5801 5802/* This code should go into some future Unicode collation support 5803 module. The basic comparison should compare ordinals on a naive 5804 basis (this is what Java does and thus JPython too). */ 5805 5806/* speedy UTF-16 code point order comparison */ 5807/* gleaned from: */ 5808/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 5809 5810static short utf16Fixup[32] = 5811{ 5812 0, 0, 0, 0, 0, 0, 0, 0, 5813 0, 0, 0, 0, 0, 0, 0, 0, 5814 0, 0, 0, 0, 0, 0, 0, 0, 5815 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 5816}; 5817 5818static int 5819unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5820{ 5821 Py_ssize_t len1, len2; 5822 5823 Py_UNICODE *s1 = str1->str; 5824 Py_UNICODE *s2 = str2->str; 5825 5826 len1 = str1->length; 5827 len2 = str2->length; 5828 5829 while (len1 > 0 && len2 > 0) { 5830 Py_UNICODE c1, c2; 5831 5832 c1 = *s1++; 5833 c2 = *s2++; 5834 5835 if (c1 > (1<<11) * 26) 5836 c1 += utf16Fixup[c1>>11]; 5837 if (c2 > (1<<11) * 26) 5838 c2 += utf16Fixup[c2>>11]; 5839 /* now c1 and c2 are in UTF-32-compatible order */ 5840 5841 if (c1 != c2) 5842 return (c1 < c2) ? -1 : 1; 5843 5844 len1--; len2--; 5845 } 5846 5847 return (len1 < len2) ? -1 : (len1 != len2); 5848} 5849 5850#else 5851 5852static int 5853unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 5854{ 5855 register Py_ssize_t len1, len2; 5856 5857 Py_UNICODE *s1 = str1->str; 5858 Py_UNICODE *s2 = str2->str; 5859 5860 len1 = str1->length; 5861 len2 = str2->length; 5862 5863 while (len1 > 0 && len2 > 0) { 5864 Py_UNICODE c1, c2; 5865 5866 c1 = *s1++; 5867 c2 = *s2++; 5868 5869 if (c1 != c2) 5870 return (c1 < c2) ? -1 : 1; 5871 5872 len1--; len2--; 5873 } 5874 5875 return (len1 < len2) ? -1 : (len1 != len2); 5876} 5877 5878#endif 5879 5880int PyUnicode_Compare(PyObject *left, 5881 PyObject *right) 5882{ 5883 if (PyUnicode_Check(left) && PyUnicode_Check(right)) 5884 return unicode_compare((PyUnicodeObject *)left, 5885 (PyUnicodeObject *)right); 5886 if ((PyString_Check(left) && PyUnicode_Check(right)) || 5887 (PyUnicode_Check(left) && PyString_Check(right))) { 5888 if (PyUnicode_Check(left)) 5889 left = _PyUnicode_AsDefaultEncodedString(left, NULL); 5890 if (PyUnicode_Check(right)) 5891 right = _PyUnicode_AsDefaultEncodedString(right, NULL); 5892 assert(PyString_Check(left)); 5893 assert(PyString_Check(right)); 5894 return PyObject_Compare(left, right); 5895 } 5896 PyErr_Format(PyExc_TypeError, 5897 "Can't compare %.100s and %.100s", 5898 left->ob_type->tp_name, 5899 right->ob_type->tp_name); 5900 return -1; 5901} 5902 5903int 5904PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 5905{ 5906 int i; 5907 Py_UNICODE *id; 5908 assert(PyUnicode_Check(uni)); 5909 id = PyUnicode_AS_UNICODE(uni); 5910 /* Compare Unicode string and source character set string */ 5911 for (i = 0; id[i] && str[i]; i++) 5912 if (id[i] != str[i]) 5913 return ((int)id[i] < (int)str[i]) ? -1 : 1; 5914 if (id[i]) 5915 return 1; /* uni is longer */ 5916 if (str[i]) 5917 return -1; /* str is longer */ 5918 return 0; 5919} 5920 5921PyObject *PyUnicode_RichCompare(PyObject *left, 5922 PyObject *right, 5923 int op) 5924{ 5925 int result; 5926 5927 result = PyUnicode_Compare(left, right); 5928 if (result == -1 && PyErr_Occurred()) 5929 goto onError; 5930 5931 /* Convert the return value to a Boolean */ 5932 switch (op) { 5933 case Py_EQ: 5934 result = (result == 0); 5935 break; 5936 case Py_NE: 5937 result = (result != 0); 5938 break; 5939 case Py_LE: 5940 result = (result <= 0); 5941 break; 5942 case Py_GE: 5943 result = (result >= 0); 5944 break; 5945 case Py_LT: 5946 result = (result == -1); 5947 break; 5948 case Py_GT: 5949 result = (result == 1); 5950 break; 5951 } 5952 return PyBool_FromLong(result); 5953 5954 onError: 5955 5956 /* Standard case 5957 5958 Type errors mean that PyUnicode_FromObject() could not convert 5959 one of the arguments (usually the right hand side) to Unicode, 5960 ie. we can't handle the comparison request. However, it is 5961 possible that the other object knows a comparison method, which 5962 is why we return Py_NotImplemented to give the other object a 5963 chance. 5964 5965 */ 5966 if (PyErr_ExceptionMatches(PyExc_TypeError)) { 5967 PyErr_Clear(); 5968 Py_INCREF(Py_NotImplemented); 5969 return Py_NotImplemented; 5970 } 5971 if (op != Py_EQ && op != Py_NE) 5972 return NULL; 5973 5974 /* Equality comparison. 5975 5976 This is a special case: we silence any PyExc_UnicodeDecodeError 5977 and instead turn it into a PyErr_UnicodeWarning. 5978 5979 */ 5980 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) 5981 return NULL; 5982 PyErr_Clear(); 5983 if (PyErr_Warn(PyExc_UnicodeWarning, 5984 (op == Py_EQ) ? 5985 "Unicode equal comparison " 5986 "failed to convert both arguments to Unicode - " 5987 "interpreting them as being unequal" : 5988 "Unicode unequal comparison " 5989 "failed to convert both arguments to Unicode - " 5990 "interpreting them as being unequal" 5991 ) < 0) 5992 return NULL; 5993 result = (op == Py_NE); 5994 return PyBool_FromLong(result); 5995} 5996 5997int PyUnicode_Contains(PyObject *container, 5998 PyObject *element) 5999{ 6000 PyObject *str, *sub; 6001 int result; 6002 6003 /* Coerce the two arguments */ 6004 sub = PyUnicode_FromObject(element); 6005 if (!sub) { 6006 PyErr_Format(PyExc_TypeError, 6007 "'in <string>' requires string as left operand, not %s", 6008 element->ob_type->tp_name); 6009 return -1; 6010 } 6011 6012 str = PyUnicode_FromObject(container); 6013 if (!str) { 6014 Py_DECREF(sub); 6015 return -1; 6016 } 6017 6018 result = stringlib_contains_obj(str, sub); 6019 6020 Py_DECREF(str); 6021 Py_DECREF(sub); 6022 6023 return result; 6024} 6025 6026/* Concat to string or Unicode object giving a new Unicode object. */ 6027 6028PyObject *PyUnicode_Concat(PyObject *left, 6029 PyObject *right) 6030{ 6031 PyUnicodeObject *u = NULL, *v = NULL, *w; 6032 6033 if (PyBytes_Check(left) || PyBytes_Check(right)) 6034 return PyBytes_Concat(left, right); 6035 6036 /* Coerce the two arguments */ 6037 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 6038 if (u == NULL) 6039 goto onError; 6040 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 6041 if (v == NULL) 6042 goto onError; 6043 6044 /* Shortcuts */ 6045 if (v == unicode_empty) { 6046 Py_DECREF(v); 6047 return (PyObject *)u; 6048 } 6049 if (u == unicode_empty) { 6050 Py_DECREF(u); 6051 return (PyObject *)v; 6052 } 6053 6054 /* Concat the two Unicode strings */ 6055 w = _PyUnicode_New(u->length + v->length); 6056 if (w == NULL) 6057 goto onError; 6058 Py_UNICODE_COPY(w->str, u->str, u->length); 6059 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 6060 6061 Py_DECREF(u); 6062 Py_DECREF(v); 6063 return (PyObject *)w; 6064 6065onError: 6066 Py_XDECREF(u); 6067 Py_XDECREF(v); 6068 return NULL; 6069} 6070 6071void 6072PyUnicode_Append(PyObject **pleft, PyObject *right) 6073{ 6074 PyObject *new; 6075 if (*pleft == NULL) 6076 return; 6077 if (right == NULL || !PyUnicode_Check(*pleft)) { 6078 Py_DECREF(*pleft); 6079 *pleft = NULL; 6080 return; 6081 } 6082 new = PyUnicode_Concat(*pleft, right); 6083 Py_DECREF(*pleft); 6084 *pleft = new; 6085} 6086 6087void 6088PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 6089{ 6090 PyUnicode_Append(pleft, right); 6091 Py_XDECREF(right); 6092} 6093 6094PyDoc_STRVAR(count__doc__, 6095"S.count(sub[, start[, end]]) -> int\n\ 6096\n\ 6097Return the number of non-overlapping occurrences of substring sub in\n\ 6098Unicode string S[start:end]. Optional arguments start and end are\n\ 6099interpreted as in slice notation."); 6100 6101static PyObject * 6102unicode_count(PyUnicodeObject *self, PyObject *args) 6103{ 6104 PyUnicodeObject *substring; 6105 Py_ssize_t start = 0; 6106 Py_ssize_t end = PY_SSIZE_T_MAX; 6107 PyObject *result; 6108 6109 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 6110 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6111 return NULL; 6112 6113 substring = (PyUnicodeObject *)PyUnicode_FromObject( 6114 (PyObject *)substring); 6115 if (substring == NULL) 6116 return NULL; 6117 6118 FIX_START_END(self); 6119 6120 result = PyInt_FromSsize_t( 6121 stringlib_count(self->str + start, end - start, 6122 substring->str, substring->length) 6123 ); 6124 6125 Py_DECREF(substring); 6126 6127 return result; 6128} 6129 6130PyDoc_STRVAR(encode__doc__, 6131"S.encode([encoding[,errors]]) -> string or unicode\n\ 6132\n\ 6133Encodes S using the codec registered for encoding. encoding defaults\n\ 6134to the default encoding. errors may be given to set a different error\n\ 6135handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6136a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 6137'xmlcharrefreplace' as well as any other name registered with\n\ 6138codecs.register_error that can handle UnicodeEncodeErrors."); 6139 6140static PyObject * 6141unicode_encode(PyUnicodeObject *self, PyObject *args) 6142{ 6143 char *encoding = NULL; 6144 char *errors = NULL; 6145 PyObject *v; 6146 6147 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 6148 return NULL; 6149 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); 6150 if (v == NULL) 6151 goto onError; 6152 if (!PyBytes_Check(v)) { 6153 if (PyString_Check(v)) { 6154 /* Old codec, turn it into bytes */ 6155 PyObject *b = PyBytes_FromObject(v); 6156 Py_DECREF(v); 6157 return b; 6158 } 6159 PyErr_Format(PyExc_TypeError, 6160 "encoder did not return a bytes object " 6161 "(type=%.400s)", 6162 v->ob_type->tp_name); 6163 Py_DECREF(v); 6164 return NULL; 6165 } 6166 return v; 6167 6168 onError: 6169 return NULL; 6170} 6171 6172PyDoc_STRVAR(decode__doc__, 6173"S.decode([encoding[,errors]]) -> string or unicode\n\ 6174\n\ 6175Decodes S using the codec registered for encoding. encoding defaults\n\ 6176to the default encoding. errors may be given to set a different error\n\ 6177handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 6178a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ 6179as well as any other name registerd with codecs.register_error that is\n\ 6180able to handle UnicodeDecodeErrors."); 6181 6182static PyObject * 6183unicode_decode(PyUnicodeObject *self, PyObject *args) 6184{ 6185 char *encoding = NULL; 6186 char *errors = NULL; 6187 PyObject *v; 6188 6189 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) 6190 return NULL; 6191 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); 6192 if (v == NULL) 6193 goto onError; 6194 if (!PyString_Check(v) && !PyUnicode_Check(v)) { 6195 PyErr_Format(PyExc_TypeError, 6196 "decoder did not return a string/unicode object " 6197 "(type=%.400s)", 6198 v->ob_type->tp_name); 6199 Py_DECREF(v); 6200 return NULL; 6201 } 6202 return v; 6203 6204 onError: 6205 return NULL; 6206} 6207 6208PyDoc_STRVAR(expandtabs__doc__, 6209"S.expandtabs([tabsize]) -> unicode\n\ 6210\n\ 6211Return a copy of S where all tab characters are expanded using spaces.\n\ 6212If tabsize is not given, a tab size of 8 characters is assumed."); 6213 6214static PyObject* 6215unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 6216{ 6217 Py_UNICODE *e; 6218 Py_UNICODE *p; 6219 Py_UNICODE *q; 6220 Py_ssize_t i, j, old_j; 6221 PyUnicodeObject *u; 6222 int tabsize = 8; 6223 6224 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 6225 return NULL; 6226 6227 /* First pass: determine size of output string */ 6228 i = j = old_j = 0; 6229 e = self->str + self->length; 6230 for (p = self->str; p < e; p++) 6231 if (*p == '\t') { 6232 if (tabsize > 0) { 6233 j += tabsize - (j % tabsize); 6234 if (old_j > j) { 6235 PyErr_SetString(PyExc_OverflowError, 6236 "new string is too long"); 6237 return NULL; 6238 } 6239 old_j = j; 6240 } 6241 } 6242 else { 6243 j++; 6244 if (*p == '\n' || *p == '\r') { 6245 i += j; 6246 old_j = j = 0; 6247 if (i < 0) { 6248 PyErr_SetString(PyExc_OverflowError, 6249 "new string is too long"); 6250 return NULL; 6251 } 6252 } 6253 } 6254 6255 if ((i + j) < 0) { 6256 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 6257 return NULL; 6258 } 6259 6260 /* Second pass: create output string and fill it */ 6261 u = _PyUnicode_New(i + j); 6262 if (!u) 6263 return NULL; 6264 6265 j = 0; 6266 q = u->str; 6267 6268 for (p = self->str; p < e; p++) 6269 if (*p == '\t') { 6270 if (tabsize > 0) { 6271 i = tabsize - (j % tabsize); 6272 j += i; 6273 while (i--) 6274 *q++ = ' '; 6275 } 6276 } 6277 else { 6278 j++; 6279 *q++ = *p; 6280 if (*p == '\n' || *p == '\r') 6281 j = 0; 6282 } 6283 6284 return (PyObject*) u; 6285} 6286 6287PyDoc_STRVAR(find__doc__, 6288"S.find(sub [,start [,end]]) -> int\n\ 6289\n\ 6290Return the lowest index in S where substring sub is found,\n\ 6291such that sub is contained within s[start,end]. Optional\n\ 6292arguments start and end are interpreted as in slice notation.\n\ 6293\n\ 6294Return -1 on failure."); 6295 6296static PyObject * 6297unicode_find(PyUnicodeObject *self, PyObject *args) 6298{ 6299 PyObject *substring; 6300 Py_ssize_t start = 0; 6301 Py_ssize_t end = PY_SSIZE_T_MAX; 6302 Py_ssize_t result; 6303 6304 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 6305 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6306 return NULL; 6307 substring = PyUnicode_FromObject(substring); 6308 if (!substring) 6309 return NULL; 6310 6311 result = stringlib_find_slice( 6312 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6313 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6314 start, end 6315 ); 6316 6317 Py_DECREF(substring); 6318 6319 return PyInt_FromSsize_t(result); 6320} 6321 6322static PyObject * 6323unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) 6324{ 6325 if (index < 0 || index >= self->length) { 6326 PyErr_SetString(PyExc_IndexError, "string index out of range"); 6327 return NULL; 6328 } 6329 6330 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 6331} 6332 6333static long 6334unicode_hash(PyObject *self) 6335{ 6336 /* Since Unicode objects compare equal to their UTF-8 string 6337 counterparts, we hash the UTF-8 string. */ 6338 PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL); 6339 return PyObject_Hash(v); 6340} 6341 6342PyDoc_STRVAR(index__doc__, 6343"S.index(sub [,start [,end]]) -> int\n\ 6344\n\ 6345Like S.find() but raise ValueError when the substring is not found."); 6346 6347static PyObject * 6348unicode_index(PyUnicodeObject *self, PyObject *args) 6349{ 6350 Py_ssize_t result; 6351 PyObject *substring; 6352 Py_ssize_t start = 0; 6353 Py_ssize_t end = PY_SSIZE_T_MAX; 6354 6355 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 6356 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 6357 return NULL; 6358 substring = PyUnicode_FromObject(substring); 6359 if (!substring) 6360 return NULL; 6361 6362 result = stringlib_find_slice( 6363 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 6364 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 6365 start, end 6366 ); 6367 6368 Py_DECREF(substring); 6369 6370 if (result < 0) { 6371 PyErr_SetString(PyExc_ValueError, "substring not found"); 6372 return NULL; 6373 } 6374 6375 return PyInt_FromSsize_t(result); 6376} 6377 6378PyDoc_STRVAR(islower__doc__, 6379"S.islower() -> bool\n\ 6380\n\ 6381Return True if all cased characters in S are lowercase and there is\n\ 6382at least one cased character in S, False otherwise."); 6383 6384static PyObject* 6385unicode_islower(PyUnicodeObject *self) 6386{ 6387 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6388 register const Py_UNICODE *e; 6389 int cased; 6390 6391 /* Shortcut for single character strings */ 6392 if (PyUnicode_GET_SIZE(self) == 1) 6393 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); 6394 6395 /* Special case for empty strings */ 6396 if (PyUnicode_GET_SIZE(self) == 0) 6397 return PyBool_FromLong(0); 6398 6399 e = p + PyUnicode_GET_SIZE(self); 6400 cased = 0; 6401 for (; p < e; p++) { 6402 register const Py_UNICODE ch = *p; 6403 6404 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 6405 return PyBool_FromLong(0); 6406 else if (!cased && Py_UNICODE_ISLOWER(ch)) 6407 cased = 1; 6408 } 6409 return PyBool_FromLong(cased); 6410} 6411 6412PyDoc_STRVAR(isupper__doc__, 6413"S.isupper() -> bool\n\ 6414\n\ 6415Return True if all cased characters in S are uppercase and there is\n\ 6416at least one cased character in S, False otherwise."); 6417 6418static PyObject* 6419unicode_isupper(PyUnicodeObject *self) 6420{ 6421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6422 register const Py_UNICODE *e; 6423 int cased; 6424 6425 /* Shortcut for single character strings */ 6426 if (PyUnicode_GET_SIZE(self) == 1) 6427 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 6428 6429 /* Special case for empty strings */ 6430 if (PyUnicode_GET_SIZE(self) == 0) 6431 return PyBool_FromLong(0); 6432 6433 e = p + PyUnicode_GET_SIZE(self); 6434 cased = 0; 6435 for (; p < e; p++) { 6436 register const Py_UNICODE ch = *p; 6437 6438 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 6439 return PyBool_FromLong(0); 6440 else if (!cased && Py_UNICODE_ISUPPER(ch)) 6441 cased = 1; 6442 } 6443 return PyBool_FromLong(cased); 6444} 6445 6446PyDoc_STRVAR(istitle__doc__, 6447"S.istitle() -> bool\n\ 6448\n\ 6449Return True if S is a titlecased string and there is at least one\n\ 6450character in S, i.e. upper- and titlecase characters may only\n\ 6451follow uncased characters and lowercase characters only cased ones.\n\ 6452Return False otherwise."); 6453 6454static PyObject* 6455unicode_istitle(PyUnicodeObject *self) 6456{ 6457 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6458 register const Py_UNICODE *e; 6459 int cased, previous_is_cased; 6460 6461 /* Shortcut for single character strings */ 6462 if (PyUnicode_GET_SIZE(self) == 1) 6463 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 6464 (Py_UNICODE_ISUPPER(*p) != 0)); 6465 6466 /* Special case for empty strings */ 6467 if (PyUnicode_GET_SIZE(self) == 0) 6468 return PyBool_FromLong(0); 6469 6470 e = p + PyUnicode_GET_SIZE(self); 6471 cased = 0; 6472 previous_is_cased = 0; 6473 for (; p < e; p++) { 6474 register const Py_UNICODE ch = *p; 6475 6476 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 6477 if (previous_is_cased) 6478 return PyBool_FromLong(0); 6479 previous_is_cased = 1; 6480 cased = 1; 6481 } 6482 else if (Py_UNICODE_ISLOWER(ch)) { 6483 if (!previous_is_cased) 6484 return PyBool_FromLong(0); 6485 previous_is_cased = 1; 6486 cased = 1; 6487 } 6488 else 6489 previous_is_cased = 0; 6490 } 6491 return PyBool_FromLong(cased); 6492} 6493 6494PyDoc_STRVAR(isspace__doc__, 6495"S.isspace() -> bool\n\ 6496\n\ 6497Return True if all characters in S are whitespace\n\ 6498and there is at least one character in S, False otherwise."); 6499 6500static PyObject* 6501unicode_isspace(PyUnicodeObject *self) 6502{ 6503 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6504 register const Py_UNICODE *e; 6505 6506 /* Shortcut for single character strings */ 6507 if (PyUnicode_GET_SIZE(self) == 1 && 6508 Py_UNICODE_ISSPACE(*p)) 6509 return PyBool_FromLong(1); 6510 6511 /* Special case for empty strings */ 6512 if (PyUnicode_GET_SIZE(self) == 0) 6513 return PyBool_FromLong(0); 6514 6515 e = p + PyUnicode_GET_SIZE(self); 6516 for (; p < e; p++) { 6517 if (!Py_UNICODE_ISSPACE(*p)) 6518 return PyBool_FromLong(0); 6519 } 6520 return PyBool_FromLong(1); 6521} 6522 6523PyDoc_STRVAR(isalpha__doc__, 6524"S.isalpha() -> bool\n\ 6525\n\ 6526Return True if all characters in S are alphabetic\n\ 6527and there is at least one character in S, False otherwise."); 6528 6529static PyObject* 6530unicode_isalpha(PyUnicodeObject *self) 6531{ 6532 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6533 register const Py_UNICODE *e; 6534 6535 /* Shortcut for single character strings */ 6536 if (PyUnicode_GET_SIZE(self) == 1 && 6537 Py_UNICODE_ISALPHA(*p)) 6538 return PyBool_FromLong(1); 6539 6540 /* Special case for empty strings */ 6541 if (PyUnicode_GET_SIZE(self) == 0) 6542 return PyBool_FromLong(0); 6543 6544 e = p + PyUnicode_GET_SIZE(self); 6545 for (; p < e; p++) { 6546 if (!Py_UNICODE_ISALPHA(*p)) 6547 return PyBool_FromLong(0); 6548 } 6549 return PyBool_FromLong(1); 6550} 6551 6552PyDoc_STRVAR(isalnum__doc__, 6553"S.isalnum() -> bool\n\ 6554\n\ 6555Return True if all characters in S are alphanumeric\n\ 6556and there is at least one character in S, False otherwise."); 6557 6558static PyObject* 6559unicode_isalnum(PyUnicodeObject *self) 6560{ 6561 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6562 register const Py_UNICODE *e; 6563 6564 /* Shortcut for single character strings */ 6565 if (PyUnicode_GET_SIZE(self) == 1 && 6566 Py_UNICODE_ISALNUM(*p)) 6567 return PyBool_FromLong(1); 6568 6569 /* Special case for empty strings */ 6570 if (PyUnicode_GET_SIZE(self) == 0) 6571 return PyBool_FromLong(0); 6572 6573 e = p + PyUnicode_GET_SIZE(self); 6574 for (; p < e; p++) { 6575 if (!Py_UNICODE_ISALNUM(*p)) 6576 return PyBool_FromLong(0); 6577 } 6578 return PyBool_FromLong(1); 6579} 6580 6581PyDoc_STRVAR(isdecimal__doc__, 6582"S.isdecimal() -> bool\n\ 6583\n\ 6584Return True if there are only decimal characters in S,\n\ 6585False otherwise."); 6586 6587static PyObject* 6588unicode_isdecimal(PyUnicodeObject *self) 6589{ 6590 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6591 register const Py_UNICODE *e; 6592 6593 /* Shortcut for single character strings */ 6594 if (PyUnicode_GET_SIZE(self) == 1 && 6595 Py_UNICODE_ISDECIMAL(*p)) 6596 return PyBool_FromLong(1); 6597 6598 /* Special case for empty strings */ 6599 if (PyUnicode_GET_SIZE(self) == 0) 6600 return PyBool_FromLong(0); 6601 6602 e = p + PyUnicode_GET_SIZE(self); 6603 for (; p < e; p++) { 6604 if (!Py_UNICODE_ISDECIMAL(*p)) 6605 return PyBool_FromLong(0); 6606 } 6607 return PyBool_FromLong(1); 6608} 6609 6610PyDoc_STRVAR(isdigit__doc__, 6611"S.isdigit() -> bool\n\ 6612\n\ 6613Return True if all characters in S are digits\n\ 6614and there is at least one character in S, False otherwise."); 6615 6616static PyObject* 6617unicode_isdigit(PyUnicodeObject *self) 6618{ 6619 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6620 register const Py_UNICODE *e; 6621 6622 /* Shortcut for single character strings */ 6623 if (PyUnicode_GET_SIZE(self) == 1 && 6624 Py_UNICODE_ISDIGIT(*p)) 6625 return PyBool_FromLong(1); 6626 6627 /* Special case for empty strings */ 6628 if (PyUnicode_GET_SIZE(self) == 0) 6629 return PyBool_FromLong(0); 6630 6631 e = p + PyUnicode_GET_SIZE(self); 6632 for (; p < e; p++) { 6633 if (!Py_UNICODE_ISDIGIT(*p)) 6634 return PyBool_FromLong(0); 6635 } 6636 return PyBool_FromLong(1); 6637} 6638 6639PyDoc_STRVAR(isnumeric__doc__, 6640"S.isnumeric() -> bool\n\ 6641\n\ 6642Return True if there are only numeric characters in S,\n\ 6643False otherwise."); 6644 6645static PyObject* 6646unicode_isnumeric(PyUnicodeObject *self) 6647{ 6648 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 6649 register const Py_UNICODE *e; 6650 6651 /* Shortcut for single character strings */ 6652 if (PyUnicode_GET_SIZE(self) == 1 && 6653 Py_UNICODE_ISNUMERIC(*p)) 6654 return PyBool_FromLong(1); 6655 6656 /* Special case for empty strings */ 6657 if (PyUnicode_GET_SIZE(self) == 0) 6658 return PyBool_FromLong(0); 6659 6660 e = p + PyUnicode_GET_SIZE(self); 6661 for (; p < e; p++) { 6662 if (!Py_UNICODE_ISNUMERIC(*p)) 6663 return PyBool_FromLong(0); 6664 } 6665 return PyBool_FromLong(1); 6666} 6667 6668PyDoc_STRVAR(join__doc__, 6669"S.join(sequence) -> unicode\n\ 6670\n\ 6671Return a string which is the concatenation of the strings in the\n\ 6672sequence. The separator between elements is S."); 6673 6674static PyObject* 6675unicode_join(PyObject *self, PyObject *data) 6676{ 6677 return PyUnicode_Join(self, data); 6678} 6679 6680static Py_ssize_t 6681unicode_length(PyUnicodeObject *self) 6682{ 6683 return self->length; 6684} 6685 6686PyDoc_STRVAR(ljust__doc__, 6687"S.ljust(width[, fillchar]) -> int\n\ 6688\n\ 6689Return S left justified in a Unicode string of length width. Padding is\n\ 6690done using the specified fill character (default is a space)."); 6691 6692static PyObject * 6693unicode_ljust(PyUnicodeObject *self, PyObject *args) 6694{ 6695 Py_ssize_t width; 6696 Py_UNICODE fillchar = ' '; 6697 6698 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 6699 return NULL; 6700 6701 if (self->length >= width && PyUnicode_CheckExact(self)) { 6702 Py_INCREF(self); 6703 return (PyObject*) self; 6704 } 6705 6706 return (PyObject*) pad(self, 0, width - self->length, fillchar); 6707} 6708 6709PyDoc_STRVAR(lower__doc__, 6710"S.lower() -> unicode\n\ 6711\n\ 6712Return a copy of the string S converted to lowercase."); 6713 6714static PyObject* 6715unicode_lower(PyUnicodeObject *self) 6716{ 6717 return fixup(self, fixlower); 6718} 6719 6720#define LEFTSTRIP 0 6721#define RIGHTSTRIP 1 6722#define BOTHSTRIP 2 6723 6724/* Arrays indexed by above */ 6725static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 6726 6727#define STRIPNAME(i) (stripformat[i]+3) 6728 6729/* externally visible for str.strip(unicode) */ 6730PyObject * 6731_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 6732{ 6733 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6734 Py_ssize_t len = PyUnicode_GET_SIZE(self); 6735 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); 6736 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); 6737 Py_ssize_t i, j; 6738 6739 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); 6740 6741 i = 0; 6742 if (striptype != RIGHTSTRIP) { 6743 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { 6744 i++; 6745 } 6746 } 6747 6748 j = len; 6749 if (striptype != LEFTSTRIP) { 6750 do { 6751 j--; 6752 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); 6753 j++; 6754 } 6755 6756 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6757 Py_INCREF(self); 6758 return (PyObject*)self; 6759 } 6760 else 6761 return PyUnicode_FromUnicode(s+i, j-i); 6762} 6763 6764 6765static PyObject * 6766do_strip(PyUnicodeObject *self, int striptype) 6767{ 6768 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); 6769 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; 6770 6771 i = 0; 6772 if (striptype != RIGHTSTRIP) { 6773 while (i < len && Py_UNICODE_ISSPACE(s[i])) { 6774 i++; 6775 } 6776 } 6777 6778 j = len; 6779 if (striptype != LEFTSTRIP) { 6780 do { 6781 j--; 6782 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); 6783 j++; 6784 } 6785 6786 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { 6787 Py_INCREF(self); 6788 return (PyObject*)self; 6789 } 6790 else 6791 return PyUnicode_FromUnicode(s+i, j-i); 6792} 6793 6794 6795static PyObject * 6796do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 6797{ 6798 PyObject *sep = NULL; 6799 6800 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 6801 return NULL; 6802 6803 if (sep != NULL && sep != Py_None) { 6804 if (PyUnicode_Check(sep)) 6805 return _PyUnicode_XStrip(self, striptype, sep); 6806 else if (PyString_Check(sep)) { 6807 PyObject *res; 6808 sep = PyUnicode_FromObject(sep); 6809 if (sep==NULL) 6810 return NULL; 6811 res = _PyUnicode_XStrip(self, striptype, sep); 6812 Py_DECREF(sep); 6813 return res; 6814 } 6815 else { 6816 PyErr_Format(PyExc_TypeError, 6817 "%s arg must be None, unicode or str", 6818 STRIPNAME(striptype)); 6819 return NULL; 6820 } 6821 } 6822 6823 return do_strip(self, striptype); 6824} 6825 6826 6827PyDoc_STRVAR(strip__doc__, 6828"S.strip([chars]) -> unicode\n\ 6829\n\ 6830Return a copy of the string S with leading and trailing\n\ 6831whitespace removed.\n\ 6832If chars is given and not None, remove characters in chars instead.\n\ 6833If chars is a str, it will be converted to unicode before stripping"); 6834 6835static PyObject * 6836unicode_strip(PyUnicodeObject *self, PyObject *args) 6837{ 6838 if (PyTuple_GET_SIZE(args) == 0) 6839 return do_strip(self, BOTHSTRIP); /* Common case */ 6840 else 6841 return do_argstrip(self, BOTHSTRIP, args); 6842} 6843 6844 6845PyDoc_STRVAR(lstrip__doc__, 6846"S.lstrip([chars]) -> unicode\n\ 6847\n\ 6848Return a copy of the string S with leading whitespace removed.\n\ 6849If chars is given and not None, remove characters in chars instead.\n\ 6850If chars is a str, it will be converted to unicode before stripping"); 6851 6852static PyObject * 6853unicode_lstrip(PyUnicodeObject *self, PyObject *args) 6854{ 6855 if (PyTuple_GET_SIZE(args) == 0) 6856 return do_strip(self, LEFTSTRIP); /* Common case */ 6857 else 6858 return do_argstrip(self, LEFTSTRIP, args); 6859} 6860 6861 6862PyDoc_STRVAR(rstrip__doc__, 6863"S.rstrip([chars]) -> unicode\n\ 6864\n\ 6865Return a copy of the string S with trailing whitespace removed.\n\ 6866If chars is given and not None, remove characters in chars instead.\n\ 6867If chars is a str, it will be converted to unicode before stripping"); 6868 6869static PyObject * 6870unicode_rstrip(PyUnicodeObject *self, PyObject *args) 6871{ 6872 if (PyTuple_GET_SIZE(args) == 0) 6873 return do_strip(self, RIGHTSTRIP); /* Common case */ 6874 else 6875 return do_argstrip(self, RIGHTSTRIP, args); 6876} 6877 6878 6879static PyObject* 6880unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 6881{ 6882 PyUnicodeObject *u; 6883 Py_UNICODE *p; 6884 Py_ssize_t nchars; 6885 size_t nbytes; 6886 6887 if (len < 0) 6888 len = 0; 6889 6890 if (len == 1 && PyUnicode_CheckExact(str)) { 6891 /* no repeat, return original string */ 6892 Py_INCREF(str); 6893 return (PyObject*) str; 6894 } 6895 6896 /* ensure # of chars needed doesn't overflow int and # of bytes 6897 * needed doesn't overflow size_t 6898 */ 6899 nchars = len * str->length; 6900 if (len && nchars / len != str->length) { 6901 PyErr_SetString(PyExc_OverflowError, 6902 "repeated string is too long"); 6903 return NULL; 6904 } 6905 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 6906 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 6907 PyErr_SetString(PyExc_OverflowError, 6908 "repeated string is too long"); 6909 return NULL; 6910 } 6911 u = _PyUnicode_New(nchars); 6912 if (!u) 6913 return NULL; 6914 6915 p = u->str; 6916 6917 if (str->length == 1 && len > 0) { 6918 Py_UNICODE_FILL(p, str->str[0], len); 6919 } else { 6920 Py_ssize_t done = 0; /* number of characters copied this far */ 6921 if (done < nchars) { 6922 Py_UNICODE_COPY(p, str->str, str->length); 6923 done = str->length; 6924 } 6925 while (done < nchars) { 6926 int n = (done <= nchars-done) ? done : nchars-done; 6927 Py_UNICODE_COPY(p+done, p, n); 6928 done += n; 6929 } 6930 } 6931 6932 return (PyObject*) u; 6933} 6934 6935PyObject *PyUnicode_Replace(PyObject *obj, 6936 PyObject *subobj, 6937 PyObject *replobj, 6938 Py_ssize_t maxcount) 6939{ 6940 PyObject *self; 6941 PyObject *str1; 6942 PyObject *str2; 6943 PyObject *result; 6944 6945 self = PyUnicode_FromObject(obj); 6946 if (self == NULL) 6947 return NULL; 6948 str1 = PyUnicode_FromObject(subobj); 6949 if (str1 == NULL) { 6950 Py_DECREF(self); 6951 return NULL; 6952 } 6953 str2 = PyUnicode_FromObject(replobj); 6954 if (str2 == NULL) { 6955 Py_DECREF(self); 6956 Py_DECREF(str1); 6957 return NULL; 6958 } 6959 result = replace((PyUnicodeObject *)self, 6960 (PyUnicodeObject *)str1, 6961 (PyUnicodeObject *)str2, 6962 maxcount); 6963 Py_DECREF(self); 6964 Py_DECREF(str1); 6965 Py_DECREF(str2); 6966 return result; 6967} 6968 6969PyDoc_STRVAR(replace__doc__, 6970"S.replace (old, new[, maxsplit]) -> unicode\n\ 6971\n\ 6972Return a copy of S with all occurrences of substring\n\ 6973old replaced by new. If the optional argument maxsplit is\n\ 6974given, only the first maxsplit occurrences are replaced."); 6975 6976static PyObject* 6977unicode_replace(PyUnicodeObject *self, PyObject *args) 6978{ 6979 PyUnicodeObject *str1; 6980 PyUnicodeObject *str2; 6981 Py_ssize_t maxcount = -1; 6982 PyObject *result; 6983 6984 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 6985 return NULL; 6986 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 6987 if (str1 == NULL) 6988 return NULL; 6989 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 6990 if (str2 == NULL) { 6991 Py_DECREF(str1); 6992 return NULL; 6993 } 6994 6995 result = replace(self, str1, str2, maxcount); 6996 6997 Py_DECREF(str1); 6998 Py_DECREF(str2); 6999 return result; 7000} 7001 7002static 7003PyObject *unicode_repr(PyObject *unicode) 7004{ 7005 PyObject *repr; 7006 Py_UNICODE *p; 7007 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode); 7008 Py_ssize_t size = PyUnicode_GET_SIZE(unicode); 7009 7010 /* XXX(nnorwitz): rather than over-allocating, it would be 7011 better to choose a different scheme. Perhaps scan the 7012 first N-chars of the string and allocate based on that size. 7013 */ 7014 /* Initial allocation is based on the longest-possible unichr 7015 escape. 7016 7017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 7018 unichr, so in this case it's the longest unichr escape. In 7019 narrow (UTF-16) builds this is five chars per source unichr 7020 since there are two unichrs in the surrogate pair, so in narrow 7021 (UTF-16) builds it's not the longest unichr escape. 7022 7023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 7024 so in the narrow (UTF-16) build case it's the longest unichr 7025 escape. 7026 */ 7027 7028 repr = PyUnicode_FromUnicode(NULL, 7029 2 /* quotes */ 7030#ifdef Py_UNICODE_WIDE 7031 + 10*size 7032#else 7033 + 6*size 7034#endif 7035 + 1); 7036 if (repr == NULL) 7037 return NULL; 7038 7039 p = PyUnicode_AS_UNICODE(repr); 7040 7041 /* Add quote */ 7042 *p++ = (findchar(s, size, '\'') && 7043 !findchar(s, size, '"')) ? '"' : '\''; 7044 while (size-- > 0) { 7045 Py_UNICODE ch = *s++; 7046 7047 /* Escape quotes and backslashes */ 7048 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) { 7049 *p++ = '\\'; 7050 *p++ = ch; 7051 continue; 7052 } 7053 7054#ifdef Py_UNICODE_WIDE 7055 /* Map 21-bit characters to '\U00xxxxxx' */ 7056 else if (ch >= 0x10000) { 7057 *p++ = '\\'; 7058 *p++ = 'U'; 7059 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 7060 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 7061 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 7062 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 7063 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 7064 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 7065 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 7066 *p++ = hexdigits[ch & 0x0000000F]; 7067 continue; 7068 } 7069#else 7070 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 7071 else if (ch >= 0xD800 && ch < 0xDC00) { 7072 Py_UNICODE ch2; 7073 Py_UCS4 ucs; 7074 7075 ch2 = *s++; 7076 size--; 7077 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 7078 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 7079 *p++ = '\\'; 7080 *p++ = 'U'; 7081 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 7082 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 7083 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 7084 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 7085 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 7086 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 7087 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 7088 *p++ = hexdigits[ucs & 0x0000000F]; 7089 continue; 7090 } 7091 /* Fall through: isolated surrogates are copied as-is */ 7092 s--; 7093 size++; 7094 } 7095#endif 7096 7097 /* Map 16-bit characters to '\uxxxx' */ 7098 if (ch >= 256) { 7099 *p++ = '\\'; 7100 *p++ = 'u'; 7101 *p++ = hexdigits[(ch >> 12) & 0x000F]; 7102 *p++ = hexdigits[(ch >> 8) & 0x000F]; 7103 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7104 *p++ = hexdigits[ch & 0x000F]; 7105 } 7106 7107 /* Map special whitespace to '\t', \n', '\r' */ 7108 else if (ch == '\t') { 7109 *p++ = '\\'; 7110 *p++ = 't'; 7111 } 7112 else if (ch == '\n') { 7113 *p++ = '\\'; 7114 *p++ = 'n'; 7115 } 7116 else if (ch == '\r') { 7117 *p++ = '\\'; 7118 *p++ = 'r'; 7119 } 7120 7121 /* Map non-printable US ASCII to '\xhh' */ 7122 else if (ch < ' ' || ch >= 0x7F) { 7123 *p++ = '\\'; 7124 *p++ = 'x'; 7125 *p++ = hexdigits[(ch >> 4) & 0x000F]; 7126 *p++ = hexdigits[ch & 0x000F]; 7127 } 7128 7129 /* Copy everything else as-is */ 7130 else 7131 *p++ = (char) ch; 7132 } 7133 /* Add quote */ 7134 *p++ = PyUnicode_AS_UNICODE(repr)[0]; 7135 7136 *p = '\0'; 7137 _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr)); 7138 return repr; 7139} 7140 7141PyDoc_STRVAR(rfind__doc__, 7142"S.rfind(sub [,start [,end]]) -> int\n\ 7143\n\ 7144Return the highest index in S where substring sub is found,\n\ 7145such that sub is contained within s[start,end]. Optional\n\ 7146arguments start and end are interpreted as in slice notation.\n\ 7147\n\ 7148Return -1 on failure."); 7149 7150static PyObject * 7151unicode_rfind(PyUnicodeObject *self, PyObject *args) 7152{ 7153 PyObject *substring; 7154 Py_ssize_t start = 0; 7155 Py_ssize_t end = PY_SSIZE_T_MAX; 7156 Py_ssize_t result; 7157 7158 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 7159 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7160 return NULL; 7161 substring = PyUnicode_FromObject(substring); 7162 if (!substring) 7163 return NULL; 7164 7165 result = stringlib_rfind_slice( 7166 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7167 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7168 start, end 7169 ); 7170 7171 Py_DECREF(substring); 7172 7173 return PyInt_FromSsize_t(result); 7174} 7175 7176PyDoc_STRVAR(rindex__doc__, 7177"S.rindex(sub [,start [,end]]) -> int\n\ 7178\n\ 7179Like S.rfind() but raise ValueError when the substring is not found."); 7180 7181static PyObject * 7182unicode_rindex(PyUnicodeObject *self, PyObject *args) 7183{ 7184 PyObject *substring; 7185 Py_ssize_t start = 0; 7186 Py_ssize_t end = PY_SSIZE_T_MAX; 7187 Py_ssize_t result; 7188 7189 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 7190 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7191 return NULL; 7192 substring = PyUnicode_FromObject(substring); 7193 if (!substring) 7194 return NULL; 7195 7196 result = stringlib_rfind_slice( 7197 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), 7198 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), 7199 start, end 7200 ); 7201 7202 Py_DECREF(substring); 7203 7204 if (result < 0) { 7205 PyErr_SetString(PyExc_ValueError, "substring not found"); 7206 return NULL; 7207 } 7208 return PyInt_FromSsize_t(result); 7209} 7210 7211PyDoc_STRVAR(rjust__doc__, 7212"S.rjust(width[, fillchar]) -> unicode\n\ 7213\n\ 7214Return S right justified in a Unicode string of length width. Padding is\n\ 7215done using the specified fill character (default is a space)."); 7216 7217static PyObject * 7218unicode_rjust(PyUnicodeObject *self, PyObject *args) 7219{ 7220 Py_ssize_t width; 7221 Py_UNICODE fillchar = ' '; 7222 7223 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 7224 return NULL; 7225 7226 if (self->length >= width && PyUnicode_CheckExact(self)) { 7227 Py_INCREF(self); 7228 return (PyObject*) self; 7229 } 7230 7231 return (PyObject*) pad(self, width - self->length, 0, fillchar); 7232} 7233 7234static PyObject* 7235unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) 7236{ 7237 /* standard clamping */ 7238 if (start < 0) 7239 start = 0; 7240 if (end < 0) 7241 end = 0; 7242 if (end > self->length) 7243 end = self->length; 7244 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { 7245 /* full slice, return original string */ 7246 Py_INCREF(self); 7247 return (PyObject*) self; 7248 } 7249 if (start > end) 7250 start = end; 7251 /* copy slice */ 7252 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 7253 end - start); 7254} 7255 7256PyObject *PyUnicode_Split(PyObject *s, 7257 PyObject *sep, 7258 Py_ssize_t maxsplit) 7259{ 7260 PyObject *result; 7261 7262 s = PyUnicode_FromObject(s); 7263 if (s == NULL) 7264 return NULL; 7265 if (sep != NULL) { 7266 sep = PyUnicode_FromObject(sep); 7267 if (sep == NULL) { 7268 Py_DECREF(s); 7269 return NULL; 7270 } 7271 } 7272 7273 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7274 7275 Py_DECREF(s); 7276 Py_XDECREF(sep); 7277 return result; 7278} 7279 7280PyDoc_STRVAR(split__doc__, 7281"S.split([sep [,maxsplit]]) -> list of strings\n\ 7282\n\ 7283Return a list of the words in S, using sep as the\n\ 7284delimiter string. If maxsplit is given, at most maxsplit\n\ 7285splits are done. If sep is not specified or is None,\n\ 7286any whitespace string is a separator."); 7287 7288static PyObject* 7289unicode_split(PyUnicodeObject *self, PyObject *args) 7290{ 7291 PyObject *substring = Py_None; 7292 Py_ssize_t maxcount = -1; 7293 7294 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 7295 return NULL; 7296 7297 if (substring == Py_None) 7298 return split(self, NULL, maxcount); 7299 else if (PyUnicode_Check(substring)) 7300 return split(self, (PyUnicodeObject *)substring, maxcount); 7301 else 7302 return PyUnicode_Split((PyObject *)self, substring, maxcount); 7303} 7304 7305PyObject * 7306PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 7307{ 7308 PyObject* str_obj; 7309 PyObject* sep_obj; 7310 PyObject* out; 7311 7312 str_obj = PyUnicode_FromObject(str_in); 7313 if (!str_obj) 7314 return NULL; 7315 sep_obj = PyUnicode_FromObject(sep_in); 7316 if (!sep_obj) { 7317 Py_DECREF(str_obj); 7318 return NULL; 7319 } 7320 7321 out = stringlib_partition( 7322 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7323 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7324 ); 7325 7326 Py_DECREF(sep_obj); 7327 Py_DECREF(str_obj); 7328 7329 return out; 7330} 7331 7332 7333PyObject * 7334PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 7335{ 7336 PyObject* str_obj; 7337 PyObject* sep_obj; 7338 PyObject* out; 7339 7340 str_obj = PyUnicode_FromObject(str_in); 7341 if (!str_obj) 7342 return NULL; 7343 sep_obj = PyUnicode_FromObject(sep_in); 7344 if (!sep_obj) { 7345 Py_DECREF(str_obj); 7346 return NULL; 7347 } 7348 7349 out = stringlib_rpartition( 7350 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), 7351 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) 7352 ); 7353 7354 Py_DECREF(sep_obj); 7355 Py_DECREF(str_obj); 7356 7357 return out; 7358} 7359 7360PyDoc_STRVAR(partition__doc__, 7361"S.partition(sep) -> (head, sep, tail)\n\ 7362\n\ 7363Searches for the separator sep in S, and returns the part before it,\n\ 7364the separator itself, and the part after it. If the separator is not\n\ 7365found, returns S and two empty strings."); 7366 7367static PyObject* 7368unicode_partition(PyUnicodeObject *self, PyObject *separator) 7369{ 7370 return PyUnicode_Partition((PyObject *)self, separator); 7371} 7372 7373PyDoc_STRVAR(rpartition__doc__, 7374"S.rpartition(sep) -> (tail, sep, head)\n\ 7375\n\ 7376Searches for the separator sep in S, starting at the end of S, and returns\n\ 7377the part before it, the separator itself, and the part after it. If the\n\ 7378separator is not found, returns two empty strings and S."); 7379 7380static PyObject* 7381unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 7382{ 7383 return PyUnicode_RPartition((PyObject *)self, separator); 7384} 7385 7386PyObject *PyUnicode_RSplit(PyObject *s, 7387 PyObject *sep, 7388 Py_ssize_t maxsplit) 7389{ 7390 PyObject *result; 7391 7392 s = PyUnicode_FromObject(s); 7393 if (s == NULL) 7394 return NULL; 7395 if (sep != NULL) { 7396 sep = PyUnicode_FromObject(sep); 7397 if (sep == NULL) { 7398 Py_DECREF(s); 7399 return NULL; 7400 } 7401 } 7402 7403 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 7404 7405 Py_DECREF(s); 7406 Py_XDECREF(sep); 7407 return result; 7408} 7409 7410PyDoc_STRVAR(rsplit__doc__, 7411"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ 7412\n\ 7413Return a list of the words in S, using sep as the\n\ 7414delimiter string, starting at the end of the string and\n\ 7415working to the front. If maxsplit is given, at most maxsplit\n\ 7416splits are done. If sep is not specified, any whitespace string\n\ 7417is a separator."); 7418 7419static PyObject* 7420unicode_rsplit(PyUnicodeObject *self, PyObject *args) 7421{ 7422 PyObject *substring = Py_None; 7423 Py_ssize_t maxcount = -1; 7424 7425 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 7426 return NULL; 7427 7428 if (substring == Py_None) 7429 return rsplit(self, NULL, maxcount); 7430 else if (PyUnicode_Check(substring)) 7431 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 7432 else 7433 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 7434} 7435 7436PyDoc_STRVAR(splitlines__doc__, 7437"S.splitlines([keepends]]) -> list of strings\n\ 7438\n\ 7439Return a list of the lines in S, breaking at line boundaries.\n\ 7440Line breaks are not included in the resulting list unless keepends\n\ 7441is given and true."); 7442 7443static PyObject* 7444unicode_splitlines(PyUnicodeObject *self, PyObject *args) 7445{ 7446 int keepends = 0; 7447 7448 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 7449 return NULL; 7450 7451 return PyUnicode_Splitlines((PyObject *)self, keepends); 7452} 7453 7454static 7455PyObject *unicode_str(PyObject *self) 7456{ 7457 if (PyUnicode_CheckExact(self)) { 7458 Py_INCREF(self); 7459 return self; 7460 } else 7461 /* Subtype -- return genuine unicode string with the same value. */ 7462 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self), 7463 PyUnicode_GET_SIZE(self)); 7464} 7465 7466PyDoc_STRVAR(swapcase__doc__, 7467"S.swapcase() -> unicode\n\ 7468\n\ 7469Return a copy of S with uppercase characters converted to lowercase\n\ 7470and vice versa."); 7471 7472static PyObject* 7473unicode_swapcase(PyUnicodeObject *self) 7474{ 7475 return fixup(self, fixswapcase); 7476} 7477 7478PyDoc_STRVAR(translate__doc__, 7479"S.translate(table) -> unicode\n\ 7480\n\ 7481Return a copy of the string S, where all characters have been mapped\n\ 7482through the given translation table, which must be a mapping of\n\ 7483Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ 7484Unmapped characters are left untouched. Characters mapped to None\n\ 7485are deleted."); 7486 7487static PyObject* 7488unicode_translate(PyUnicodeObject *self, PyObject *table) 7489{ 7490 return PyUnicode_TranslateCharmap(self->str, 7491 self->length, 7492 table, 7493 "ignore"); 7494} 7495 7496PyDoc_STRVAR(upper__doc__, 7497"S.upper() -> unicode\n\ 7498\n\ 7499Return a copy of S converted to uppercase."); 7500 7501static PyObject* 7502unicode_upper(PyUnicodeObject *self) 7503{ 7504 return fixup(self, fixupper); 7505} 7506 7507PyDoc_STRVAR(zfill__doc__, 7508"S.zfill(width) -> unicode\n\ 7509\n\ 7510Pad a numeric string x with zeros on the left, to fill a field\n\ 7511of the specified width. The string x is never truncated."); 7512 7513static PyObject * 7514unicode_zfill(PyUnicodeObject *self, PyObject *args) 7515{ 7516 Py_ssize_t fill; 7517 PyUnicodeObject *u; 7518 7519 Py_ssize_t width; 7520 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 7521 return NULL; 7522 7523 if (self->length >= width) { 7524 if (PyUnicode_CheckExact(self)) { 7525 Py_INCREF(self); 7526 return (PyObject*) self; 7527 } 7528 else 7529 return PyUnicode_FromUnicode( 7530 PyUnicode_AS_UNICODE(self), 7531 PyUnicode_GET_SIZE(self) 7532 ); 7533 } 7534 7535 fill = width - self->length; 7536 7537 u = pad(self, fill, 0, '0'); 7538 7539 if (u == NULL) 7540 return NULL; 7541 7542 if (u->str[fill] == '+' || u->str[fill] == '-') { 7543 /* move sign to beginning of string */ 7544 u->str[0] = u->str[fill]; 7545 u->str[fill] = '0'; 7546 } 7547 7548 return (PyObject*) u; 7549} 7550 7551#if 0 7552static PyObject* 7553unicode_freelistsize(PyUnicodeObject *self) 7554{ 7555 return PyInt_FromLong(unicode_freelist_size); 7556} 7557#endif 7558 7559PyDoc_STRVAR(startswith__doc__, 7560"S.startswith(prefix[, start[, end]]) -> bool\n\ 7561\n\ 7562Return True if S starts with the specified prefix, False otherwise.\n\ 7563With optional start, test S beginning at that position.\n\ 7564With optional end, stop comparing S at that position.\n\ 7565prefix can also be a tuple of strings to try."); 7566 7567static PyObject * 7568unicode_startswith(PyUnicodeObject *self, 7569 PyObject *args) 7570{ 7571 PyObject *subobj; 7572 PyUnicodeObject *substring; 7573 Py_ssize_t start = 0; 7574 Py_ssize_t end = PY_SSIZE_T_MAX; 7575 int result; 7576 7577 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, 7578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7579 return NULL; 7580 if (PyTuple_Check(subobj)) { 7581 Py_ssize_t i; 7582 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7583 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7584 PyTuple_GET_ITEM(subobj, i)); 7585 if (substring == NULL) 7586 return NULL; 7587 result = tailmatch(self, substring, start, end, -1); 7588 Py_DECREF(substring); 7589 if (result) { 7590 Py_RETURN_TRUE; 7591 } 7592 } 7593 /* nothing matched */ 7594 Py_RETURN_FALSE; 7595 } 7596 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7597 if (substring == NULL) 7598 return NULL; 7599 result = tailmatch(self, substring, start, end, -1); 7600 Py_DECREF(substring); 7601 return PyBool_FromLong(result); 7602} 7603 7604 7605PyDoc_STRVAR(endswith__doc__, 7606"S.endswith(suffix[, start[, end]]) -> bool\n\ 7607\n\ 7608Return True if S ends with the specified suffix, False otherwise.\n\ 7609With optional start, test S beginning at that position.\n\ 7610With optional end, stop comparing S at that position.\n\ 7611suffix can also be a tuple of strings to try."); 7612 7613static PyObject * 7614unicode_endswith(PyUnicodeObject *self, 7615 PyObject *args) 7616{ 7617 PyObject *subobj; 7618 PyUnicodeObject *substring; 7619 Py_ssize_t start = 0; 7620 Py_ssize_t end = PY_SSIZE_T_MAX; 7621 int result; 7622 7623 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, 7624 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 7625 return NULL; 7626 if (PyTuple_Check(subobj)) { 7627 Py_ssize_t i; 7628 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 7629 substring = (PyUnicodeObject *)PyUnicode_FromObject( 7630 PyTuple_GET_ITEM(subobj, i)); 7631 if (substring == NULL) 7632 return NULL; 7633 result = tailmatch(self, substring, start, end, +1); 7634 Py_DECREF(substring); 7635 if (result) { 7636 Py_RETURN_TRUE; 7637 } 7638 } 7639 Py_RETURN_FALSE; 7640 } 7641 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 7642 if (substring == NULL) 7643 return NULL; 7644 7645 result = tailmatch(self, substring, start, end, +1); 7646 Py_DECREF(substring); 7647 return PyBool_FromLong(result); 7648} 7649 7650 7651 7652static PyObject * 7653unicode_getnewargs(PyUnicodeObject *v) 7654{ 7655 return Py_BuildValue("(u#)", v->str, v->length); 7656} 7657 7658 7659static PyMethodDef unicode_methods[] = { 7660 7661 /* Order is according to common usage: often used methods should 7662 appear first, since lookup is done sequentially. */ 7663 7664 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, 7665 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 7666 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 7667 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 7668 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 7669 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 7670 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 7671 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 7672 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 7673 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 7674 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 7675 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 7676 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 7677 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 7678 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 7679 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 7680 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, 7681/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ 7682 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 7683 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 7684 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 7685 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 7686 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 7687 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, 7688 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 7689 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 7690 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 7691 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 7692 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 7693 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 7694 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 7695 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 7696 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 7697 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 7698 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 7699 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 7700 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 7701 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 7702 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 7703 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 7704#if 0 7705 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 7706#endif 7707 7708#if 0 7709 /* This one is just used for debugging the implementation. */ 7710 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, 7711#endif 7712 7713 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 7714 {NULL, NULL} 7715}; 7716 7717static PyObject * 7718unicode_mod(PyObject *v, PyObject *w) 7719{ 7720 if (!PyUnicode_Check(v)) { 7721 Py_INCREF(Py_NotImplemented); 7722 return Py_NotImplemented; 7723 } 7724 return PyUnicode_Format(v, w); 7725} 7726 7727static PyNumberMethods unicode_as_number = { 7728 0, /*nb_add*/ 7729 0, /*nb_subtract*/ 7730 0, /*nb_multiply*/ 7731 unicode_mod, /*nb_remainder*/ 7732}; 7733 7734static PySequenceMethods unicode_as_sequence = { 7735 (lenfunc) unicode_length, /* sq_length */ 7736 PyUnicode_Concat, /* sq_concat */ 7737 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 7738 (ssizeargfunc) unicode_getitem, /* sq_item */ 7739 (ssizessizeargfunc) unicode_slice, /* sq_slice */ 7740 0, /* sq_ass_item */ 7741 0, /* sq_ass_slice */ 7742 PyUnicode_Contains, /* sq_contains */ 7743}; 7744 7745static PyObject* 7746unicode_subscript(PyUnicodeObject* self, PyObject* item) 7747{ 7748 if (PyIndex_Check(item)) { 7749 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 7750 if (i == -1 && PyErr_Occurred()) 7751 return NULL; 7752 if (i < 0) 7753 i += PyUnicode_GET_SIZE(self); 7754 return unicode_getitem(self, i); 7755 } else if (PySlice_Check(item)) { 7756 Py_ssize_t start, stop, step, slicelength, cur, i; 7757 Py_UNICODE* source_buf; 7758 Py_UNICODE* result_buf; 7759 PyObject* result; 7760 7761 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), 7762 &start, &stop, &step, &slicelength) < 0) { 7763 return NULL; 7764 } 7765 7766 if (slicelength <= 0) { 7767 return PyUnicode_FromUnicode(NULL, 0); 7768 } else { 7769 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 7770 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* 7771 sizeof(Py_UNICODE)); 7772 7773 if (result_buf == NULL) 7774 return PyErr_NoMemory(); 7775 7776 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 7777 result_buf[i] = source_buf[cur]; 7778 } 7779 7780 result = PyUnicode_FromUnicode(result_buf, slicelength); 7781 PyMem_FREE(result_buf); 7782 return result; 7783 } 7784 } else { 7785 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 7786 return NULL; 7787 } 7788} 7789 7790static PyMappingMethods unicode_as_mapping = { 7791 (lenfunc)unicode_length, /* mp_length */ 7792 (binaryfunc)unicode_subscript, /* mp_subscript */ 7793 (objobjargproc)0, /* mp_ass_subscript */ 7794}; 7795 7796static Py_ssize_t 7797unicode_buffer_getreadbuf(PyUnicodeObject *self, 7798 Py_ssize_t index, 7799 const void **ptr) 7800{ 7801 if (index != 0) { 7802 PyErr_SetString(PyExc_SystemError, 7803 "accessing non-existent unicode segment"); 7804 return -1; 7805 } 7806 *ptr = (void *) self->str; 7807 return PyUnicode_GET_DATA_SIZE(self); 7808} 7809 7810static Py_ssize_t 7811unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, 7812 const void **ptr) 7813{ 7814 PyErr_SetString(PyExc_TypeError, 7815 "cannot use unicode as modifiable buffer"); 7816 return -1; 7817} 7818 7819static int 7820unicode_buffer_getsegcount(PyUnicodeObject *self, 7821 Py_ssize_t *lenp) 7822{ 7823 if (lenp) 7824 *lenp = PyUnicode_GET_DATA_SIZE(self); 7825 return 1; 7826} 7827 7828static Py_ssize_t 7829unicode_buffer_getcharbuf(PyUnicodeObject *self, 7830 Py_ssize_t index, 7831 const void **ptr) 7832{ 7833 PyObject *str; 7834 7835 if (index != 0) { 7836 PyErr_SetString(PyExc_SystemError, 7837 "accessing non-existent unicode segment"); 7838 return -1; 7839 } 7840 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 7841 if (str == NULL) 7842 return -1; 7843 *ptr = (void *) PyString_AS_STRING(str); 7844 return PyString_GET_SIZE(str); 7845} 7846 7847/* Helpers for PyUnicode_Format() */ 7848 7849static PyObject * 7850getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 7851{ 7852 Py_ssize_t argidx = *p_argidx; 7853 if (argidx < arglen) { 7854 (*p_argidx)++; 7855 if (arglen < 0) 7856 return args; 7857 else 7858 return PyTuple_GetItem(args, argidx); 7859 } 7860 PyErr_SetString(PyExc_TypeError, 7861 "not enough arguments for format string"); 7862 return NULL; 7863} 7864 7865#define F_LJUST (1<<0) 7866#define F_SIGN (1<<1) 7867#define F_BLANK (1<<2) 7868#define F_ALT (1<<3) 7869#define F_ZERO (1<<4) 7870 7871static Py_ssize_t 7872strtounicode(Py_UNICODE *buffer, const char *charbuffer) 7873{ 7874 register Py_ssize_t i; 7875 Py_ssize_t len = strlen(charbuffer); 7876 for (i = len - 1; i >= 0; i--) 7877 buffer[i] = (Py_UNICODE) charbuffer[i]; 7878 7879 return len; 7880} 7881 7882static int 7883doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) 7884{ 7885 Py_ssize_t result; 7886 7887 PyOS_ascii_formatd((char *)buffer, len, format, x); 7888 result = strtounicode(buffer, (char *)buffer); 7889 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7890} 7891 7892static int 7893longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) 7894{ 7895 Py_ssize_t result; 7896 7897 PyOS_snprintf((char *)buffer, len, format, x); 7898 result = strtounicode(buffer, (char *)buffer); 7899 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); 7900} 7901 7902/* XXX To save some code duplication, formatfloat/long/int could have been 7903 shared with stringobject.c, converting from 8-bit to Unicode after the 7904 formatting is done. */ 7905 7906static int 7907formatfloat(Py_UNICODE *buf, 7908 size_t buflen, 7909 int flags, 7910 int prec, 7911 int type, 7912 PyObject *v) 7913{ 7914 /* fmt = '%#.' + `prec` + `type` 7915 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 7916 char fmt[20]; 7917 double x; 7918 7919 x = PyFloat_AsDouble(v); 7920 if (x == -1.0 && PyErr_Occurred()) 7921 return -1; 7922 if (prec < 0) 7923 prec = 6; 7924 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 7925 type = 'g'; 7926 /* Worst case length calc to ensure no buffer overrun: 7927 7928 'g' formats: 7929 fmt = %#.<prec>g 7930 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 7931 for any double rep.) 7932 len = 1 + prec + 1 + 2 + 5 = 9 + prec 7933 7934 'f' formats: 7935 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) 7936 len = 1 + 50 + 1 + prec = 52 + prec 7937 7938 If prec=0 the effective precision is 1 (the leading digit is 7939 always given), therefore increase the length by one. 7940 7941 */ 7942 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) || 7943 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { 7944 PyErr_SetString(PyExc_OverflowError, 7945 "formatted float is too long (precision too large?)"); 7946 return -1; 7947 } 7948 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", 7949 (flags&F_ALT) ? "#" : "", 7950 prec, type); 7951 return doubletounicode(buf, buflen, fmt, x); 7952} 7953 7954static PyObject* 7955formatlong(PyObject *val, int flags, int prec, int type) 7956{ 7957 char *buf; 7958 int len; 7959 PyObject *str; /* temporary string object. */ 7960 PyObject *result; 7961 7962 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 7963 if (!str) 7964 return NULL; 7965 result = PyUnicode_FromStringAndSize(buf, len); 7966 Py_DECREF(str); 7967 return result; 7968} 7969 7970static int 7971formatint(Py_UNICODE *buf, 7972 size_t buflen, 7973 int flags, 7974 int prec, 7975 int type, 7976 PyObject *v) 7977{ 7978 /* fmt = '%#.' + `prec` + 'l' + `type` 7979 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 7980 * + 1 + 1 7981 * = 24 7982 */ 7983 char fmt[64]; /* plenty big enough! */ 7984 char *sign; 7985 long x; 7986 7987 x = PyInt_AsLong(v); 7988 if (x == -1 && PyErr_Occurred()) 7989 return -1; 7990 if (x < 0 && type == 'u') { 7991 type = 'd'; 7992 } 7993 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) 7994 sign = "-"; 7995 else 7996 sign = ""; 7997 if (prec < 0) 7998 prec = 1; 7999 8000 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) 8001 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 8002 */ 8003 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { 8004 PyErr_SetString(PyExc_OverflowError, 8005 "formatted integer is too long (precision too large?)"); 8006 return -1; 8007 } 8008 8009 if ((flags & F_ALT) && 8010 (type == 'x' || type == 'X' || type == 'o')) { 8011 /* When converting under %#o, %#x or %#X, there are a number 8012 * of issues that cause pain: 8013 * - for %#o, we want a different base marker than C 8014 * - when 0 is being converted, the C standard leaves off 8015 * the '0x' or '0X', which is inconsistent with other 8016 * %#x/%#X conversions and inconsistent with Python's 8017 * hex() function 8018 * - there are platforms that violate the standard and 8019 * convert 0 with the '0x' or '0X' 8020 * (Metrowerks, Compaq Tru64) 8021 * - there are platforms that give '0x' when converting 8022 * under %#X, but convert 0 in accordance with the 8023 * standard (OS/2 EMX) 8024 * 8025 * We can achieve the desired consistency by inserting our 8026 * own '0x' or '0X' prefix, and substituting %x/%X in place 8027 * of %#x/%#X. 8028 * 8029 * Note that this is the same approach as used in 8030 * formatint() in stringobject.c 8031 */ 8032 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", 8033 sign, type, prec, type); 8034 } 8035 else { 8036 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", 8037 sign, (flags&F_ALT) ? "#" : "", 8038 prec, type); 8039 } 8040 if (sign[0]) 8041 return longtounicode(buf, buflen, fmt, -x); 8042 else 8043 return longtounicode(buf, buflen, fmt, x); 8044} 8045 8046static int 8047formatchar(Py_UNICODE *buf, 8048 size_t buflen, 8049 PyObject *v) 8050{ 8051 /* presume that the buffer is at least 2 characters long */ 8052 if (PyUnicode_Check(v)) { 8053 if (PyUnicode_GET_SIZE(v) != 1) 8054 goto onError; 8055 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 8056 } 8057 8058 else if (PyString_Check(v)) { 8059 if (PyString_GET_SIZE(v) != 1) 8060 goto onError; 8061 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 8062 } 8063 8064 else { 8065 /* Integer input truncated to a character */ 8066 long x; 8067 x = PyInt_AsLong(v); 8068 if (x == -1 && PyErr_Occurred()) 8069 goto onError; 8070#ifdef Py_UNICODE_WIDE 8071 if (x < 0 || x > 0x10ffff) { 8072 PyErr_SetString(PyExc_OverflowError, 8073 "%c arg not in range(0x110000) " 8074 "(wide Python build)"); 8075 return -1; 8076 } 8077#else 8078 if (x < 0 || x > 0xffff) { 8079 PyErr_SetString(PyExc_OverflowError, 8080 "%c arg not in range(0x10000) " 8081 "(narrow Python build)"); 8082 return -1; 8083 } 8084#endif 8085 buf[0] = (Py_UNICODE) x; 8086 } 8087 buf[1] = '\0'; 8088 return 1; 8089 8090 onError: 8091 PyErr_SetString(PyExc_TypeError, 8092 "%c requires int or char"); 8093 return -1; 8094} 8095 8096/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 8097 8098 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 8099 chars are formatted. XXX This is a magic number. Each formatting 8100 routine does bounds checking to ensure no overflow, but a better 8101 solution may be to malloc a buffer of appropriate size for each 8102 format. For now, the current solution is sufficient. 8103*/ 8104#define FORMATBUFLEN (size_t)120 8105 8106PyObject *PyUnicode_Format(PyObject *format, 8107 PyObject *args) 8108{ 8109 Py_UNICODE *fmt, *res; 8110 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; 8111 int args_owned = 0; 8112 PyUnicodeObject *result = NULL; 8113 PyObject *dict = NULL; 8114 PyObject *uformat; 8115 8116 if (format == NULL || args == NULL) { 8117 PyErr_BadInternalCall(); 8118 return NULL; 8119 } 8120 uformat = PyUnicode_FromObject(format); 8121 if (uformat == NULL) 8122 return NULL; 8123 fmt = PyUnicode_AS_UNICODE(uformat); 8124 fmtcnt = PyUnicode_GET_SIZE(uformat); 8125 8126 reslen = rescnt = fmtcnt + 100; 8127 result = _PyUnicode_New(reslen); 8128 if (result == NULL) 8129 goto onError; 8130 res = PyUnicode_AS_UNICODE(result); 8131 8132 if (PyTuple_Check(args)) { 8133 arglen = PyTuple_Size(args); 8134 argidx = 0; 8135 } 8136 else { 8137 arglen = -1; 8138 argidx = -2; 8139 } 8140 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) && 8141 !PyObject_TypeCheck(args, &PyBaseString_Type)) 8142 dict = args; 8143 8144 while (--fmtcnt >= 0) { 8145 if (*fmt != '%') { 8146 if (--rescnt < 0) { 8147 rescnt = fmtcnt + 100; 8148 reslen += rescnt; 8149 if (_PyUnicode_Resize(&result, reslen) < 0) 8150 goto onError; 8151 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 8152 --rescnt; 8153 } 8154 *res++ = *fmt++; 8155 } 8156 else { 8157 /* Got a format specifier */ 8158 int flags = 0; 8159 Py_ssize_t width = -1; 8160 int prec = -1; 8161 Py_UNICODE c = '\0'; 8162 Py_UNICODE fill; 8163 PyObject *v = NULL; 8164 PyObject *temp = NULL; 8165 Py_UNICODE *pbuf; 8166 Py_UNICODE sign; 8167 Py_ssize_t len; 8168 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 8169 8170 fmt++; 8171 if (*fmt == '(') { 8172 Py_UNICODE *keystart; 8173 Py_ssize_t keylen; 8174 PyObject *key; 8175 int pcount = 1; 8176 8177 if (dict == NULL) { 8178 PyErr_SetString(PyExc_TypeError, 8179 "format requires a mapping"); 8180 goto onError; 8181 } 8182 ++fmt; 8183 --fmtcnt; 8184 keystart = fmt; 8185 /* Skip over balanced parentheses */ 8186 while (pcount > 0 && --fmtcnt >= 0) { 8187 if (*fmt == ')') 8188 --pcount; 8189 else if (*fmt == '(') 8190 ++pcount; 8191 fmt++; 8192 } 8193 keylen = fmt - keystart - 1; 8194 if (fmtcnt < 0 || pcount > 0) { 8195 PyErr_SetString(PyExc_ValueError, 8196 "incomplete format key"); 8197 goto onError; 8198 } 8199#if 0 8200 /* keys are converted to strings using UTF-8 and 8201 then looked up since Python uses strings to hold 8202 variables names etc. in its namespaces and we 8203 wouldn't want to break common idioms. */ 8204 key = PyUnicode_EncodeUTF8(keystart, 8205 keylen, 8206 NULL); 8207#else 8208 key = PyUnicode_FromUnicode(keystart, keylen); 8209#endif 8210 if (key == NULL) 8211 goto onError; 8212 if (args_owned) { 8213 Py_DECREF(args); 8214 args_owned = 0; 8215 } 8216 args = PyObject_GetItem(dict, key); 8217 Py_DECREF(key); 8218 if (args == NULL) { 8219 goto onError; 8220 } 8221 args_owned = 1; 8222 arglen = -1; 8223 argidx = -2; 8224 } 8225 while (--fmtcnt >= 0) { 8226 switch (c = *fmt++) { 8227 case '-': flags |= F_LJUST; continue; 8228 case '+': flags |= F_SIGN; continue; 8229 case ' ': flags |= F_BLANK; continue; 8230 case '#': flags |= F_ALT; continue; 8231 case '0': flags |= F_ZERO; continue; 8232 } 8233 break; 8234 } 8235 if (c == '*') { 8236 v = getnextarg(args, arglen, &argidx); 8237 if (v == NULL) 8238 goto onError; 8239 if (!PyInt_Check(v)) { 8240 PyErr_SetString(PyExc_TypeError, 8241 "* wants int"); 8242 goto onError; 8243 } 8244 width = PyInt_AsLong(v); 8245 if (width == -1 && PyErr_Occurred()) 8246 goto onError; 8247 if (width < 0) { 8248 flags |= F_LJUST; 8249 width = -width; 8250 } 8251 if (--fmtcnt >= 0) 8252 c = *fmt++; 8253 } 8254 else if (c >= '0' && c <= '9') { 8255 width = c - '0'; 8256 while (--fmtcnt >= 0) { 8257 c = *fmt++; 8258 if (c < '0' || c > '9') 8259 break; 8260 if ((width*10) / 10 != width) { 8261 PyErr_SetString(PyExc_ValueError, 8262 "width too big"); 8263 goto onError; 8264 } 8265 width = width*10 + (c - '0'); 8266 } 8267 } 8268 if (c == '.') { 8269 prec = 0; 8270 if (--fmtcnt >= 0) 8271 c = *fmt++; 8272 if (c == '*') { 8273 v = getnextarg(args, arglen, &argidx); 8274 if (v == NULL) 8275 goto onError; 8276 if (!PyInt_Check(v)) { 8277 PyErr_SetString(PyExc_TypeError, 8278 "* wants int"); 8279 goto onError; 8280 } 8281 prec = PyInt_AsLong(v); 8282 if (prec == -1 && PyErr_Occurred()) 8283 goto onError; 8284 if (prec < 0) 8285 prec = 0; 8286 if (--fmtcnt >= 0) 8287 c = *fmt++; 8288 } 8289 else if (c >= '0' && c <= '9') { 8290 prec = c - '0'; 8291 while (--fmtcnt >= 0) { 8292 c = Py_CHARMASK(*fmt++); 8293 if (c < '0' || c > '9') 8294 break; 8295 if ((prec*10) / 10 != prec) { 8296 PyErr_SetString(PyExc_ValueError, 8297 "prec too big"); 8298 goto onError; 8299 } 8300 prec = prec*10 + (c - '0'); 8301 } 8302 } 8303 } /* prec */ 8304 if (fmtcnt >= 0) { 8305 if (c == 'h' || c == 'l' || c == 'L') { 8306 if (--fmtcnt >= 0) 8307 c = *fmt++; 8308 } 8309 } 8310 if (fmtcnt < 0) { 8311 PyErr_SetString(PyExc_ValueError, 8312 "incomplete format"); 8313 goto onError; 8314 } 8315 if (c != '%') { 8316 v = getnextarg(args, arglen, &argidx); 8317 if (v == NULL) 8318 goto onError; 8319 } 8320 sign = 0; 8321 fill = ' '; 8322 switch (c) { 8323 8324 case '%': 8325 pbuf = formatbuf; 8326 /* presume that buffer length is at least 1 */ 8327 pbuf[0] = '%'; 8328 len = 1; 8329 break; 8330 8331 case 's': 8332 case 'r': 8333 if (PyUnicode_Check(v) && c == 's') { 8334 temp = v; 8335 Py_INCREF(temp); 8336 } 8337 else { 8338 PyObject *unicode; 8339 if (c == 's') 8340 temp = PyObject_Unicode(v); 8341 else 8342 temp = PyObject_Repr(v); 8343 if (temp == NULL) 8344 goto onError; 8345 if (PyUnicode_Check(temp)) 8346 /* nothing to do */; 8347 else if (PyString_Check(temp)) { 8348 /* convert to string to Unicode */ 8349 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 8350 PyString_GET_SIZE(temp), 8351 NULL, 8352 "strict"); 8353 Py_DECREF(temp); 8354 temp = unicode; 8355 if (temp == NULL) 8356 goto onError; 8357 } 8358 else { 8359 Py_DECREF(temp); 8360 PyErr_SetString(PyExc_TypeError, 8361 "%s argument has non-string str()"); 8362 goto onError; 8363 } 8364 } 8365 pbuf = PyUnicode_AS_UNICODE(temp); 8366 len = PyUnicode_GET_SIZE(temp); 8367 if (prec >= 0 && len > prec) 8368 len = prec; 8369 break; 8370 8371 case 'i': 8372 case 'd': 8373 case 'u': 8374 case 'o': 8375 case 'x': 8376 case 'X': 8377 if (c == 'i') 8378 c = 'd'; 8379 if (PyLong_Check(v)) { 8380 temp = formatlong(v, flags, prec, c); 8381 if (!temp) 8382 goto onError; 8383 pbuf = PyUnicode_AS_UNICODE(temp); 8384 len = PyUnicode_GET_SIZE(temp); 8385 sign = 1; 8386 } 8387 else { 8388 pbuf = formatbuf; 8389 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8390 flags, prec, c, v); 8391 if (len < 0) 8392 goto onError; 8393 sign = 1; 8394 } 8395 if (flags & F_ZERO) 8396 fill = '0'; 8397 break; 8398 8399 case 'e': 8400 case 'E': 8401 case 'f': 8402 case 'F': 8403 case 'g': 8404 case 'G': 8405 if (c == 'F') 8406 c = 'f'; 8407 pbuf = formatbuf; 8408 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 8409 flags, prec, c, v); 8410 if (len < 0) 8411 goto onError; 8412 sign = 1; 8413 if (flags & F_ZERO) 8414 fill = '0'; 8415 break; 8416 8417 case 'c': 8418 pbuf = formatbuf; 8419 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 8420 if (len < 0) 8421 goto onError; 8422 break; 8423 8424 default: 8425 PyErr_Format(PyExc_ValueError, 8426 "unsupported format character '%c' (0x%x) " 8427 "at index %zd", 8428 (31<=c && c<=126) ? (char)c : '?', 8429 (int)c, 8430 (Py_ssize_t)(fmt - 1 - 8431 PyUnicode_AS_UNICODE(uformat))); 8432 goto onError; 8433 } 8434 if (sign) { 8435 if (*pbuf == '-' || *pbuf == '+') { 8436 sign = *pbuf++; 8437 len--; 8438 } 8439 else if (flags & F_SIGN) 8440 sign = '+'; 8441 else if (flags & F_BLANK) 8442 sign = ' '; 8443 else 8444 sign = 0; 8445 } 8446 if (width < len) 8447 width = len; 8448 if (rescnt - (sign != 0) < width) { 8449 reslen -= rescnt; 8450 rescnt = width + fmtcnt + 100; 8451 reslen += rescnt; 8452 if (reslen < 0) { 8453 Py_XDECREF(temp); 8454 PyErr_NoMemory(); 8455 goto onError; 8456 } 8457 if (_PyUnicode_Resize(&result, reslen) < 0) { 8458 Py_XDECREF(temp); 8459 goto onError; 8460 } 8461 res = PyUnicode_AS_UNICODE(result) 8462 + reslen - rescnt; 8463 } 8464 if (sign) { 8465 if (fill != ' ') 8466 *res++ = sign; 8467 rescnt--; 8468 if (width > len) 8469 width--; 8470 } 8471 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8472 assert(pbuf[0] == '0'); 8473 assert(pbuf[1] == c); 8474 if (fill != ' ') { 8475 *res++ = *pbuf++; 8476 *res++ = *pbuf++; 8477 } 8478 rescnt -= 2; 8479 width -= 2; 8480 if (width < 0) 8481 width = 0; 8482 len -= 2; 8483 } 8484 if (width > len && !(flags & F_LJUST)) { 8485 do { 8486 --rescnt; 8487 *res++ = fill; 8488 } while (--width > len); 8489 } 8490 if (fill == ' ') { 8491 if (sign) 8492 *res++ = sign; 8493 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 8494 assert(pbuf[0] == '0'); 8495 assert(pbuf[1] == c); 8496 *res++ = *pbuf++; 8497 *res++ = *pbuf++; 8498 } 8499 } 8500 Py_UNICODE_COPY(res, pbuf, len); 8501 res += len; 8502 rescnt -= len; 8503 while (--width >= len) { 8504 --rescnt; 8505 *res++ = ' '; 8506 } 8507 if (dict && (argidx < arglen) && c != '%') { 8508 PyErr_SetString(PyExc_TypeError, 8509 "not all arguments converted during string formatting"); 8510 Py_XDECREF(temp); 8511 goto onError; 8512 } 8513 Py_XDECREF(temp); 8514 } /* '%' */ 8515 } /* until end */ 8516 if (argidx < arglen && !dict) { 8517 PyErr_SetString(PyExc_TypeError, 8518 "not all arguments converted during string formatting"); 8519 goto onError; 8520 } 8521 8522 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) 8523 goto onError; 8524 if (args_owned) { 8525 Py_DECREF(args); 8526 } 8527 Py_DECREF(uformat); 8528 return (PyObject *)result; 8529 8530 onError: 8531 Py_XDECREF(result); 8532 Py_DECREF(uformat); 8533 if (args_owned) { 8534 Py_DECREF(args); 8535 } 8536 return NULL; 8537} 8538 8539static PyBufferProcs unicode_as_buffer = { 8540 (readbufferproc) unicode_buffer_getreadbuf, 8541 (writebufferproc) unicode_buffer_getwritebuf, 8542 (segcountproc) unicode_buffer_getsegcount, 8543 (charbufferproc) unicode_buffer_getcharbuf, 8544}; 8545 8546static PyObject * 8547unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 8548 8549static PyObject * 8550unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8551{ 8552 PyObject *x = NULL; 8553 static char *kwlist[] = {"string", "encoding", "errors", 0}; 8554 char *encoding = NULL; 8555 char *errors = NULL; 8556 8557 if (type != &PyUnicode_Type) 8558 return unicode_subtype_new(type, args, kwds); 8559 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", 8560 kwlist, &x, &encoding, &errors)) 8561 return NULL; 8562 if (x == NULL) 8563 return (PyObject *)_PyUnicode_New(0); 8564 if (encoding == NULL && errors == NULL) 8565 return PyObject_Unicode(x); 8566 else 8567 return PyUnicode_FromEncodedObject(x, encoding, errors); 8568} 8569 8570static PyObject * 8571unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 8572{ 8573 PyUnicodeObject *tmp, *pnew; 8574 Py_ssize_t n; 8575 8576 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 8577 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 8578 if (tmp == NULL) 8579 return NULL; 8580 assert(PyUnicode_Check(tmp)); 8581 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); 8582 if (pnew == NULL) { 8583 Py_DECREF(tmp); 8584 return NULL; 8585 } 8586 pnew->str = PyMem_NEW(Py_UNICODE, n+1); 8587 if (pnew->str == NULL) { 8588 _Py_ForgetReference((PyObject *)pnew); 8589 PyObject_Del(pnew); 8590 Py_DECREF(tmp); 8591 return PyErr_NoMemory(); 8592 } 8593 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); 8594 pnew->length = n; 8595 pnew->hash = tmp->hash; 8596 Py_DECREF(tmp); 8597 return (PyObject *)pnew; 8598} 8599 8600PyDoc_STRVAR(unicode_doc, 8601"unicode(string [, encoding[, errors]]) -> object\n\ 8602\n\ 8603Create a new Unicode object from the given encoded string.\n\ 8604encoding defaults to the current default string encoding.\n\ 8605errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 8606 8607static PyObject *unicode_iter(PyObject *seq); 8608 8609PyTypeObject PyUnicode_Type = { 8610 PyObject_HEAD_INIT(&PyType_Type) 8611 0, /* ob_size */ 8612 "str", /* tp_name */ 8613 sizeof(PyUnicodeObject), /* tp_size */ 8614 0, /* tp_itemsize */ 8615 /* Slots */ 8616 (destructor)unicode_dealloc, /* tp_dealloc */ 8617 0, /* tp_print */ 8618 0, /* tp_getattr */ 8619 0, /* tp_setattr */ 8620 0, /* tp_compare */ 8621 unicode_repr, /* tp_repr */ 8622 &unicode_as_number, /* tp_as_number */ 8623 &unicode_as_sequence, /* tp_as_sequence */ 8624 &unicode_as_mapping, /* tp_as_mapping */ 8625 (hashfunc) unicode_hash, /* tp_hash*/ 8626 0, /* tp_call*/ 8627 (reprfunc) unicode_str, /* tp_str */ 8628 PyObject_GenericGetAttr, /* tp_getattro */ 8629 0, /* tp_setattro */ 8630 &unicode_as_buffer, /* tp_as_buffer */ 8631 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 8632 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 8633 unicode_doc, /* tp_doc */ 8634 0, /* tp_traverse */ 8635 0, /* tp_clear */ 8636 PyUnicode_RichCompare, /* tp_richcompare */ 8637 0, /* tp_weaklistoffset */ 8638 unicode_iter, /* tp_iter */ 8639 0, /* tp_iternext */ 8640 unicode_methods, /* tp_methods */ 8641 0, /* tp_members */ 8642 0, /* tp_getset */ 8643 &PyBaseString_Type, /* tp_base */ 8644 0, /* tp_dict */ 8645 0, /* tp_descr_get */ 8646 0, /* tp_descr_set */ 8647 0, /* tp_dictoffset */ 8648 0, /* tp_init */ 8649 0, /* tp_alloc */ 8650 unicode_new, /* tp_new */ 8651 PyObject_Del, /* tp_free */ 8652}; 8653 8654/* Initialize the Unicode implementation */ 8655 8656void _PyUnicode_Init(void) 8657{ 8658 int i; 8659 8660 /* XXX - move this array to unicodectype.c ? */ 8661 Py_UNICODE linebreak[] = { 8662 0x000A, /* LINE FEED */ 8663 0x000D, /* CARRIAGE RETURN */ 8664 0x001C, /* FILE SEPARATOR */ 8665 0x001D, /* GROUP SEPARATOR */ 8666 0x001E, /* RECORD SEPARATOR */ 8667 0x0085, /* NEXT LINE */ 8668 0x2028, /* LINE SEPARATOR */ 8669 0x2029, /* PARAGRAPH SEPARATOR */ 8670 }; 8671 8672 /* Init the implementation */ 8673 unicode_freelist = NULL; 8674 unicode_freelist_size = 0; 8675 unicode_empty = _PyUnicode_New(0); 8676 if (!unicode_empty) 8677 return; 8678 8679 for (i = 0; i < 256; i++) 8680 unicode_latin1[i] = NULL; 8681 if (PyType_Ready(&PyUnicode_Type) < 0) 8682 Py_FatalError("Can't initialize 'unicode'"); 8683 8684 /* initialize the linebreak bloom filter */ 8685 bloom_linebreak = make_bloom_mask( 8686 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) 8687 ); 8688 8689 PyType_Ready(&EncodingMapType); 8690} 8691 8692/* Finalize the Unicode implementation */ 8693 8694void 8695_PyUnicode_Fini(void) 8696{ 8697 PyUnicodeObject *u; 8698 int i; 8699 8700 Py_XDECREF(unicode_empty); 8701 unicode_empty = NULL; 8702 8703 for (i = 0; i < 256; i++) { 8704 if (unicode_latin1[i]) { 8705 Py_DECREF(unicode_latin1[i]); 8706 unicode_latin1[i] = NULL; 8707 } 8708 } 8709 8710 for (u = unicode_freelist; u != NULL;) { 8711 PyUnicodeObject *v = u; 8712 u = *(PyUnicodeObject **)u; 8713 if (v->str) 8714 PyMem_DEL(v->str); 8715 Py_XDECREF(v->defenc); 8716 PyObject_Del(v); 8717 } 8718 unicode_freelist = NULL; 8719 unicode_freelist_size = 0; 8720} 8721 8722void 8723PyUnicode_InternInPlace(PyObject **p) 8724{ 8725 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 8726 PyObject *t; 8727 if (s == NULL || !PyUnicode_Check(s)) 8728 Py_FatalError( 8729 "PyUnicode_InternInPlace: unicode strings only please!"); 8730 /* If it's a subclass, we don't really know what putting 8731 it in the interned dict might do. */ 8732 if (!PyUnicode_CheckExact(s)) 8733 return; 8734 if (PyUnicode_CHECK_INTERNED(s)) 8735 return; 8736 if (interned == NULL) { 8737 interned = PyDict_New(); 8738 if (interned == NULL) { 8739 PyErr_Clear(); /* Don't leave an exception */ 8740 return; 8741 } 8742 } 8743 /* It might be that the GetItem call fails even 8744 though the key is present in the dictionary, 8745 namely when this happens during a stack overflow. */ 8746 Py_ALLOW_RECURSION 8747 t = PyDict_GetItem(interned, (PyObject *)s); 8748 Py_END_ALLOW_RECURSION 8749 8750 if (t) { 8751 Py_INCREF(t); 8752 Py_DECREF(*p); 8753 *p = t; 8754 return; 8755 } 8756 8757 PyThreadState_GET()->recursion_critical = 1; 8758 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 8759 PyErr_Clear(); 8760 PyThreadState_GET()->recursion_critical = 0; 8761 return; 8762 } 8763 PyThreadState_GET()->recursion_critical = 0; 8764 /* The two references in interned are not counted by refcnt. 8765 The deallocator will take care of this */ 8766 s->ob_refcnt -= 2; 8767 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; 8768} 8769 8770void 8771PyUnicode_InternImmortal(PyObject **p) 8772{ 8773 PyUnicode_InternInPlace(p); 8774 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 8775 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; 8776 Py_INCREF(*p); 8777 } 8778} 8779 8780PyObject * 8781PyUnicode_InternFromString(const char *cp) 8782{ 8783 PyObject *s = PyUnicode_FromString(cp); 8784 if (s == NULL) 8785 return NULL; 8786 PyUnicode_InternInPlace(&s); 8787 return s; 8788} 8789 8790void _Py_ReleaseInternedUnicodeStrings(void) 8791{ 8792 PyObject *keys; 8793 PyUnicodeObject *s; 8794 Py_ssize_t i, n; 8795 Py_ssize_t immortal_size = 0, mortal_size = 0; 8796 8797 if (interned == NULL || !PyDict_Check(interned)) 8798 return; 8799 keys = PyDict_Keys(interned); 8800 if (keys == NULL || !PyList_Check(keys)) { 8801 PyErr_Clear(); 8802 return; 8803 } 8804 8805 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 8806 detector, interned unicode strings are not forcibly deallocated; 8807 rather, we give them their stolen references back, and then clear 8808 and DECREF the interned dict. */ 8809 8810 n = PyList_GET_SIZE(keys); 8811 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 8812 n); 8813 for (i = 0; i < n; i++) { 8814 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 8815 switch (s->state) { 8816 case SSTATE_NOT_INTERNED: 8817 /* XXX Shouldn't happen */ 8818 break; 8819 case SSTATE_INTERNED_IMMORTAL: 8820 s->ob_refcnt += 1; 8821 immortal_size += s->length; 8822 break; 8823 case SSTATE_INTERNED_MORTAL: 8824 s->ob_refcnt += 2; 8825 mortal_size += s->length; 8826 break; 8827 default: 8828 Py_FatalError("Inconsistent interned string state."); 8829 } 8830 s->state = SSTATE_NOT_INTERNED; 8831 } 8832 fprintf(stderr, "total size of all interned strings: " 8833 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 8834 "mortal/immortal\n", mortal_size, immortal_size); 8835 Py_DECREF(keys); 8836 PyDict_Clear(interned); 8837 Py_DECREF(interned); 8838 interned = NULL; 8839} 8840 8841 8842/********************* Unicode Iterator **************************/ 8843 8844typedef struct { 8845 PyObject_HEAD 8846 Py_ssize_t it_index; 8847 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 8848} unicodeiterobject; 8849 8850static void 8851unicodeiter_dealloc(unicodeiterobject *it) 8852{ 8853 _PyObject_GC_UNTRACK(it); 8854 Py_XDECREF(it->it_seq); 8855 PyObject_GC_Del(it); 8856} 8857 8858static int 8859unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 8860{ 8861 Py_VISIT(it->it_seq); 8862 return 0; 8863} 8864 8865static PyObject * 8866unicodeiter_next(unicodeiterobject *it) 8867{ 8868 PyUnicodeObject *seq; 8869 PyObject *item; 8870 8871 assert(it != NULL); 8872 seq = it->it_seq; 8873 if (seq == NULL) 8874 return NULL; 8875 assert(PyUnicode_Check(seq)); 8876 8877 if (it->it_index < PyUnicode_GET_SIZE(seq)) { 8878 item = PyUnicode_FromUnicode( 8879 PyUnicode_AS_UNICODE(seq)+it->it_index, 1); 8880 if (item != NULL) 8881 ++it->it_index; 8882 return item; 8883 } 8884 8885 Py_DECREF(seq); 8886 it->it_seq = NULL; 8887 return NULL; 8888} 8889 8890static PyObject * 8891unicodeiter_len(unicodeiterobject *it) 8892{ 8893 Py_ssize_t len = 0; 8894 if (it->it_seq) 8895 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 8896 return PyInt_FromSsize_t(len); 8897} 8898 8899PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 8900 8901static PyMethodDef unicodeiter_methods[] = { 8902 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 8903 length_hint_doc}, 8904 {NULL, NULL} /* sentinel */ 8905}; 8906 8907PyTypeObject PyUnicodeIter_Type = { 8908 PyObject_HEAD_INIT(&PyType_Type) 8909 0, /* ob_size */ 8910 "unicodeiterator", /* tp_name */ 8911 sizeof(unicodeiterobject), /* tp_basicsize */ 8912 0, /* tp_itemsize */ 8913 /* methods */ 8914 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 8915 0, /* tp_print */ 8916 0, /* tp_getattr */ 8917 0, /* tp_setattr */ 8918 0, /* tp_compare */ 8919 0, /* tp_repr */ 8920 0, /* tp_as_number */ 8921 0, /* tp_as_sequence */ 8922 0, /* tp_as_mapping */ 8923 0, /* tp_hash */ 8924 0, /* tp_call */ 8925 0, /* tp_str */ 8926 PyObject_GenericGetAttr, /* tp_getattro */ 8927 0, /* tp_setattro */ 8928 0, /* tp_as_buffer */ 8929 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 8930 0, /* tp_doc */ 8931 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 8932 0, /* tp_clear */ 8933 0, /* tp_richcompare */ 8934 0, /* tp_weaklistoffset */ 8935 PyObject_SelfIter, /* tp_iter */ 8936 (iternextfunc)unicodeiter_next, /* tp_iternext */ 8937 unicodeiter_methods, /* tp_methods */ 8938 0, 8939}; 8940 8941static PyObject * 8942unicode_iter(PyObject *seq) 8943{ 8944 unicodeiterobject *it; 8945 8946 if (!PyUnicode_Check(seq)) { 8947 PyErr_BadInternalCall(); 8948 return NULL; 8949 } 8950 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 8951 if (it == NULL) 8952 return NULL; 8953 it->it_index = 0; 8954 Py_INCREF(seq); 8955 it->it_seq = (PyUnicodeObject *)seq; 8956 _PyObject_GC_TRACK(it); 8957 return (PyObject *)it; 8958} 8959 8960size_t 8961Py_UNICODE_strlen(const Py_UNICODE *u) 8962{ 8963 int res = 0; 8964 while(*u++) 8965 res++; 8966 return res; 8967} 8968 8969Py_UNICODE* 8970Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 8971{ 8972 Py_UNICODE *u = s1; 8973 while ((*u++ = *s2++)); 8974 return s1; 8975} 8976 8977Py_UNICODE* 8978Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 8979{ 8980 Py_UNICODE *u = s1; 8981 while ((*u++ = *s2++)) 8982 if (n-- == 0) 8983 break; 8984 return s1; 8985} 8986 8987int 8988Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 8989{ 8990 while (*s1 && *s2 && *s1 == *s2) 8991 s1++, s2++; 8992 if (*s1 && *s2) 8993 return (*s1 < *s2) ? -1 : +1; 8994 if (*s1) 8995 return 1; 8996 if (*s2) 8997 return -1; 8998 return 0; 8999} 9000 9001Py_UNICODE* 9002Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 9003{ 9004 const Py_UNICODE *p; 9005 for (p = s; *p; p++) 9006 if (*p == c) 9007 return (Py_UNICODE*)p; 9008 return NULL; 9009} 9010 9011 9012#ifdef __cplusplus 9013} 9014#endif 9015 9016 9017/* 9018Local variables: 9019c-basic-offset: 4 9020indent-tabs-mode: nil 9021End: 9022*/ 9023