unicodeobject.c revision ce4dc41b1a2be5b5335bcbc0865b145852a5c0e5
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9 10 Original header: 11 -------------------------------------------------------------------- 12 13 * Yet another Unicode string type for Python. This type supports the 14 * 16-bit Basic Multilingual Plane (BMP) only. 15 * 16 * Note that this string class supports embedded NULL characters. End 17 * of string is given by the length attribute. However, the internal 18 * representation always stores a trailing NULL to make it easier to 19 * use unicode strings with standard APIs. 20 * 21 * History: 22 * 1999-01-23 fl Created 23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support 24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc. 25 * 1999-03-06 fl Moved declarations to separate file, etc. 26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal 27 * 1999-08-10 fl Some minor tweaks 28 * 29 * Written by Fredrik Lundh, January 1999. 30 * 31 * Copyright (c) 1999 by Secret Labs AB. 32 * Copyright (c) 1999 by Fredrik Lundh. 33 * 34 * fredrik@pythonware.com 35 * http://www.pythonware.com 36 * 37 * -------------------------------------------------------------------- 38 * This Unicode String Type is 39 * 40 * Copyright (c) 1999 by Secret Labs AB 41 * Copyright (c) 1999 by Fredrik Lundh 42 * 43 * By obtaining, using, and/or copying this software and/or its 44 * associated documentation, you agree that you have read, understood, 45 * and will comply with the following terms and conditions: 46 * 47 * Permission to use, copy, modify, and distribute this software and its 48 * associated documentation for any purpose and without fee is hereby 49 * granted, provided that the above copyright notice appears in all 50 * copies, and that both that copyright notice and this permission notice 51 * appear in supporting documentation, and that the name of Secret Labs 52 * AB or the author not be used in advertising or publicity pertaining to 53 * distribution of the software without specific, written prior 54 * permission. 55 * 56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 63 * -------------------------------------------------------------------- */ 64 65#include "Python.h" 66 67#include "unicodeobject.h" 68#include "ucnhash.h" 69 70#if defined(HAVE_LIMITS_H) 71#include <limits.h> 72#else 73#define INT_MAX 2147483647 74#endif 75 76#ifdef MS_WIN32 77#include <windows.h> 78#endif 79 80/* Limit for the Unicode object free list */ 81 82#define MAX_UNICODE_FREELIST_SIZE 1024 83 84/* Limit for the Unicode object free list stay alive optimization. 85 86 The implementation will keep allocated Unicode memory intact for 87 all objects on the free list having a size less than this 88 limit. This reduces malloc() overhead for small Unicode objects. 89 90 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 91 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 92 malloc()-overhead) bytes of unused garbage. 93 94 Setting the limit to 0 effectively turns the feature off. 95 96 Note: This is an experimental feature ! If you get core dumps when 97 using Unicode objects, turn this feature off. 98 99*/ 100 101#define KEEPALIVE_SIZE_LIMIT 9 102 103/* Endianness switches; defaults to little endian */ 104 105#ifdef WORDS_BIGENDIAN 106# define BYTEORDER_IS_BIG_ENDIAN 107#else 108# define BYTEORDER_IS_LITTLE_ENDIAN 109#endif 110 111/* --- Globals ------------------------------------------------------------ 112 113 The globals are initialized by the _PyUnicode_Init() API and should 114 not be used before calling that API. 115 116*/ 117 118/* The empty Unicode object */ 119static PyUnicodeObject *unicode_empty; 120 121/* Free list for Unicode objects */ 122static PyUnicodeObject *unicode_freelist; 123static int unicode_freelist_size; 124 125/* Default encoding to use and assume when NULL is passed as encoding 126 parameter; it is initialized by _PyUnicode_Init(). 127 128 Always use the PyUnicode_SetDefaultEncoding() and 129 PyUnicode_GetDefaultEncoding() APIs to access this global. 130 131*/ 132 133static char unicode_default_encoding[100]; 134 135/* --- Unicode Object ----------------------------------------------------- */ 136 137static 138int _PyUnicode_Resize(register PyUnicodeObject *unicode, 139 int length) 140{ 141 void *oldstr; 142 143 /* Shortcut if there's nothing much to do. */ 144 if (unicode->length == length) 145 goto reset; 146 147 /* Resizing unicode_empty is not allowed. */ 148 if (unicode == unicode_empty) { 149 PyErr_SetString(PyExc_SystemError, 150 "can't resize empty unicode object"); 151 return -1; 152 } 153 154 /* We allocate one more byte to make sure the string is 155 Ux0000 terminated -- XXX is this needed ? */ 156 oldstr = unicode->str; 157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 158 if (!unicode->str) { 159 unicode->str = oldstr; 160 PyErr_NoMemory(); 161 return -1; 162 } 163 unicode->str[length] = 0; 164 unicode->length = length; 165 166 reset: 167 /* Reset the object caches */ 168 if (unicode->defenc) { 169 Py_DECREF(unicode->defenc); 170 unicode->defenc = NULL; 171 } 172 unicode->hash = -1; 173 174 return 0; 175} 176 177int PyUnicode_Resize(PyObject **unicode, 178 int length) 179{ 180 PyUnicodeObject *v; 181 182 if (unicode == NULL) { 183 PyErr_BadInternalCall(); 184 return -1; 185 } 186 v = (PyUnicodeObject *)*unicode; 187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 188 PyErr_BadInternalCall(); 189 return -1; 190 } 191 return _PyUnicode_Resize(v, length); 192} 193 194/* We allocate one more byte to make sure the string is 195 Ux0000 terminated -- XXX is this needed ? 196 197 XXX This allocator could further be enhanced by assuring that the 198 free list never reduces its size below 1. 199 200*/ 201 202static 203PyUnicodeObject *_PyUnicode_New(int length) 204{ 205 register PyUnicodeObject *unicode; 206 207 /* Optimization for empty strings */ 208 if (length == 0 && unicode_empty != NULL) { 209 Py_INCREF(unicode_empty); 210 return unicode_empty; 211 } 212 213 /* Unicode freelist & memory allocation */ 214 if (unicode_freelist) { 215 unicode = unicode_freelist; 216 unicode_freelist = *(PyUnicodeObject **)unicode; 217 unicode_freelist_size--; 218 if (unicode->str) { 219 /* Keep-Alive optimization: we only upsize the buffer, 220 never downsize it. */ 221 if ((unicode->length < length) && 222 _PyUnicode_Resize(unicode, length)) { 223 PyMem_DEL(unicode->str); 224 goto onError; 225 } 226 } 227 else { 228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 229 } 230 PyObject_INIT(unicode, &PyUnicode_Type); 231 } 232 else { 233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 234 if (unicode == NULL) 235 return NULL; 236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 237 } 238 239 if (!unicode->str) { 240 PyErr_NoMemory(); 241 goto onError; 242 } 243 unicode->str[length] = 0; 244 unicode->length = length; 245 unicode->hash = -1; 246 unicode->defenc = NULL; 247 return unicode; 248 249 onError: 250 _Py_ForgetReference((PyObject *)unicode); 251 PyObject_DEL(unicode); 252 return NULL; 253} 254 255static 256void _PyUnicode_Free(register PyUnicodeObject *unicode) 257{ 258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 259 /* Keep-Alive optimization */ 260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 261 PyMem_DEL(unicode->str); 262 unicode->str = NULL; 263 unicode->length = 0; 264 } 265 if (unicode->defenc) { 266 Py_DECREF(unicode->defenc); 267 unicode->defenc = NULL; 268 } 269 /* Add to free list */ 270 *(PyUnicodeObject **)unicode = unicode_freelist; 271 unicode_freelist = unicode; 272 unicode_freelist_size++; 273 } 274 else { 275 PyMem_DEL(unicode->str); 276 Py_XDECREF(unicode->defenc); 277 PyObject_DEL(unicode); 278 } 279} 280 281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 282 int size) 283{ 284 PyUnicodeObject *unicode; 285 286 unicode = _PyUnicode_New(size); 287 if (!unicode) 288 return NULL; 289 290 /* Copy the Unicode data into the new object */ 291 if (u != NULL) 292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE)); 293 294 return (PyObject *)unicode; 295} 296 297#ifdef HAVE_WCHAR_H 298 299PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 300 int size) 301{ 302 PyUnicodeObject *unicode; 303 304 if (w == NULL) { 305 PyErr_BadInternalCall(); 306 return NULL; 307 } 308 309 unicode = _PyUnicode_New(size); 310 if (!unicode) 311 return NULL; 312 313 /* Copy the wchar_t data into the new object */ 314#ifdef HAVE_USABLE_WCHAR_T 315 memcpy(unicode->str, w, size * sizeof(wchar_t)); 316#else 317 { 318 register Py_UNICODE *u; 319 register int i; 320 u = PyUnicode_AS_UNICODE(unicode); 321 for (i = size; i >= 0; i--) 322 *u++ = *w++; 323 } 324#endif 325 326 return (PyObject *)unicode; 327} 328 329int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 330 register wchar_t *w, 331 int size) 332{ 333 if (unicode == NULL) { 334 PyErr_BadInternalCall(); 335 return -1; 336 } 337 if (size > PyUnicode_GET_SIZE(unicode)) 338 size = PyUnicode_GET_SIZE(unicode); 339#ifdef HAVE_USABLE_WCHAR_T 340 memcpy(w, unicode->str, size * sizeof(wchar_t)); 341#else 342 { 343 register Py_UNICODE *u; 344 register int i; 345 u = PyUnicode_AS_UNICODE(unicode); 346 for (i = size; i >= 0; i--) 347 *w++ = *u++; 348 } 349#endif 350 351 return size; 352} 353 354#endif 355 356PyObject *PyUnicode_FromObject(register PyObject *obj) 357{ 358 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 359} 360 361PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 362 const char *encoding, 363 const char *errors) 364{ 365 const char *s; 366 int len; 367 int owned = 0; 368 PyObject *v; 369 370 if (obj == NULL) { 371 PyErr_BadInternalCall(); 372 return NULL; 373 } 374 375 /* Coerce object */ 376 if (PyInstance_Check(obj)) { 377 PyObject *func; 378 func = PyObject_GetAttrString(obj, "__str__"); 379 if (func == NULL) { 380 PyErr_SetString(PyExc_TypeError, 381 "coercing to Unicode: instance doesn't define __str__"); 382 return NULL; 383 } 384 obj = PyEval_CallObject(func, NULL); 385 Py_DECREF(func); 386 if (obj == NULL) 387 return NULL; 388 owned = 1; 389 } 390 if (PyUnicode_Check(obj)) { 391 Py_INCREF(obj); 392 v = obj; 393 if (encoding) { 394 PyErr_SetString(PyExc_TypeError, 395 "decoding Unicode is not supported"); 396 return NULL; 397 } 398 goto done; 399 } 400 else if (PyString_Check(obj)) { 401 s = PyString_AS_STRING(obj); 402 len = PyString_GET_SIZE(obj); 403 } 404 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 405 /* Overwrite the error message with something more useful in 406 case of a TypeError. */ 407 if (PyErr_ExceptionMatches(PyExc_TypeError)) 408 PyErr_Format(PyExc_TypeError, 409 "coercing to Unicode: need string or buffer, " 410 "%.80s found", 411 obj->ob_type->tp_name); 412 goto onError; 413 } 414 415 /* Convert to Unicode */ 416 if (len == 0) { 417 Py_INCREF(unicode_empty); 418 v = (PyObject *)unicode_empty; 419 } 420 else 421 v = PyUnicode_Decode(s, len, encoding, errors); 422 done: 423 if (owned) { 424 Py_DECREF(obj); 425 } 426 return v; 427 428 onError: 429 if (owned) { 430 Py_DECREF(obj); 431 } 432 return NULL; 433} 434 435PyObject *PyUnicode_Decode(const char *s, 436 int size, 437 const char *encoding, 438 const char *errors) 439{ 440 PyObject *buffer = NULL, *unicode; 441 442 if (encoding == NULL) 443 encoding = PyUnicode_GetDefaultEncoding(); 444 445 /* Shortcuts for common default encodings */ 446 if (strcmp(encoding, "utf-8") == 0) 447 return PyUnicode_DecodeUTF8(s, size, errors); 448 else if (strcmp(encoding, "latin-1") == 0) 449 return PyUnicode_DecodeLatin1(s, size, errors); 450 else if (strcmp(encoding, "ascii") == 0) 451 return PyUnicode_DecodeASCII(s, size, errors); 452 453 /* Decode via the codec registry */ 454 buffer = PyBuffer_FromMemory((void *)s, size); 455 if (buffer == NULL) 456 goto onError; 457 unicode = PyCodec_Decode(buffer, encoding, errors); 458 if (unicode == NULL) 459 goto onError; 460 if (!PyUnicode_Check(unicode)) { 461 PyErr_Format(PyExc_TypeError, 462 "decoder did not return an unicode object (type=%.400s)", 463 unicode->ob_type->tp_name); 464 Py_DECREF(unicode); 465 goto onError; 466 } 467 Py_DECREF(buffer); 468 return unicode; 469 470 onError: 471 Py_XDECREF(buffer); 472 return NULL; 473} 474 475PyObject *PyUnicode_Encode(const Py_UNICODE *s, 476 int size, 477 const char *encoding, 478 const char *errors) 479{ 480 PyObject *v, *unicode; 481 482 unicode = PyUnicode_FromUnicode(s, size); 483 if (unicode == NULL) 484 return NULL; 485 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 486 Py_DECREF(unicode); 487 return v; 488} 489 490PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 491 const char *encoding, 492 const char *errors) 493{ 494 PyObject *v; 495 496 if (!PyUnicode_Check(unicode)) { 497 PyErr_BadArgument(); 498 goto onError; 499 } 500 501 if (encoding == NULL) 502 encoding = PyUnicode_GetDefaultEncoding(); 503 504 /* Shortcuts for common default encodings */ 505 if (errors == NULL) { 506 if (strcmp(encoding, "utf-8") == 0) 507 return PyUnicode_AsUTF8String(unicode); 508 else if (strcmp(encoding, "latin-1") == 0) 509 return PyUnicode_AsLatin1String(unicode); 510 else if (strcmp(encoding, "ascii") == 0) 511 return PyUnicode_AsASCIIString(unicode); 512 } 513 514 /* Encode via the codec registry */ 515 v = PyCodec_Encode(unicode, encoding, errors); 516 if (v == NULL) 517 goto onError; 518 /* XXX Should we really enforce this ? */ 519 if (!PyString_Check(v)) { 520 PyErr_Format(PyExc_TypeError, 521 "encoder did not return a string object (type=%.400s)", 522 v->ob_type->tp_name); 523 Py_DECREF(v); 524 goto onError; 525 } 526 return v; 527 528 onError: 529 return NULL; 530} 531 532/* Return a Python string holding the default encoded value of the 533 Unicode object. 534 535 The resulting string is cached in the Unicode object for subsequent 536 usage by this function. The cached version is needed to implement 537 the character buffer interface and will live (at least) as long as 538 the Unicode object itself. 539 540 The refcount of the string is *not* incremented. 541 542 *** Exported for internal use by the interpreter only !!! *** 543 544*/ 545 546PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 547 const char *errors) 548{ 549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 550 551 if (v) 552 return v; 553 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 554 if (v && errors == NULL) 555 ((PyUnicodeObject *)unicode)->defenc = v; 556 return v; 557} 558 559Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 560{ 561 if (!PyUnicode_Check(unicode)) { 562 PyErr_BadArgument(); 563 goto onError; 564 } 565 return PyUnicode_AS_UNICODE(unicode); 566 567 onError: 568 return NULL; 569} 570 571int PyUnicode_GetSize(PyObject *unicode) 572{ 573 if (!PyUnicode_Check(unicode)) { 574 PyErr_BadArgument(); 575 goto onError; 576 } 577 return PyUnicode_GET_SIZE(unicode); 578 579 onError: 580 return -1; 581} 582 583const char *PyUnicode_GetDefaultEncoding(void) 584{ 585 return unicode_default_encoding; 586} 587 588int PyUnicode_SetDefaultEncoding(const char *encoding) 589{ 590 PyObject *v; 591 592 /* Make sure the encoding is valid. As side effect, this also 593 loads the encoding into the codec registry cache. */ 594 v = _PyCodec_Lookup(encoding); 595 if (v == NULL) 596 goto onError; 597 Py_DECREF(v); 598 strncpy(unicode_default_encoding, 599 encoding, 600 sizeof(unicode_default_encoding)); 601 return 0; 602 603 onError: 604 return -1; 605} 606 607/* --- UTF-8 Codec -------------------------------------------------------- */ 608 609static 610char utf8_code_length[256] = { 611 /* Map UTF-8 encoded prefix byte to sequence length. zero means 612 illegal prefix. see RFC 2279 for details */ 613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 629}; 630 631static 632int utf8_decoding_error(const char **source, 633 Py_UNICODE **dest, 634 const char *errors, 635 const char *details) 636{ 637 if ((errors == NULL) || 638 (strcmp(errors,"strict") == 0)) { 639 PyErr_Format(PyExc_UnicodeError, 640 "UTF-8 decoding error: %.400s", 641 details); 642 return -1; 643 } 644 else if (strcmp(errors,"ignore") == 0) { 645 (*source)++; 646 return 0; 647 } 648 else if (strcmp(errors,"replace") == 0) { 649 (*source)++; 650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 651 (*dest)++; 652 return 0; 653 } 654 else { 655 PyErr_Format(PyExc_ValueError, 656 "UTF-8 decoding error; unknown error handling code: %.400s", 657 errors); 658 return -1; 659 } 660} 661 662PyObject *PyUnicode_DecodeUTF8(const char *s, 663 int size, 664 const char *errors) 665{ 666 int n; 667 const char *e; 668 PyUnicodeObject *unicode; 669 Py_UNICODE *p; 670 const char *errmsg = ""; 671 672 /* Note: size will always be longer than the resulting Unicode 673 character count */ 674 unicode = _PyUnicode_New(size); 675 if (!unicode) 676 return NULL; 677 if (size == 0) 678 return (PyObject *)unicode; 679 680 /* Unpack UTF-8 encoded data */ 681 p = unicode->str; 682 e = s + size; 683 684 while (s < e) { 685 Py_UCS4 ch = (unsigned char)*s; 686 687 if (ch < 0x80) { 688 *p++ = (Py_UNICODE)ch; 689 s++; 690 continue; 691 } 692 693 n = utf8_code_length[ch]; 694 695 if (s + n > e) { 696 errmsg = "unexpected end of data"; 697 goto utf8Error; 698 } 699 700 switch (n) { 701 702 case 0: 703 errmsg = "unexpected code byte"; 704 goto utf8Error; 705 break; 706 707 case 1: 708 errmsg = "internal error"; 709 goto utf8Error; 710 break; 711 712 case 2: 713 if ((s[1] & 0xc0) != 0x80) { 714 errmsg = "invalid data"; 715 goto utf8Error; 716 } 717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 718 if (ch < 0x80) { 719 errmsg = "illegal encoding"; 720 goto utf8Error; 721 } 722 else 723 *p++ = (Py_UNICODE)ch; 724 break; 725 726 case 3: 727 if ((s[1] & 0xc0) != 0x80 || 728 (s[2] & 0xc0) != 0x80) { 729 errmsg = "invalid data"; 730 goto utf8Error; 731 } 732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 734 errmsg = "illegal encoding"; 735 goto utf8Error; 736 } 737 else 738 *p++ = (Py_UNICODE)ch; 739 break; 740 741 case 4: 742 if ((s[1] & 0xc0) != 0x80 || 743 (s[2] & 0xc0) != 0x80 || 744 (s[3] & 0xc0) != 0x80) { 745 errmsg = "invalid data"; 746 goto utf8Error; 747 } 748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 750 /* validate and convert to UTF-16 */ 751 if ((ch < 0x10000) || /* minimum value allowed for 4 752 byte encoding */ 753 (ch > 0x10ffff)) { /* maximum value allowed for 754 UTF-16 */ 755 errmsg = "illegal encoding"; 756 goto utf8Error; 757 } 758 /* compute and append the two surrogates: */ 759 760 /* translate from 10000..10FFFF to 0..FFFF */ 761 ch -= 0x10000; 762 763 /* high surrogate = top 10 bits added to D800 */ 764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 765 766 /* low surrogate = bottom 10 bits added to DC00 */ 767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00)); 768 break; 769 770 default: 771 /* Other sizes are only needed for UCS-4 */ 772 errmsg = "unsupported Unicode code range"; 773 goto utf8Error; 774 break; 775 } 776 s += n; 777 continue; 778 779 utf8Error: 780 if (utf8_decoding_error(&s, &p, errors, errmsg)) 781 goto onError; 782 } 783 784 /* Adjust length */ 785 if (_PyUnicode_Resize(unicode, p - unicode->str)) 786 goto onError; 787 788 return (PyObject *)unicode; 789 790onError: 791 Py_DECREF(unicode); 792 return NULL; 793} 794 795/* Not used anymore, now that the encoder supports UTF-16 796 surrogates. */ 797#if 0 798static 799int utf8_encoding_error(const Py_UNICODE **source, 800 char **dest, 801 const char *errors, 802 const char *details) 803{ 804 if ((errors == NULL) || 805 (strcmp(errors,"strict") == 0)) { 806 PyErr_Format(PyExc_UnicodeError, 807 "UTF-8 encoding error: %.400s", 808 details); 809 return -1; 810 } 811 else if (strcmp(errors,"ignore") == 0) { 812 return 0; 813 } 814 else if (strcmp(errors,"replace") == 0) { 815 **dest = '?'; 816 (*dest)++; 817 return 0; 818 } 819 else { 820 PyErr_Format(PyExc_ValueError, 821 "UTF-8 encoding error; " 822 "unknown error handling code: %.400s", 823 errors); 824 return -1; 825 } 826} 827#endif 828 829PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 830 int size, 831 const char *errors) 832{ 833 PyObject *v; 834 char *p; 835 char *q; 836 Py_UCS4 ch2; 837 unsigned int cbAllocated = 3 * size; 838 unsigned int cbWritten = 0; 839 int i = 0; 840 841 v = PyString_FromStringAndSize(NULL, cbAllocated); 842 if (v == NULL) 843 return NULL; 844 if (size == 0) 845 return v; 846 847 p = q = PyString_AS_STRING(v); 848 while (i < size) { 849 Py_UCS4 ch = s[i++]; 850 if (ch < 0x80) { 851 *p++ = (char) ch; 852 cbWritten++; 853 } 854 else if (ch < 0x0800) { 855 *p++ = 0xc0 | (ch >> 6); 856 *p++ = 0x80 | (ch & 0x3f); 857 cbWritten += 2; 858 } 859 else { 860 /* Check for high surrogate */ 861 if (0xD800 <= ch && ch <= 0xDBFF) { 862 if (i != size) { 863 ch2 = s[i]; 864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 865 866 if (cbWritten >= (cbAllocated - 4)) { 867 /* Provide enough room for some more 868 surrogates */ 869 cbAllocated += 4*10; 870 if (_PyString_Resize(&v, cbAllocated)) 871 goto onError; 872 } 873 874 /* combine the two values */ 875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 876 877 *p++ = (char)((ch >> 18) | 0xf0); 878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 879 i++; 880 cbWritten += 4; 881 } 882 } 883 } 884 else { 885 *p++ = (char)(0xe0 | (ch >> 12)); 886 cbWritten += 3; 887 } 888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 889 *p++ = (char)(0x80 | (ch & 0x3f)); 890 } 891 } 892 *p = '\0'; 893 if (_PyString_Resize(&v, p - q)) 894 goto onError; 895 return v; 896 897 onError: 898 Py_DECREF(v); 899 return NULL; 900} 901 902PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 903{ 904 if (!PyUnicode_Check(unicode)) { 905 PyErr_BadArgument(); 906 return NULL; 907 } 908 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 909 PyUnicode_GET_SIZE(unicode), 910 NULL); 911} 912 913/* --- UTF-16 Codec ------------------------------------------------------- */ 914 915static 916int utf16_decoding_error(const Py_UNICODE **source, 917 Py_UNICODE **dest, 918 const char *errors, 919 const char *details) 920{ 921 if ((errors == NULL) || 922 (strcmp(errors,"strict") == 0)) { 923 PyErr_Format(PyExc_UnicodeError, 924 "UTF-16 decoding error: %.400s", 925 details); 926 return -1; 927 } 928 else if (strcmp(errors,"ignore") == 0) { 929 return 0; 930 } 931 else if (strcmp(errors,"replace") == 0) { 932 if (dest) { 933 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 934 (*dest)++; 935 } 936 return 0; 937 } 938 else { 939 PyErr_Format(PyExc_ValueError, 940 "UTF-16 decoding error; " 941 "unknown error handling code: %.400s", 942 errors); 943 return -1; 944 } 945} 946 947PyObject *PyUnicode_DecodeUTF16(const char *s, 948 int size, 949 const char *errors, 950 int *byteorder) 951{ 952 PyUnicodeObject *unicode; 953 Py_UNICODE *p; 954 const Py_UNICODE *q, *e; 955 int bo = 0; 956 const char *errmsg = ""; 957 958 /* size should be an even number */ 959 if (size % sizeof(Py_UNICODE) != 0) { 960 if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) 961 return NULL; 962 /* The remaining input chars are ignored if we fall through 963 here... */ 964 } 965 966 /* Note: size will always be longer than the resulting Unicode 967 character count */ 968 unicode = _PyUnicode_New(size); 969 if (!unicode) 970 return NULL; 971 if (size == 0) 972 return (PyObject *)unicode; 973 974 /* Unpack UTF-16 encoded data */ 975 p = unicode->str; 976 q = (Py_UNICODE *)s; 977 e = q + (size / sizeof(Py_UNICODE)); 978 979 if (byteorder) 980 bo = *byteorder; 981 982 while (q < e) { 983 register Py_UNICODE ch = *q++; 984 985 /* Check for BOM marks (U+FEFF) in the input and adjust 986 current byte order setting accordingly. Swap input 987 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 988 !) */ 989#ifdef BYTEORDER_IS_LITTLE_ENDIAN 990 if (ch == 0xFEFF) { 991 bo = -1; 992 continue; 993 } else if (ch == 0xFFFE) { 994 bo = 1; 995 continue; 996 } 997 if (bo == 1) 998 ch = (ch >> 8) | (ch << 8); 999#else 1000 if (ch == 0xFEFF) { 1001 bo = 1; 1002 continue; 1003 } else if (ch == 0xFFFE) { 1004 bo = -1; 1005 continue; 1006 } 1007 if (bo == -1) 1008 ch = (ch >> 8) | (ch << 8); 1009#endif 1010 if (ch < 0xD800 || ch > 0xDFFF) { 1011 *p++ = ch; 1012 continue; 1013 } 1014 1015 /* UTF-16 code pair: */ 1016 if (q >= e) { 1017 errmsg = "unexpected end of data"; 1018 goto utf16Error; 1019 } 1020 if (0xDC00 <= *q && *q <= 0xDFFF) { 1021 q++; 1022 if (0xD800 <= *q && *q <= 0xDBFF) { 1023 /* This is valid data (a UTF-16 surrogate pair), but 1024 we are not able to store this information since our 1025 Py_UNICODE type only has 16 bits... this might 1026 change someday, even though it's unlikely. */ 1027 errmsg = "code pairs are not supported"; 1028 goto utf16Error; 1029 } 1030 else 1031 continue; 1032 } 1033 errmsg = "illegal encoding"; 1034 /* Fall through to report the error */ 1035 1036 utf16Error: 1037 if (utf16_decoding_error(&q, &p, errors, errmsg)) 1038 goto onError; 1039 } 1040 1041 if (byteorder) 1042 *byteorder = bo; 1043 1044 /* Adjust length */ 1045 if (_PyUnicode_Resize(unicode, p - unicode->str)) 1046 goto onError; 1047 1048 return (PyObject *)unicode; 1049 1050onError: 1051 Py_DECREF(unicode); 1052 return NULL; 1053} 1054 1055#undef UTF16_ERROR 1056 1057PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1058 int size, 1059 const char *errors, 1060 int byteorder) 1061{ 1062 PyObject *v; 1063 Py_UNICODE *p; 1064 char *q; 1065 1066 /* We don't create UTF-16 pairs... */ 1067 v = PyString_FromStringAndSize(NULL, 1068 sizeof(Py_UNICODE) * (size + (byteorder == 0))); 1069 if (v == NULL) 1070 return NULL; 1071 1072 q = PyString_AS_STRING(v); 1073 p = (Py_UNICODE *)q; 1074 if (byteorder == 0) 1075 *p++ = 0xFEFF; 1076 if (size == 0) 1077 return v; 1078 if (byteorder == 0 || 1079#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1080 byteorder == -1 1081#else 1082 byteorder == 1 1083#endif 1084 ) 1085 memcpy(p, s, size * sizeof(Py_UNICODE)); 1086 else 1087 while (size-- > 0) { 1088 Py_UNICODE ch = *s++; 1089 *p++ = (ch >> 8) | (ch << 8); 1090 } 1091 return v; 1092} 1093 1094PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1095{ 1096 if (!PyUnicode_Check(unicode)) { 1097 PyErr_BadArgument(); 1098 return NULL; 1099 } 1100 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1101 PyUnicode_GET_SIZE(unicode), 1102 NULL, 1103 0); 1104} 1105 1106/* --- Unicode Escape Codec ----------------------------------------------- */ 1107 1108static 1109int unicodeescape_decoding_error(const char **source, 1110 Py_UNICODE *x, 1111 const char *errors, 1112 const char *details) 1113{ 1114 if ((errors == NULL) || 1115 (strcmp(errors,"strict") == 0)) { 1116 PyErr_Format(PyExc_UnicodeError, 1117 "Unicode-Escape decoding error: %.400s", 1118 details); 1119 return -1; 1120 } 1121 else if (strcmp(errors,"ignore") == 0) { 1122 return 0; 1123 } 1124 else if (strcmp(errors,"replace") == 0) { 1125 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1126 return 0; 1127 } 1128 else { 1129 PyErr_Format(PyExc_ValueError, 1130 "Unicode-Escape decoding error; " 1131 "unknown error handling code: %.400s", 1132 errors); 1133 return -1; 1134 } 1135} 1136 1137static _Py_UCNHashAPI *pucnHash = NULL; 1138 1139static 1140int mystrnicmp(const char *s1, const char *s2, size_t count) 1141{ 1142 char c1, c2; 1143 1144 if (count) 1145 { 1146 do 1147 { 1148 c1 = tolower(*(s1++)); 1149 c2 = tolower(*(s2++)); 1150 } 1151 while(--count && c1 == c2); 1152 1153 return c1 - c2; 1154 } 1155 1156 return 0; 1157} 1158 1159PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1160 int size, 1161 const char *errors) 1162{ 1163 PyUnicodeObject *v; 1164 Py_UNICODE *p = NULL, *buf = NULL; 1165 const char *end; 1166 1167 /* Escaped strings will always be longer than the resulting 1168 Unicode string, so we start with size here and then reduce the 1169 length after conversion to the true value. */ 1170 v = _PyUnicode_New(size); 1171 if (v == NULL) 1172 goto onError; 1173 if (size == 0) 1174 return (PyObject *)v; 1175 p = buf = PyUnicode_AS_UNICODE(v); 1176 end = s + size; 1177 while (s < end) { 1178 unsigned char c; 1179 Py_UNICODE x; 1180 int i; 1181 1182 /* Non-escape characters are interpreted as Unicode ordinals */ 1183 if (*s != '\\') { 1184 *p++ = (unsigned char)*s++; 1185 continue; 1186 } 1187 1188 /* \ - Escapes */ 1189 s++; 1190 switch (*s++) { 1191 1192 /* \x escapes */ 1193 case '\n': break; 1194 case '\\': *p++ = '\\'; break; 1195 case '\'': *p++ = '\''; break; 1196 case '\"': *p++ = '\"'; break; 1197 case 'b': *p++ = '\b'; break; 1198 case 'f': *p++ = '\014'; break; /* FF */ 1199 case 't': *p++ = '\t'; break; 1200 case 'n': *p++ = '\n'; break; 1201 case 'r': *p++ = '\r'; break; 1202 case 'v': *p++ = '\013'; break; /* VT */ 1203 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1204 1205 /* \OOO (octal) escapes */ 1206 case '0': case '1': case '2': case '3': 1207 case '4': case '5': case '6': case '7': 1208 x = s[-1] - '0'; 1209 if ('0' <= *s && *s <= '7') { 1210 x = (x<<3) + *s++ - '0'; 1211 if ('0' <= *s && *s <= '7') 1212 x = (x<<3) + *s++ - '0'; 1213 } 1214 *p++ = x; 1215 break; 1216 1217 /* \xXXXX escape with 1-n hex digits. for compatibility 1218 with 8-bit strings, this code ignores all but the last 1219 two digits */ 1220 case 'x': 1221 x = 0; 1222 c = (unsigned char)*s; 1223 if (isxdigit(c)) { 1224 do { 1225 x = (x<<4) & 0xF0; 1226 if ('0' <= c && c <= '9') 1227 x += c - '0'; 1228 else if ('a' <= c && c <= 'f') 1229 x += 10 + c - 'a'; 1230 else 1231 x += 10 + c - 'A'; 1232 c = (unsigned char)*++s; 1233 } while (isxdigit(c)); 1234 *p++ = (unsigned char) x; 1235 } else { 1236 *p++ = '\\'; 1237 *p++ = (unsigned char)s[-1]; 1238 } 1239 break; 1240 1241 /* \uXXXX with 4 hex digits */ 1242 case 'u': 1243 for (x = 0, i = 0; i < 4; i++) { 1244 c = (unsigned char)s[i]; 1245 if (!isxdigit(c)) { 1246 if (unicodeescape_decoding_error(&s, &x, errors, 1247 "truncated \\uXXXX")) 1248 goto onError; 1249 i++; 1250 break; 1251 } 1252 x = (x<<4) & ~0xF; 1253 if (c >= '0' && c <= '9') 1254 x += c - '0'; 1255 else if (c >= 'a' && c <= 'f') 1256 x += 10 + c - 'a'; 1257 else 1258 x += 10 + c - 'A'; 1259 } 1260 s += i; 1261 *p++ = x; 1262 break; 1263 1264 case 'N': 1265 /* Ok, we need to deal with Unicode Character Names now, 1266 * make sure we've imported the hash table data... 1267 */ 1268 if (pucnHash == NULL) 1269 { 1270 PyObject *mod = 0, *v = 0; 1271 1272 mod = PyImport_ImportModule("ucnhash"); 1273 if (mod == NULL) 1274 goto onError; 1275 v = PyObject_GetAttrString(mod,"ucnhashAPI"); 1276 Py_DECREF(mod); 1277 if (v == NULL) 1278 { 1279 goto onError; 1280 } 1281 pucnHash = PyCObject_AsVoidPtr(v); 1282 Py_DECREF(v); 1283 if (pucnHash == NULL) 1284 { 1285 goto onError; 1286 } 1287 } 1288 1289 if (*s == '{') 1290 { 1291 const char *start = s + 1; 1292 const char *endBrace = start; 1293 Py_UCS4 value; 1294 unsigned long j; 1295 1296 /* look for either the closing brace, or we 1297 * exceed the maximum length of the unicode character names 1298 */ 1299 while (*endBrace != '}' && 1300 (unsigned int)(endBrace - start) <= 1301 pucnHash->cchMax && 1302 endBrace < end) 1303 { 1304 endBrace++; 1305 } 1306 if (endBrace != end && *endBrace == '}') 1307 { 1308 j = pucnHash->hash(start, endBrace - start); 1309 if (j > pucnHash->cKeys || 1310 mystrnicmp( 1311 start, 1312 ((_Py_UnicodeCharacterName *) 1313 (pucnHash->getValue(j)))->pszUCN, 1314 (int)(endBrace - start)) != 0) 1315 { 1316 if (unicodeescape_decoding_error( 1317 &s, &x, errors, 1318 "Invalid Unicode Character Name")) 1319 { 1320 goto onError; 1321 } 1322 goto ucnFallthrough; 1323 } 1324 value = ((_Py_UnicodeCharacterName *) 1325 (pucnHash->getValue(j)))->value; 1326 if (value < 1<<16) 1327 { 1328 /* In UCS-2 range, easy solution.. */ 1329 *p++ = value; 1330 } 1331 else 1332 { 1333 /* Oops, its in UCS-4 space, */ 1334 /* compute and append the two surrogates: */ 1335 /* translate from 10000..10FFFF to 0..FFFFF */ 1336 value -= 0x10000; 1337 1338 /* high surrogate = top 10 bits added to D800 */ 1339 *p++ = 0xD800 + (value >> 10); 1340 1341 /* low surrogate = bottom 10 bits added to DC00 */ 1342 *p++ = 0xDC00 + (value & ~0xFC00); 1343 } 1344 s = endBrace + 1; 1345 } 1346 else 1347 { 1348 if (unicodeescape_decoding_error( 1349 &s, &x, errors, 1350 "Unicode name missing closing brace")) 1351 goto onError; 1352 goto ucnFallthrough; 1353 } 1354 break; 1355 } 1356 if (unicodeescape_decoding_error( 1357 &s, &x, errors, 1358 "Missing opening brace for Unicode Character Name escape")) 1359 goto onError; 1360ucnFallthrough: 1361 /* fall through on purpose */ 1362 default: 1363 *p++ = '\\'; 1364 *p++ = (unsigned char)s[-1]; 1365 break; 1366 } 1367 } 1368 if (_PyUnicode_Resize(v, (int)(p - buf))) 1369 goto onError; 1370 return (PyObject *)v; 1371 1372 onError: 1373 Py_XDECREF(v); 1374 return NULL; 1375} 1376 1377/* Return a Unicode-Escape string version of the Unicode object. 1378 1379 If quotes is true, the string is enclosed in u"" or u'' quotes as 1380 appropriate. 1381 1382*/ 1383 1384static const Py_UNICODE *findchar(const Py_UNICODE *s, 1385 int size, 1386 Py_UNICODE ch); 1387 1388static 1389PyObject *unicodeescape_string(const Py_UNICODE *s, 1390 int size, 1391 int quotes) 1392{ 1393 PyObject *repr; 1394 char *p; 1395 char *q; 1396 1397 static const char *hexdigit = "0123456789ABCDEF"; 1398 1399 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1400 if (repr == NULL) 1401 return NULL; 1402 1403 p = q = PyString_AS_STRING(repr); 1404 1405 if (quotes) { 1406 *p++ = 'u'; 1407 *p++ = (findchar(s, size, '\'') && 1408 !findchar(s, size, '"')) ? '"' : '\''; 1409 } 1410 while (size-- > 0) { 1411 Py_UNICODE ch = *s++; 1412 /* Escape quotes */ 1413 if (quotes && (ch == q[1] || ch == '\\')) { 1414 *p++ = '\\'; 1415 *p++ = (char) ch; 1416 } 1417 /* Map 16-bit characters to '\uxxxx' */ 1418 else if (ch >= 256) { 1419 *p++ = '\\'; 1420 *p++ = 'u'; 1421 *p++ = hexdigit[(ch >> 12) & 0xf]; 1422 *p++ = hexdigit[(ch >> 8) & 0xf]; 1423 *p++ = hexdigit[(ch >> 4) & 0xf]; 1424 *p++ = hexdigit[ch & 15]; 1425 } 1426 /* Map non-printable US ASCII to '\ooo' */ 1427 else if (ch < ' ' || ch >= 128) { 1428 *p++ = '\\'; 1429 *p++ = hexdigit[(ch >> 6) & 7]; 1430 *p++ = hexdigit[(ch >> 3) & 7]; 1431 *p++ = hexdigit[ch & 7]; 1432 } 1433 /* Copy everything else as-is */ 1434 else 1435 *p++ = (char) ch; 1436 } 1437 if (quotes) 1438 *p++ = q[1]; 1439 1440 *p = '\0'; 1441 if (_PyString_Resize(&repr, p - q)) 1442 goto onError; 1443 1444 return repr; 1445 1446 onError: 1447 Py_DECREF(repr); 1448 return NULL; 1449} 1450 1451PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1452 int size) 1453{ 1454 return unicodeescape_string(s, size, 0); 1455} 1456 1457PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1458{ 1459 if (!PyUnicode_Check(unicode)) { 1460 PyErr_BadArgument(); 1461 return NULL; 1462 } 1463 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1464 PyUnicode_GET_SIZE(unicode)); 1465} 1466 1467/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1468 1469PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1470 int size, 1471 const char *errors) 1472{ 1473 PyUnicodeObject *v; 1474 Py_UNICODE *p, *buf; 1475 const char *end; 1476 const char *bs; 1477 1478 /* Escaped strings will always be longer than the resulting 1479 Unicode string, so we start with size here and then reduce the 1480 length after conversion to the true value. */ 1481 v = _PyUnicode_New(size); 1482 if (v == NULL) 1483 goto onError; 1484 if (size == 0) 1485 return (PyObject *)v; 1486 p = buf = PyUnicode_AS_UNICODE(v); 1487 end = s + size; 1488 while (s < end) { 1489 unsigned char c; 1490 Py_UNICODE x; 1491 int i; 1492 1493 /* Non-escape characters are interpreted as Unicode ordinals */ 1494 if (*s != '\\') { 1495 *p++ = (unsigned char)*s++; 1496 continue; 1497 } 1498 1499 /* \u-escapes are only interpreted iff the number of leading 1500 backslashes if odd */ 1501 bs = s; 1502 for (;s < end;) { 1503 if (*s != '\\') 1504 break; 1505 *p++ = (unsigned char)*s++; 1506 } 1507 if (((s - bs) & 1) == 0 || 1508 s >= end || 1509 *s != 'u') { 1510 continue; 1511 } 1512 p--; 1513 s++; 1514 1515 /* \uXXXX with 4 hex digits */ 1516 for (x = 0, i = 0; i < 4; i++) { 1517 c = (unsigned char)s[i]; 1518 if (!isxdigit(c)) { 1519 if (unicodeescape_decoding_error(&s, &x, errors, 1520 "truncated \\uXXXX")) 1521 goto onError; 1522 i++; 1523 break; 1524 } 1525 x = (x<<4) & ~0xF; 1526 if (c >= '0' && c <= '9') 1527 x += c - '0'; 1528 else if (c >= 'a' && c <= 'f') 1529 x += 10 + c - 'a'; 1530 else 1531 x += 10 + c - 'A'; 1532 } 1533 s += i; 1534 *p++ = x; 1535 } 1536 if (_PyUnicode_Resize(v, (int)(p - buf))) 1537 goto onError; 1538 return (PyObject *)v; 1539 1540 onError: 1541 Py_XDECREF(v); 1542 return NULL; 1543} 1544 1545PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1546 int size) 1547{ 1548 PyObject *repr; 1549 char *p; 1550 char *q; 1551 1552 static const char *hexdigit = "0123456789ABCDEF"; 1553 1554 repr = PyString_FromStringAndSize(NULL, 6 * size); 1555 if (repr == NULL) 1556 return NULL; 1557 if (size == 0) 1558 return repr; 1559 1560 p = q = PyString_AS_STRING(repr); 1561 while (size-- > 0) { 1562 Py_UNICODE ch = *s++; 1563 /* Map 16-bit characters to '\uxxxx' */ 1564 if (ch >= 256) { 1565 *p++ = '\\'; 1566 *p++ = 'u'; 1567 *p++ = hexdigit[(ch >> 12) & 0xf]; 1568 *p++ = hexdigit[(ch >> 8) & 0xf]; 1569 *p++ = hexdigit[(ch >> 4) & 0xf]; 1570 *p++ = hexdigit[ch & 15]; 1571 } 1572 /* Copy everything else as-is */ 1573 else 1574 *p++ = (char) ch; 1575 } 1576 *p = '\0'; 1577 if (_PyString_Resize(&repr, p - q)) 1578 goto onError; 1579 1580 return repr; 1581 1582 onError: 1583 Py_DECREF(repr); 1584 return NULL; 1585} 1586 1587PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1588{ 1589 if (!PyUnicode_Check(unicode)) { 1590 PyErr_BadArgument(); 1591 return NULL; 1592 } 1593 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1594 PyUnicode_GET_SIZE(unicode)); 1595} 1596 1597/* --- Latin-1 Codec ------------------------------------------------------ */ 1598 1599PyObject *PyUnicode_DecodeLatin1(const char *s, 1600 int size, 1601 const char *errors) 1602{ 1603 PyUnicodeObject *v; 1604 Py_UNICODE *p; 1605 1606 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 1607 v = _PyUnicode_New(size); 1608 if (v == NULL) 1609 goto onError; 1610 if (size == 0) 1611 return (PyObject *)v; 1612 p = PyUnicode_AS_UNICODE(v); 1613 while (size-- > 0) 1614 *p++ = (unsigned char)*s++; 1615 return (PyObject *)v; 1616 1617 onError: 1618 Py_XDECREF(v); 1619 return NULL; 1620} 1621 1622static 1623int latin1_encoding_error(const Py_UNICODE **source, 1624 char **dest, 1625 const char *errors, 1626 const char *details) 1627{ 1628 if ((errors == NULL) || 1629 (strcmp(errors,"strict") == 0)) { 1630 PyErr_Format(PyExc_UnicodeError, 1631 "Latin-1 encoding error: %.400s", 1632 details); 1633 return -1; 1634 } 1635 else if (strcmp(errors,"ignore") == 0) { 1636 return 0; 1637 } 1638 else if (strcmp(errors,"replace") == 0) { 1639 **dest = '?'; 1640 (*dest)++; 1641 return 0; 1642 } 1643 else { 1644 PyErr_Format(PyExc_ValueError, 1645 "Latin-1 encoding error; " 1646 "unknown error handling code: %.400s", 1647 errors); 1648 return -1; 1649 } 1650} 1651 1652PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 1653 int size, 1654 const char *errors) 1655{ 1656 PyObject *repr; 1657 char *s, *start; 1658 1659 repr = PyString_FromStringAndSize(NULL, size); 1660 if (repr == NULL) 1661 return NULL; 1662 if (size == 0) 1663 return repr; 1664 1665 s = PyString_AS_STRING(repr); 1666 start = s; 1667 while (size-- > 0) { 1668 Py_UNICODE ch = *p++; 1669 if (ch >= 256) { 1670 if (latin1_encoding_error(&p, &s, errors, 1671 "ordinal not in range(256)")) 1672 goto onError; 1673 } 1674 else 1675 *s++ = (char)ch; 1676 } 1677 /* Resize if error handling skipped some characters */ 1678 if (s - start < PyString_GET_SIZE(repr)) 1679 if (_PyString_Resize(&repr, s - start)) 1680 goto onError; 1681 return repr; 1682 1683 onError: 1684 Py_DECREF(repr); 1685 return NULL; 1686} 1687 1688PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 1689{ 1690 if (!PyUnicode_Check(unicode)) { 1691 PyErr_BadArgument(); 1692 return NULL; 1693 } 1694 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1695 PyUnicode_GET_SIZE(unicode), 1696 NULL); 1697} 1698 1699/* --- 7-bit ASCII Codec -------------------------------------------------- */ 1700 1701static 1702int ascii_decoding_error(const char **source, 1703 Py_UNICODE **dest, 1704 const char *errors, 1705 const char *details) 1706{ 1707 if ((errors == NULL) || 1708 (strcmp(errors,"strict") == 0)) { 1709 PyErr_Format(PyExc_UnicodeError, 1710 "ASCII decoding error: %.400s", 1711 details); 1712 return -1; 1713 } 1714 else if (strcmp(errors,"ignore") == 0) { 1715 return 0; 1716 } 1717 else if (strcmp(errors,"replace") == 0) { 1718 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1719 (*dest)++; 1720 return 0; 1721 } 1722 else { 1723 PyErr_Format(PyExc_ValueError, 1724 "ASCII decoding error; " 1725 "unknown error handling code: %.400s", 1726 errors); 1727 return -1; 1728 } 1729} 1730 1731PyObject *PyUnicode_DecodeASCII(const char *s, 1732 int size, 1733 const char *errors) 1734{ 1735 PyUnicodeObject *v; 1736 Py_UNICODE *p; 1737 1738 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 1739 v = _PyUnicode_New(size); 1740 if (v == NULL) 1741 goto onError; 1742 if (size == 0) 1743 return (PyObject *)v; 1744 p = PyUnicode_AS_UNICODE(v); 1745 while (size-- > 0) { 1746 register unsigned char c; 1747 1748 c = (unsigned char)*s++; 1749 if (c < 128) 1750 *p++ = c; 1751 else if (ascii_decoding_error(&s, &p, errors, 1752 "ordinal not in range(128)")) 1753 goto onError; 1754 } 1755 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 1756 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1757 goto onError; 1758 return (PyObject *)v; 1759 1760 onError: 1761 Py_XDECREF(v); 1762 return NULL; 1763} 1764 1765static 1766int ascii_encoding_error(const Py_UNICODE **source, 1767 char **dest, 1768 const char *errors, 1769 const char *details) 1770{ 1771 if ((errors == NULL) || 1772 (strcmp(errors,"strict") == 0)) { 1773 PyErr_Format(PyExc_UnicodeError, 1774 "ASCII encoding error: %.400s", 1775 details); 1776 return -1; 1777 } 1778 else if (strcmp(errors,"ignore") == 0) { 1779 return 0; 1780 } 1781 else if (strcmp(errors,"replace") == 0) { 1782 **dest = '?'; 1783 (*dest)++; 1784 return 0; 1785 } 1786 else { 1787 PyErr_Format(PyExc_ValueError, 1788 "ASCII encoding error; " 1789 "unknown error handling code: %.400s", 1790 errors); 1791 return -1; 1792 } 1793} 1794 1795PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 1796 int size, 1797 const char *errors) 1798{ 1799 PyObject *repr; 1800 char *s, *start; 1801 1802 repr = PyString_FromStringAndSize(NULL, size); 1803 if (repr == NULL) 1804 return NULL; 1805 if (size == 0) 1806 return repr; 1807 1808 s = PyString_AS_STRING(repr); 1809 start = s; 1810 while (size-- > 0) { 1811 Py_UNICODE ch = *p++; 1812 if (ch >= 128) { 1813 if (ascii_encoding_error(&p, &s, errors, 1814 "ordinal not in range(128)")) 1815 goto onError; 1816 } 1817 else 1818 *s++ = (char)ch; 1819 } 1820 /* Resize if error handling skipped some characters */ 1821 if (s - start < PyString_GET_SIZE(repr)) 1822 if (_PyString_Resize(&repr, s - start)) 1823 goto onError; 1824 return repr; 1825 1826 onError: 1827 Py_DECREF(repr); 1828 return NULL; 1829} 1830 1831PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 1832{ 1833 if (!PyUnicode_Check(unicode)) { 1834 PyErr_BadArgument(); 1835 return NULL; 1836 } 1837 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1838 PyUnicode_GET_SIZE(unicode), 1839 NULL); 1840} 1841 1842#ifdef MS_WIN32 1843 1844/* --- MBCS codecs for Windows -------------------------------------------- */ 1845 1846PyObject *PyUnicode_DecodeMBCS(const char *s, 1847 int size, 1848 const char *errors) 1849{ 1850 PyUnicodeObject *v; 1851 Py_UNICODE *p; 1852 1853 /* First get the size of the result */ 1854 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 1855 if (size > 0 && usize==0) 1856 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1857 1858 v = _PyUnicode_New(usize); 1859 if (v == NULL) 1860 return NULL; 1861 if (usize == 0) 1862 return (PyObject *)v; 1863 p = PyUnicode_AS_UNICODE(v); 1864 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 1865 Py_DECREF(v); 1866 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1867 } 1868 1869 return (PyObject *)v; 1870} 1871 1872PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 1873 int size, 1874 const char *errors) 1875{ 1876 PyObject *repr; 1877 char *s; 1878 DWORD mbcssize; 1879 1880 /* If there are no characters, bail now! */ 1881 if (size==0) 1882 return PyString_FromString(""); 1883 1884 /* First get the size of the result */ 1885 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 1886 if (mbcssize==0) 1887 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1888 1889 repr = PyString_FromStringAndSize(NULL, mbcssize); 1890 if (repr == NULL) 1891 return NULL; 1892 if (mbcssize == 0) 1893 return repr; 1894 1895 /* Do the conversion */ 1896 s = PyString_AS_STRING(repr); 1897 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 1898 Py_DECREF(repr); 1899 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1900 } 1901 return repr; 1902} 1903 1904#endif /* MS_WIN32 */ 1905 1906/* --- Character Mapping Codec -------------------------------------------- */ 1907 1908static 1909int charmap_decoding_error(const char **source, 1910 Py_UNICODE **dest, 1911 const char *errors, 1912 const char *details) 1913{ 1914 if ((errors == NULL) || 1915 (strcmp(errors,"strict") == 0)) { 1916 PyErr_Format(PyExc_UnicodeError, 1917 "charmap decoding error: %.400s", 1918 details); 1919 return -1; 1920 } 1921 else if (strcmp(errors,"ignore") == 0) { 1922 return 0; 1923 } 1924 else if (strcmp(errors,"replace") == 0) { 1925 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1926 (*dest)++; 1927 return 0; 1928 } 1929 else { 1930 PyErr_Format(PyExc_ValueError, 1931 "charmap decoding error; " 1932 "unknown error handling code: %.400s", 1933 errors); 1934 return -1; 1935 } 1936} 1937 1938PyObject *PyUnicode_DecodeCharmap(const char *s, 1939 int size, 1940 PyObject *mapping, 1941 const char *errors) 1942{ 1943 PyUnicodeObject *v; 1944 Py_UNICODE *p; 1945 1946 /* Default to Latin-1 */ 1947 if (mapping == NULL) 1948 return PyUnicode_DecodeLatin1(s, size, errors); 1949 1950 v = _PyUnicode_New(size); 1951 if (v == NULL) 1952 goto onError; 1953 if (size == 0) 1954 return (PyObject *)v; 1955 p = PyUnicode_AS_UNICODE(v); 1956 while (size-- > 0) { 1957 unsigned char ch = *s++; 1958 PyObject *w, *x; 1959 1960 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 1961 w = PyInt_FromLong((long)ch); 1962 if (w == NULL) 1963 goto onError; 1964 x = PyObject_GetItem(mapping, w); 1965 Py_DECREF(w); 1966 if (x == NULL) { 1967 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 1968 /* No mapping found: default to Latin-1 mapping */ 1969 PyErr_Clear(); 1970 *p++ = (Py_UNICODE)ch; 1971 continue; 1972 } 1973 goto onError; 1974 } 1975 1976 /* Apply mapping */ 1977 if (PyInt_Check(x)) { 1978 long value = PyInt_AS_LONG(x); 1979 if (value < 0 || value > 65535) { 1980 PyErr_SetString(PyExc_TypeError, 1981 "character mapping must be in range(65536)"); 1982 Py_DECREF(x); 1983 goto onError; 1984 } 1985 *p++ = (Py_UNICODE)value; 1986 } 1987 else if (x == Py_None) { 1988 /* undefined mapping */ 1989 if (charmap_decoding_error(&s, &p, errors, 1990 "character maps to <undefined>")) { 1991 Py_DECREF(x); 1992 goto onError; 1993 } 1994 } 1995 else if (PyUnicode_Check(x)) { 1996 if (PyUnicode_GET_SIZE(x) != 1) { 1997 /* 1-n mapping */ 1998 PyErr_SetString(PyExc_NotImplementedError, 1999 "1-n mappings are currently not implemented"); 2000 Py_DECREF(x); 2001 goto onError; 2002 } 2003 *p++ = *PyUnicode_AS_UNICODE(x); 2004 } 2005 else { 2006 /* wrong return value */ 2007 PyErr_SetString(PyExc_TypeError, 2008 "character mapping must return integer, None or unicode"); 2009 Py_DECREF(x); 2010 goto onError; 2011 } 2012 Py_DECREF(x); 2013 } 2014 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2015 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2016 goto onError; 2017 return (PyObject *)v; 2018 2019 onError: 2020 Py_XDECREF(v); 2021 return NULL; 2022} 2023 2024static 2025int charmap_encoding_error(const Py_UNICODE **source, 2026 char **dest, 2027 const char *errors, 2028 const char *details) 2029{ 2030 if ((errors == NULL) || 2031 (strcmp(errors,"strict") == 0)) { 2032 PyErr_Format(PyExc_UnicodeError, 2033 "charmap encoding error: %.400s", 2034 details); 2035 return -1; 2036 } 2037 else if (strcmp(errors,"ignore") == 0) { 2038 return 0; 2039 } 2040 else if (strcmp(errors,"replace") == 0) { 2041 **dest = '?'; 2042 (*dest)++; 2043 return 0; 2044 } 2045 else { 2046 PyErr_Format(PyExc_ValueError, 2047 "charmap encoding error; " 2048 "unknown error handling code: %.400s", 2049 errors); 2050 return -1; 2051 } 2052} 2053 2054PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2055 int size, 2056 PyObject *mapping, 2057 const char *errors) 2058{ 2059 PyObject *v; 2060 char *s; 2061 2062 /* Default to Latin-1 */ 2063 if (mapping == NULL) 2064 return PyUnicode_EncodeLatin1(p, size, errors); 2065 2066 v = PyString_FromStringAndSize(NULL, size); 2067 if (v == NULL) 2068 return NULL; 2069 if (size == 0) 2070 return v; 2071 s = PyString_AS_STRING(v); 2072 while (size-- > 0) { 2073 Py_UNICODE ch = *p++; 2074 PyObject *w, *x; 2075 2076 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2077 w = PyInt_FromLong((long)ch); 2078 if (w == NULL) 2079 goto onError; 2080 x = PyObject_GetItem(mapping, w); 2081 Py_DECREF(w); 2082 if (x == NULL) { 2083 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2084 /* No mapping found: default to Latin-1 mapping if possible */ 2085 PyErr_Clear(); 2086 if (ch < 256) { 2087 *s++ = (char)ch; 2088 continue; 2089 } 2090 else if (!charmap_encoding_error(&p, &s, errors, 2091 "missing character mapping")) 2092 continue; 2093 } 2094 goto onError; 2095 } 2096 2097 /* Apply mapping */ 2098 if (PyInt_Check(x)) { 2099 long value = PyInt_AS_LONG(x); 2100 if (value < 0 || value > 255) { 2101 PyErr_SetString(PyExc_TypeError, 2102 "character mapping must be in range(256)"); 2103 Py_DECREF(x); 2104 goto onError; 2105 } 2106 *s++ = (char)value; 2107 } 2108 else if (x == Py_None) { 2109 /* undefined mapping */ 2110 if (charmap_encoding_error(&p, &s, errors, 2111 "character maps to <undefined>")) { 2112 Py_DECREF(x); 2113 goto onError; 2114 } 2115 } 2116 else if (PyString_Check(x)) { 2117 if (PyString_GET_SIZE(x) != 1) { 2118 /* 1-n mapping */ 2119 PyErr_SetString(PyExc_NotImplementedError, 2120 "1-n mappings are currently not implemented"); 2121 Py_DECREF(x); 2122 goto onError; 2123 } 2124 *s++ = *PyString_AS_STRING(x); 2125 } 2126 else { 2127 /* wrong return value */ 2128 PyErr_SetString(PyExc_TypeError, 2129 "character mapping must return integer, None or unicode"); 2130 Py_DECREF(x); 2131 goto onError; 2132 } 2133 Py_DECREF(x); 2134 } 2135 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2136 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2137 goto onError; 2138 return v; 2139 2140 onError: 2141 Py_DECREF(v); 2142 return NULL; 2143} 2144 2145PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2146 PyObject *mapping) 2147{ 2148 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2149 PyErr_BadArgument(); 2150 return NULL; 2151 } 2152 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2153 PyUnicode_GET_SIZE(unicode), 2154 mapping, 2155 NULL); 2156} 2157 2158static 2159int translate_error(const Py_UNICODE **source, 2160 Py_UNICODE **dest, 2161 const char *errors, 2162 const char *details) 2163{ 2164 if ((errors == NULL) || 2165 (strcmp(errors,"strict") == 0)) { 2166 PyErr_Format(PyExc_UnicodeError, 2167 "translate error: %.400s", 2168 details); 2169 return -1; 2170 } 2171 else if (strcmp(errors,"ignore") == 0) { 2172 return 0; 2173 } 2174 else if (strcmp(errors,"replace") == 0) { 2175 **dest = '?'; 2176 (*dest)++; 2177 return 0; 2178 } 2179 else { 2180 PyErr_Format(PyExc_ValueError, 2181 "translate error; " 2182 "unknown error handling code: %.400s", 2183 errors); 2184 return -1; 2185 } 2186} 2187 2188PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2189 int size, 2190 PyObject *mapping, 2191 const char *errors) 2192{ 2193 PyUnicodeObject *v; 2194 Py_UNICODE *p; 2195 2196 if (mapping == NULL) { 2197 PyErr_BadArgument(); 2198 return NULL; 2199 } 2200 2201 /* Output will never be longer than input */ 2202 v = _PyUnicode_New(size); 2203 if (v == NULL) 2204 goto onError; 2205 if (size == 0) 2206 goto done; 2207 p = PyUnicode_AS_UNICODE(v); 2208 while (size-- > 0) { 2209 Py_UNICODE ch = *s++; 2210 PyObject *w, *x; 2211 2212 /* Get mapping */ 2213 w = PyInt_FromLong(ch); 2214 if (w == NULL) 2215 goto onError; 2216 x = PyObject_GetItem(mapping, w); 2217 Py_DECREF(w); 2218 if (x == NULL) { 2219 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2220 /* No mapping found: default to 1-1 mapping */ 2221 PyErr_Clear(); 2222 *p++ = ch; 2223 continue; 2224 } 2225 goto onError; 2226 } 2227 2228 /* Apply mapping */ 2229 if (PyInt_Check(x)) 2230 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2231 else if (x == Py_None) { 2232 /* undefined mapping */ 2233 if (translate_error(&s, &p, errors, 2234 "character maps to <undefined>")) { 2235 Py_DECREF(x); 2236 goto onError; 2237 } 2238 } 2239 else if (PyUnicode_Check(x)) { 2240 if (PyUnicode_GET_SIZE(x) != 1) { 2241 /* 1-n mapping */ 2242 PyErr_SetString(PyExc_NotImplementedError, 2243 "1-n mappings are currently not implemented"); 2244 Py_DECREF(x); 2245 goto onError; 2246 } 2247 *p++ = *PyUnicode_AS_UNICODE(x); 2248 } 2249 else { 2250 /* wrong return value */ 2251 PyErr_SetString(PyExc_TypeError, 2252 "translate mapping must return integer, None or unicode"); 2253 Py_DECREF(x); 2254 goto onError; 2255 } 2256 Py_DECREF(x); 2257 } 2258 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2259 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2260 goto onError; 2261 2262 done: 2263 return (PyObject *)v; 2264 2265 onError: 2266 Py_XDECREF(v); 2267 return NULL; 2268} 2269 2270PyObject *PyUnicode_Translate(PyObject *str, 2271 PyObject *mapping, 2272 const char *errors) 2273{ 2274 PyObject *result; 2275 2276 str = PyUnicode_FromObject(str); 2277 if (str == NULL) 2278 goto onError; 2279 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2280 PyUnicode_GET_SIZE(str), 2281 mapping, 2282 errors); 2283 Py_DECREF(str); 2284 return result; 2285 2286 onError: 2287 Py_XDECREF(str); 2288 return NULL; 2289} 2290 2291/* --- Decimal Encoder ---------------------------------------------------- */ 2292 2293int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2294 int length, 2295 char *output, 2296 const char *errors) 2297{ 2298 Py_UNICODE *p, *end; 2299 2300 if (output == NULL) { 2301 PyErr_BadArgument(); 2302 return -1; 2303 } 2304 2305 p = s; 2306 end = s + length; 2307 while (p < end) { 2308 register Py_UNICODE ch = *p++; 2309 int decimal; 2310 2311 if (Py_UNICODE_ISSPACE(ch)) { 2312 *output++ = ' '; 2313 continue; 2314 } 2315 decimal = Py_UNICODE_TODECIMAL(ch); 2316 if (decimal >= 0) { 2317 *output++ = '0' + decimal; 2318 continue; 2319 } 2320 if (0 < ch && ch < 256) { 2321 *output++ = (char)ch; 2322 continue; 2323 } 2324 /* All other characters are considered invalid */ 2325 if (errors == NULL || strcmp(errors, "strict") == 0) { 2326 PyErr_SetString(PyExc_ValueError, 2327 "invalid decimal Unicode string"); 2328 goto onError; 2329 } 2330 else if (strcmp(errors, "ignore") == 0) 2331 continue; 2332 else if (strcmp(errors, "replace") == 0) { 2333 *output++ = '?'; 2334 continue; 2335 } 2336 } 2337 /* 0-terminate the output string */ 2338 *output++ = '\0'; 2339 return 0; 2340 2341 onError: 2342 return -1; 2343} 2344 2345/* --- Helpers ------------------------------------------------------------ */ 2346 2347static 2348int count(PyUnicodeObject *self, 2349 int start, 2350 int end, 2351 PyUnicodeObject *substring) 2352{ 2353 int count = 0; 2354 2355 if (substring->length == 0) 2356 return (end - start + 1); 2357 2358 end -= substring->length; 2359 2360 while (start <= end) 2361 if (Py_UNICODE_MATCH(self, start, substring)) { 2362 count++; 2363 start += substring->length; 2364 } else 2365 start++; 2366 2367 return count; 2368} 2369 2370int PyUnicode_Count(PyObject *str, 2371 PyObject *substr, 2372 int start, 2373 int end) 2374{ 2375 int result; 2376 2377 str = PyUnicode_FromObject(str); 2378 if (str == NULL) 2379 return -1; 2380 substr = PyUnicode_FromObject(substr); 2381 if (substr == NULL) { 2382 Py_DECREF(str); 2383 return -1; 2384 } 2385 2386 result = count((PyUnicodeObject *)str, 2387 start, end, 2388 (PyUnicodeObject *)substr); 2389 2390 Py_DECREF(str); 2391 Py_DECREF(substr); 2392 return result; 2393} 2394 2395static 2396int findstring(PyUnicodeObject *self, 2397 PyUnicodeObject *substring, 2398 int start, 2399 int end, 2400 int direction) 2401{ 2402 if (start < 0) 2403 start += self->length; 2404 if (start < 0) 2405 start = 0; 2406 2407 if (substring->length == 0) 2408 return start; 2409 2410 if (end > self->length) 2411 end = self->length; 2412 if (end < 0) 2413 end += self->length; 2414 if (end < 0) 2415 end = 0; 2416 2417 end -= substring->length; 2418 2419 if (direction < 0) { 2420 for (; end >= start; end--) 2421 if (Py_UNICODE_MATCH(self, end, substring)) 2422 return end; 2423 } else { 2424 for (; start <= end; start++) 2425 if (Py_UNICODE_MATCH(self, start, substring)) 2426 return start; 2427 } 2428 2429 return -1; 2430} 2431 2432int PyUnicode_Find(PyObject *str, 2433 PyObject *substr, 2434 int start, 2435 int end, 2436 int direction) 2437{ 2438 int result; 2439 2440 str = PyUnicode_FromObject(str); 2441 if (str == NULL) 2442 return -1; 2443 substr = PyUnicode_FromObject(substr); 2444 if (substr == NULL) { 2445 Py_DECREF(substr); 2446 return -1; 2447 } 2448 2449 result = findstring((PyUnicodeObject *)str, 2450 (PyUnicodeObject *)substr, 2451 start, end, direction); 2452 Py_DECREF(str); 2453 Py_DECREF(substr); 2454 return result; 2455} 2456 2457static 2458int tailmatch(PyUnicodeObject *self, 2459 PyUnicodeObject *substring, 2460 int start, 2461 int end, 2462 int direction) 2463{ 2464 if (start < 0) 2465 start += self->length; 2466 if (start < 0) 2467 start = 0; 2468 2469 if (substring->length == 0) 2470 return 1; 2471 2472 if (end > self->length) 2473 end = self->length; 2474 if (end < 0) 2475 end += self->length; 2476 if (end < 0) 2477 end = 0; 2478 2479 end -= substring->length; 2480 if (end < start) 2481 return 0; 2482 2483 if (direction > 0) { 2484 if (Py_UNICODE_MATCH(self, end, substring)) 2485 return 1; 2486 } else { 2487 if (Py_UNICODE_MATCH(self, start, substring)) 2488 return 1; 2489 } 2490 2491 return 0; 2492} 2493 2494int PyUnicode_Tailmatch(PyObject *str, 2495 PyObject *substr, 2496 int start, 2497 int end, 2498 int direction) 2499{ 2500 int result; 2501 2502 str = PyUnicode_FromObject(str); 2503 if (str == NULL) 2504 return -1; 2505 substr = PyUnicode_FromObject(substr); 2506 if (substr == NULL) { 2507 Py_DECREF(substr); 2508 return -1; 2509 } 2510 2511 result = tailmatch((PyUnicodeObject *)str, 2512 (PyUnicodeObject *)substr, 2513 start, end, direction); 2514 Py_DECREF(str); 2515 Py_DECREF(substr); 2516 return result; 2517} 2518 2519static 2520const Py_UNICODE *findchar(const Py_UNICODE *s, 2521 int size, 2522 Py_UNICODE ch) 2523{ 2524 /* like wcschr, but doesn't stop at NULL characters */ 2525 2526 while (size-- > 0) { 2527 if (*s == ch) 2528 return s; 2529 s++; 2530 } 2531 2532 return NULL; 2533} 2534 2535/* Apply fixfct filter to the Unicode object self and return a 2536 reference to the modified object */ 2537 2538static 2539PyObject *fixup(PyUnicodeObject *self, 2540 int (*fixfct)(PyUnicodeObject *s)) 2541{ 2542 2543 PyUnicodeObject *u; 2544 2545 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str, 2546 self->length); 2547 if (u == NULL) 2548 return NULL; 2549 if (!fixfct(u)) { 2550 /* fixfct should return TRUE if it modified the buffer. If 2551 FALSE, return a reference to the original buffer instead 2552 (to save space, not time) */ 2553 Py_INCREF(self); 2554 Py_DECREF(u); 2555 return (PyObject*) self; 2556 } 2557 return (PyObject*) u; 2558} 2559 2560static 2561int fixupper(PyUnicodeObject *self) 2562{ 2563 int len = self->length; 2564 Py_UNICODE *s = self->str; 2565 int status = 0; 2566 2567 while (len-- > 0) { 2568 register Py_UNICODE ch; 2569 2570 ch = Py_UNICODE_TOUPPER(*s); 2571 if (ch != *s) { 2572 status = 1; 2573 *s = ch; 2574 } 2575 s++; 2576 } 2577 2578 return status; 2579} 2580 2581static 2582int fixlower(PyUnicodeObject *self) 2583{ 2584 int len = self->length; 2585 Py_UNICODE *s = self->str; 2586 int status = 0; 2587 2588 while (len-- > 0) { 2589 register Py_UNICODE ch; 2590 2591 ch = Py_UNICODE_TOLOWER(*s); 2592 if (ch != *s) { 2593 status = 1; 2594 *s = ch; 2595 } 2596 s++; 2597 } 2598 2599 return status; 2600} 2601 2602static 2603int fixswapcase(PyUnicodeObject *self) 2604{ 2605 int len = self->length; 2606 Py_UNICODE *s = self->str; 2607 int status = 0; 2608 2609 while (len-- > 0) { 2610 if (Py_UNICODE_ISUPPER(*s)) { 2611 *s = Py_UNICODE_TOLOWER(*s); 2612 status = 1; 2613 } else if (Py_UNICODE_ISLOWER(*s)) { 2614 *s = Py_UNICODE_TOUPPER(*s); 2615 status = 1; 2616 } 2617 s++; 2618 } 2619 2620 return status; 2621} 2622 2623static 2624int fixcapitalize(PyUnicodeObject *self) 2625{ 2626 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) { 2627 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]); 2628 return 1; 2629 } 2630 return 0; 2631} 2632 2633static 2634int fixtitle(PyUnicodeObject *self) 2635{ 2636 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 2637 register Py_UNICODE *e; 2638 int previous_is_cased; 2639 2640 /* Shortcut for single character strings */ 2641 if (PyUnicode_GET_SIZE(self) == 1) { 2642 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 2643 if (*p != ch) { 2644 *p = ch; 2645 return 1; 2646 } 2647 else 2648 return 0; 2649 } 2650 2651 e = p + PyUnicode_GET_SIZE(self); 2652 previous_is_cased = 0; 2653 for (; p < e; p++) { 2654 register const Py_UNICODE ch = *p; 2655 2656 if (previous_is_cased) 2657 *p = Py_UNICODE_TOLOWER(ch); 2658 else 2659 *p = Py_UNICODE_TOTITLE(ch); 2660 2661 if (Py_UNICODE_ISLOWER(ch) || 2662 Py_UNICODE_ISUPPER(ch) || 2663 Py_UNICODE_ISTITLE(ch)) 2664 previous_is_cased = 1; 2665 else 2666 previous_is_cased = 0; 2667 } 2668 return 1; 2669} 2670 2671PyObject *PyUnicode_Join(PyObject *separator, 2672 PyObject *seq) 2673{ 2674 Py_UNICODE *sep; 2675 int seplen; 2676 PyUnicodeObject *res = NULL; 2677 int reslen = 0; 2678 Py_UNICODE *p; 2679 int seqlen = 0; 2680 int sz = 100; 2681 int i; 2682 2683 seqlen = PySequence_Size(seq); 2684 if (seqlen < 0 && PyErr_Occurred()) 2685 return NULL; 2686 2687 if (separator == NULL) { 2688 Py_UNICODE blank = ' '; 2689 sep = ␣ 2690 seplen = 1; 2691 } 2692 else { 2693 separator = PyUnicode_FromObject(separator); 2694 if (separator == NULL) 2695 return NULL; 2696 sep = PyUnicode_AS_UNICODE(separator); 2697 seplen = PyUnicode_GET_SIZE(separator); 2698 } 2699 2700 res = _PyUnicode_New(sz); 2701 if (res == NULL) 2702 goto onError; 2703 p = PyUnicode_AS_UNICODE(res); 2704 reslen = 0; 2705 2706 for (i = 0; i < seqlen; i++) { 2707 int itemlen; 2708 PyObject *item; 2709 2710 item = PySequence_GetItem(seq, i); 2711 if (item == NULL) 2712 goto onError; 2713 if (!PyUnicode_Check(item)) { 2714 PyObject *v; 2715 v = PyUnicode_FromObject(item); 2716 Py_DECREF(item); 2717 item = v; 2718 if (item == NULL) 2719 goto onError; 2720 } 2721 itemlen = PyUnicode_GET_SIZE(item); 2722 while (reslen + itemlen + seplen >= sz) { 2723 if (_PyUnicode_Resize(res, sz*2)) 2724 goto onError; 2725 sz *= 2; 2726 p = PyUnicode_AS_UNICODE(res) + reslen; 2727 } 2728 if (i > 0) { 2729 memcpy(p, sep, seplen * sizeof(Py_UNICODE)); 2730 p += seplen; 2731 reslen += seplen; 2732 } 2733 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE)); 2734 p += itemlen; 2735 reslen += itemlen; 2736 Py_DECREF(item); 2737 } 2738 if (_PyUnicode_Resize(res, reslen)) 2739 goto onError; 2740 2741 Py_XDECREF(separator); 2742 return (PyObject *)res; 2743 2744 onError: 2745 Py_XDECREF(separator); 2746 Py_DECREF(res); 2747 return NULL; 2748} 2749 2750static 2751PyUnicodeObject *pad(PyUnicodeObject *self, 2752 int left, 2753 int right, 2754 Py_UNICODE fill) 2755{ 2756 PyUnicodeObject *u; 2757 2758 if (left < 0) 2759 left = 0; 2760 if (right < 0) 2761 right = 0; 2762 2763 if (left == 0 && right == 0) { 2764 Py_INCREF(self); 2765 return self; 2766 } 2767 2768 u = _PyUnicode_New(left + self->length + right); 2769 if (u) { 2770 if (left) 2771 Py_UNICODE_FILL(u->str, fill, left); 2772 Py_UNICODE_COPY(u->str + left, self->str, self->length); 2773 if (right) 2774 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 2775 } 2776 2777 return u; 2778} 2779 2780#define SPLIT_APPEND(data, left, right) \ 2781 str = PyUnicode_FromUnicode(data + left, right - left); \ 2782 if (!str) \ 2783 goto onError; \ 2784 if (PyList_Append(list, str)) { \ 2785 Py_DECREF(str); \ 2786 goto onError; \ 2787 } \ 2788 else \ 2789 Py_DECREF(str); 2790 2791static 2792PyObject *split_whitespace(PyUnicodeObject *self, 2793 PyObject *list, 2794 int maxcount) 2795{ 2796 register int i; 2797 register int j; 2798 int len = self->length; 2799 PyObject *str; 2800 2801 for (i = j = 0; i < len; ) { 2802 /* find a token */ 2803 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2804 i++; 2805 j = i; 2806 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 2807 i++; 2808 if (j < i) { 2809 if (maxcount-- <= 0) 2810 break; 2811 SPLIT_APPEND(self->str, j, i); 2812 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2813 i++; 2814 j = i; 2815 } 2816 } 2817 if (j < len) { 2818 SPLIT_APPEND(self->str, j, len); 2819 } 2820 return list; 2821 2822 onError: 2823 Py_DECREF(list); 2824 return NULL; 2825} 2826 2827PyObject *PyUnicode_Splitlines(PyObject *string, 2828 int keepends) 2829{ 2830 register int i; 2831 register int j; 2832 int len; 2833 PyObject *list; 2834 PyObject *str; 2835 Py_UNICODE *data; 2836 2837 string = PyUnicode_FromObject(string); 2838 if (string == NULL) 2839 return NULL; 2840 data = PyUnicode_AS_UNICODE(string); 2841 len = PyUnicode_GET_SIZE(string); 2842 2843 list = PyList_New(0); 2844 if (!list) 2845 goto onError; 2846 2847 for (i = j = 0; i < len; ) { 2848 int eol; 2849 2850 /* Find a line and append it */ 2851 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 2852 i++; 2853 2854 /* Skip the line break reading CRLF as one line break */ 2855 eol = i; 2856 if (i < len) { 2857 if (data[i] == '\r' && i + 1 < len && 2858 data[i+1] == '\n') 2859 i += 2; 2860 else 2861 i++; 2862 if (keepends) 2863 eol = i; 2864 } 2865 SPLIT_APPEND(data, j, eol); 2866 j = i; 2867 } 2868 if (j < len) { 2869 SPLIT_APPEND(data, j, len); 2870 } 2871 2872 Py_DECREF(string); 2873 return list; 2874 2875 onError: 2876 Py_DECREF(list); 2877 Py_DECREF(string); 2878 return NULL; 2879} 2880 2881static 2882PyObject *split_char(PyUnicodeObject *self, 2883 PyObject *list, 2884 Py_UNICODE ch, 2885 int maxcount) 2886{ 2887 register int i; 2888 register int j; 2889 int len = self->length; 2890 PyObject *str; 2891 2892 for (i = j = 0; i < len; ) { 2893 if (self->str[i] == ch) { 2894 if (maxcount-- <= 0) 2895 break; 2896 SPLIT_APPEND(self->str, j, i); 2897 i = j = i + 1; 2898 } else 2899 i++; 2900 } 2901 if (j <= len) { 2902 SPLIT_APPEND(self->str, j, len); 2903 } 2904 return list; 2905 2906 onError: 2907 Py_DECREF(list); 2908 return NULL; 2909} 2910 2911static 2912PyObject *split_substring(PyUnicodeObject *self, 2913 PyObject *list, 2914 PyUnicodeObject *substring, 2915 int maxcount) 2916{ 2917 register int i; 2918 register int j; 2919 int len = self->length; 2920 int sublen = substring->length; 2921 PyObject *str; 2922 2923 for (i = j = 0; i < len - sublen; ) { 2924 if (Py_UNICODE_MATCH(self, i, substring)) { 2925 if (maxcount-- <= 0) 2926 break; 2927 SPLIT_APPEND(self->str, j, i); 2928 i = j = i + sublen; 2929 } else 2930 i++; 2931 } 2932 if (j <= len) { 2933 SPLIT_APPEND(self->str, j, len); 2934 } 2935 return list; 2936 2937 onError: 2938 Py_DECREF(list); 2939 return NULL; 2940} 2941 2942#undef SPLIT_APPEND 2943 2944static 2945PyObject *split(PyUnicodeObject *self, 2946 PyUnicodeObject *substring, 2947 int maxcount) 2948{ 2949 PyObject *list; 2950 2951 if (maxcount < 0) 2952 maxcount = INT_MAX; 2953 2954 list = PyList_New(0); 2955 if (!list) 2956 return NULL; 2957 2958 if (substring == NULL) 2959 return split_whitespace(self,list,maxcount); 2960 2961 else if (substring->length == 1) 2962 return split_char(self,list,substring->str[0],maxcount); 2963 2964 else if (substring->length == 0) { 2965 Py_DECREF(list); 2966 PyErr_SetString(PyExc_ValueError, "empty separator"); 2967 return NULL; 2968 } 2969 else 2970 return split_substring(self,list,substring,maxcount); 2971} 2972 2973static 2974PyObject *strip(PyUnicodeObject *self, 2975 int left, 2976 int right) 2977{ 2978 Py_UNICODE *p = self->str; 2979 int start = 0; 2980 int end = self->length; 2981 2982 if (left) 2983 while (start < end && Py_UNICODE_ISSPACE(p[start])) 2984 start++; 2985 2986 if (right) 2987 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 2988 end--; 2989 2990 if (start == 0 && end == self->length) { 2991 /* couldn't strip anything off, return original string */ 2992 Py_INCREF(self); 2993 return (PyObject*) self; 2994 } 2995 2996 return (PyObject*) PyUnicode_FromUnicode( 2997 self->str + start, 2998 end - start 2999 ); 3000} 3001 3002static 3003PyObject *replace(PyUnicodeObject *self, 3004 PyUnicodeObject *str1, 3005 PyUnicodeObject *str2, 3006 int maxcount) 3007{ 3008 PyUnicodeObject *u; 3009 3010 if (maxcount < 0) 3011 maxcount = INT_MAX; 3012 3013 if (str1->length == 1 && str2->length == 1) { 3014 int i; 3015 3016 /* replace characters */ 3017 if (!findchar(self->str, self->length, str1->str[0])) { 3018 /* nothing to replace, return original string */ 3019 Py_INCREF(self); 3020 u = self; 3021 } else { 3022 Py_UNICODE u1 = str1->str[0]; 3023 Py_UNICODE u2 = str2->str[0]; 3024 3025 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3026 self->str, 3027 self->length 3028 ); 3029 if (u) 3030 for (i = 0; i < u->length; i++) 3031 if (u->str[i] == u1) { 3032 if (--maxcount < 0) 3033 break; 3034 u->str[i] = u2; 3035 } 3036 } 3037 3038 } else { 3039 int n, i; 3040 Py_UNICODE *p; 3041 3042 /* replace strings */ 3043 n = count(self, 0, self->length, str1); 3044 if (n > maxcount) 3045 n = maxcount; 3046 if (n == 0) { 3047 /* nothing to replace, return original string */ 3048 Py_INCREF(self); 3049 u = self; 3050 } else { 3051 u = _PyUnicode_New( 3052 self->length + n * (str2->length - str1->length)); 3053 if (u) { 3054 i = 0; 3055 p = u->str; 3056 while (i <= self->length - str1->length) 3057 if (Py_UNICODE_MATCH(self, i, str1)) { 3058 /* replace string segment */ 3059 Py_UNICODE_COPY(p, str2->str, str2->length); 3060 p += str2->length; 3061 i += str1->length; 3062 if (--n <= 0) { 3063 /* copy remaining part */ 3064 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3065 break; 3066 } 3067 } else 3068 *p++ = self->str[i++]; 3069 } 3070 } 3071 } 3072 3073 return (PyObject *) u; 3074} 3075 3076/* --- Unicode Object Methods --------------------------------------------- */ 3077 3078static char title__doc__[] = 3079"S.title() -> unicode\n\ 3080\n\ 3081Return a titlecased version of S, i.e. words start with title case\n\ 3082characters, all remaining cased characters have lower case."; 3083 3084static PyObject* 3085unicode_title(PyUnicodeObject *self, PyObject *args) 3086{ 3087 if (!PyArg_NoArgs(args)) 3088 return NULL; 3089 return fixup(self, fixtitle); 3090} 3091 3092static char capitalize__doc__[] = 3093"S.capitalize() -> unicode\n\ 3094\n\ 3095Return a capitalized version of S, i.e. make the first character\n\ 3096have upper case."; 3097 3098static PyObject* 3099unicode_capitalize(PyUnicodeObject *self, PyObject *args) 3100{ 3101 if (!PyArg_NoArgs(args)) 3102 return NULL; 3103 return fixup(self, fixcapitalize); 3104} 3105 3106#if 0 3107static char capwords__doc__[] = 3108"S.capwords() -> unicode\n\ 3109\n\ 3110Apply .capitalize() to all words in S and return the result with\n\ 3111normalized whitespace (all whitespace strings are replaced by ' ')."; 3112 3113static PyObject* 3114unicode_capwords(PyUnicodeObject *self, PyObject *args) 3115{ 3116 PyObject *list; 3117 PyObject *item; 3118 int i; 3119 3120 if (!PyArg_NoArgs(args)) 3121 return NULL; 3122 3123 /* Split into words */ 3124 list = split(self, NULL, -1); 3125 if (!list) 3126 return NULL; 3127 3128 /* Capitalize each word */ 3129 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3130 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3131 fixcapitalize); 3132 if (item == NULL) 3133 goto onError; 3134 Py_DECREF(PyList_GET_ITEM(list, i)); 3135 PyList_SET_ITEM(list, i, item); 3136 } 3137 3138 /* Join the words to form a new string */ 3139 item = PyUnicode_Join(NULL, list); 3140 3141onError: 3142 Py_DECREF(list); 3143 return (PyObject *)item; 3144} 3145#endif 3146 3147static char center__doc__[] = 3148"S.center(width) -> unicode\n\ 3149\n\ 3150Return S centered in a Unicode string of length width. Padding is done\n\ 3151using spaces."; 3152 3153static PyObject * 3154unicode_center(PyUnicodeObject *self, PyObject *args) 3155{ 3156 int marg, left; 3157 int width; 3158 3159 if (!PyArg_ParseTuple(args, "i:center", &width)) 3160 return NULL; 3161 3162 if (self->length >= width) { 3163 Py_INCREF(self); 3164 return (PyObject*) self; 3165 } 3166 3167 marg = width - self->length; 3168 left = marg / 2 + (marg & width & 1); 3169 3170 return (PyObject*) pad(self, left, marg - left, ' '); 3171} 3172 3173#if 0 3174 3175/* This code should go into some future Unicode collation support 3176 module. The basic comparison should compare ordinals on a naive 3177 basis (this is what Java does and thus JPython too). */ 3178 3179/* speedy UTF-16 code point order comparison */ 3180/* gleaned from: */ 3181/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3182 3183static short utf16Fixup[32] = 3184{ 3185 0, 0, 0, 0, 0, 0, 0, 0, 3186 0, 0, 0, 0, 0, 0, 0, 0, 3187 0, 0, 0, 0, 0, 0, 0, 0, 3188 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3189}; 3190 3191static int 3192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3193{ 3194 int len1, len2; 3195 3196 Py_UNICODE *s1 = str1->str; 3197 Py_UNICODE *s2 = str2->str; 3198 3199 len1 = str1->length; 3200 len2 = str2->length; 3201 3202 while (len1 > 0 && len2 > 0) { 3203 Py_UNICODE c1, c2; 3204 long diff; 3205 3206 c1 = *s1++; 3207 c2 = *s2++; 3208 if (c1 > (1<<11) * 26) 3209 c1 += utf16Fixup[c1>>11]; 3210 if (c2 > (1<<11) * 26) 3211 c2 += utf16Fixup[c2>>11]; 3212 3213 /* now c1 and c2 are in UTF-32-compatible order */ 3214 diff = (long)c1 - (long)c2; 3215 if (diff) 3216 return (diff < 0) ? -1 : (diff != 0); 3217 len1--; len2--; 3218 } 3219 3220 return (len1 < len2) ? -1 : (len1 != len2); 3221} 3222 3223#else 3224 3225static int 3226unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3227{ 3228 register int len1, len2; 3229 3230 Py_UNICODE *s1 = str1->str; 3231 Py_UNICODE *s2 = str2->str; 3232 3233 len1 = str1->length; 3234 len2 = str2->length; 3235 3236 while (len1 > 0 && len2 > 0) { 3237 register long diff; 3238 3239 diff = (long)*s1++ - (long)*s2++; 3240 if (diff) 3241 return (diff < 0) ? -1 : (diff != 0); 3242 len1--; len2--; 3243 } 3244 3245 return (len1 < len2) ? -1 : (len1 != len2); 3246} 3247 3248#endif 3249 3250int PyUnicode_Compare(PyObject *left, 3251 PyObject *right) 3252{ 3253 PyUnicodeObject *u = NULL, *v = NULL; 3254 int result; 3255 3256 /* Coerce the two arguments */ 3257 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3258 if (u == NULL) 3259 goto onError; 3260 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3261 if (v == NULL) 3262 goto onError; 3263 3264 /* Shortcut for empty or interned objects */ 3265 if (v == u) { 3266 Py_DECREF(u); 3267 Py_DECREF(v); 3268 return 0; 3269 } 3270 3271 result = unicode_compare(u, v); 3272 3273 Py_DECREF(u); 3274 Py_DECREF(v); 3275 return result; 3276 3277onError: 3278 Py_XDECREF(u); 3279 Py_XDECREF(v); 3280 return -1; 3281} 3282 3283int PyUnicode_Contains(PyObject *container, 3284 PyObject *element) 3285{ 3286 PyUnicodeObject *u = NULL, *v = NULL; 3287 int result; 3288 register const Py_UNICODE *p, *e; 3289 register Py_UNICODE ch; 3290 3291 /* Coerce the two arguments */ 3292 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3293 if (v == NULL) { 3294 PyErr_SetString(PyExc_TypeError, 3295 "'in <string>' requires character as left operand"); 3296 goto onError; 3297 } 3298 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3299 if (u == NULL) { 3300 Py_DECREF(v); 3301 goto onError; 3302 } 3303 3304 /* Check v in u */ 3305 if (PyUnicode_GET_SIZE(v) != 1) { 3306 PyErr_SetString(PyExc_TypeError, 3307 "'in <string>' requires character as left operand"); 3308 goto onError; 3309 } 3310 ch = *PyUnicode_AS_UNICODE(v); 3311 p = PyUnicode_AS_UNICODE(u); 3312 e = p + PyUnicode_GET_SIZE(u); 3313 result = 0; 3314 while (p < e) { 3315 if (*p++ == ch) { 3316 result = 1; 3317 break; 3318 } 3319 } 3320 3321 Py_DECREF(u); 3322 Py_DECREF(v); 3323 return result; 3324 3325onError: 3326 Py_XDECREF(u); 3327 Py_XDECREF(v); 3328 return -1; 3329} 3330 3331/* Concat to string or Unicode object giving a new Unicode object. */ 3332 3333PyObject *PyUnicode_Concat(PyObject *left, 3334 PyObject *right) 3335{ 3336 PyUnicodeObject *u = NULL, *v = NULL, *w; 3337 3338 /* Coerce the two arguments */ 3339 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3340 if (u == NULL) 3341 goto onError; 3342 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3343 if (v == NULL) 3344 goto onError; 3345 3346 /* Shortcuts */ 3347 if (v == unicode_empty) { 3348 Py_DECREF(v); 3349 return (PyObject *)u; 3350 } 3351 if (u == unicode_empty) { 3352 Py_DECREF(u); 3353 return (PyObject *)v; 3354 } 3355 3356 /* Concat the two Unicode strings */ 3357 w = _PyUnicode_New(u->length + v->length); 3358 if (w == NULL) 3359 goto onError; 3360 Py_UNICODE_COPY(w->str, u->str, u->length); 3361 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3362 3363 Py_DECREF(u); 3364 Py_DECREF(v); 3365 return (PyObject *)w; 3366 3367onError: 3368 Py_XDECREF(u); 3369 Py_XDECREF(v); 3370 return NULL; 3371} 3372 3373static char count__doc__[] = 3374"S.count(sub[, start[, end]]) -> int\n\ 3375\n\ 3376Return the number of occurrences of substring sub in Unicode string\n\ 3377S[start:end]. Optional arguments start and end are\n\ 3378interpreted as in slice notation."; 3379 3380static PyObject * 3381unicode_count(PyUnicodeObject *self, PyObject *args) 3382{ 3383 PyUnicodeObject *substring; 3384 int start = 0; 3385 int end = INT_MAX; 3386 PyObject *result; 3387 3388 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3389 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3390 return NULL; 3391 3392 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3393 (PyObject *)substring); 3394 if (substring == NULL) 3395 return NULL; 3396 3397 if (start < 0) 3398 start += self->length; 3399 if (start < 0) 3400 start = 0; 3401 if (end > self->length) 3402 end = self->length; 3403 if (end < 0) 3404 end += self->length; 3405 if (end < 0) 3406 end = 0; 3407 3408 result = PyInt_FromLong((long) count(self, start, end, substring)); 3409 3410 Py_DECREF(substring); 3411 return result; 3412} 3413 3414static char encode__doc__[] = 3415"S.encode([encoding[,errors]]) -> string\n\ 3416\n\ 3417Return an encoded string version of S. Default encoding is the current\n\ 3418default string encoding. errors may be given to set a different error\n\ 3419handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3420a ValueError. Other possible values are 'ignore' and 'replace'."; 3421 3422static PyObject * 3423unicode_encode(PyUnicodeObject *self, PyObject *args) 3424{ 3425 char *encoding = NULL; 3426 char *errors = NULL; 3427 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3428 return NULL; 3429 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3430} 3431 3432static char expandtabs__doc__[] = 3433"S.expandtabs([tabsize]) -> unicode\n\ 3434\n\ 3435Return a copy of S where all tab characters are expanded using spaces.\n\ 3436If tabsize is not given, a tab size of 8 characters is assumed."; 3437 3438static PyObject* 3439unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3440{ 3441 Py_UNICODE *e; 3442 Py_UNICODE *p; 3443 Py_UNICODE *q; 3444 int i, j; 3445 PyUnicodeObject *u; 3446 int tabsize = 8; 3447 3448 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3449 return NULL; 3450 3451 /* First pass: determine size of output string */ 3452 i = j = 0; 3453 e = self->str + self->length; 3454 for (p = self->str; p < e; p++) 3455 if (*p == '\t') { 3456 if (tabsize > 0) 3457 j += tabsize - (j % tabsize); 3458 } 3459 else { 3460 j++; 3461 if (*p == '\n' || *p == '\r') { 3462 i += j; 3463 j = 0; 3464 } 3465 } 3466 3467 /* Second pass: create output string and fill it */ 3468 u = _PyUnicode_New(i + j); 3469 if (!u) 3470 return NULL; 3471 3472 j = 0; 3473 q = u->str; 3474 3475 for (p = self->str; p < e; p++) 3476 if (*p == '\t') { 3477 if (tabsize > 0) { 3478 i = tabsize - (j % tabsize); 3479 j += i; 3480 while (i--) 3481 *q++ = ' '; 3482 } 3483 } 3484 else { 3485 j++; 3486 *q++ = *p; 3487 if (*p == '\n' || *p == '\r') 3488 j = 0; 3489 } 3490 3491 return (PyObject*) u; 3492} 3493 3494static char find__doc__[] = 3495"S.find(sub [,start [,end]]) -> int\n\ 3496\n\ 3497Return the lowest index in S where substring sub is found,\n\ 3498such that sub is contained within s[start,end]. Optional\n\ 3499arguments start and end are interpreted as in slice notation.\n\ 3500\n\ 3501Return -1 on failure."; 3502 3503static PyObject * 3504unicode_find(PyUnicodeObject *self, PyObject *args) 3505{ 3506 PyUnicodeObject *substring; 3507 int start = 0; 3508 int end = INT_MAX; 3509 PyObject *result; 3510 3511 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 3512 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3513 return NULL; 3514 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3515 (PyObject *)substring); 3516 if (substring == NULL) 3517 return NULL; 3518 3519 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 3520 3521 Py_DECREF(substring); 3522 return result; 3523} 3524 3525static PyObject * 3526unicode_getitem(PyUnicodeObject *self, int index) 3527{ 3528 if (index < 0 || index >= self->length) { 3529 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3530 return NULL; 3531 } 3532 3533 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 3534} 3535 3536static long 3537unicode_hash(PyUnicodeObject *self) 3538{ 3539 /* Since Unicode objects compare equal to their ASCII string 3540 counterparts, they should use the individual character values 3541 as basis for their hash value. This is needed to assure that 3542 strings and Unicode objects behave in the same way as 3543 dictionary keys. */ 3544 3545 register int len; 3546 register Py_UNICODE *p; 3547 register long x; 3548 3549 if (self->hash != -1) 3550 return self->hash; 3551 len = PyUnicode_GET_SIZE(self); 3552 p = PyUnicode_AS_UNICODE(self); 3553 x = *p << 7; 3554 while (--len >= 0) 3555 x = (1000003*x) ^ *p++; 3556 x ^= PyUnicode_GET_SIZE(self); 3557 if (x == -1) 3558 x = -2; 3559 self->hash = x; 3560 return x; 3561} 3562 3563static char index__doc__[] = 3564"S.index(sub [,start [,end]]) -> int\n\ 3565\n\ 3566Like S.find() but raise ValueError when the substring is not found."; 3567 3568static PyObject * 3569unicode_index(PyUnicodeObject *self, PyObject *args) 3570{ 3571 int result; 3572 PyUnicodeObject *substring; 3573 int start = 0; 3574 int end = INT_MAX; 3575 3576 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 3577 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3578 return NULL; 3579 3580 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3581 (PyObject *)substring); 3582 if (substring == NULL) 3583 return NULL; 3584 3585 result = findstring(self, substring, start, end, 1); 3586 3587 Py_DECREF(substring); 3588 if (result < 0) { 3589 PyErr_SetString(PyExc_ValueError, "substring not found"); 3590 return NULL; 3591 } 3592 return PyInt_FromLong(result); 3593} 3594 3595static char islower__doc__[] = 3596"S.islower() -> int\n\ 3597\n\ 3598Return 1 if all cased characters in S are lowercase and there is\n\ 3599at least one cased character in S, 0 otherwise."; 3600 3601static PyObject* 3602unicode_islower(PyUnicodeObject *self, PyObject *args) 3603{ 3604 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3605 register const Py_UNICODE *e; 3606 int cased; 3607 3608 if (!PyArg_NoArgs(args)) 3609 return NULL; 3610 3611 /* Shortcut for single character strings */ 3612 if (PyUnicode_GET_SIZE(self) == 1) 3613 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 3614 3615 /* Special case for empty strings */ 3616 if (PyString_GET_SIZE(self) == 0) 3617 return PyInt_FromLong(0); 3618 3619 e = p + PyUnicode_GET_SIZE(self); 3620 cased = 0; 3621 for (; p < e; p++) { 3622 register const Py_UNICODE ch = *p; 3623 3624 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 3625 return PyInt_FromLong(0); 3626 else if (!cased && Py_UNICODE_ISLOWER(ch)) 3627 cased = 1; 3628 } 3629 return PyInt_FromLong(cased); 3630} 3631 3632static char isupper__doc__[] = 3633"S.isupper() -> int\n\ 3634\n\ 3635Return 1 if all cased characters in S are uppercase and there is\n\ 3636at least one cased character in S, 0 otherwise."; 3637 3638static PyObject* 3639unicode_isupper(PyUnicodeObject *self, PyObject *args) 3640{ 3641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3642 register const Py_UNICODE *e; 3643 int cased; 3644 3645 if (!PyArg_NoArgs(args)) 3646 return NULL; 3647 3648 /* Shortcut for single character strings */ 3649 if (PyUnicode_GET_SIZE(self) == 1) 3650 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 3651 3652 /* Special case for empty strings */ 3653 if (PyString_GET_SIZE(self) == 0) 3654 return PyInt_FromLong(0); 3655 3656 e = p + PyUnicode_GET_SIZE(self); 3657 cased = 0; 3658 for (; p < e; p++) { 3659 register const Py_UNICODE ch = *p; 3660 3661 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 3662 return PyInt_FromLong(0); 3663 else if (!cased && Py_UNICODE_ISUPPER(ch)) 3664 cased = 1; 3665 } 3666 return PyInt_FromLong(cased); 3667} 3668 3669static char istitle__doc__[] = 3670"S.istitle() -> int\n\ 3671\n\ 3672Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 3673may only follow uncased characters and lowercase characters only cased\n\ 3674ones. Return 0 otherwise."; 3675 3676static PyObject* 3677unicode_istitle(PyUnicodeObject *self, PyObject *args) 3678{ 3679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3680 register const Py_UNICODE *e; 3681 int cased, previous_is_cased; 3682 3683 if (!PyArg_NoArgs(args)) 3684 return NULL; 3685 3686 /* Shortcut for single character strings */ 3687 if (PyUnicode_GET_SIZE(self) == 1) 3688 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 3689 (Py_UNICODE_ISUPPER(*p) != 0)); 3690 3691 /* Special case for empty strings */ 3692 if (PyString_GET_SIZE(self) == 0) 3693 return PyInt_FromLong(0); 3694 3695 e = p + PyUnicode_GET_SIZE(self); 3696 cased = 0; 3697 previous_is_cased = 0; 3698 for (; p < e; p++) { 3699 register const Py_UNICODE ch = *p; 3700 3701 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 3702 if (previous_is_cased) 3703 return PyInt_FromLong(0); 3704 previous_is_cased = 1; 3705 cased = 1; 3706 } 3707 else if (Py_UNICODE_ISLOWER(ch)) { 3708 if (!previous_is_cased) 3709 return PyInt_FromLong(0); 3710 previous_is_cased = 1; 3711 cased = 1; 3712 } 3713 else 3714 previous_is_cased = 0; 3715 } 3716 return PyInt_FromLong(cased); 3717} 3718 3719static char isspace__doc__[] = 3720"S.isspace() -> int\n\ 3721\n\ 3722Return 1 if there are only whitespace characters in S,\n\ 37230 otherwise."; 3724 3725static PyObject* 3726unicode_isspace(PyUnicodeObject *self, PyObject *args) 3727{ 3728 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3729 register const Py_UNICODE *e; 3730 3731 if (!PyArg_NoArgs(args)) 3732 return NULL; 3733 3734 /* Shortcut for single character strings */ 3735 if (PyUnicode_GET_SIZE(self) == 1 && 3736 Py_UNICODE_ISSPACE(*p)) 3737 return PyInt_FromLong(1); 3738 3739 /* Special case for empty strings */ 3740 if (PyString_GET_SIZE(self) == 0) 3741 return PyInt_FromLong(0); 3742 3743 e = p + PyUnicode_GET_SIZE(self); 3744 for (; p < e; p++) { 3745 if (!Py_UNICODE_ISSPACE(*p)) 3746 return PyInt_FromLong(0); 3747 } 3748 return PyInt_FromLong(1); 3749} 3750 3751static char isalpha__doc__[] = 3752"S.isalpha() -> int\n\ 3753\n\ 3754Return 1 if all characters in S are alphabetic\n\ 3755and there is at least one character in S, 0 otherwise."; 3756 3757static PyObject* 3758unicode_isalpha(PyUnicodeObject *self, PyObject *args) 3759{ 3760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3761 register const Py_UNICODE *e; 3762 3763 if (!PyArg_NoArgs(args)) 3764 return NULL; 3765 3766 /* Shortcut for single character strings */ 3767 if (PyUnicode_GET_SIZE(self) == 1 && 3768 Py_UNICODE_ISALPHA(*p)) 3769 return PyInt_FromLong(1); 3770 3771 /* Special case for empty strings */ 3772 if (PyString_GET_SIZE(self) == 0) 3773 return PyInt_FromLong(0); 3774 3775 e = p + PyUnicode_GET_SIZE(self); 3776 for (; p < e; p++) { 3777 if (!Py_UNICODE_ISALPHA(*p)) 3778 return PyInt_FromLong(0); 3779 } 3780 return PyInt_FromLong(1); 3781} 3782 3783static char isalnum__doc__[] = 3784"S.isalnum() -> int\n\ 3785\n\ 3786Return 1 if all characters in S are alphanumeric\n\ 3787and there is at least one character in S, 0 otherwise."; 3788 3789static PyObject* 3790unicode_isalnum(PyUnicodeObject *self, PyObject *args) 3791{ 3792 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3793 register const Py_UNICODE *e; 3794 3795 if (!PyArg_NoArgs(args)) 3796 return NULL; 3797 3798 /* Shortcut for single character strings */ 3799 if (PyUnicode_GET_SIZE(self) == 1 && 3800 Py_UNICODE_ISALNUM(*p)) 3801 return PyInt_FromLong(1); 3802 3803 /* Special case for empty strings */ 3804 if (PyString_GET_SIZE(self) == 0) 3805 return PyInt_FromLong(0); 3806 3807 e = p + PyUnicode_GET_SIZE(self); 3808 for (; p < e; p++) { 3809 if (!Py_UNICODE_ISALNUM(*p)) 3810 return PyInt_FromLong(0); 3811 } 3812 return PyInt_FromLong(1); 3813} 3814 3815static char isdecimal__doc__[] = 3816"S.isdecimal() -> int\n\ 3817\n\ 3818Return 1 if there are only decimal characters in S,\n\ 38190 otherwise."; 3820 3821static PyObject* 3822unicode_isdecimal(PyUnicodeObject *self, PyObject *args) 3823{ 3824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3825 register const Py_UNICODE *e; 3826 3827 if (!PyArg_NoArgs(args)) 3828 return NULL; 3829 3830 /* Shortcut for single character strings */ 3831 if (PyUnicode_GET_SIZE(self) == 1 && 3832 Py_UNICODE_ISDECIMAL(*p)) 3833 return PyInt_FromLong(1); 3834 3835 /* Special case for empty strings */ 3836 if (PyString_GET_SIZE(self) == 0) 3837 return PyInt_FromLong(0); 3838 3839 e = p + PyUnicode_GET_SIZE(self); 3840 for (; p < e; p++) { 3841 if (!Py_UNICODE_ISDECIMAL(*p)) 3842 return PyInt_FromLong(0); 3843 } 3844 return PyInt_FromLong(1); 3845} 3846 3847static char isdigit__doc__[] = 3848"S.isdigit() -> int\n\ 3849\n\ 3850Return 1 if there are only digit characters in S,\n\ 38510 otherwise."; 3852 3853static PyObject* 3854unicode_isdigit(PyUnicodeObject *self, PyObject *args) 3855{ 3856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3857 register const Py_UNICODE *e; 3858 3859 if (!PyArg_NoArgs(args)) 3860 return NULL; 3861 3862 /* Shortcut for single character strings */ 3863 if (PyUnicode_GET_SIZE(self) == 1 && 3864 Py_UNICODE_ISDIGIT(*p)) 3865 return PyInt_FromLong(1); 3866 3867 /* Special case for empty strings */ 3868 if (PyString_GET_SIZE(self) == 0) 3869 return PyInt_FromLong(0); 3870 3871 e = p + PyUnicode_GET_SIZE(self); 3872 for (; p < e; p++) { 3873 if (!Py_UNICODE_ISDIGIT(*p)) 3874 return PyInt_FromLong(0); 3875 } 3876 return PyInt_FromLong(1); 3877} 3878 3879static char isnumeric__doc__[] = 3880"S.isnumeric() -> int\n\ 3881\n\ 3882Return 1 if there are only numeric characters in S,\n\ 38830 otherwise."; 3884 3885static PyObject* 3886unicode_isnumeric(PyUnicodeObject *self, PyObject *args) 3887{ 3888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3889 register const Py_UNICODE *e; 3890 3891 if (!PyArg_NoArgs(args)) 3892 return NULL; 3893 3894 /* Shortcut for single character strings */ 3895 if (PyUnicode_GET_SIZE(self) == 1 && 3896 Py_UNICODE_ISNUMERIC(*p)) 3897 return PyInt_FromLong(1); 3898 3899 /* Special case for empty strings */ 3900 if (PyString_GET_SIZE(self) == 0) 3901 return PyInt_FromLong(0); 3902 3903 e = p + PyUnicode_GET_SIZE(self); 3904 for (; p < e; p++) { 3905 if (!Py_UNICODE_ISNUMERIC(*p)) 3906 return PyInt_FromLong(0); 3907 } 3908 return PyInt_FromLong(1); 3909} 3910 3911static char join__doc__[] = 3912"S.join(sequence) -> unicode\n\ 3913\n\ 3914Return a string which is the concatenation of the strings in the\n\ 3915sequence. The separator between elements is S."; 3916 3917static PyObject* 3918unicode_join(PyUnicodeObject *self, PyObject *args) 3919{ 3920 PyObject *data; 3921 if (!PyArg_ParseTuple(args, "O:join", &data)) 3922 return NULL; 3923 3924 return PyUnicode_Join((PyObject *)self, data); 3925} 3926 3927static int 3928unicode_length(PyUnicodeObject *self) 3929{ 3930 return self->length; 3931} 3932 3933static char ljust__doc__[] = 3934"S.ljust(width) -> unicode\n\ 3935\n\ 3936Return S left justified in a Unicode string of length width. Padding is\n\ 3937done using spaces."; 3938 3939static PyObject * 3940unicode_ljust(PyUnicodeObject *self, PyObject *args) 3941{ 3942 int width; 3943 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 3944 return NULL; 3945 3946 if (self->length >= width) { 3947 Py_INCREF(self); 3948 return (PyObject*) self; 3949 } 3950 3951 return (PyObject*) pad(self, 0, width - self->length, ' '); 3952} 3953 3954static char lower__doc__[] = 3955"S.lower() -> unicode\n\ 3956\n\ 3957Return a copy of the string S converted to lowercase."; 3958 3959static PyObject* 3960unicode_lower(PyUnicodeObject *self, PyObject *args) 3961{ 3962 if (!PyArg_NoArgs(args)) 3963 return NULL; 3964 return fixup(self, fixlower); 3965} 3966 3967static char lstrip__doc__[] = 3968"S.lstrip() -> unicode\n\ 3969\n\ 3970Return a copy of the string S with leading whitespace removed."; 3971 3972static PyObject * 3973unicode_lstrip(PyUnicodeObject *self, PyObject *args) 3974{ 3975 if (!PyArg_NoArgs(args)) 3976 return NULL; 3977 return strip(self, 1, 0); 3978} 3979 3980static PyObject* 3981unicode_repeat(PyUnicodeObject *str, int len) 3982{ 3983 PyUnicodeObject *u; 3984 Py_UNICODE *p; 3985 3986 if (len < 0) 3987 len = 0; 3988 3989 if (len == 1) { 3990 /* no repeat, return original string */ 3991 Py_INCREF(str); 3992 return (PyObject*) str; 3993 } 3994 3995 u = _PyUnicode_New(len * str->length); 3996 if (!u) 3997 return NULL; 3998 3999 p = u->str; 4000 4001 while (len-- > 0) { 4002 Py_UNICODE_COPY(p, str->str, str->length); 4003 p += str->length; 4004 } 4005 4006 return (PyObject*) u; 4007} 4008 4009PyObject *PyUnicode_Replace(PyObject *obj, 4010 PyObject *subobj, 4011 PyObject *replobj, 4012 int maxcount) 4013{ 4014 PyObject *self; 4015 PyObject *str1; 4016 PyObject *str2; 4017 PyObject *result; 4018 4019 self = PyUnicode_FromObject(obj); 4020 if (self == NULL) 4021 return NULL; 4022 str1 = PyUnicode_FromObject(subobj); 4023 if (str1 == NULL) { 4024 Py_DECREF(self); 4025 return NULL; 4026 } 4027 str2 = PyUnicode_FromObject(replobj); 4028 if (str2 == NULL) { 4029 Py_DECREF(self); 4030 Py_DECREF(str1); 4031 return NULL; 4032 } 4033 result = replace((PyUnicodeObject *)self, 4034 (PyUnicodeObject *)str1, 4035 (PyUnicodeObject *)str2, 4036 maxcount); 4037 Py_DECREF(self); 4038 Py_DECREF(str1); 4039 Py_DECREF(str2); 4040 return result; 4041} 4042 4043static char replace__doc__[] = 4044"S.replace (old, new[, maxsplit]) -> unicode\n\ 4045\n\ 4046Return a copy of S with all occurrences of substring\n\ 4047old replaced by new. If the optional argument maxsplit is\n\ 4048given, only the first maxsplit occurrences are replaced."; 4049 4050static PyObject* 4051unicode_replace(PyUnicodeObject *self, PyObject *args) 4052{ 4053 PyUnicodeObject *str1; 4054 PyUnicodeObject *str2; 4055 int maxcount = -1; 4056 PyObject *result; 4057 4058 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4059 return NULL; 4060 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4061 if (str1 == NULL) 4062 return NULL; 4063 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4064 if (str2 == NULL) 4065 return NULL; 4066 4067 result = replace(self, str1, str2, maxcount); 4068 4069 Py_DECREF(str1); 4070 Py_DECREF(str2); 4071 return result; 4072} 4073 4074static 4075PyObject *unicode_repr(PyObject *unicode) 4076{ 4077 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4078 PyUnicode_GET_SIZE(unicode), 4079 1); 4080} 4081 4082static char rfind__doc__[] = 4083"S.rfind(sub [,start [,end]]) -> int\n\ 4084\n\ 4085Return the highest index in S where substring sub is found,\n\ 4086such that sub is contained within s[start,end]. Optional\n\ 4087arguments start and end are interpreted as in slice notation.\n\ 4088\n\ 4089Return -1 on failure."; 4090 4091static PyObject * 4092unicode_rfind(PyUnicodeObject *self, PyObject *args) 4093{ 4094 PyUnicodeObject *substring; 4095 int start = 0; 4096 int end = INT_MAX; 4097 PyObject *result; 4098 4099 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4101 return NULL; 4102 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4103 (PyObject *)substring); 4104 if (substring == NULL) 4105 return NULL; 4106 4107 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4108 4109 Py_DECREF(substring); 4110 return result; 4111} 4112 4113static char rindex__doc__[] = 4114"S.rindex(sub [,start [,end]]) -> int\n\ 4115\n\ 4116Like S.rfind() but raise ValueError when the substring is not found."; 4117 4118static PyObject * 4119unicode_rindex(PyUnicodeObject *self, PyObject *args) 4120{ 4121 int result; 4122 PyUnicodeObject *substring; 4123 int start = 0; 4124 int end = INT_MAX; 4125 4126 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4127 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4128 return NULL; 4129 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4130 (PyObject *)substring); 4131 if (substring == NULL) 4132 return NULL; 4133 4134 result = findstring(self, substring, start, end, -1); 4135 4136 Py_DECREF(substring); 4137 if (result < 0) { 4138 PyErr_SetString(PyExc_ValueError, "substring not found"); 4139 return NULL; 4140 } 4141 return PyInt_FromLong(result); 4142} 4143 4144static char rjust__doc__[] = 4145"S.rjust(width) -> unicode\n\ 4146\n\ 4147Return S right justified in a Unicode string of length width. Padding is\n\ 4148done using spaces."; 4149 4150static PyObject * 4151unicode_rjust(PyUnicodeObject *self, PyObject *args) 4152{ 4153 int width; 4154 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4155 return NULL; 4156 4157 if (self->length >= width) { 4158 Py_INCREF(self); 4159 return (PyObject*) self; 4160 } 4161 4162 return (PyObject*) pad(self, width - self->length, 0, ' '); 4163} 4164 4165static char rstrip__doc__[] = 4166"S.rstrip() -> unicode\n\ 4167\n\ 4168Return a copy of the string S with trailing whitespace removed."; 4169 4170static PyObject * 4171unicode_rstrip(PyUnicodeObject *self, PyObject *args) 4172{ 4173 if (!PyArg_NoArgs(args)) 4174 return NULL; 4175 return strip(self, 0, 1); 4176} 4177 4178static PyObject* 4179unicode_slice(PyUnicodeObject *self, int start, int end) 4180{ 4181 /* standard clamping */ 4182 if (start < 0) 4183 start = 0; 4184 if (end < 0) 4185 end = 0; 4186 if (end > self->length) 4187 end = self->length; 4188 if (start == 0 && end == self->length) { 4189 /* full slice, return original string */ 4190 Py_INCREF(self); 4191 return (PyObject*) self; 4192 } 4193 if (start > end) 4194 start = end; 4195 /* copy slice */ 4196 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4197 end - start); 4198} 4199 4200PyObject *PyUnicode_Split(PyObject *s, 4201 PyObject *sep, 4202 int maxsplit) 4203{ 4204 PyObject *result; 4205 4206 s = PyUnicode_FromObject(s); 4207 if (s == NULL) 4208 return NULL; 4209 if (sep != NULL) { 4210 sep = PyUnicode_FromObject(sep); 4211 if (sep == NULL) { 4212 Py_DECREF(s); 4213 return NULL; 4214 } 4215 } 4216 4217 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4218 4219 Py_DECREF(s); 4220 Py_XDECREF(sep); 4221 return result; 4222} 4223 4224static char split__doc__[] = 4225"S.split([sep [,maxsplit]]) -> list of strings\n\ 4226\n\ 4227Return a list of the words in S, using sep as the\n\ 4228delimiter string. If maxsplit is given, at most maxsplit\n\ 4229splits are done. If sep is not specified, any whitespace string\n\ 4230is a separator."; 4231 4232static PyObject* 4233unicode_split(PyUnicodeObject *self, PyObject *args) 4234{ 4235 PyObject *substring = Py_None; 4236 int maxcount = -1; 4237 4238 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4239 return NULL; 4240 4241 if (substring == Py_None) 4242 return split(self, NULL, maxcount); 4243 else if (PyUnicode_Check(substring)) 4244 return split(self, (PyUnicodeObject *)substring, maxcount); 4245 else 4246 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4247} 4248 4249static char splitlines__doc__[] = 4250"S.splitlines([keepends]]) -> list of strings\n\ 4251\n\ 4252Return a list of the lines in S, breaking at line boundaries.\n\ 4253Line breaks are not included in the resulting list unless keepends\n\ 4254is given and true."; 4255 4256static PyObject* 4257unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4258{ 4259 int keepends = 0; 4260 4261 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4262 return NULL; 4263 4264 return PyUnicode_Splitlines((PyObject *)self, keepends); 4265} 4266 4267static 4268PyObject *unicode_str(PyUnicodeObject *self) 4269{ 4270 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4271} 4272 4273static char strip__doc__[] = 4274"S.strip() -> unicode\n\ 4275\n\ 4276Return a copy of S with leading and trailing whitespace removed."; 4277 4278static PyObject * 4279unicode_strip(PyUnicodeObject *self, PyObject *args) 4280{ 4281 if (!PyArg_NoArgs(args)) 4282 return NULL; 4283 return strip(self, 1, 1); 4284} 4285 4286static char swapcase__doc__[] = 4287"S.swapcase() -> unicode\n\ 4288\n\ 4289Return a copy of S with uppercase characters converted to lowercase\n\ 4290and vice versa."; 4291 4292static PyObject* 4293unicode_swapcase(PyUnicodeObject *self, PyObject *args) 4294{ 4295 if (!PyArg_NoArgs(args)) 4296 return NULL; 4297 return fixup(self, fixswapcase); 4298} 4299 4300static char translate__doc__[] = 4301"S.translate(table) -> unicode\n\ 4302\n\ 4303Return a copy of the string S, where all characters have been mapped\n\ 4304through the given translation table, which must be a mapping of\n\ 4305Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4306are left untouched. Characters mapped to None are deleted."; 4307 4308static PyObject* 4309unicode_translate(PyUnicodeObject *self, PyObject *args) 4310{ 4311 PyObject *table; 4312 4313 if (!PyArg_ParseTuple(args, "O:translate", &table)) 4314 return NULL; 4315 return PyUnicode_TranslateCharmap(self->str, 4316 self->length, 4317 table, 4318 "ignore"); 4319} 4320 4321static char upper__doc__[] = 4322"S.upper() -> unicode\n\ 4323\n\ 4324Return a copy of S converted to uppercase."; 4325 4326static PyObject* 4327unicode_upper(PyUnicodeObject *self, PyObject *args) 4328{ 4329 if (!PyArg_NoArgs(args)) 4330 return NULL; 4331 return fixup(self, fixupper); 4332} 4333 4334#if 0 4335static char zfill__doc__[] = 4336"S.zfill(width) -> unicode\n\ 4337\n\ 4338Pad a numeric string x with zeros on the left, to fill a field\n\ 4339of the specified width. The string x is never truncated."; 4340 4341static PyObject * 4342unicode_zfill(PyUnicodeObject *self, PyObject *args) 4343{ 4344 int fill; 4345 PyUnicodeObject *u; 4346 4347 int width; 4348 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4349 return NULL; 4350 4351 if (self->length >= width) { 4352 Py_INCREF(self); 4353 return (PyObject*) self; 4354 } 4355 4356 fill = width - self->length; 4357 4358 u = pad(self, fill, 0, '0'); 4359 4360 if (u->str[fill] == '+' || u->str[fill] == '-') { 4361 /* move sign to beginning of string */ 4362 u->str[0] = u->str[fill]; 4363 u->str[fill] = '0'; 4364 } 4365 4366 return (PyObject*) u; 4367} 4368#endif 4369 4370#if 0 4371static PyObject* 4372unicode_freelistsize(PyUnicodeObject *self, PyObject *args) 4373{ 4374 if (!PyArg_NoArgs(args)) 4375 return NULL; 4376 return PyInt_FromLong(unicode_freelist_size); 4377} 4378#endif 4379 4380static char startswith__doc__[] = 4381"S.startswith(prefix[, start[, end]]) -> int\n\ 4382\n\ 4383Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4384optional start, test S beginning at that position. With optional end, stop\n\ 4385comparing S at that position."; 4386 4387static PyObject * 4388unicode_startswith(PyUnicodeObject *self, 4389 PyObject *args) 4390{ 4391 PyUnicodeObject *substring; 4392 int start = 0; 4393 int end = INT_MAX; 4394 PyObject *result; 4395 4396 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4397 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4398 return NULL; 4399 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4400 (PyObject *)substring); 4401 if (substring == NULL) 4402 return NULL; 4403 4404 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4405 4406 Py_DECREF(substring); 4407 return result; 4408} 4409 4410 4411static char endswith__doc__[] = 4412"S.endswith(suffix[, start[, end]]) -> int\n\ 4413\n\ 4414Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4415optional start, test S beginning at that position. With optional end, stop\n\ 4416comparing S at that position."; 4417 4418static PyObject * 4419unicode_endswith(PyUnicodeObject *self, 4420 PyObject *args) 4421{ 4422 PyUnicodeObject *substring; 4423 int start = 0; 4424 int end = INT_MAX; 4425 PyObject *result; 4426 4427 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4428 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4429 return NULL; 4430 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4431 (PyObject *)substring); 4432 if (substring == NULL) 4433 return NULL; 4434 4435 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4436 4437 Py_DECREF(substring); 4438 return result; 4439} 4440 4441 4442static PyMethodDef unicode_methods[] = { 4443 4444 /* Order is according to common usage: often used methods should 4445 appear first, since lookup is done sequentially. */ 4446 4447 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__}, 4448 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__}, 4449 {"split", (PyCFunction) unicode_split, 1, split__doc__}, 4450 {"join", (PyCFunction) unicode_join, 1, join__doc__}, 4451 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__}, 4452 {"title", (PyCFunction) unicode_title, 0, title__doc__}, 4453 {"center", (PyCFunction) unicode_center, 1, center__doc__}, 4454 {"count", (PyCFunction) unicode_count, 1, count__doc__}, 4455 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__}, 4456 {"find", (PyCFunction) unicode_find, 1, find__doc__}, 4457 {"index", (PyCFunction) unicode_index, 1, index__doc__}, 4458 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__}, 4459 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__}, 4460 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__}, 4461/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */ 4462 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__}, 4463 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__}, 4464 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__}, 4465 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__}, 4466 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__}, 4467 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__}, 4468 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__}, 4469 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__}, 4470 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__}, 4471 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__}, 4472 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__}, 4473 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__}, 4474 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__}, 4475 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__}, 4476 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__}, 4477 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__}, 4478 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__}, 4479 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__}, 4480 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__}, 4481 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__}, 4482#if 0 4483 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__}, 4484 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__}, 4485#endif 4486 4487#if 0 4488 /* This one is just used for debugging the implementation. */ 4489 {"freelistsize", (PyCFunction) unicode_freelistsize, 0}, 4490#endif 4491 4492 {NULL, NULL} 4493}; 4494 4495static PyObject * 4496unicode_getattr(PyUnicodeObject *self, char *name) 4497{ 4498 return Py_FindMethod(unicode_methods, (PyObject*) self, name); 4499} 4500 4501static PySequenceMethods unicode_as_sequence = { 4502 (inquiry) unicode_length, /* sq_length */ 4503 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4504 (intargfunc) unicode_repeat, /* sq_repeat */ 4505 (intargfunc) unicode_getitem, /* sq_item */ 4506 (intintargfunc) unicode_slice, /* sq_slice */ 4507 0, /* sq_ass_item */ 4508 0, /* sq_ass_slice */ 4509 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4510}; 4511 4512static int 4513unicode_buffer_getreadbuf(PyUnicodeObject *self, 4514 int index, 4515 const void **ptr) 4516{ 4517 if (index != 0) { 4518 PyErr_SetString(PyExc_SystemError, 4519 "accessing non-existent unicode segment"); 4520 return -1; 4521 } 4522 *ptr = (void *) self->str; 4523 return PyUnicode_GET_DATA_SIZE(self); 4524} 4525 4526static int 4527unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4528 const void **ptr) 4529{ 4530 PyErr_SetString(PyExc_TypeError, 4531 "cannot use unicode as modifyable buffer"); 4532 return -1; 4533} 4534 4535static int 4536unicode_buffer_getsegcount(PyUnicodeObject *self, 4537 int *lenp) 4538{ 4539 if (lenp) 4540 *lenp = PyUnicode_GET_DATA_SIZE(self); 4541 return 1; 4542} 4543 4544static int 4545unicode_buffer_getcharbuf(PyUnicodeObject *self, 4546 int index, 4547 const void **ptr) 4548{ 4549 PyObject *str; 4550 4551 if (index != 0) { 4552 PyErr_SetString(PyExc_SystemError, 4553 "accessing non-existent unicode segment"); 4554 return -1; 4555 } 4556 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 4557 if (str == NULL) 4558 return -1; 4559 *ptr = (void *) PyString_AS_STRING(str); 4560 return PyString_GET_SIZE(str); 4561} 4562 4563/* Helpers for PyUnicode_Format() */ 4564 4565static PyObject * 4566getnextarg(PyObject *args, int arglen, int *p_argidx) 4567{ 4568 int argidx = *p_argidx; 4569 if (argidx < arglen) { 4570 (*p_argidx)++; 4571 if (arglen < 0) 4572 return args; 4573 else 4574 return PyTuple_GetItem(args, argidx); 4575 } 4576 PyErr_SetString(PyExc_TypeError, 4577 "not enough arguments for format string"); 4578 return NULL; 4579} 4580 4581#define F_LJUST (1<<0) 4582#define F_SIGN (1<<1) 4583#define F_BLANK (1<<2) 4584#define F_ALT (1<<3) 4585#define F_ZERO (1<<4) 4586 4587static 4588int usprintf(register Py_UNICODE *buffer, char *format, ...) 4589{ 4590 register int i; 4591 int len; 4592 va_list va; 4593 char *charbuffer; 4594 va_start(va, format); 4595 4596 /* First, format the string as char array, then expand to Py_UNICODE 4597 array. */ 4598 charbuffer = (char *)buffer; 4599 len = vsprintf(charbuffer, format, va); 4600 for (i = len - 1; i >= 0; i--) 4601 buffer[i] = (Py_UNICODE) charbuffer[i]; 4602 4603 va_end(va); 4604 return len; 4605} 4606 4607static int 4608formatfloat(Py_UNICODE *buf, 4609 size_t buflen, 4610 int flags, 4611 int prec, 4612 int type, 4613 PyObject *v) 4614{ 4615 /* fmt = '%#.' + `prec` + `type` 4616 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 4617 char fmt[20]; 4618 double x; 4619 4620 x = PyFloat_AsDouble(v); 4621 if (x == -1.0 && PyErr_Occurred()) 4622 return -1; 4623 if (prec < 0) 4624 prec = 6; 4625 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 4626 type = 'g'; 4627 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type); 4628 /* worst case length calc to ensure no buffer overrun: 4629 fmt = %#.<prec>g 4630 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 4631 for any double rep.) 4632 len = 1 + prec + 1 + 2 + 5 = 9 + prec 4633 If prec=0 the effective precision is 1 (the leading digit is 4634 always given), therefore increase by one to 10+prec. */ 4635 if (buflen <= (size_t)10 + (size_t)prec) { 4636 PyErr_SetString(PyExc_OverflowError, 4637 "formatted float is too long (precision too long?)"); 4638 return -1; 4639 } 4640 return usprintf(buf, fmt, x); 4641} 4642 4643static int 4644formatint(Py_UNICODE *buf, 4645 size_t buflen, 4646 int flags, 4647 int prec, 4648 int type, 4649 PyObject *v) 4650{ 4651 /* fmt = '%#.' + `prec` + 'l' + `type` 4652 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/ 4653 char fmt[20]; 4654 long x; 4655 4656 x = PyInt_AsLong(v); 4657 if (x == -1 && PyErr_Occurred()) 4658 return -1; 4659 if (prec < 0) 4660 prec = 1; 4661 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 4662 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 4663 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 4664 PyErr_SetString(PyExc_OverflowError, 4665 "formatted integer is too long (precision too long?)"); 4666 return -1; 4667 } 4668 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type); 4669 return usprintf(buf, fmt, x); 4670} 4671 4672static int 4673formatchar(Py_UNICODE *buf, 4674 size_t buflen, 4675 PyObject *v) 4676{ 4677 /* presume that the buffer is at least 2 characters long */ 4678 if (PyUnicode_Check(v)) { 4679 if (PyUnicode_GET_SIZE(v) != 1) 4680 goto onError; 4681 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 4682 } 4683 4684 else if (PyString_Check(v)) { 4685 if (PyString_GET_SIZE(v) != 1) 4686 goto onError; 4687 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 4688 } 4689 4690 else { 4691 /* Integer input truncated to a character */ 4692 long x; 4693 x = PyInt_AsLong(v); 4694 if (x == -1 && PyErr_Occurred()) 4695 goto onError; 4696 buf[0] = (char) x; 4697 } 4698 buf[1] = '\0'; 4699 return 1; 4700 4701 onError: 4702 PyErr_SetString(PyExc_TypeError, 4703 "%c requires int or char"); 4704 return -1; 4705} 4706 4707/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 4708 4709 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 4710 chars are formatted. XXX This is a magic number. Each formatting 4711 routine does bounds checking to ensure no overflow, but a better 4712 solution may be to malloc a buffer of appropriate size for each 4713 format. For now, the current solution is sufficient. 4714*/ 4715#define FORMATBUFLEN (size_t)120 4716 4717PyObject *PyUnicode_Format(PyObject *format, 4718 PyObject *args) 4719{ 4720 Py_UNICODE *fmt, *res; 4721 int fmtcnt, rescnt, reslen, arglen, argidx; 4722 int args_owned = 0; 4723 PyUnicodeObject *result = NULL; 4724 PyObject *dict = NULL; 4725 PyObject *uformat; 4726 4727 if (format == NULL || args == NULL) { 4728 PyErr_BadInternalCall(); 4729 return NULL; 4730 } 4731 uformat = PyUnicode_FromObject(format); 4732 if (uformat == NULL) 4733 return NULL; 4734 fmt = PyUnicode_AS_UNICODE(uformat); 4735 fmtcnt = PyUnicode_GET_SIZE(uformat); 4736 4737 reslen = rescnt = fmtcnt + 100; 4738 result = _PyUnicode_New(reslen); 4739 if (result == NULL) 4740 goto onError; 4741 res = PyUnicode_AS_UNICODE(result); 4742 4743 if (PyTuple_Check(args)) { 4744 arglen = PyTuple_Size(args); 4745 argidx = 0; 4746 } 4747 else { 4748 arglen = -1; 4749 argidx = -2; 4750 } 4751 if (args->ob_type->tp_as_mapping) 4752 dict = args; 4753 4754 while (--fmtcnt >= 0) { 4755 if (*fmt != '%') { 4756 if (--rescnt < 0) { 4757 rescnt = fmtcnt + 100; 4758 reslen += rescnt; 4759 if (_PyUnicode_Resize(result, reslen) < 0) 4760 return NULL; 4761 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 4762 --rescnt; 4763 } 4764 *res++ = *fmt++; 4765 } 4766 else { 4767 /* Got a format specifier */ 4768 int flags = 0; 4769 int width = -1; 4770 int prec = -1; 4771 int size = 0; 4772 Py_UNICODE c = '\0'; 4773 Py_UNICODE fill; 4774 PyObject *v = NULL; 4775 PyObject *temp = NULL; 4776 Py_UNICODE *pbuf; 4777 Py_UNICODE sign; 4778 int len; 4779 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 4780 4781 fmt++; 4782 if (*fmt == '(') { 4783 Py_UNICODE *keystart; 4784 int keylen; 4785 PyObject *key; 4786 int pcount = 1; 4787 4788 if (dict == NULL) { 4789 PyErr_SetString(PyExc_TypeError, 4790 "format requires a mapping"); 4791 goto onError; 4792 } 4793 ++fmt; 4794 --fmtcnt; 4795 keystart = fmt; 4796 /* Skip over balanced parentheses */ 4797 while (pcount > 0 && --fmtcnt >= 0) { 4798 if (*fmt == ')') 4799 --pcount; 4800 else if (*fmt == '(') 4801 ++pcount; 4802 fmt++; 4803 } 4804 keylen = fmt - keystart - 1; 4805 if (fmtcnt < 0 || pcount > 0) { 4806 PyErr_SetString(PyExc_ValueError, 4807 "incomplete format key"); 4808 goto onError; 4809 } 4810 /* keys are converted to strings using UTF-8 and 4811 then looked up since Python uses strings to hold 4812 variables names etc. in its namespaces and we 4813 wouldn't want to break common idioms. */ 4814 key = PyUnicode_EncodeUTF8(keystart, 4815 keylen, 4816 NULL); 4817 if (key == NULL) 4818 goto onError; 4819 if (args_owned) { 4820 Py_DECREF(args); 4821 args_owned = 0; 4822 } 4823 args = PyObject_GetItem(dict, key); 4824 Py_DECREF(key); 4825 if (args == NULL) { 4826 goto onError; 4827 } 4828 args_owned = 1; 4829 arglen = -1; 4830 argidx = -2; 4831 } 4832 while (--fmtcnt >= 0) { 4833 switch (c = *fmt++) { 4834 case '-': flags |= F_LJUST; continue; 4835 case '+': flags |= F_SIGN; continue; 4836 case ' ': flags |= F_BLANK; continue; 4837 case '#': flags |= F_ALT; continue; 4838 case '0': flags |= F_ZERO; continue; 4839 } 4840 break; 4841 } 4842 if (c == '*') { 4843 v = getnextarg(args, arglen, &argidx); 4844 if (v == NULL) 4845 goto onError; 4846 if (!PyInt_Check(v)) { 4847 PyErr_SetString(PyExc_TypeError, 4848 "* wants int"); 4849 goto onError; 4850 } 4851 width = PyInt_AsLong(v); 4852 if (width < 0) { 4853 flags |= F_LJUST; 4854 width = -width; 4855 } 4856 if (--fmtcnt >= 0) 4857 c = *fmt++; 4858 } 4859 else if (c >= '0' && c <= '9') { 4860 width = c - '0'; 4861 while (--fmtcnt >= 0) { 4862 c = *fmt++; 4863 if (c < '0' || c > '9') 4864 break; 4865 if ((width*10) / 10 != width) { 4866 PyErr_SetString(PyExc_ValueError, 4867 "width too big"); 4868 goto onError; 4869 } 4870 width = width*10 + (c - '0'); 4871 } 4872 } 4873 if (c == '.') { 4874 prec = 0; 4875 if (--fmtcnt >= 0) 4876 c = *fmt++; 4877 if (c == '*') { 4878 v = getnextarg(args, arglen, &argidx); 4879 if (v == NULL) 4880 goto onError; 4881 if (!PyInt_Check(v)) { 4882 PyErr_SetString(PyExc_TypeError, 4883 "* wants int"); 4884 goto onError; 4885 } 4886 prec = PyInt_AsLong(v); 4887 if (prec < 0) 4888 prec = 0; 4889 if (--fmtcnt >= 0) 4890 c = *fmt++; 4891 } 4892 else if (c >= '0' && c <= '9') { 4893 prec = c - '0'; 4894 while (--fmtcnt >= 0) { 4895 c = Py_CHARMASK(*fmt++); 4896 if (c < '0' || c > '9') 4897 break; 4898 if ((prec*10) / 10 != prec) { 4899 PyErr_SetString(PyExc_ValueError, 4900 "prec too big"); 4901 goto onError; 4902 } 4903 prec = prec*10 + (c - '0'); 4904 } 4905 } 4906 } /* prec */ 4907 if (fmtcnt >= 0) { 4908 if (c == 'h' || c == 'l' || c == 'L') { 4909 size = c; 4910 if (--fmtcnt >= 0) 4911 c = *fmt++; 4912 } 4913 } 4914 if (fmtcnt < 0) { 4915 PyErr_SetString(PyExc_ValueError, 4916 "incomplete format"); 4917 goto onError; 4918 } 4919 if (c != '%') { 4920 v = getnextarg(args, arglen, &argidx); 4921 if (v == NULL) 4922 goto onError; 4923 } 4924 sign = 0; 4925 fill = ' '; 4926 switch (c) { 4927 4928 case '%': 4929 pbuf = formatbuf; 4930 /* presume that buffer length is at least 1 */ 4931 pbuf[0] = '%'; 4932 len = 1; 4933 break; 4934 4935 case 's': 4936 case 'r': 4937 if (PyUnicode_Check(v) && c == 's') { 4938 temp = v; 4939 Py_INCREF(temp); 4940 } 4941 else { 4942 PyObject *unicode; 4943 if (c == 's') 4944 temp = PyObject_Str(v); 4945 else 4946 temp = PyObject_Repr(v); 4947 if (temp == NULL) 4948 goto onError; 4949 if (!PyString_Check(temp)) { 4950 /* XXX Note: this should never happen, since 4951 PyObject_Repr() and PyObject_Str() assure 4952 this */ 4953 Py_DECREF(temp); 4954 PyErr_SetString(PyExc_TypeError, 4955 "%s argument has non-string str()"); 4956 goto onError; 4957 } 4958 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 4959 PyString_GET_SIZE(temp), 4960 NULL, 4961 "strict"); 4962 Py_DECREF(temp); 4963 temp = unicode; 4964 if (temp == NULL) 4965 goto onError; 4966 } 4967 pbuf = PyUnicode_AS_UNICODE(temp); 4968 len = PyUnicode_GET_SIZE(temp); 4969 if (prec >= 0 && len > prec) 4970 len = prec; 4971 break; 4972 4973 case 'i': 4974 case 'd': 4975 case 'u': 4976 case 'o': 4977 case 'x': 4978 case 'X': 4979 if (c == 'i') 4980 c = 'd'; 4981 pbuf = formatbuf; 4982 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 4983 flags, prec, c, v); 4984 if (len < 0) 4985 goto onError; 4986 sign = (c == 'd'); 4987 if (flags & F_ZERO) { 4988 fill = '0'; 4989 if ((flags&F_ALT) && 4990 (c == 'x' || c == 'X') && 4991 pbuf[0] == '0' && pbuf[1] == c) { 4992 *res++ = *pbuf++; 4993 *res++ = *pbuf++; 4994 rescnt -= 2; 4995 len -= 2; 4996 width -= 2; 4997 if (width < 0) 4998 width = 0; 4999 } 5000 } 5001 break; 5002 5003 case 'e': 5004 case 'E': 5005 case 'f': 5006 case 'g': 5007 case 'G': 5008 pbuf = formatbuf; 5009 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5010 flags, prec, c, v); 5011 if (len < 0) 5012 goto onError; 5013 sign = 1; 5014 if (flags&F_ZERO) 5015 fill = '0'; 5016 break; 5017 5018 case 'c': 5019 pbuf = formatbuf; 5020 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5021 if (len < 0) 5022 goto onError; 5023 break; 5024 5025 default: 5026 PyErr_Format(PyExc_ValueError, 5027 "unsupported format character '%c' (0x%x)", 5028 c, c); 5029 goto onError; 5030 } 5031 if (sign) { 5032 if (*pbuf == '-' || *pbuf == '+') { 5033 sign = *pbuf++; 5034 len--; 5035 } 5036 else if (flags & F_SIGN) 5037 sign = '+'; 5038 else if (flags & F_BLANK) 5039 sign = ' '; 5040 else 5041 sign = 0; 5042 } 5043 if (width < len) 5044 width = len; 5045 if (rescnt < width + (sign != 0)) { 5046 reslen -= rescnt; 5047 rescnt = width + fmtcnt + 100; 5048 reslen += rescnt; 5049 if (_PyUnicode_Resize(result, reslen) < 0) 5050 return NULL; 5051 res = PyUnicode_AS_UNICODE(result) 5052 + reslen - rescnt; 5053 } 5054 if (sign) { 5055 if (fill != ' ') 5056 *res++ = sign; 5057 rescnt--; 5058 if (width > len) 5059 width--; 5060 } 5061 if (width > len && !(flags & F_LJUST)) { 5062 do { 5063 --rescnt; 5064 *res++ = fill; 5065 } while (--width > len); 5066 } 5067 if (sign && fill == ' ') 5068 *res++ = sign; 5069 memcpy(res, pbuf, len * sizeof(Py_UNICODE)); 5070 res += len; 5071 rescnt -= len; 5072 while (--width >= len) { 5073 --rescnt; 5074 *res++ = ' '; 5075 } 5076 if (dict && (argidx < arglen) && c != '%') { 5077 PyErr_SetString(PyExc_TypeError, 5078 "not all arguments converted"); 5079 goto onError; 5080 } 5081 Py_XDECREF(temp); 5082 } /* '%' */ 5083 } /* until end */ 5084 if (argidx < arglen && !dict) { 5085 PyErr_SetString(PyExc_TypeError, 5086 "not all arguments converted"); 5087 goto onError; 5088 } 5089 5090 if (args_owned) { 5091 Py_DECREF(args); 5092 } 5093 Py_DECREF(uformat); 5094 if (_PyUnicode_Resize(result, reslen - rescnt)) 5095 goto onError; 5096 return (PyObject *)result; 5097 5098 onError: 5099 Py_XDECREF(result); 5100 Py_DECREF(uformat); 5101 if (args_owned) { 5102 Py_DECREF(args); 5103 } 5104 return NULL; 5105} 5106 5107static PyBufferProcs unicode_as_buffer = { 5108 (getreadbufferproc) unicode_buffer_getreadbuf, 5109 (getwritebufferproc) unicode_buffer_getwritebuf, 5110 (getsegcountproc) unicode_buffer_getsegcount, 5111 (getcharbufferproc) unicode_buffer_getcharbuf, 5112}; 5113 5114PyTypeObject PyUnicode_Type = { 5115 PyObject_HEAD_INIT(&PyType_Type) 5116 0, /* ob_size */ 5117 "unicode", /* tp_name */ 5118 sizeof(PyUnicodeObject), /* tp_size */ 5119 0, /* tp_itemsize */ 5120 /* Slots */ 5121 (destructor)_PyUnicode_Free, /* tp_dealloc */ 5122 0, /* tp_print */ 5123 (getattrfunc)unicode_getattr, /* tp_getattr */ 5124 0, /* tp_setattr */ 5125 (cmpfunc) unicode_compare, /* tp_compare */ 5126 (reprfunc) unicode_repr, /* tp_repr */ 5127 0, /* tp_as_number */ 5128 &unicode_as_sequence, /* tp_as_sequence */ 5129 0, /* tp_as_mapping */ 5130 (hashfunc) unicode_hash, /* tp_hash*/ 5131 0, /* tp_call*/ 5132 (reprfunc) unicode_str, /* tp_str */ 5133 (getattrofunc) NULL, /* tp_getattro */ 5134 (setattrofunc) NULL, /* tp_setattro */ 5135 &unicode_as_buffer, /* tp_as_buffer */ 5136 Py_TPFLAGS_DEFAULT, /* tp_flags */ 5137}; 5138 5139/* Initialize the Unicode implementation */ 5140 5141void _PyUnicode_Init(void) 5142{ 5143 /* Doublecheck the configuration... */ 5144 if (sizeof(Py_UNICODE) != 2) 5145 Py_FatalError("Unicode configuration error: " 5146 "sizeof(Py_UNICODE) != 2 bytes"); 5147 5148 /* Init the implementation */ 5149 unicode_freelist = NULL; 5150 unicode_freelist_size = 0; 5151 unicode_empty = _PyUnicode_New(0); 5152 strcpy(unicode_default_encoding, "ascii"); 5153} 5154 5155/* Finalize the Unicode implementation */ 5156 5157void 5158_PyUnicode_Fini(void) 5159{ 5160 PyUnicodeObject *u = unicode_freelist; 5161 5162 while (u != NULL) { 5163 PyUnicodeObject *v = u; 5164 u = *(PyUnicodeObject **)u; 5165 if (v->str) 5166 PyMem_DEL(v->str); 5167 Py_XDECREF(v->defenc); 5168 PyObject_DEL(v); 5169 } 5170 unicode_freelist = NULL; 5171 unicode_freelist_size = 0; 5172 Py_XDECREF(unicode_empty); 5173 unicode_empty = NULL; 5174} 5175