unicodeobject.c revision ec233e58038b222ec4cedc07ec46bed1f40468d7
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the 5Unicode Integration Proposal (see file Misc/unicode.txt). 6 7Copyright (c) Corporation for National Research Initiatives. 8 9 10 Original header: 11 -------------------------------------------------------------------- 12 13 * Yet another Unicode string type for Python. This type supports the 14 * 16-bit Basic Multilingual Plane (BMP) only. 15 * 16 * Note that this string class supports embedded NULL characters. End 17 * of string is given by the length attribute. However, the internal 18 * representation always stores a trailing NULL to make it easier to 19 * use unicode strings with standard APIs. 20 * 21 * History: 22 * 1999-01-23 fl Created 23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support 24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc. 25 * 1999-03-06 fl Moved declarations to separate file, etc. 26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal 27 * 1999-08-10 fl Some minor tweaks 28 * 29 * Written by Fredrik Lundh, January 1999. 30 * 31 * Copyright (c) 1999 by Secret Labs AB. 32 * Copyright (c) 1999 by Fredrik Lundh. 33 * 34 * fredrik@pythonware.com 35 * http://www.pythonware.com 36 * 37 * -------------------------------------------------------------------- 38 * This Unicode String Type is 39 * 40 * Copyright (c) 1999 by Secret Labs AB 41 * Copyright (c) 1999 by Fredrik Lundh 42 * 43 * By obtaining, using, and/or copying this software and/or its 44 * associated documentation, you agree that you have read, understood, 45 * and will comply with the following terms and conditions: 46 * 47 * Permission to use, copy, modify, and distribute this software and its 48 * associated documentation for any purpose and without fee is hereby 49 * granted, provided that the above copyright notice appears in all 50 * copies, and that both that copyright notice and this permission notice 51 * appear in supporting documentation, and that the name of Secret Labs 52 * AB or the author not be used in advertising or publicity pertaining to 53 * distribution of the software without specific, written prior 54 * permission. 55 * 56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 63 * -------------------------------------------------------------------- */ 64 65#include "Python.h" 66 67#include "unicodeobject.h" 68#include "ucnhash.h" 69 70#ifdef MS_WIN32 71#include <windows.h> 72#endif 73 74/* Limit for the Unicode object free list */ 75 76#define MAX_UNICODE_FREELIST_SIZE 1024 77 78/* Limit for the Unicode object free list stay alive optimization. 79 80 The implementation will keep allocated Unicode memory intact for 81 all objects on the free list having a size less than this 82 limit. This reduces malloc() overhead for small Unicode objects. 83 84 At worst this will result in MAX_UNICODE_FREELIST_SIZE * 85 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 86 malloc()-overhead) bytes of unused garbage. 87 88 Setting the limit to 0 effectively turns the feature off. 89 90 Note: This is an experimental feature ! If you get core dumps when 91 using Unicode objects, turn this feature off. 92 93*/ 94 95#define KEEPALIVE_SIZE_LIMIT 9 96 97/* Endianness switches; defaults to little endian */ 98 99#ifdef WORDS_BIGENDIAN 100# define BYTEORDER_IS_BIG_ENDIAN 101#else 102# define BYTEORDER_IS_LITTLE_ENDIAN 103#endif 104 105/* --- Globals ------------------------------------------------------------ 106 107 The globals are initialized by the _PyUnicode_Init() API and should 108 not be used before calling that API. 109 110*/ 111 112/* The empty Unicode object */ 113static PyUnicodeObject *unicode_empty; 114 115/* Free list for Unicode objects */ 116static PyUnicodeObject *unicode_freelist; 117static int unicode_freelist_size; 118 119/* Default encoding to use and assume when NULL is passed as encoding 120 parameter; it is initialized by _PyUnicode_Init(). 121 122 Always use the PyUnicode_SetDefaultEncoding() and 123 PyUnicode_GetDefaultEncoding() APIs to access this global. 124 125*/ 126 127static char unicode_default_encoding[100]; 128 129/* --- Unicode Object ----------------------------------------------------- */ 130 131static 132int _PyUnicode_Resize(register PyUnicodeObject *unicode, 133 int length) 134{ 135 void *oldstr; 136 137 /* Shortcut if there's nothing much to do. */ 138 if (unicode->length == length) 139 goto reset; 140 141 /* Resizing unicode_empty is not allowed. */ 142 if (unicode == unicode_empty) { 143 PyErr_SetString(PyExc_SystemError, 144 "can't resize empty unicode object"); 145 return -1; 146 } 147 148 /* We allocate one more byte to make sure the string is 149 Ux0000 terminated -- XXX is this needed ? */ 150 oldstr = unicode->str; 151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); 152 if (!unicode->str) { 153 unicode->str = oldstr; 154 PyErr_NoMemory(); 155 return -1; 156 } 157 unicode->str[length] = 0; 158 unicode->length = length; 159 160 reset: 161 /* Reset the object caches */ 162 if (unicode->defenc) { 163 Py_DECREF(unicode->defenc); 164 unicode->defenc = NULL; 165 } 166 unicode->hash = -1; 167 168 return 0; 169} 170 171int PyUnicode_Resize(PyObject **unicode, 172 int length) 173{ 174 PyUnicodeObject *v; 175 176 if (unicode == NULL) { 177 PyErr_BadInternalCall(); 178 return -1; 179 } 180 v = (PyUnicodeObject *)*unicode; 181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { 182 PyErr_BadInternalCall(); 183 return -1; 184 } 185 return _PyUnicode_Resize(v, length); 186} 187 188/* We allocate one more byte to make sure the string is 189 Ux0000 terminated -- XXX is this needed ? 190 191 XXX This allocator could further be enhanced by assuring that the 192 free list never reduces its size below 1. 193 194*/ 195 196static 197PyUnicodeObject *_PyUnicode_New(int length) 198{ 199 register PyUnicodeObject *unicode; 200 201 /* Optimization for empty strings */ 202 if (length == 0 && unicode_empty != NULL) { 203 Py_INCREF(unicode_empty); 204 return unicode_empty; 205 } 206 207 /* Unicode freelist & memory allocation */ 208 if (unicode_freelist) { 209 unicode = unicode_freelist; 210 unicode_freelist = *(PyUnicodeObject **)unicode; 211 unicode_freelist_size--; 212 if (unicode->str) { 213 /* Keep-Alive optimization: we only upsize the buffer, 214 never downsize it. */ 215 if ((unicode->length < length) && 216 _PyUnicode_Resize(unicode, length)) { 217 PyMem_DEL(unicode->str); 218 goto onError; 219 } 220 } 221 else { 222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 223 } 224 PyObject_INIT(unicode, &PyUnicode_Type); 225 } 226 else { 227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type); 228 if (unicode == NULL) 229 return NULL; 230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1); 231 } 232 233 if (!unicode->str) { 234 PyErr_NoMemory(); 235 goto onError; 236 } 237 unicode->str[length] = 0; 238 unicode->length = length; 239 unicode->hash = -1; 240 unicode->defenc = NULL; 241 return unicode; 242 243 onError: 244 _Py_ForgetReference((PyObject *)unicode); 245 PyObject_DEL(unicode); 246 return NULL; 247} 248 249static 250void _PyUnicode_Free(register PyUnicodeObject *unicode) 251{ 252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) { 253 /* Keep-Alive optimization */ 254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { 255 PyMem_DEL(unicode->str); 256 unicode->str = NULL; 257 unicode->length = 0; 258 } 259 if (unicode->defenc) { 260 Py_DECREF(unicode->defenc); 261 unicode->defenc = NULL; 262 } 263 /* Add to free list */ 264 *(PyUnicodeObject **)unicode = unicode_freelist; 265 unicode_freelist = unicode; 266 unicode_freelist_size++; 267 } 268 else { 269 PyMem_DEL(unicode->str); 270 Py_XDECREF(unicode->defenc); 271 PyObject_DEL(unicode); 272 } 273} 274 275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, 276 int size) 277{ 278 PyUnicodeObject *unicode; 279 280 unicode = _PyUnicode_New(size); 281 if (!unicode) 282 return NULL; 283 284 /* Copy the Unicode data into the new object */ 285 if (u != NULL) 286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE)); 287 288 return (PyObject *)unicode; 289} 290 291#ifdef HAVE_WCHAR_H 292 293PyObject *PyUnicode_FromWideChar(register const wchar_t *w, 294 int size) 295{ 296 PyUnicodeObject *unicode; 297 298 if (w == NULL) { 299 PyErr_BadInternalCall(); 300 return NULL; 301 } 302 303 unicode = _PyUnicode_New(size); 304 if (!unicode) 305 return NULL; 306 307 /* Copy the wchar_t data into the new object */ 308#ifdef HAVE_USABLE_WCHAR_T 309 memcpy(unicode->str, w, size * sizeof(wchar_t)); 310#else 311 { 312 register Py_UNICODE *u; 313 register int i; 314 u = PyUnicode_AS_UNICODE(unicode); 315 for (i = size; i >= 0; i--) 316 *u++ = *w++; 317 } 318#endif 319 320 return (PyObject *)unicode; 321} 322 323int PyUnicode_AsWideChar(PyUnicodeObject *unicode, 324 register wchar_t *w, 325 int size) 326{ 327 if (unicode == NULL) { 328 PyErr_BadInternalCall(); 329 return -1; 330 } 331 if (size > PyUnicode_GET_SIZE(unicode)) 332 size = PyUnicode_GET_SIZE(unicode); 333#ifdef HAVE_USABLE_WCHAR_T 334 memcpy(w, unicode->str, size * sizeof(wchar_t)); 335#else 336 { 337 register Py_UNICODE *u; 338 register int i; 339 u = PyUnicode_AS_UNICODE(unicode); 340 for (i = size; i >= 0; i--) 341 *w++ = *u++; 342 } 343#endif 344 345 return size; 346} 347 348#endif 349 350PyObject *PyUnicode_FromObject(register PyObject *obj) 351{ 352 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); 353} 354 355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, 356 const char *encoding, 357 const char *errors) 358{ 359 const char *s; 360 int len; 361 int owned = 0; 362 PyObject *v; 363 364 if (obj == NULL) { 365 PyErr_BadInternalCall(); 366 return NULL; 367 } 368 369 /* Coerce object */ 370 if (PyInstance_Check(obj)) { 371 PyObject *func; 372 func = PyObject_GetAttrString(obj, "__str__"); 373 if (func == NULL) { 374 PyErr_SetString(PyExc_TypeError, 375 "coercing to Unicode: instance doesn't define __str__"); 376 return NULL; 377 } 378 obj = PyEval_CallObject(func, NULL); 379 Py_DECREF(func); 380 if (obj == NULL) 381 return NULL; 382 owned = 1; 383 } 384 if (PyUnicode_Check(obj)) { 385 Py_INCREF(obj); 386 v = obj; 387 if (encoding) { 388 PyErr_SetString(PyExc_TypeError, 389 "decoding Unicode is not supported"); 390 return NULL; 391 } 392 goto done; 393 } 394 else if (PyString_Check(obj)) { 395 s = PyString_AS_STRING(obj); 396 len = PyString_GET_SIZE(obj); 397 } 398 else if (PyObject_AsCharBuffer(obj, &s, &len)) { 399 /* Overwrite the error message with something more useful in 400 case of a TypeError. */ 401 if (PyErr_ExceptionMatches(PyExc_TypeError)) 402 PyErr_Format(PyExc_TypeError, 403 "coercing to Unicode: need string or buffer, " 404 "%.80s found", 405 obj->ob_type->tp_name); 406 goto onError; 407 } 408 409 /* Convert to Unicode */ 410 if (len == 0) { 411 Py_INCREF(unicode_empty); 412 v = (PyObject *)unicode_empty; 413 } 414 else 415 v = PyUnicode_Decode(s, len, encoding, errors); 416 done: 417 if (owned) { 418 Py_DECREF(obj); 419 } 420 return v; 421 422 onError: 423 if (owned) { 424 Py_DECREF(obj); 425 } 426 return NULL; 427} 428 429PyObject *PyUnicode_Decode(const char *s, 430 int size, 431 const char *encoding, 432 const char *errors) 433{ 434 PyObject *buffer = NULL, *unicode; 435 436 if (encoding == NULL) 437 encoding = PyUnicode_GetDefaultEncoding(); 438 439 /* Shortcuts for common default encodings */ 440 if (strcmp(encoding, "utf-8") == 0) 441 return PyUnicode_DecodeUTF8(s, size, errors); 442 else if (strcmp(encoding, "latin-1") == 0) 443 return PyUnicode_DecodeLatin1(s, size, errors); 444 else if (strcmp(encoding, "ascii") == 0) 445 return PyUnicode_DecodeASCII(s, size, errors); 446 447 /* Decode via the codec registry */ 448 buffer = PyBuffer_FromMemory((void *)s, size); 449 if (buffer == NULL) 450 goto onError; 451 unicode = PyCodec_Decode(buffer, encoding, errors); 452 if (unicode == NULL) 453 goto onError; 454 if (!PyUnicode_Check(unicode)) { 455 PyErr_Format(PyExc_TypeError, 456 "decoder did not return an unicode object (type=%.400s)", 457 unicode->ob_type->tp_name); 458 Py_DECREF(unicode); 459 goto onError; 460 } 461 Py_DECREF(buffer); 462 return unicode; 463 464 onError: 465 Py_XDECREF(buffer); 466 return NULL; 467} 468 469PyObject *PyUnicode_Encode(const Py_UNICODE *s, 470 int size, 471 const char *encoding, 472 const char *errors) 473{ 474 PyObject *v, *unicode; 475 476 unicode = PyUnicode_FromUnicode(s, size); 477 if (unicode == NULL) 478 return NULL; 479 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 480 Py_DECREF(unicode); 481 return v; 482} 483 484PyObject *PyUnicode_AsEncodedString(PyObject *unicode, 485 const char *encoding, 486 const char *errors) 487{ 488 PyObject *v; 489 490 if (!PyUnicode_Check(unicode)) { 491 PyErr_BadArgument(); 492 goto onError; 493 } 494 495 if (encoding == NULL) 496 encoding = PyUnicode_GetDefaultEncoding(); 497 498 /* Shortcuts for common default encodings */ 499 if (errors == NULL) { 500 if (strcmp(encoding, "utf-8") == 0) 501 return PyUnicode_AsUTF8String(unicode); 502 else if (strcmp(encoding, "latin-1") == 0) 503 return PyUnicode_AsLatin1String(unicode); 504 else if (strcmp(encoding, "ascii") == 0) 505 return PyUnicode_AsASCIIString(unicode); 506 } 507 508 /* Encode via the codec registry */ 509 v = PyCodec_Encode(unicode, encoding, errors); 510 if (v == NULL) 511 goto onError; 512 /* XXX Should we really enforce this ? */ 513 if (!PyString_Check(v)) { 514 PyErr_Format(PyExc_TypeError, 515 "encoder did not return a string object (type=%.400s)", 516 v->ob_type->tp_name); 517 Py_DECREF(v); 518 goto onError; 519 } 520 return v; 521 522 onError: 523 return NULL; 524} 525 526/* Return a Python string holding the default encoded value of the 527 Unicode object. 528 529 The resulting string is cached in the Unicode object for subsequent 530 usage by this function. The cached version is needed to implement 531 the character buffer interface and will live (at least) as long as 532 the Unicode object itself. 533 534 The refcount of the string is *not* incremented. 535 536 *** Exported for internal use by the interpreter only !!! *** 537 538*/ 539 540PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, 541 const char *errors) 542{ 543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; 544 545 if (v) 546 return v; 547 v = PyUnicode_AsEncodedString(unicode, NULL, errors); 548 if (v && errors == NULL) 549 ((PyUnicodeObject *)unicode)->defenc = v; 550 return v; 551} 552 553Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) 554{ 555 if (!PyUnicode_Check(unicode)) { 556 PyErr_BadArgument(); 557 goto onError; 558 } 559 return PyUnicode_AS_UNICODE(unicode); 560 561 onError: 562 return NULL; 563} 564 565int PyUnicode_GetSize(PyObject *unicode) 566{ 567 if (!PyUnicode_Check(unicode)) { 568 PyErr_BadArgument(); 569 goto onError; 570 } 571 return PyUnicode_GET_SIZE(unicode); 572 573 onError: 574 return -1; 575} 576 577const char *PyUnicode_GetDefaultEncoding(void) 578{ 579 return unicode_default_encoding; 580} 581 582int PyUnicode_SetDefaultEncoding(const char *encoding) 583{ 584 PyObject *v; 585 586 /* Make sure the encoding is valid. As side effect, this also 587 loads the encoding into the codec registry cache. */ 588 v = _PyCodec_Lookup(encoding); 589 if (v == NULL) 590 goto onError; 591 Py_DECREF(v); 592 strncpy(unicode_default_encoding, 593 encoding, 594 sizeof(unicode_default_encoding)); 595 return 0; 596 597 onError: 598 return -1; 599} 600 601/* --- UTF-8 Codec -------------------------------------------------------- */ 602 603static 604char utf8_code_length[256] = { 605 /* Map UTF-8 encoded prefix byte to sequence length. zero means 606 illegal prefix. see RFC 2279 for details */ 607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 623}; 624 625static 626int utf8_decoding_error(const char **source, 627 Py_UNICODE **dest, 628 const char *errors, 629 const char *details) 630{ 631 if ((errors == NULL) || 632 (strcmp(errors,"strict") == 0)) { 633 PyErr_Format(PyExc_UnicodeError, 634 "UTF-8 decoding error: %.400s", 635 details); 636 return -1; 637 } 638 else if (strcmp(errors,"ignore") == 0) { 639 (*source)++; 640 return 0; 641 } 642 else if (strcmp(errors,"replace") == 0) { 643 (*source)++; 644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 645 (*dest)++; 646 return 0; 647 } 648 else { 649 PyErr_Format(PyExc_ValueError, 650 "UTF-8 decoding error; unknown error handling code: %.400s", 651 errors); 652 return -1; 653 } 654} 655 656PyObject *PyUnicode_DecodeUTF8(const char *s, 657 int size, 658 const char *errors) 659{ 660 int n; 661 const char *e; 662 PyUnicodeObject *unicode; 663 Py_UNICODE *p; 664 const char *errmsg = ""; 665 666 /* Note: size will always be longer than the resulting Unicode 667 character count */ 668 unicode = _PyUnicode_New(size); 669 if (!unicode) 670 return NULL; 671 if (size == 0) 672 return (PyObject *)unicode; 673 674 /* Unpack UTF-8 encoded data */ 675 p = unicode->str; 676 e = s + size; 677 678 while (s < e) { 679 Py_UCS4 ch = (unsigned char)*s; 680 681 if (ch < 0x80) { 682 *p++ = (Py_UNICODE)ch; 683 s++; 684 continue; 685 } 686 687 n = utf8_code_length[ch]; 688 689 if (s + n > e) { 690 errmsg = "unexpected end of data"; 691 goto utf8Error; 692 } 693 694 switch (n) { 695 696 case 0: 697 errmsg = "unexpected code byte"; 698 goto utf8Error; 699 break; 700 701 case 1: 702 errmsg = "internal error"; 703 goto utf8Error; 704 break; 705 706 case 2: 707 if ((s[1] & 0xc0) != 0x80) { 708 errmsg = "invalid data"; 709 goto utf8Error; 710 } 711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 712 if (ch < 0x80) { 713 errmsg = "illegal encoding"; 714 goto utf8Error; 715 } 716 else 717 *p++ = (Py_UNICODE)ch; 718 break; 719 720 case 3: 721 if ((s[1] & 0xc0) != 0x80 || 722 (s[2] & 0xc0) != 0x80) { 723 errmsg = "invalid data"; 724 goto utf8Error; 725 } 726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { 728 errmsg = "illegal encoding"; 729 goto utf8Error; 730 } 731 else 732 *p++ = (Py_UNICODE)ch; 733 break; 734 735 case 4: 736 if ((s[1] & 0xc0) != 0x80 || 737 (s[2] & 0xc0) != 0x80 || 738 (s[3] & 0xc0) != 0x80) { 739 errmsg = "invalid data"; 740 goto utf8Error; 741 } 742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 744 /* validate and convert to UTF-16 */ 745 if ((ch < 0x10000) || /* minimum value allowed for 4 746 byte encoding */ 747 (ch > 0x10ffff)) { /* maximum value allowed for 748 UTF-16 */ 749 errmsg = "illegal encoding"; 750 goto utf8Error; 751 } 752 /* compute and append the two surrogates: */ 753 754 /* translate from 10000..10FFFF to 0..FFFF */ 755 ch -= 0x10000; 756 757 /* high surrogate = top 10 bits added to D800 */ 758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); 759 760 /* low surrogate = bottom 10 bits added to DC00 */ 761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00)); 762 break; 763 764 default: 765 /* Other sizes are only needed for UCS-4 */ 766 errmsg = "unsupported Unicode code range"; 767 goto utf8Error; 768 break; 769 } 770 s += n; 771 continue; 772 773 utf8Error: 774 if (utf8_decoding_error(&s, &p, errors, errmsg)) 775 goto onError; 776 } 777 778 /* Adjust length */ 779 if (_PyUnicode_Resize(unicode, p - unicode->str)) 780 goto onError; 781 782 return (PyObject *)unicode; 783 784onError: 785 Py_DECREF(unicode); 786 return NULL; 787} 788 789/* Not used anymore, now that the encoder supports UTF-16 790 surrogates. */ 791#if 0 792static 793int utf8_encoding_error(const Py_UNICODE **source, 794 char **dest, 795 const char *errors, 796 const char *details) 797{ 798 if ((errors == NULL) || 799 (strcmp(errors,"strict") == 0)) { 800 PyErr_Format(PyExc_UnicodeError, 801 "UTF-8 encoding error: %.400s", 802 details); 803 return -1; 804 } 805 else if (strcmp(errors,"ignore") == 0) { 806 return 0; 807 } 808 else if (strcmp(errors,"replace") == 0) { 809 **dest = '?'; 810 (*dest)++; 811 return 0; 812 } 813 else { 814 PyErr_Format(PyExc_ValueError, 815 "UTF-8 encoding error; " 816 "unknown error handling code: %.400s", 817 errors); 818 return -1; 819 } 820} 821#endif 822 823PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, 824 int size, 825 const char *errors) 826{ 827 PyObject *v; 828 char *p; 829 char *q; 830 Py_UCS4 ch2; 831 unsigned int cbAllocated = 3 * size; 832 unsigned int cbWritten = 0; 833 int i = 0; 834 835 v = PyString_FromStringAndSize(NULL, cbAllocated); 836 if (v == NULL) 837 return NULL; 838 if (size == 0) 839 return v; 840 841 p = q = PyString_AS_STRING(v); 842 while (i < size) { 843 Py_UCS4 ch = s[i++]; 844 if (ch < 0x80) { 845 *p++ = (char) ch; 846 cbWritten++; 847 } 848 else if (ch < 0x0800) { 849 *p++ = 0xc0 | (ch >> 6); 850 *p++ = 0x80 | (ch & 0x3f); 851 cbWritten += 2; 852 } 853 else { 854 /* Check for high surrogate */ 855 if (0xD800 <= ch && ch <= 0xDBFF) { 856 if (i != size) { 857 ch2 = s[i]; 858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 859 860 if (cbWritten >= (cbAllocated - 4)) { 861 /* Provide enough room for some more 862 surrogates */ 863 cbAllocated += 4*10; 864 if (_PyString_Resize(&v, cbAllocated)) 865 goto onError; 866 } 867 868 /* combine the two values */ 869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; 870 871 *p++ = (char)((ch >> 18) | 0xf0); 872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 873 i++; 874 cbWritten += 4; 875 } 876 } 877 } 878 else { 879 *p++ = (char)(0xe0 | (ch >> 12)); 880 cbWritten += 3; 881 } 882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 883 *p++ = (char)(0x80 | (ch & 0x3f)); 884 } 885 } 886 *p = '\0'; 887 if (_PyString_Resize(&v, p - q)) 888 goto onError; 889 return v; 890 891 onError: 892 Py_DECREF(v); 893 return NULL; 894} 895 896PyObject *PyUnicode_AsUTF8String(PyObject *unicode) 897{ 898 if (!PyUnicode_Check(unicode)) { 899 PyErr_BadArgument(); 900 return NULL; 901 } 902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), 903 PyUnicode_GET_SIZE(unicode), 904 NULL); 905} 906 907/* --- UTF-16 Codec ------------------------------------------------------- */ 908 909static 910int utf16_decoding_error(const Py_UNICODE **source, 911 Py_UNICODE **dest, 912 const char *errors, 913 const char *details) 914{ 915 if ((errors == NULL) || 916 (strcmp(errors,"strict") == 0)) { 917 PyErr_Format(PyExc_UnicodeError, 918 "UTF-16 decoding error: %.400s", 919 details); 920 return -1; 921 } 922 else if (strcmp(errors,"ignore") == 0) { 923 return 0; 924 } 925 else if (strcmp(errors,"replace") == 0) { 926 if (dest) { 927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 928 (*dest)++; 929 } 930 return 0; 931 } 932 else { 933 PyErr_Format(PyExc_ValueError, 934 "UTF-16 decoding error; " 935 "unknown error handling code: %.400s", 936 errors); 937 return -1; 938 } 939} 940 941PyObject *PyUnicode_DecodeUTF16(const char *s, 942 int size, 943 const char *errors, 944 int *byteorder) 945{ 946 PyUnicodeObject *unicode; 947 Py_UNICODE *p; 948 const Py_UNICODE *q, *e; 949 int bo = 0; 950 const char *errmsg = ""; 951 952 /* size should be an even number */ 953 if (size % sizeof(Py_UNICODE) != 0) { 954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data")) 955 return NULL; 956 /* The remaining input chars are ignored if we fall through 957 here... */ 958 } 959 960 /* Note: size will always be longer than the resulting Unicode 961 character count */ 962 unicode = _PyUnicode_New(size); 963 if (!unicode) 964 return NULL; 965 if (size == 0) 966 return (PyObject *)unicode; 967 968 /* Unpack UTF-16 encoded data */ 969 p = unicode->str; 970 q = (Py_UNICODE *)s; 971 e = q + (size / sizeof(Py_UNICODE)); 972 973 if (byteorder) 974 bo = *byteorder; 975 976 while (q < e) { 977 register Py_UNICODE ch = *q++; 978 979 /* Check for BOM marks (U+FEFF) in the input and adjust 980 current byte order setting accordingly. Swap input 981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 982 !) */ 983#ifdef BYTEORDER_IS_LITTLE_ENDIAN 984 if (ch == 0xFEFF) { 985 bo = -1; 986 continue; 987 } else if (ch == 0xFFFE) { 988 bo = 1; 989 continue; 990 } 991 if (bo == 1) 992 ch = (ch >> 8) | (ch << 8); 993#else 994 if (ch == 0xFEFF) { 995 bo = 1; 996 continue; 997 } else if (ch == 0xFFFE) { 998 bo = -1; 999 continue; 1000 } 1001 if (bo == -1) 1002 ch = (ch >> 8) | (ch << 8); 1003#endif 1004 if (ch < 0xD800 || ch > 0xDFFF) { 1005 *p++ = ch; 1006 continue; 1007 } 1008 1009 /* UTF-16 code pair: */ 1010 if (q >= e) { 1011 errmsg = "unexpected end of data"; 1012 goto utf16Error; 1013 } 1014 if (0xDC00 <= *q && *q <= 0xDFFF) { 1015 q++; 1016 if (0xD800 <= *q && *q <= 0xDBFF) { 1017 /* This is valid data (a UTF-16 surrogate pair), but 1018 we are not able to store this information since our 1019 Py_UNICODE type only has 16 bits... this might 1020 change someday, even though it's unlikely. */ 1021 errmsg = "code pairs are not supported"; 1022 goto utf16Error; 1023 } 1024 else 1025 continue; 1026 } 1027 errmsg = "illegal encoding"; 1028 /* Fall through to report the error */ 1029 1030 utf16Error: 1031 if (utf16_decoding_error(&q, &p, errors, errmsg)) 1032 goto onError; 1033 } 1034 1035 if (byteorder) 1036 *byteorder = bo; 1037 1038 /* Adjust length */ 1039 if (_PyUnicode_Resize(unicode, p - unicode->str)) 1040 goto onError; 1041 1042 return (PyObject *)unicode; 1043 1044onError: 1045 Py_DECREF(unicode); 1046 return NULL; 1047} 1048 1049#undef UTF16_ERROR 1050 1051PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, 1052 int size, 1053 const char *errors, 1054 int byteorder) 1055{ 1056 PyObject *v; 1057 Py_UNICODE *p; 1058 char *q; 1059 1060 /* We don't create UTF-16 pairs... */ 1061 v = PyString_FromStringAndSize(NULL, 1062 sizeof(Py_UNICODE) * (size + (byteorder == 0))); 1063 if (v == NULL) 1064 return NULL; 1065 1066 q = PyString_AS_STRING(v); 1067 p = (Py_UNICODE *)q; 1068 if (byteorder == 0) 1069 *p++ = 0xFEFF; 1070 if (size == 0) 1071 return v; 1072 if (byteorder == 0 || 1073#ifdef BYTEORDER_IS_LITTLE_ENDIAN 1074 byteorder == -1 1075#else 1076 byteorder == 1 1077#endif 1078 ) 1079 memcpy(p, s, size * sizeof(Py_UNICODE)); 1080 else 1081 while (size-- > 0) { 1082 Py_UNICODE ch = *s++; 1083 *p++ = (ch >> 8) | (ch << 8); 1084 } 1085 return v; 1086} 1087 1088PyObject *PyUnicode_AsUTF16String(PyObject *unicode) 1089{ 1090 if (!PyUnicode_Check(unicode)) { 1091 PyErr_BadArgument(); 1092 return NULL; 1093 } 1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 1095 PyUnicode_GET_SIZE(unicode), 1096 NULL, 1097 0); 1098} 1099 1100/* --- Unicode Escape Codec ----------------------------------------------- */ 1101 1102static 1103int unicodeescape_decoding_error(const char **source, 1104 Py_UNICODE *x, 1105 const char *errors, 1106 const char *details) 1107{ 1108 if ((errors == NULL) || 1109 (strcmp(errors,"strict") == 0)) { 1110 PyErr_Format(PyExc_UnicodeError, 1111 "Unicode-Escape decoding error: %.400s", 1112 details); 1113 return -1; 1114 } 1115 else if (strcmp(errors,"ignore") == 0) { 1116 return 0; 1117 } 1118 else if (strcmp(errors,"replace") == 0) { 1119 *x = Py_UNICODE_REPLACEMENT_CHARACTER; 1120 return 0; 1121 } 1122 else { 1123 PyErr_Format(PyExc_ValueError, 1124 "Unicode-Escape decoding error; " 1125 "unknown error handling code: %.400s", 1126 errors); 1127 return -1; 1128 } 1129} 1130 1131static _Py_UCNHashAPI *pucnHash = NULL; 1132 1133static 1134int mystrnicmp(const char *s1, const char *s2, size_t count) 1135{ 1136 char c1, c2; 1137 1138 if (count) 1139 { 1140 do 1141 { 1142 c1 = tolower(*(s1++)); 1143 c2 = tolower(*(s2++)); 1144 } 1145 while(--count && c1 == c2); 1146 1147 return c1 - c2; 1148 } 1149 1150 return 0; 1151} 1152 1153PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, 1154 int size, 1155 const char *errors) 1156{ 1157 PyUnicodeObject *v; 1158 Py_UNICODE *p = NULL, *buf = NULL; 1159 const char *end; 1160 Py_UCS4 chr; 1161 1162 /* Escaped strings will always be longer than the resulting 1163 Unicode string, so we start with size here and then reduce the 1164 length after conversion to the true value. */ 1165 v = _PyUnicode_New(size); 1166 if (v == NULL) 1167 goto onError; 1168 if (size == 0) 1169 return (PyObject *)v; 1170 p = buf = PyUnicode_AS_UNICODE(v); 1171 end = s + size; 1172 while (s < end) { 1173 unsigned char c; 1174 Py_UNICODE x; 1175 int i; 1176 1177 /* Non-escape characters are interpreted as Unicode ordinals */ 1178 if (*s != '\\') { 1179 *p++ = (unsigned char)*s++; 1180 continue; 1181 } 1182 1183 /* \ - Escapes */ 1184 s++; 1185 switch (*s++) { 1186 1187 /* \x escapes */ 1188 case '\n': break; 1189 case '\\': *p++ = '\\'; break; 1190 case '\'': *p++ = '\''; break; 1191 case '\"': *p++ = '\"'; break; 1192 case 'b': *p++ = '\b'; break; 1193 case 'f': *p++ = '\014'; break; /* FF */ 1194 case 't': *p++ = '\t'; break; 1195 case 'n': *p++ = '\n'; break; 1196 case 'r': *p++ = '\r'; break; 1197 case 'v': *p++ = '\013'; break; /* VT */ 1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ 1199 1200 /* \OOO (octal) escapes */ 1201 case '0': case '1': case '2': case '3': 1202 case '4': case '5': case '6': case '7': 1203 x = s[-1] - '0'; 1204 if ('0' <= *s && *s <= '7') { 1205 x = (x<<3) + *s++ - '0'; 1206 if ('0' <= *s && *s <= '7') 1207 x = (x<<3) + *s++ - '0'; 1208 } 1209 *p++ = x; 1210 break; 1211 1212 /* \xXX with two hex digits */ 1213 case 'x': 1214 for (x = 0, i = 0; i < 2; i++) { 1215 c = (unsigned char)s[i]; 1216 if (!isxdigit(c)) { 1217 if (unicodeescape_decoding_error(&s, &x, errors, 1218 "truncated \\xXX")) 1219 goto onError; 1220 i++; 1221 break; 1222 } 1223 x = (x<<4) & ~0xF; 1224 if (c >= '0' && c <= '9') 1225 x += c - '0'; 1226 else if (c >= 'a' && c <= 'f') 1227 x += 10 + c - 'a'; 1228 else 1229 x += 10 + c - 'A'; 1230 } 1231 s += i; 1232 *p++ = x; 1233 break; 1234 1235 /* \uXXXX with 4 hex digits */ 1236 case 'u': 1237 for (x = 0, i = 0; i < 4; i++) { 1238 c = (unsigned char)s[i]; 1239 if (!isxdigit(c)) { 1240 if (unicodeescape_decoding_error(&s, &x, errors, 1241 "truncated \\uXXXX")) 1242 goto onError; 1243 i++; 1244 break; 1245 } 1246 x = (x<<4) & ~0xF; 1247 if (c >= '0' && c <= '9') 1248 x += c - '0'; 1249 else if (c >= 'a' && c <= 'f') 1250 x += 10 + c - 'a'; 1251 else 1252 x += 10 + c - 'A'; 1253 } 1254 s += i; 1255 *p++ = x; 1256 break; 1257 1258 /* \UXXXXXXXX with 8 hex digits */ 1259 case 'U': 1260 for (chr = 0, i = 0; i < 8; i++) { 1261 c = (unsigned char)s[i]; 1262 if (!isxdigit(c)) { 1263 if (unicodeescape_decoding_error(&s, &x, errors, 1264 "truncated \\uXXXX")) 1265 goto onError; 1266 i++; 1267 break; 1268 } 1269 chr = (chr<<4) & ~0xF; 1270 if (c >= '0' && c <= '9') 1271 chr += c - '0'; 1272 else if (c >= 'a' && c <= 'f') 1273 chr += 10 + c - 'a'; 1274 else 1275 chr += 10 + c - 'A'; 1276 } 1277 s += i; 1278 goto store; 1279 1280 case 'N': 1281 /* Ok, we need to deal with Unicode Character Names now, 1282 * make sure we've imported the hash table data... 1283 */ 1284 if (pucnHash == NULL) { 1285 PyObject *mod = 0, *v = 0; 1286 mod = PyImport_ImportModule("ucnhash"); 1287 if (mod == NULL) 1288 goto onError; 1289 v = PyObject_GetAttrString(mod,"ucnhashAPI"); 1290 Py_DECREF(mod); 1291 if (v == NULL) 1292 goto onError; 1293 pucnHash = PyCObject_AsVoidPtr(v); 1294 Py_DECREF(v); 1295 if (pucnHash == NULL) 1296 goto onError; 1297 } 1298 1299 if (*s == '{') { 1300 const char *start = s + 1; 1301 const char *endBrace = start; 1302 unsigned long j; 1303 1304 /* look for either the closing brace, or we 1305 * exceed the maximum length of the unicode character names 1306 */ 1307 while (*endBrace != '}' && 1308 (unsigned int)(endBrace - start) <= 1309 pucnHash->cchMax && 1310 endBrace < end) 1311 { 1312 endBrace++; 1313 } 1314 if (endBrace != end && *endBrace == '}') { 1315 j = pucnHash->hash(start, endBrace - start); 1316 if (j > pucnHash->cKeys || 1317 mystrnicmp( 1318 start, 1319 ((_Py_UnicodeCharacterName *) 1320 (pucnHash->getValue(j)))->pszUCN, 1321 (int)(endBrace - start)) != 0) 1322 { 1323 if (unicodeescape_decoding_error( 1324 &s, &x, errors, 1325 "Invalid Unicode Character Name")) 1326 { 1327 goto onError; 1328 } 1329 goto ucnFallthrough; 1330 } 1331 chr = ((_Py_UnicodeCharacterName *) 1332 (pucnHash->getValue(j)))->value; 1333 s = endBrace + 1; 1334 goto store; 1335 } else { 1336 if (unicodeescape_decoding_error( 1337 &s, &x, errors, 1338 "Unicode name missing closing brace")) 1339 goto onError; 1340 goto ucnFallthrough; 1341 } 1342 break; 1343 } 1344 if (unicodeescape_decoding_error( 1345 &s, &x, errors, 1346 "Missing opening brace for Unicode Character Name escape")) 1347 goto onError; 1348ucnFallthrough: 1349 /* fall through on purpose */ 1350 default: 1351 *p++ = '\\'; 1352 *p++ = (unsigned char)s[-1]; 1353 break; 1354store: 1355 /* when we get here, chr is a 32-bit unicode character */ 1356 if (chr <= 0xffff) 1357 /* UCS-2 character */ 1358 *p++ = (Py_UNICODE) chr; 1359 else if (chr <= 0x10ffff) { 1360 /* UCS-4 character. store as two surrogate characters */ 1361 chr -= 0x10000L; 1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); 1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00); 1364 } else { 1365 if (unicodeescape_decoding_error( 1366 &s, &x, errors, 1367 "Illegal Unicode character") 1368 ) 1369 goto onError; 1370 } 1371 } 1372 } 1373 if (_PyUnicode_Resize(v, (int)(p - buf))) 1374 goto onError; 1375 return (PyObject *)v; 1376 1377 onError: 1378 Py_XDECREF(v); 1379 return NULL; 1380} 1381 1382/* Return a Unicode-Escape string version of the Unicode object. 1383 1384 If quotes is true, the string is enclosed in u"" or u'' quotes as 1385 appropriate. 1386 1387*/ 1388 1389static const Py_UNICODE *findchar(const Py_UNICODE *s, 1390 int size, 1391 Py_UNICODE ch); 1392 1393static 1394PyObject *unicodeescape_string(const Py_UNICODE *s, 1395 int size, 1396 int quotes) 1397{ 1398 PyObject *repr; 1399 char *p; 1400 char *q; 1401 1402 static const char *hexdigit = "0123456789ABCDEF"; 1403 1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1); 1405 if (repr == NULL) 1406 return NULL; 1407 1408 p = q = PyString_AS_STRING(repr); 1409 1410 if (quotes) { 1411 *p++ = 'u'; 1412 *p++ = (findchar(s, size, '\'') && 1413 !findchar(s, size, '"')) ? '"' : '\''; 1414 } 1415 while (size-- > 0) { 1416 Py_UNICODE ch = *s++; 1417 /* Escape quotes */ 1418 if (quotes && (ch == q[1] || ch == '\\')) { 1419 *p++ = '\\'; 1420 *p++ = (char) ch; 1421 } 1422 /* Map 16-bit characters to '\uxxxx' */ 1423 else if (ch >= 256) { 1424 *p++ = '\\'; 1425 *p++ = 'u'; 1426 *p++ = hexdigit[(ch >> 12) & 0xf]; 1427 *p++ = hexdigit[(ch >> 8) & 0xf]; 1428 *p++ = hexdigit[(ch >> 4) & 0xf]; 1429 *p++ = hexdigit[ch & 15]; 1430 } 1431 /* Map non-printable US ASCII to '\ooo' */ 1432 else if (ch < ' ' || ch >= 128) { 1433 *p++ = '\\'; 1434 *p++ = hexdigit[(ch >> 6) & 7]; 1435 *p++ = hexdigit[(ch >> 3) & 7]; 1436 *p++ = hexdigit[ch & 7]; 1437 } 1438 /* Copy everything else as-is */ 1439 else 1440 *p++ = (char) ch; 1441 } 1442 if (quotes) 1443 *p++ = q[1]; 1444 1445 *p = '\0'; 1446 if (_PyString_Resize(&repr, p - q)) 1447 goto onError; 1448 1449 return repr; 1450 1451 onError: 1452 Py_DECREF(repr); 1453 return NULL; 1454} 1455 1456PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 1457 int size) 1458{ 1459 return unicodeescape_string(s, size, 0); 1460} 1461 1462PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 1463{ 1464 if (!PyUnicode_Check(unicode)) { 1465 PyErr_BadArgument(); 1466 return NULL; 1467 } 1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1469 PyUnicode_GET_SIZE(unicode)); 1470} 1471 1472/* --- Raw Unicode Escape Codec ------------------------------------------- */ 1473 1474PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, 1475 int size, 1476 const char *errors) 1477{ 1478 PyUnicodeObject *v; 1479 Py_UNICODE *p, *buf; 1480 const char *end; 1481 const char *bs; 1482 1483 /* Escaped strings will always be longer than the resulting 1484 Unicode string, so we start with size here and then reduce the 1485 length after conversion to the true value. */ 1486 v = _PyUnicode_New(size); 1487 if (v == NULL) 1488 goto onError; 1489 if (size == 0) 1490 return (PyObject *)v; 1491 p = buf = PyUnicode_AS_UNICODE(v); 1492 end = s + size; 1493 while (s < end) { 1494 unsigned char c; 1495 Py_UNICODE x; 1496 int i; 1497 1498 /* Non-escape characters are interpreted as Unicode ordinals */ 1499 if (*s != '\\') { 1500 *p++ = (unsigned char)*s++; 1501 continue; 1502 } 1503 1504 /* \u-escapes are only interpreted iff the number of leading 1505 backslashes if odd */ 1506 bs = s; 1507 for (;s < end;) { 1508 if (*s != '\\') 1509 break; 1510 *p++ = (unsigned char)*s++; 1511 } 1512 if (((s - bs) & 1) == 0 || 1513 s >= end || 1514 *s != 'u') { 1515 continue; 1516 } 1517 p--; 1518 s++; 1519 1520 /* \uXXXX with 4 hex digits */ 1521 for (x = 0, i = 0; i < 4; i++) { 1522 c = (unsigned char)s[i]; 1523 if (!isxdigit(c)) { 1524 if (unicodeescape_decoding_error(&s, &x, errors, 1525 "truncated \\uXXXX")) 1526 goto onError; 1527 i++; 1528 break; 1529 } 1530 x = (x<<4) & ~0xF; 1531 if (c >= '0' && c <= '9') 1532 x += c - '0'; 1533 else if (c >= 'a' && c <= 'f') 1534 x += 10 + c - 'a'; 1535 else 1536 x += 10 + c - 'A'; 1537 } 1538 s += i; 1539 *p++ = x; 1540 } 1541 if (_PyUnicode_Resize(v, (int)(p - buf))) 1542 goto onError; 1543 return (PyObject *)v; 1544 1545 onError: 1546 Py_XDECREF(v); 1547 return NULL; 1548} 1549 1550PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 1551 int size) 1552{ 1553 PyObject *repr; 1554 char *p; 1555 char *q; 1556 1557 static const char *hexdigit = "0123456789ABCDEF"; 1558 1559 repr = PyString_FromStringAndSize(NULL, 6 * size); 1560 if (repr == NULL) 1561 return NULL; 1562 if (size == 0) 1563 return repr; 1564 1565 p = q = PyString_AS_STRING(repr); 1566 while (size-- > 0) { 1567 Py_UNICODE ch = *s++; 1568 /* Map 16-bit characters to '\uxxxx' */ 1569 if (ch >= 256) { 1570 *p++ = '\\'; 1571 *p++ = 'u'; 1572 *p++ = hexdigit[(ch >> 12) & 0xf]; 1573 *p++ = hexdigit[(ch >> 8) & 0xf]; 1574 *p++ = hexdigit[(ch >> 4) & 0xf]; 1575 *p++ = hexdigit[ch & 15]; 1576 } 1577 /* Copy everything else as-is */ 1578 else 1579 *p++ = (char) ch; 1580 } 1581 *p = '\0'; 1582 if (_PyString_Resize(&repr, p - q)) 1583 goto onError; 1584 1585 return repr; 1586 1587 onError: 1588 Py_DECREF(repr); 1589 return NULL; 1590} 1591 1592PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 1593{ 1594 if (!PyUnicode_Check(unicode)) { 1595 PyErr_BadArgument(); 1596 return NULL; 1597 } 1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 1599 PyUnicode_GET_SIZE(unicode)); 1600} 1601 1602/* --- Latin-1 Codec ------------------------------------------------------ */ 1603 1604PyObject *PyUnicode_DecodeLatin1(const char *s, 1605 int size, 1606 const char *errors) 1607{ 1608 PyUnicodeObject *v; 1609 Py_UNICODE *p; 1610 1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 1612 v = _PyUnicode_New(size); 1613 if (v == NULL) 1614 goto onError; 1615 if (size == 0) 1616 return (PyObject *)v; 1617 p = PyUnicode_AS_UNICODE(v); 1618 while (size-- > 0) 1619 *p++ = (unsigned char)*s++; 1620 return (PyObject *)v; 1621 1622 onError: 1623 Py_XDECREF(v); 1624 return NULL; 1625} 1626 1627static 1628int latin1_encoding_error(const Py_UNICODE **source, 1629 char **dest, 1630 const char *errors, 1631 const char *details) 1632{ 1633 if ((errors == NULL) || 1634 (strcmp(errors,"strict") == 0)) { 1635 PyErr_Format(PyExc_UnicodeError, 1636 "Latin-1 encoding error: %.400s", 1637 details); 1638 return -1; 1639 } 1640 else if (strcmp(errors,"ignore") == 0) { 1641 return 0; 1642 } 1643 else if (strcmp(errors,"replace") == 0) { 1644 **dest = '?'; 1645 (*dest)++; 1646 return 0; 1647 } 1648 else { 1649 PyErr_Format(PyExc_ValueError, 1650 "Latin-1 encoding error; " 1651 "unknown error handling code: %.400s", 1652 errors); 1653 return -1; 1654 } 1655} 1656 1657PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, 1658 int size, 1659 const char *errors) 1660{ 1661 PyObject *repr; 1662 char *s, *start; 1663 1664 repr = PyString_FromStringAndSize(NULL, size); 1665 if (repr == NULL) 1666 return NULL; 1667 if (size == 0) 1668 return repr; 1669 1670 s = PyString_AS_STRING(repr); 1671 start = s; 1672 while (size-- > 0) { 1673 Py_UNICODE ch = *p++; 1674 if (ch >= 256) { 1675 if (latin1_encoding_error(&p, &s, errors, 1676 "ordinal not in range(256)")) 1677 goto onError; 1678 } 1679 else 1680 *s++ = (char)ch; 1681 } 1682 /* Resize if error handling skipped some characters */ 1683 if (s - start < PyString_GET_SIZE(repr)) 1684 if (_PyString_Resize(&repr, s - start)) 1685 goto onError; 1686 return repr; 1687 1688 onError: 1689 Py_DECREF(repr); 1690 return NULL; 1691} 1692 1693PyObject *PyUnicode_AsLatin1String(PyObject *unicode) 1694{ 1695 if (!PyUnicode_Check(unicode)) { 1696 PyErr_BadArgument(); 1697 return NULL; 1698 } 1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 1700 PyUnicode_GET_SIZE(unicode), 1701 NULL); 1702} 1703 1704/* --- 7-bit ASCII Codec -------------------------------------------------- */ 1705 1706static 1707int ascii_decoding_error(const char **source, 1708 Py_UNICODE **dest, 1709 const char *errors, 1710 const char *details) 1711{ 1712 if ((errors == NULL) || 1713 (strcmp(errors,"strict") == 0)) { 1714 PyErr_Format(PyExc_UnicodeError, 1715 "ASCII decoding error: %.400s", 1716 details); 1717 return -1; 1718 } 1719 else if (strcmp(errors,"ignore") == 0) { 1720 return 0; 1721 } 1722 else if (strcmp(errors,"replace") == 0) { 1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1724 (*dest)++; 1725 return 0; 1726 } 1727 else { 1728 PyErr_Format(PyExc_ValueError, 1729 "ASCII decoding error; " 1730 "unknown error handling code: %.400s", 1731 errors); 1732 return -1; 1733 } 1734} 1735 1736PyObject *PyUnicode_DecodeASCII(const char *s, 1737 int size, 1738 const char *errors) 1739{ 1740 PyUnicodeObject *v; 1741 Py_UNICODE *p; 1742 1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 1744 v = _PyUnicode_New(size); 1745 if (v == NULL) 1746 goto onError; 1747 if (size == 0) 1748 return (PyObject *)v; 1749 p = PyUnicode_AS_UNICODE(v); 1750 while (size-- > 0) { 1751 register unsigned char c; 1752 1753 c = (unsigned char)*s++; 1754 if (c < 128) 1755 *p++ = c; 1756 else if (ascii_decoding_error(&s, &p, errors, 1757 "ordinal not in range(128)")) 1758 goto onError; 1759 } 1760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) 1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 1762 goto onError; 1763 return (PyObject *)v; 1764 1765 onError: 1766 Py_XDECREF(v); 1767 return NULL; 1768} 1769 1770static 1771int ascii_encoding_error(const Py_UNICODE **source, 1772 char **dest, 1773 const char *errors, 1774 const char *details) 1775{ 1776 if ((errors == NULL) || 1777 (strcmp(errors,"strict") == 0)) { 1778 PyErr_Format(PyExc_UnicodeError, 1779 "ASCII encoding error: %.400s", 1780 details); 1781 return -1; 1782 } 1783 else if (strcmp(errors,"ignore") == 0) { 1784 return 0; 1785 } 1786 else if (strcmp(errors,"replace") == 0) { 1787 **dest = '?'; 1788 (*dest)++; 1789 return 0; 1790 } 1791 else { 1792 PyErr_Format(PyExc_ValueError, 1793 "ASCII encoding error; " 1794 "unknown error handling code: %.400s", 1795 errors); 1796 return -1; 1797 } 1798} 1799 1800PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, 1801 int size, 1802 const char *errors) 1803{ 1804 PyObject *repr; 1805 char *s, *start; 1806 1807 repr = PyString_FromStringAndSize(NULL, size); 1808 if (repr == NULL) 1809 return NULL; 1810 if (size == 0) 1811 return repr; 1812 1813 s = PyString_AS_STRING(repr); 1814 start = s; 1815 while (size-- > 0) { 1816 Py_UNICODE ch = *p++; 1817 if (ch >= 128) { 1818 if (ascii_encoding_error(&p, &s, errors, 1819 "ordinal not in range(128)")) 1820 goto onError; 1821 } 1822 else 1823 *s++ = (char)ch; 1824 } 1825 /* Resize if error handling skipped some characters */ 1826 if (s - start < PyString_GET_SIZE(repr)) 1827 if (_PyString_Resize(&repr, s - start)) 1828 goto onError; 1829 return repr; 1830 1831 onError: 1832 Py_DECREF(repr); 1833 return NULL; 1834} 1835 1836PyObject *PyUnicode_AsASCIIString(PyObject *unicode) 1837{ 1838 if (!PyUnicode_Check(unicode)) { 1839 PyErr_BadArgument(); 1840 return NULL; 1841 } 1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 1843 PyUnicode_GET_SIZE(unicode), 1844 NULL); 1845} 1846 1847#ifdef MS_WIN32 1848 1849/* --- MBCS codecs for Windows -------------------------------------------- */ 1850 1851PyObject *PyUnicode_DecodeMBCS(const char *s, 1852 int size, 1853 const char *errors) 1854{ 1855 PyUnicodeObject *v; 1856 Py_UNICODE *p; 1857 1858 /* First get the size of the result */ 1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); 1860 if (size > 0 && usize==0) 1861 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1862 1863 v = _PyUnicode_New(usize); 1864 if (v == NULL) 1865 return NULL; 1866 if (usize == 0) 1867 return (PyObject *)v; 1868 p = PyUnicode_AS_UNICODE(v); 1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { 1870 Py_DECREF(v); 1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1872 } 1873 1874 return (PyObject *)v; 1875} 1876 1877PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, 1878 int size, 1879 const char *errors) 1880{ 1881 PyObject *repr; 1882 char *s; 1883 DWORD mbcssize; 1884 1885 /* If there are no characters, bail now! */ 1886 if (size==0) 1887 return PyString_FromString(""); 1888 1889 /* First get the size of the result */ 1890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); 1891 if (mbcssize==0) 1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1893 1894 repr = PyString_FromStringAndSize(NULL, mbcssize); 1895 if (repr == NULL) 1896 return NULL; 1897 if (mbcssize == 0) 1898 return repr; 1899 1900 /* Do the conversion */ 1901 s = PyString_AS_STRING(repr); 1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { 1903 Py_DECREF(repr); 1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL); 1905 } 1906 return repr; 1907} 1908 1909#endif /* MS_WIN32 */ 1910 1911/* --- Character Mapping Codec -------------------------------------------- */ 1912 1913static 1914int charmap_decoding_error(const char **source, 1915 Py_UNICODE **dest, 1916 const char *errors, 1917 const char *details) 1918{ 1919 if ((errors == NULL) || 1920 (strcmp(errors,"strict") == 0)) { 1921 PyErr_Format(PyExc_UnicodeError, 1922 "charmap decoding error: %.400s", 1923 details); 1924 return -1; 1925 } 1926 else if (strcmp(errors,"ignore") == 0) { 1927 return 0; 1928 } 1929 else if (strcmp(errors,"replace") == 0) { 1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER; 1931 (*dest)++; 1932 return 0; 1933 } 1934 else { 1935 PyErr_Format(PyExc_ValueError, 1936 "charmap decoding error; " 1937 "unknown error handling code: %.400s", 1938 errors); 1939 return -1; 1940 } 1941} 1942 1943PyObject *PyUnicode_DecodeCharmap(const char *s, 1944 int size, 1945 PyObject *mapping, 1946 const char *errors) 1947{ 1948 PyUnicodeObject *v; 1949 Py_UNICODE *p; 1950 int extrachars = 0; 1951 1952 /* Default to Latin-1 */ 1953 if (mapping == NULL) 1954 return PyUnicode_DecodeLatin1(s, size, errors); 1955 1956 v = _PyUnicode_New(size); 1957 if (v == NULL) 1958 goto onError; 1959 if (size == 0) 1960 return (PyObject *)v; 1961 p = PyUnicode_AS_UNICODE(v); 1962 while (size-- > 0) { 1963 unsigned char ch = *s++; 1964 PyObject *w, *x; 1965 1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 1967 w = PyInt_FromLong((long)ch); 1968 if (w == NULL) 1969 goto onError; 1970 x = PyObject_GetItem(mapping, w); 1971 Py_DECREF(w); 1972 if (x == NULL) { 1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 1974 /* No mapping found means: mapping is undefined. */ 1975 PyErr_Clear(); 1976 x = Py_None; 1977 Py_INCREF(x); 1978 } else 1979 goto onError; 1980 } 1981 1982 /* Apply mapping */ 1983 if (PyInt_Check(x)) { 1984 long value = PyInt_AS_LONG(x); 1985 if (value < 0 || value > 65535) { 1986 PyErr_SetString(PyExc_TypeError, 1987 "character mapping must be in range(65536)"); 1988 Py_DECREF(x); 1989 goto onError; 1990 } 1991 *p++ = (Py_UNICODE)value; 1992 } 1993 else if (x == Py_None) { 1994 /* undefined mapping */ 1995 if (charmap_decoding_error(&s, &p, errors, 1996 "character maps to <undefined>")) { 1997 Py_DECREF(x); 1998 goto onError; 1999 } 2000 } 2001 else if (PyUnicode_Check(x)) { 2002 int targetsize = PyUnicode_GET_SIZE(x); 2003 2004 if (targetsize == 1) 2005 /* 1-1 mapping */ 2006 *p++ = *PyUnicode_AS_UNICODE(x); 2007 2008 else if (targetsize > 1) { 2009 /* 1-n mapping */ 2010 if (targetsize > extrachars) { 2011 /* resize first */ 2012 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v)); 2013 int needed = (targetsize - extrachars) + \ 2014 (targetsize << 2); 2015 extrachars += needed; 2016 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) { 2017 Py_DECREF(x); 2018 goto onError; 2019 } 2020 p = PyUnicode_AS_UNICODE(v) + oldpos; 2021 } 2022 Py_UNICODE_COPY(p, 2023 PyUnicode_AS_UNICODE(x), 2024 targetsize); 2025 p += targetsize; 2026 extrachars -= targetsize; 2027 } 2028 /* 1-0 mapping: skip the character */ 2029 } 2030 else { 2031 /* wrong return value */ 2032 PyErr_SetString(PyExc_TypeError, 2033 "character mapping must return integer, None or unicode"); 2034 Py_DECREF(x); 2035 goto onError; 2036 } 2037 Py_DECREF(x); 2038 } 2039 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2040 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2041 goto onError; 2042 return (PyObject *)v; 2043 2044 onError: 2045 Py_XDECREF(v); 2046 return NULL; 2047} 2048 2049static 2050int charmap_encoding_error(const Py_UNICODE **source, 2051 char **dest, 2052 const char *errors, 2053 const char *details) 2054{ 2055 if ((errors == NULL) || 2056 (strcmp(errors,"strict") == 0)) { 2057 PyErr_Format(PyExc_UnicodeError, 2058 "charmap encoding error: %.400s", 2059 details); 2060 return -1; 2061 } 2062 else if (strcmp(errors,"ignore") == 0) { 2063 return 0; 2064 } 2065 else if (strcmp(errors,"replace") == 0) { 2066 **dest = '?'; 2067 (*dest)++; 2068 return 0; 2069 } 2070 else { 2071 PyErr_Format(PyExc_ValueError, 2072 "charmap encoding error; " 2073 "unknown error handling code: %.400s", 2074 errors); 2075 return -1; 2076 } 2077} 2078 2079PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, 2080 int size, 2081 PyObject *mapping, 2082 const char *errors) 2083{ 2084 PyObject *v; 2085 char *s; 2086 int extrachars = 0; 2087 2088 /* Default to Latin-1 */ 2089 if (mapping == NULL) 2090 return PyUnicode_EncodeLatin1(p, size, errors); 2091 2092 v = PyString_FromStringAndSize(NULL, size); 2093 if (v == NULL) 2094 return NULL; 2095 if (size == 0) 2096 return v; 2097 s = PyString_AS_STRING(v); 2098 while (size-- > 0) { 2099 Py_UNICODE ch = *p++; 2100 PyObject *w, *x; 2101 2102 /* Get mapping (Unicode ordinal -> string char, integer or None) */ 2103 w = PyInt_FromLong((long)ch); 2104 if (w == NULL) 2105 goto onError; 2106 x = PyObject_GetItem(mapping, w); 2107 Py_DECREF(w); 2108 if (x == NULL) { 2109 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2110 /* No mapping found means: mapping is undefined. */ 2111 PyErr_Clear(); 2112 x = Py_None; 2113 Py_INCREF(x); 2114 } else 2115 goto onError; 2116 } 2117 2118 /* Apply mapping */ 2119 if (PyInt_Check(x)) { 2120 long value = PyInt_AS_LONG(x); 2121 if (value < 0 || value > 255) { 2122 PyErr_SetString(PyExc_TypeError, 2123 "character mapping must be in range(256)"); 2124 Py_DECREF(x); 2125 goto onError; 2126 } 2127 *s++ = (char)value; 2128 } 2129 else if (x == Py_None) { 2130 /* undefined mapping */ 2131 if (charmap_encoding_error(&p, &s, errors, 2132 "character maps to <undefined>")) { 2133 Py_DECREF(x); 2134 goto onError; 2135 } 2136 } 2137 else if (PyString_Check(x)) { 2138 int targetsize = PyString_GET_SIZE(x); 2139 2140 if (targetsize == 1) 2141 /* 1-1 mapping */ 2142 *s++ = *PyString_AS_STRING(x); 2143 2144 else if (targetsize > 1) { 2145 /* 1-n mapping */ 2146 if (targetsize > extrachars) { 2147 /* resize first */ 2148 int oldpos = (int)(s - PyString_AS_STRING(v)); 2149 int needed = (targetsize - extrachars) + \ 2150 (targetsize << 2); 2151 extrachars += needed; 2152 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { 2153 Py_DECREF(x); 2154 goto onError; 2155 } 2156 s = PyString_AS_STRING(v) + oldpos; 2157 } 2158 memcpy(s, 2159 PyString_AS_STRING(x), 2160 targetsize); 2161 s += targetsize; 2162 extrachars -= targetsize; 2163 } 2164 /* 1-0 mapping: skip the character */ 2165 } 2166 else { 2167 /* wrong return value */ 2168 PyErr_SetString(PyExc_TypeError, 2169 "character mapping must return integer, None or unicode"); 2170 Py_DECREF(x); 2171 goto onError; 2172 } 2173 Py_DECREF(x); 2174 } 2175 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) 2176 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)))) 2177 goto onError; 2178 return v; 2179 2180 onError: 2181 Py_DECREF(v); 2182 return NULL; 2183} 2184 2185PyObject *PyUnicode_AsCharmapString(PyObject *unicode, 2186 PyObject *mapping) 2187{ 2188 if (!PyUnicode_Check(unicode) || mapping == NULL) { 2189 PyErr_BadArgument(); 2190 return NULL; 2191 } 2192 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 2193 PyUnicode_GET_SIZE(unicode), 2194 mapping, 2195 NULL); 2196} 2197 2198static 2199int translate_error(const Py_UNICODE **source, 2200 Py_UNICODE **dest, 2201 const char *errors, 2202 const char *details) 2203{ 2204 if ((errors == NULL) || 2205 (strcmp(errors,"strict") == 0)) { 2206 PyErr_Format(PyExc_UnicodeError, 2207 "translate error: %.400s", 2208 details); 2209 return -1; 2210 } 2211 else if (strcmp(errors,"ignore") == 0) { 2212 return 0; 2213 } 2214 else if (strcmp(errors,"replace") == 0) { 2215 **dest = '?'; 2216 (*dest)++; 2217 return 0; 2218 } 2219 else { 2220 PyErr_Format(PyExc_ValueError, 2221 "translate error; " 2222 "unknown error handling code: %.400s", 2223 errors); 2224 return -1; 2225 } 2226} 2227 2228PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, 2229 int size, 2230 PyObject *mapping, 2231 const char *errors) 2232{ 2233 PyUnicodeObject *v; 2234 Py_UNICODE *p; 2235 2236 if (mapping == NULL) { 2237 PyErr_BadArgument(); 2238 return NULL; 2239 } 2240 2241 /* Output will never be longer than input */ 2242 v = _PyUnicode_New(size); 2243 if (v == NULL) 2244 goto onError; 2245 if (size == 0) 2246 goto done; 2247 p = PyUnicode_AS_UNICODE(v); 2248 while (size-- > 0) { 2249 Py_UNICODE ch = *s++; 2250 PyObject *w, *x; 2251 2252 /* Get mapping */ 2253 w = PyInt_FromLong(ch); 2254 if (w == NULL) 2255 goto onError; 2256 x = PyObject_GetItem(mapping, w); 2257 Py_DECREF(w); 2258 if (x == NULL) { 2259 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 2260 /* No mapping found: default to 1-1 mapping */ 2261 PyErr_Clear(); 2262 *p++ = ch; 2263 continue; 2264 } 2265 goto onError; 2266 } 2267 2268 /* Apply mapping */ 2269 if (PyInt_Check(x)) 2270 *p++ = (Py_UNICODE)PyInt_AS_LONG(x); 2271 else if (x == Py_None) { 2272 /* undefined mapping */ 2273 if (translate_error(&s, &p, errors, 2274 "character maps to <undefined>")) { 2275 Py_DECREF(x); 2276 goto onError; 2277 } 2278 } 2279 else if (PyUnicode_Check(x)) { 2280 if (PyUnicode_GET_SIZE(x) != 1) { 2281 /* 1-n mapping */ 2282 PyErr_SetString(PyExc_NotImplementedError, 2283 "1-n mappings are currently not implemented"); 2284 Py_DECREF(x); 2285 goto onError; 2286 } 2287 *p++ = *PyUnicode_AS_UNICODE(x); 2288 } 2289 else { 2290 /* wrong return value */ 2291 PyErr_SetString(PyExc_TypeError, 2292 "translate mapping must return integer, None or unicode"); 2293 Py_DECREF(x); 2294 goto onError; 2295 } 2296 Py_DECREF(x); 2297 } 2298 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 2299 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) 2300 goto onError; 2301 2302 done: 2303 return (PyObject *)v; 2304 2305 onError: 2306 Py_XDECREF(v); 2307 return NULL; 2308} 2309 2310PyObject *PyUnicode_Translate(PyObject *str, 2311 PyObject *mapping, 2312 const char *errors) 2313{ 2314 PyObject *result; 2315 2316 str = PyUnicode_FromObject(str); 2317 if (str == NULL) 2318 goto onError; 2319 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), 2320 PyUnicode_GET_SIZE(str), 2321 mapping, 2322 errors); 2323 Py_DECREF(str); 2324 return result; 2325 2326 onError: 2327 Py_XDECREF(str); 2328 return NULL; 2329} 2330 2331/* --- Decimal Encoder ---------------------------------------------------- */ 2332 2333int PyUnicode_EncodeDecimal(Py_UNICODE *s, 2334 int length, 2335 char *output, 2336 const char *errors) 2337{ 2338 Py_UNICODE *p, *end; 2339 2340 if (output == NULL) { 2341 PyErr_BadArgument(); 2342 return -1; 2343 } 2344 2345 p = s; 2346 end = s + length; 2347 while (p < end) { 2348 register Py_UNICODE ch = *p++; 2349 int decimal; 2350 2351 if (Py_UNICODE_ISSPACE(ch)) { 2352 *output++ = ' '; 2353 continue; 2354 } 2355 decimal = Py_UNICODE_TODECIMAL(ch); 2356 if (decimal >= 0) { 2357 *output++ = '0' + decimal; 2358 continue; 2359 } 2360 if (0 < ch && ch < 256) { 2361 *output++ = (char)ch; 2362 continue; 2363 } 2364 /* All other characters are considered invalid */ 2365 if (errors == NULL || strcmp(errors, "strict") == 0) { 2366 PyErr_SetString(PyExc_ValueError, 2367 "invalid decimal Unicode string"); 2368 goto onError; 2369 } 2370 else if (strcmp(errors, "ignore") == 0) 2371 continue; 2372 else if (strcmp(errors, "replace") == 0) { 2373 *output++ = '?'; 2374 continue; 2375 } 2376 } 2377 /* 0-terminate the output string */ 2378 *output++ = '\0'; 2379 return 0; 2380 2381 onError: 2382 return -1; 2383} 2384 2385/* --- Helpers ------------------------------------------------------------ */ 2386 2387static 2388int count(PyUnicodeObject *self, 2389 int start, 2390 int end, 2391 PyUnicodeObject *substring) 2392{ 2393 int count = 0; 2394 2395 if (substring->length == 0) 2396 return (end - start + 1); 2397 2398 end -= substring->length; 2399 2400 while (start <= end) 2401 if (Py_UNICODE_MATCH(self, start, substring)) { 2402 count++; 2403 start += substring->length; 2404 } else 2405 start++; 2406 2407 return count; 2408} 2409 2410int PyUnicode_Count(PyObject *str, 2411 PyObject *substr, 2412 int start, 2413 int end) 2414{ 2415 int result; 2416 2417 str = PyUnicode_FromObject(str); 2418 if (str == NULL) 2419 return -1; 2420 substr = PyUnicode_FromObject(substr); 2421 if (substr == NULL) { 2422 Py_DECREF(str); 2423 return -1; 2424 } 2425 2426 result = count((PyUnicodeObject *)str, 2427 start, end, 2428 (PyUnicodeObject *)substr); 2429 2430 Py_DECREF(str); 2431 Py_DECREF(substr); 2432 return result; 2433} 2434 2435static 2436int findstring(PyUnicodeObject *self, 2437 PyUnicodeObject *substring, 2438 int start, 2439 int end, 2440 int direction) 2441{ 2442 if (start < 0) 2443 start += self->length; 2444 if (start < 0) 2445 start = 0; 2446 2447 if (substring->length == 0) 2448 return start; 2449 2450 if (end > self->length) 2451 end = self->length; 2452 if (end < 0) 2453 end += self->length; 2454 if (end < 0) 2455 end = 0; 2456 2457 end -= substring->length; 2458 2459 if (direction < 0) { 2460 for (; end >= start; end--) 2461 if (Py_UNICODE_MATCH(self, end, substring)) 2462 return end; 2463 } else { 2464 for (; start <= end; start++) 2465 if (Py_UNICODE_MATCH(self, start, substring)) 2466 return start; 2467 } 2468 2469 return -1; 2470} 2471 2472int PyUnicode_Find(PyObject *str, 2473 PyObject *substr, 2474 int start, 2475 int end, 2476 int direction) 2477{ 2478 int result; 2479 2480 str = PyUnicode_FromObject(str); 2481 if (str == NULL) 2482 return -1; 2483 substr = PyUnicode_FromObject(substr); 2484 if (substr == NULL) { 2485 Py_DECREF(substr); 2486 return -1; 2487 } 2488 2489 result = findstring((PyUnicodeObject *)str, 2490 (PyUnicodeObject *)substr, 2491 start, end, direction); 2492 Py_DECREF(str); 2493 Py_DECREF(substr); 2494 return result; 2495} 2496 2497static 2498int tailmatch(PyUnicodeObject *self, 2499 PyUnicodeObject *substring, 2500 int start, 2501 int end, 2502 int direction) 2503{ 2504 if (start < 0) 2505 start += self->length; 2506 if (start < 0) 2507 start = 0; 2508 2509 if (substring->length == 0) 2510 return 1; 2511 2512 if (end > self->length) 2513 end = self->length; 2514 if (end < 0) 2515 end += self->length; 2516 if (end < 0) 2517 end = 0; 2518 2519 end -= substring->length; 2520 if (end < start) 2521 return 0; 2522 2523 if (direction > 0) { 2524 if (Py_UNICODE_MATCH(self, end, substring)) 2525 return 1; 2526 } else { 2527 if (Py_UNICODE_MATCH(self, start, substring)) 2528 return 1; 2529 } 2530 2531 return 0; 2532} 2533 2534int PyUnicode_Tailmatch(PyObject *str, 2535 PyObject *substr, 2536 int start, 2537 int end, 2538 int direction) 2539{ 2540 int result; 2541 2542 str = PyUnicode_FromObject(str); 2543 if (str == NULL) 2544 return -1; 2545 substr = PyUnicode_FromObject(substr); 2546 if (substr == NULL) { 2547 Py_DECREF(substr); 2548 return -1; 2549 } 2550 2551 result = tailmatch((PyUnicodeObject *)str, 2552 (PyUnicodeObject *)substr, 2553 start, end, direction); 2554 Py_DECREF(str); 2555 Py_DECREF(substr); 2556 return result; 2557} 2558 2559static 2560const Py_UNICODE *findchar(const Py_UNICODE *s, 2561 int size, 2562 Py_UNICODE ch) 2563{ 2564 /* like wcschr, but doesn't stop at NULL characters */ 2565 2566 while (size-- > 0) { 2567 if (*s == ch) 2568 return s; 2569 s++; 2570 } 2571 2572 return NULL; 2573} 2574 2575/* Apply fixfct filter to the Unicode object self and return a 2576 reference to the modified object */ 2577 2578static 2579PyObject *fixup(PyUnicodeObject *self, 2580 int (*fixfct)(PyUnicodeObject *s)) 2581{ 2582 2583 PyUnicodeObject *u; 2584 2585 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str, 2586 self->length); 2587 if (u == NULL) 2588 return NULL; 2589 if (!fixfct(u)) { 2590 /* fixfct should return TRUE if it modified the buffer. If 2591 FALSE, return a reference to the original buffer instead 2592 (to save space, not time) */ 2593 Py_INCREF(self); 2594 Py_DECREF(u); 2595 return (PyObject*) self; 2596 } 2597 return (PyObject*) u; 2598} 2599 2600static 2601int fixupper(PyUnicodeObject *self) 2602{ 2603 int len = self->length; 2604 Py_UNICODE *s = self->str; 2605 int status = 0; 2606 2607 while (len-- > 0) { 2608 register Py_UNICODE ch; 2609 2610 ch = Py_UNICODE_TOUPPER(*s); 2611 if (ch != *s) { 2612 status = 1; 2613 *s = ch; 2614 } 2615 s++; 2616 } 2617 2618 return status; 2619} 2620 2621static 2622int fixlower(PyUnicodeObject *self) 2623{ 2624 int len = self->length; 2625 Py_UNICODE *s = self->str; 2626 int status = 0; 2627 2628 while (len-- > 0) { 2629 register Py_UNICODE ch; 2630 2631 ch = Py_UNICODE_TOLOWER(*s); 2632 if (ch != *s) { 2633 status = 1; 2634 *s = ch; 2635 } 2636 s++; 2637 } 2638 2639 return status; 2640} 2641 2642static 2643int fixswapcase(PyUnicodeObject *self) 2644{ 2645 int len = self->length; 2646 Py_UNICODE *s = self->str; 2647 int status = 0; 2648 2649 while (len-- > 0) { 2650 if (Py_UNICODE_ISUPPER(*s)) { 2651 *s = Py_UNICODE_TOLOWER(*s); 2652 status = 1; 2653 } else if (Py_UNICODE_ISLOWER(*s)) { 2654 *s = Py_UNICODE_TOUPPER(*s); 2655 status = 1; 2656 } 2657 s++; 2658 } 2659 2660 return status; 2661} 2662 2663static 2664int fixcapitalize(PyUnicodeObject *self) 2665{ 2666 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) { 2667 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]); 2668 return 1; 2669 } 2670 return 0; 2671} 2672 2673static 2674int fixtitle(PyUnicodeObject *self) 2675{ 2676 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 2677 register Py_UNICODE *e; 2678 int previous_is_cased; 2679 2680 /* Shortcut for single character strings */ 2681 if (PyUnicode_GET_SIZE(self) == 1) { 2682 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); 2683 if (*p != ch) { 2684 *p = ch; 2685 return 1; 2686 } 2687 else 2688 return 0; 2689 } 2690 2691 e = p + PyUnicode_GET_SIZE(self); 2692 previous_is_cased = 0; 2693 for (; p < e; p++) { 2694 register const Py_UNICODE ch = *p; 2695 2696 if (previous_is_cased) 2697 *p = Py_UNICODE_TOLOWER(ch); 2698 else 2699 *p = Py_UNICODE_TOTITLE(ch); 2700 2701 if (Py_UNICODE_ISLOWER(ch) || 2702 Py_UNICODE_ISUPPER(ch) || 2703 Py_UNICODE_ISTITLE(ch)) 2704 previous_is_cased = 1; 2705 else 2706 previous_is_cased = 0; 2707 } 2708 return 1; 2709} 2710 2711PyObject *PyUnicode_Join(PyObject *separator, 2712 PyObject *seq) 2713{ 2714 Py_UNICODE *sep; 2715 int seplen; 2716 PyUnicodeObject *res = NULL; 2717 int reslen = 0; 2718 Py_UNICODE *p; 2719 int seqlen = 0; 2720 int sz = 100; 2721 int i; 2722 2723 seqlen = PySequence_Size(seq); 2724 if (seqlen < 0 && PyErr_Occurred()) 2725 return NULL; 2726 2727 if (separator == NULL) { 2728 Py_UNICODE blank = ' '; 2729 sep = ␣ 2730 seplen = 1; 2731 } 2732 else { 2733 separator = PyUnicode_FromObject(separator); 2734 if (separator == NULL) 2735 return NULL; 2736 sep = PyUnicode_AS_UNICODE(separator); 2737 seplen = PyUnicode_GET_SIZE(separator); 2738 } 2739 2740 res = _PyUnicode_New(sz); 2741 if (res == NULL) 2742 goto onError; 2743 p = PyUnicode_AS_UNICODE(res); 2744 reslen = 0; 2745 2746 for (i = 0; i < seqlen; i++) { 2747 int itemlen; 2748 PyObject *item; 2749 2750 item = PySequence_GetItem(seq, i); 2751 if (item == NULL) 2752 goto onError; 2753 if (!PyUnicode_Check(item)) { 2754 PyObject *v; 2755 v = PyUnicode_FromObject(item); 2756 Py_DECREF(item); 2757 item = v; 2758 if (item == NULL) 2759 goto onError; 2760 } 2761 itemlen = PyUnicode_GET_SIZE(item); 2762 while (reslen + itemlen + seplen >= sz) { 2763 if (_PyUnicode_Resize(res, sz*2)) 2764 goto onError; 2765 sz *= 2; 2766 p = PyUnicode_AS_UNICODE(res) + reslen; 2767 } 2768 if (i > 0) { 2769 memcpy(p, sep, seplen * sizeof(Py_UNICODE)); 2770 p += seplen; 2771 reslen += seplen; 2772 } 2773 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE)); 2774 p += itemlen; 2775 reslen += itemlen; 2776 Py_DECREF(item); 2777 } 2778 if (_PyUnicode_Resize(res, reslen)) 2779 goto onError; 2780 2781 Py_XDECREF(separator); 2782 return (PyObject *)res; 2783 2784 onError: 2785 Py_XDECREF(separator); 2786 Py_DECREF(res); 2787 return NULL; 2788} 2789 2790static 2791PyUnicodeObject *pad(PyUnicodeObject *self, 2792 int left, 2793 int right, 2794 Py_UNICODE fill) 2795{ 2796 PyUnicodeObject *u; 2797 2798 if (left < 0) 2799 left = 0; 2800 if (right < 0) 2801 right = 0; 2802 2803 if (left == 0 && right == 0) { 2804 Py_INCREF(self); 2805 return self; 2806 } 2807 2808 u = _PyUnicode_New(left + self->length + right); 2809 if (u) { 2810 if (left) 2811 Py_UNICODE_FILL(u->str, fill, left); 2812 Py_UNICODE_COPY(u->str + left, self->str, self->length); 2813 if (right) 2814 Py_UNICODE_FILL(u->str + left + self->length, fill, right); 2815 } 2816 2817 return u; 2818} 2819 2820#define SPLIT_APPEND(data, left, right) \ 2821 str = PyUnicode_FromUnicode(data + left, right - left); \ 2822 if (!str) \ 2823 goto onError; \ 2824 if (PyList_Append(list, str)) { \ 2825 Py_DECREF(str); \ 2826 goto onError; \ 2827 } \ 2828 else \ 2829 Py_DECREF(str); 2830 2831static 2832PyObject *split_whitespace(PyUnicodeObject *self, 2833 PyObject *list, 2834 int maxcount) 2835{ 2836 register int i; 2837 register int j; 2838 int len = self->length; 2839 PyObject *str; 2840 2841 for (i = j = 0; i < len; ) { 2842 /* find a token */ 2843 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2844 i++; 2845 j = i; 2846 while (i < len && !Py_UNICODE_ISSPACE(self->str[i])) 2847 i++; 2848 if (j < i) { 2849 if (maxcount-- <= 0) 2850 break; 2851 SPLIT_APPEND(self->str, j, i); 2852 while (i < len && Py_UNICODE_ISSPACE(self->str[i])) 2853 i++; 2854 j = i; 2855 } 2856 } 2857 if (j < len) { 2858 SPLIT_APPEND(self->str, j, len); 2859 } 2860 return list; 2861 2862 onError: 2863 Py_DECREF(list); 2864 return NULL; 2865} 2866 2867PyObject *PyUnicode_Splitlines(PyObject *string, 2868 int keepends) 2869{ 2870 register int i; 2871 register int j; 2872 int len; 2873 PyObject *list; 2874 PyObject *str; 2875 Py_UNICODE *data; 2876 2877 string = PyUnicode_FromObject(string); 2878 if (string == NULL) 2879 return NULL; 2880 data = PyUnicode_AS_UNICODE(string); 2881 len = PyUnicode_GET_SIZE(string); 2882 2883 list = PyList_New(0); 2884 if (!list) 2885 goto onError; 2886 2887 for (i = j = 0; i < len; ) { 2888 int eol; 2889 2890 /* Find a line and append it */ 2891 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i])) 2892 i++; 2893 2894 /* Skip the line break reading CRLF as one line break */ 2895 eol = i; 2896 if (i < len) { 2897 if (data[i] == '\r' && i + 1 < len && 2898 data[i+1] == '\n') 2899 i += 2; 2900 else 2901 i++; 2902 if (keepends) 2903 eol = i; 2904 } 2905 SPLIT_APPEND(data, j, eol); 2906 j = i; 2907 } 2908 if (j < len) { 2909 SPLIT_APPEND(data, j, len); 2910 } 2911 2912 Py_DECREF(string); 2913 return list; 2914 2915 onError: 2916 Py_DECREF(list); 2917 Py_DECREF(string); 2918 return NULL; 2919} 2920 2921static 2922PyObject *split_char(PyUnicodeObject *self, 2923 PyObject *list, 2924 Py_UNICODE ch, 2925 int maxcount) 2926{ 2927 register int i; 2928 register int j; 2929 int len = self->length; 2930 PyObject *str; 2931 2932 for (i = j = 0; i < len; ) { 2933 if (self->str[i] == ch) { 2934 if (maxcount-- <= 0) 2935 break; 2936 SPLIT_APPEND(self->str, j, i); 2937 i = j = i + 1; 2938 } else 2939 i++; 2940 } 2941 if (j <= len) { 2942 SPLIT_APPEND(self->str, j, len); 2943 } 2944 return list; 2945 2946 onError: 2947 Py_DECREF(list); 2948 return NULL; 2949} 2950 2951static 2952PyObject *split_substring(PyUnicodeObject *self, 2953 PyObject *list, 2954 PyUnicodeObject *substring, 2955 int maxcount) 2956{ 2957 register int i; 2958 register int j; 2959 int len = self->length; 2960 int sublen = substring->length; 2961 PyObject *str; 2962 2963 for (i = j = 0; i <= len - sublen; ) { 2964 if (Py_UNICODE_MATCH(self, i, substring)) { 2965 if (maxcount-- <= 0) 2966 break; 2967 SPLIT_APPEND(self->str, j, i); 2968 i = j = i + sublen; 2969 } else 2970 i++; 2971 } 2972 if (j <= len) { 2973 SPLIT_APPEND(self->str, j, len); 2974 } 2975 return list; 2976 2977 onError: 2978 Py_DECREF(list); 2979 return NULL; 2980} 2981 2982#undef SPLIT_APPEND 2983 2984static 2985PyObject *split(PyUnicodeObject *self, 2986 PyUnicodeObject *substring, 2987 int maxcount) 2988{ 2989 PyObject *list; 2990 2991 if (maxcount < 0) 2992 maxcount = INT_MAX; 2993 2994 list = PyList_New(0); 2995 if (!list) 2996 return NULL; 2997 2998 if (substring == NULL) 2999 return split_whitespace(self,list,maxcount); 3000 3001 else if (substring->length == 1) 3002 return split_char(self,list,substring->str[0],maxcount); 3003 3004 else if (substring->length == 0) { 3005 Py_DECREF(list); 3006 PyErr_SetString(PyExc_ValueError, "empty separator"); 3007 return NULL; 3008 } 3009 else 3010 return split_substring(self,list,substring,maxcount); 3011} 3012 3013static 3014PyObject *strip(PyUnicodeObject *self, 3015 int left, 3016 int right) 3017{ 3018 Py_UNICODE *p = self->str; 3019 int start = 0; 3020 int end = self->length; 3021 3022 if (left) 3023 while (start < end && Py_UNICODE_ISSPACE(p[start])) 3024 start++; 3025 3026 if (right) 3027 while (end > start && Py_UNICODE_ISSPACE(p[end-1])) 3028 end--; 3029 3030 if (start == 0 && end == self->length) { 3031 /* couldn't strip anything off, return original string */ 3032 Py_INCREF(self); 3033 return (PyObject*) self; 3034 } 3035 3036 return (PyObject*) PyUnicode_FromUnicode( 3037 self->str + start, 3038 end - start 3039 ); 3040} 3041 3042static 3043PyObject *replace(PyUnicodeObject *self, 3044 PyUnicodeObject *str1, 3045 PyUnicodeObject *str2, 3046 int maxcount) 3047{ 3048 PyUnicodeObject *u; 3049 3050 if (maxcount < 0) 3051 maxcount = INT_MAX; 3052 3053 if (str1->length == 1 && str2->length == 1) { 3054 int i; 3055 3056 /* replace characters */ 3057 if (!findchar(self->str, self->length, str1->str[0])) { 3058 /* nothing to replace, return original string */ 3059 Py_INCREF(self); 3060 u = self; 3061 } else { 3062 Py_UNICODE u1 = str1->str[0]; 3063 Py_UNICODE u2 = str2->str[0]; 3064 3065 u = (PyUnicodeObject*) PyUnicode_FromUnicode( 3066 self->str, 3067 self->length 3068 ); 3069 if (u) 3070 for (i = 0; i < u->length; i++) 3071 if (u->str[i] == u1) { 3072 if (--maxcount < 0) 3073 break; 3074 u->str[i] = u2; 3075 } 3076 } 3077 3078 } else { 3079 int n, i; 3080 Py_UNICODE *p; 3081 3082 /* replace strings */ 3083 n = count(self, 0, self->length, str1); 3084 if (n > maxcount) 3085 n = maxcount; 3086 if (n == 0) { 3087 /* nothing to replace, return original string */ 3088 Py_INCREF(self); 3089 u = self; 3090 } else { 3091 u = _PyUnicode_New( 3092 self->length + n * (str2->length - str1->length)); 3093 if (u) { 3094 i = 0; 3095 p = u->str; 3096 while (i <= self->length - str1->length) 3097 if (Py_UNICODE_MATCH(self, i, str1)) { 3098 /* replace string segment */ 3099 Py_UNICODE_COPY(p, str2->str, str2->length); 3100 p += str2->length; 3101 i += str1->length; 3102 if (--n <= 0) { 3103 /* copy remaining part */ 3104 Py_UNICODE_COPY(p, self->str+i, self->length-i); 3105 break; 3106 } 3107 } else 3108 *p++ = self->str[i++]; 3109 } 3110 } 3111 } 3112 3113 return (PyObject *) u; 3114} 3115 3116/* --- Unicode Object Methods --------------------------------------------- */ 3117 3118static char title__doc__[] = 3119"S.title() -> unicode\n\ 3120\n\ 3121Return a titlecased version of S, i.e. words start with title case\n\ 3122characters, all remaining cased characters have lower case."; 3123 3124static PyObject* 3125unicode_title(PyUnicodeObject *self, PyObject *args) 3126{ 3127 if (!PyArg_NoArgs(args)) 3128 return NULL; 3129 return fixup(self, fixtitle); 3130} 3131 3132static char capitalize__doc__[] = 3133"S.capitalize() -> unicode\n\ 3134\n\ 3135Return a capitalized version of S, i.e. make the first character\n\ 3136have upper case."; 3137 3138static PyObject* 3139unicode_capitalize(PyUnicodeObject *self, PyObject *args) 3140{ 3141 if (!PyArg_NoArgs(args)) 3142 return NULL; 3143 return fixup(self, fixcapitalize); 3144} 3145 3146#if 0 3147static char capwords__doc__[] = 3148"S.capwords() -> unicode\n\ 3149\n\ 3150Apply .capitalize() to all words in S and return the result with\n\ 3151normalized whitespace (all whitespace strings are replaced by ' ')."; 3152 3153static PyObject* 3154unicode_capwords(PyUnicodeObject *self, PyObject *args) 3155{ 3156 PyObject *list; 3157 PyObject *item; 3158 int i; 3159 3160 if (!PyArg_NoArgs(args)) 3161 return NULL; 3162 3163 /* Split into words */ 3164 list = split(self, NULL, -1); 3165 if (!list) 3166 return NULL; 3167 3168 /* Capitalize each word */ 3169 for (i = 0; i < PyList_GET_SIZE(list); i++) { 3170 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 3171 fixcapitalize); 3172 if (item == NULL) 3173 goto onError; 3174 Py_DECREF(PyList_GET_ITEM(list, i)); 3175 PyList_SET_ITEM(list, i, item); 3176 } 3177 3178 /* Join the words to form a new string */ 3179 item = PyUnicode_Join(NULL, list); 3180 3181onError: 3182 Py_DECREF(list); 3183 return (PyObject *)item; 3184} 3185#endif 3186 3187static char center__doc__[] = 3188"S.center(width) -> unicode\n\ 3189\n\ 3190Return S centered in a Unicode string of length width. Padding is done\n\ 3191using spaces."; 3192 3193static PyObject * 3194unicode_center(PyUnicodeObject *self, PyObject *args) 3195{ 3196 int marg, left; 3197 int width; 3198 3199 if (!PyArg_ParseTuple(args, "i:center", &width)) 3200 return NULL; 3201 3202 if (self->length >= width) { 3203 Py_INCREF(self); 3204 return (PyObject*) self; 3205 } 3206 3207 marg = width - self->length; 3208 left = marg / 2 + (marg & width & 1); 3209 3210 return (PyObject*) pad(self, left, marg - left, ' '); 3211} 3212 3213#if 0 3214 3215/* This code should go into some future Unicode collation support 3216 module. The basic comparison should compare ordinals on a naive 3217 basis (this is what Java does and thus JPython too). */ 3218 3219/* speedy UTF-16 code point order comparison */ 3220/* gleaned from: */ 3221/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 3222 3223static short utf16Fixup[32] = 3224{ 3225 0, 0, 0, 0, 0, 0, 0, 0, 3226 0, 0, 0, 0, 0, 0, 0, 0, 3227 0, 0, 0, 0, 0, 0, 0, 0, 3228 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 3229}; 3230 3231static int 3232unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3233{ 3234 int len1, len2; 3235 3236 Py_UNICODE *s1 = str1->str; 3237 Py_UNICODE *s2 = str2->str; 3238 3239 len1 = str1->length; 3240 len2 = str2->length; 3241 3242 while (len1 > 0 && len2 > 0) { 3243 Py_UNICODE c1, c2; 3244 long diff; 3245 3246 c1 = *s1++; 3247 c2 = *s2++; 3248 if (c1 > (1<<11) * 26) 3249 c1 += utf16Fixup[c1>>11]; 3250 if (c2 > (1<<11) * 26) 3251 c2 += utf16Fixup[c2>>11]; 3252 3253 /* now c1 and c2 are in UTF-32-compatible order */ 3254 diff = (long)c1 - (long)c2; 3255 if (diff) 3256 return (diff < 0) ? -1 : (diff != 0); 3257 len1--; len2--; 3258 } 3259 3260 return (len1 < len2) ? -1 : (len1 != len2); 3261} 3262 3263#else 3264 3265static int 3266unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 3267{ 3268 register int len1, len2; 3269 3270 Py_UNICODE *s1 = str1->str; 3271 Py_UNICODE *s2 = str2->str; 3272 3273 len1 = str1->length; 3274 len2 = str2->length; 3275 3276 while (len1 > 0 && len2 > 0) { 3277 register long diff; 3278 3279 diff = (long)*s1++ - (long)*s2++; 3280 if (diff) 3281 return (diff < 0) ? -1 : (diff != 0); 3282 len1--; len2--; 3283 } 3284 3285 return (len1 < len2) ? -1 : (len1 != len2); 3286} 3287 3288#endif 3289 3290int PyUnicode_Compare(PyObject *left, 3291 PyObject *right) 3292{ 3293 PyUnicodeObject *u = NULL, *v = NULL; 3294 int result; 3295 3296 /* Coerce the two arguments */ 3297 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3298 if (u == NULL) 3299 goto onError; 3300 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3301 if (v == NULL) 3302 goto onError; 3303 3304 /* Shortcut for empty or interned objects */ 3305 if (v == u) { 3306 Py_DECREF(u); 3307 Py_DECREF(v); 3308 return 0; 3309 } 3310 3311 result = unicode_compare(u, v); 3312 3313 Py_DECREF(u); 3314 Py_DECREF(v); 3315 return result; 3316 3317onError: 3318 Py_XDECREF(u); 3319 Py_XDECREF(v); 3320 return -1; 3321} 3322 3323int PyUnicode_Contains(PyObject *container, 3324 PyObject *element) 3325{ 3326 PyUnicodeObject *u = NULL, *v = NULL; 3327 int result; 3328 register const Py_UNICODE *p, *e; 3329 register Py_UNICODE ch; 3330 3331 /* Coerce the two arguments */ 3332 v = (PyUnicodeObject *)PyUnicode_FromObject(element); 3333 if (v == NULL) { 3334 PyErr_SetString(PyExc_TypeError, 3335 "'in <string>' requires character as left operand"); 3336 goto onError; 3337 } 3338 u = (PyUnicodeObject *)PyUnicode_FromObject(container); 3339 if (u == NULL) { 3340 Py_DECREF(v); 3341 goto onError; 3342 } 3343 3344 /* Check v in u */ 3345 if (PyUnicode_GET_SIZE(v) != 1) { 3346 PyErr_SetString(PyExc_TypeError, 3347 "'in <string>' requires character as left operand"); 3348 goto onError; 3349 } 3350 ch = *PyUnicode_AS_UNICODE(v); 3351 p = PyUnicode_AS_UNICODE(u); 3352 e = p + PyUnicode_GET_SIZE(u); 3353 result = 0; 3354 while (p < e) { 3355 if (*p++ == ch) { 3356 result = 1; 3357 break; 3358 } 3359 } 3360 3361 Py_DECREF(u); 3362 Py_DECREF(v); 3363 return result; 3364 3365onError: 3366 Py_XDECREF(u); 3367 Py_XDECREF(v); 3368 return -1; 3369} 3370 3371/* Concat to string or Unicode object giving a new Unicode object. */ 3372 3373PyObject *PyUnicode_Concat(PyObject *left, 3374 PyObject *right) 3375{ 3376 PyUnicodeObject *u = NULL, *v = NULL, *w; 3377 3378 /* Coerce the two arguments */ 3379 u = (PyUnicodeObject *)PyUnicode_FromObject(left); 3380 if (u == NULL) 3381 goto onError; 3382 v = (PyUnicodeObject *)PyUnicode_FromObject(right); 3383 if (v == NULL) 3384 goto onError; 3385 3386 /* Shortcuts */ 3387 if (v == unicode_empty) { 3388 Py_DECREF(v); 3389 return (PyObject *)u; 3390 } 3391 if (u == unicode_empty) { 3392 Py_DECREF(u); 3393 return (PyObject *)v; 3394 } 3395 3396 /* Concat the two Unicode strings */ 3397 w = _PyUnicode_New(u->length + v->length); 3398 if (w == NULL) 3399 goto onError; 3400 Py_UNICODE_COPY(w->str, u->str, u->length); 3401 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); 3402 3403 Py_DECREF(u); 3404 Py_DECREF(v); 3405 return (PyObject *)w; 3406 3407onError: 3408 Py_XDECREF(u); 3409 Py_XDECREF(v); 3410 return NULL; 3411} 3412 3413static char count__doc__[] = 3414"S.count(sub[, start[, end]]) -> int\n\ 3415\n\ 3416Return the number of occurrences of substring sub in Unicode string\n\ 3417S[start:end]. Optional arguments start and end are\n\ 3418interpreted as in slice notation."; 3419 3420static PyObject * 3421unicode_count(PyUnicodeObject *self, PyObject *args) 3422{ 3423 PyUnicodeObject *substring; 3424 int start = 0; 3425 int end = INT_MAX; 3426 PyObject *result; 3427 3428 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, 3429 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3430 return NULL; 3431 3432 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3433 (PyObject *)substring); 3434 if (substring == NULL) 3435 return NULL; 3436 3437 if (start < 0) 3438 start += self->length; 3439 if (start < 0) 3440 start = 0; 3441 if (end > self->length) 3442 end = self->length; 3443 if (end < 0) 3444 end += self->length; 3445 if (end < 0) 3446 end = 0; 3447 3448 result = PyInt_FromLong((long) count(self, start, end, substring)); 3449 3450 Py_DECREF(substring); 3451 return result; 3452} 3453 3454static char encode__doc__[] = 3455"S.encode([encoding[,errors]]) -> string\n\ 3456\n\ 3457Return an encoded string version of S. Default encoding is the current\n\ 3458default string encoding. errors may be given to set a different error\n\ 3459handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 3460a ValueError. Other possible values are 'ignore' and 'replace'."; 3461 3462static PyObject * 3463unicode_encode(PyUnicodeObject *self, PyObject *args) 3464{ 3465 char *encoding = NULL; 3466 char *errors = NULL; 3467 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) 3468 return NULL; 3469 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 3470} 3471 3472static char expandtabs__doc__[] = 3473"S.expandtabs([tabsize]) -> unicode\n\ 3474\n\ 3475Return a copy of S where all tab characters are expanded using spaces.\n\ 3476If tabsize is not given, a tab size of 8 characters is assumed."; 3477 3478static PyObject* 3479unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 3480{ 3481 Py_UNICODE *e; 3482 Py_UNICODE *p; 3483 Py_UNICODE *q; 3484 int i, j; 3485 PyUnicodeObject *u; 3486 int tabsize = 8; 3487 3488 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 3489 return NULL; 3490 3491 /* First pass: determine size of output string */ 3492 i = j = 0; 3493 e = self->str + self->length; 3494 for (p = self->str; p < e; p++) 3495 if (*p == '\t') { 3496 if (tabsize > 0) 3497 j += tabsize - (j % tabsize); 3498 } 3499 else { 3500 j++; 3501 if (*p == '\n' || *p == '\r') { 3502 i += j; 3503 j = 0; 3504 } 3505 } 3506 3507 /* Second pass: create output string and fill it */ 3508 u = _PyUnicode_New(i + j); 3509 if (!u) 3510 return NULL; 3511 3512 j = 0; 3513 q = u->str; 3514 3515 for (p = self->str; p < e; p++) 3516 if (*p == '\t') { 3517 if (tabsize > 0) { 3518 i = tabsize - (j % tabsize); 3519 j += i; 3520 while (i--) 3521 *q++ = ' '; 3522 } 3523 } 3524 else { 3525 j++; 3526 *q++ = *p; 3527 if (*p == '\n' || *p == '\r') 3528 j = 0; 3529 } 3530 3531 return (PyObject*) u; 3532} 3533 3534static char find__doc__[] = 3535"S.find(sub [,start [,end]]) -> int\n\ 3536\n\ 3537Return the lowest index in S where substring sub is found,\n\ 3538such that sub is contained within s[start,end]. Optional\n\ 3539arguments start and end are interpreted as in slice notation.\n\ 3540\n\ 3541Return -1 on failure."; 3542 3543static PyObject * 3544unicode_find(PyUnicodeObject *self, PyObject *args) 3545{ 3546 PyUnicodeObject *substring; 3547 int start = 0; 3548 int end = INT_MAX; 3549 PyObject *result; 3550 3551 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring, 3552 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3553 return NULL; 3554 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3555 (PyObject *)substring); 3556 if (substring == NULL) 3557 return NULL; 3558 3559 result = PyInt_FromLong(findstring(self, substring, start, end, 1)); 3560 3561 Py_DECREF(substring); 3562 return result; 3563} 3564 3565static PyObject * 3566unicode_getitem(PyUnicodeObject *self, int index) 3567{ 3568 if (index < 0 || index >= self->length) { 3569 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3570 return NULL; 3571 } 3572 3573 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); 3574} 3575 3576static long 3577unicode_hash(PyUnicodeObject *self) 3578{ 3579 /* Since Unicode objects compare equal to their ASCII string 3580 counterparts, they should use the individual character values 3581 as basis for their hash value. This is needed to assure that 3582 strings and Unicode objects behave in the same way as 3583 dictionary keys. */ 3584 3585 register int len; 3586 register Py_UNICODE *p; 3587 register long x; 3588 3589 if (self->hash != -1) 3590 return self->hash; 3591 len = PyUnicode_GET_SIZE(self); 3592 p = PyUnicode_AS_UNICODE(self); 3593 x = *p << 7; 3594 while (--len >= 0) 3595 x = (1000003*x) ^ *p++; 3596 x ^= PyUnicode_GET_SIZE(self); 3597 if (x == -1) 3598 x = -2; 3599 self->hash = x; 3600 return x; 3601} 3602 3603static char index__doc__[] = 3604"S.index(sub [,start [,end]]) -> int\n\ 3605\n\ 3606Like S.find() but raise ValueError when the substring is not found."; 3607 3608static PyObject * 3609unicode_index(PyUnicodeObject *self, PyObject *args) 3610{ 3611 int result; 3612 PyUnicodeObject *substring; 3613 int start = 0; 3614 int end = INT_MAX; 3615 3616 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring, 3617 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 3618 return NULL; 3619 3620 substring = (PyUnicodeObject *)PyUnicode_FromObject( 3621 (PyObject *)substring); 3622 if (substring == NULL) 3623 return NULL; 3624 3625 result = findstring(self, substring, start, end, 1); 3626 3627 Py_DECREF(substring); 3628 if (result < 0) { 3629 PyErr_SetString(PyExc_ValueError, "substring not found"); 3630 return NULL; 3631 } 3632 return PyInt_FromLong(result); 3633} 3634 3635static char islower__doc__[] = 3636"S.islower() -> int\n\ 3637\n\ 3638Return 1 if all cased characters in S are lowercase and there is\n\ 3639at least one cased character in S, 0 otherwise."; 3640 3641static PyObject* 3642unicode_islower(PyUnicodeObject *self, PyObject *args) 3643{ 3644 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3645 register const Py_UNICODE *e; 3646 int cased; 3647 3648 if (!PyArg_NoArgs(args)) 3649 return NULL; 3650 3651 /* Shortcut for single character strings */ 3652 if (PyUnicode_GET_SIZE(self) == 1) 3653 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0); 3654 3655 /* Special case for empty strings */ 3656 if (PyString_GET_SIZE(self) == 0) 3657 return PyInt_FromLong(0); 3658 3659 e = p + PyUnicode_GET_SIZE(self); 3660 cased = 0; 3661 for (; p < e; p++) { 3662 register const Py_UNICODE ch = *p; 3663 3664 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 3665 return PyInt_FromLong(0); 3666 else if (!cased && Py_UNICODE_ISLOWER(ch)) 3667 cased = 1; 3668 } 3669 return PyInt_FromLong(cased); 3670} 3671 3672static char isupper__doc__[] = 3673"S.isupper() -> int\n\ 3674\n\ 3675Return 1 if all cased characters in S are uppercase and there is\n\ 3676at least one cased character in S, 0 otherwise."; 3677 3678static PyObject* 3679unicode_isupper(PyUnicodeObject *self, PyObject *args) 3680{ 3681 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3682 register const Py_UNICODE *e; 3683 int cased; 3684 3685 if (!PyArg_NoArgs(args)) 3686 return NULL; 3687 3688 /* Shortcut for single character strings */ 3689 if (PyUnicode_GET_SIZE(self) == 1) 3690 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0); 3691 3692 /* Special case for empty strings */ 3693 if (PyString_GET_SIZE(self) == 0) 3694 return PyInt_FromLong(0); 3695 3696 e = p + PyUnicode_GET_SIZE(self); 3697 cased = 0; 3698 for (; p < e; p++) { 3699 register const Py_UNICODE ch = *p; 3700 3701 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 3702 return PyInt_FromLong(0); 3703 else if (!cased && Py_UNICODE_ISUPPER(ch)) 3704 cased = 1; 3705 } 3706 return PyInt_FromLong(cased); 3707} 3708 3709static char istitle__doc__[] = 3710"S.istitle() -> int\n\ 3711\n\ 3712Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\ 3713may only follow uncased characters and lowercase characters only cased\n\ 3714ones. Return 0 otherwise."; 3715 3716static PyObject* 3717unicode_istitle(PyUnicodeObject *self, PyObject *args) 3718{ 3719 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3720 register const Py_UNICODE *e; 3721 int cased, previous_is_cased; 3722 3723 if (!PyArg_NoArgs(args)) 3724 return NULL; 3725 3726 /* Shortcut for single character strings */ 3727 if (PyUnicode_GET_SIZE(self) == 1) 3728 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || 3729 (Py_UNICODE_ISUPPER(*p) != 0)); 3730 3731 /* Special case for empty strings */ 3732 if (PyString_GET_SIZE(self) == 0) 3733 return PyInt_FromLong(0); 3734 3735 e = p + PyUnicode_GET_SIZE(self); 3736 cased = 0; 3737 previous_is_cased = 0; 3738 for (; p < e; p++) { 3739 register const Py_UNICODE ch = *p; 3740 3741 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 3742 if (previous_is_cased) 3743 return PyInt_FromLong(0); 3744 previous_is_cased = 1; 3745 cased = 1; 3746 } 3747 else if (Py_UNICODE_ISLOWER(ch)) { 3748 if (!previous_is_cased) 3749 return PyInt_FromLong(0); 3750 previous_is_cased = 1; 3751 cased = 1; 3752 } 3753 else 3754 previous_is_cased = 0; 3755 } 3756 return PyInt_FromLong(cased); 3757} 3758 3759static char isspace__doc__[] = 3760"S.isspace() -> int\n\ 3761\n\ 3762Return 1 if there are only whitespace characters in S,\n\ 37630 otherwise."; 3764 3765static PyObject* 3766unicode_isspace(PyUnicodeObject *self, PyObject *args) 3767{ 3768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3769 register const Py_UNICODE *e; 3770 3771 if (!PyArg_NoArgs(args)) 3772 return NULL; 3773 3774 /* Shortcut for single character strings */ 3775 if (PyUnicode_GET_SIZE(self) == 1 && 3776 Py_UNICODE_ISSPACE(*p)) 3777 return PyInt_FromLong(1); 3778 3779 /* Special case for empty strings */ 3780 if (PyString_GET_SIZE(self) == 0) 3781 return PyInt_FromLong(0); 3782 3783 e = p + PyUnicode_GET_SIZE(self); 3784 for (; p < e; p++) { 3785 if (!Py_UNICODE_ISSPACE(*p)) 3786 return PyInt_FromLong(0); 3787 } 3788 return PyInt_FromLong(1); 3789} 3790 3791static char isalpha__doc__[] = 3792"S.isalpha() -> int\n\ 3793\n\ 3794Return 1 if all characters in S are alphabetic\n\ 3795and there is at least one character in S, 0 otherwise."; 3796 3797static PyObject* 3798unicode_isalpha(PyUnicodeObject *self, PyObject *args) 3799{ 3800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3801 register const Py_UNICODE *e; 3802 3803 if (!PyArg_NoArgs(args)) 3804 return NULL; 3805 3806 /* Shortcut for single character strings */ 3807 if (PyUnicode_GET_SIZE(self) == 1 && 3808 Py_UNICODE_ISALPHA(*p)) 3809 return PyInt_FromLong(1); 3810 3811 /* Special case for empty strings */ 3812 if (PyString_GET_SIZE(self) == 0) 3813 return PyInt_FromLong(0); 3814 3815 e = p + PyUnicode_GET_SIZE(self); 3816 for (; p < e; p++) { 3817 if (!Py_UNICODE_ISALPHA(*p)) 3818 return PyInt_FromLong(0); 3819 } 3820 return PyInt_FromLong(1); 3821} 3822 3823static char isalnum__doc__[] = 3824"S.isalnum() -> int\n\ 3825\n\ 3826Return 1 if all characters in S are alphanumeric\n\ 3827and there is at least one character in S, 0 otherwise."; 3828 3829static PyObject* 3830unicode_isalnum(PyUnicodeObject *self, PyObject *args) 3831{ 3832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3833 register const Py_UNICODE *e; 3834 3835 if (!PyArg_NoArgs(args)) 3836 return NULL; 3837 3838 /* Shortcut for single character strings */ 3839 if (PyUnicode_GET_SIZE(self) == 1 && 3840 Py_UNICODE_ISALNUM(*p)) 3841 return PyInt_FromLong(1); 3842 3843 /* Special case for empty strings */ 3844 if (PyString_GET_SIZE(self) == 0) 3845 return PyInt_FromLong(0); 3846 3847 e = p + PyUnicode_GET_SIZE(self); 3848 for (; p < e; p++) { 3849 if (!Py_UNICODE_ISALNUM(*p)) 3850 return PyInt_FromLong(0); 3851 } 3852 return PyInt_FromLong(1); 3853} 3854 3855static char isdecimal__doc__[] = 3856"S.isdecimal() -> int\n\ 3857\n\ 3858Return 1 if there are only decimal characters in S,\n\ 38590 otherwise."; 3860 3861static PyObject* 3862unicode_isdecimal(PyUnicodeObject *self, PyObject *args) 3863{ 3864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3865 register const Py_UNICODE *e; 3866 3867 if (!PyArg_NoArgs(args)) 3868 return NULL; 3869 3870 /* Shortcut for single character strings */ 3871 if (PyUnicode_GET_SIZE(self) == 1 && 3872 Py_UNICODE_ISDECIMAL(*p)) 3873 return PyInt_FromLong(1); 3874 3875 /* Special case for empty strings */ 3876 if (PyString_GET_SIZE(self) == 0) 3877 return PyInt_FromLong(0); 3878 3879 e = p + PyUnicode_GET_SIZE(self); 3880 for (; p < e; p++) { 3881 if (!Py_UNICODE_ISDECIMAL(*p)) 3882 return PyInt_FromLong(0); 3883 } 3884 return PyInt_FromLong(1); 3885} 3886 3887static char isdigit__doc__[] = 3888"S.isdigit() -> int\n\ 3889\n\ 3890Return 1 if there are only digit characters in S,\n\ 38910 otherwise."; 3892 3893static PyObject* 3894unicode_isdigit(PyUnicodeObject *self, PyObject *args) 3895{ 3896 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3897 register const Py_UNICODE *e; 3898 3899 if (!PyArg_NoArgs(args)) 3900 return NULL; 3901 3902 /* Shortcut for single character strings */ 3903 if (PyUnicode_GET_SIZE(self) == 1 && 3904 Py_UNICODE_ISDIGIT(*p)) 3905 return PyInt_FromLong(1); 3906 3907 /* Special case for empty strings */ 3908 if (PyString_GET_SIZE(self) == 0) 3909 return PyInt_FromLong(0); 3910 3911 e = p + PyUnicode_GET_SIZE(self); 3912 for (; p < e; p++) { 3913 if (!Py_UNICODE_ISDIGIT(*p)) 3914 return PyInt_FromLong(0); 3915 } 3916 return PyInt_FromLong(1); 3917} 3918 3919static char isnumeric__doc__[] = 3920"S.isnumeric() -> int\n\ 3921\n\ 3922Return 1 if there are only numeric characters in S,\n\ 39230 otherwise."; 3924 3925static PyObject* 3926unicode_isnumeric(PyUnicodeObject *self, PyObject *args) 3927{ 3928 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); 3929 register const Py_UNICODE *e; 3930 3931 if (!PyArg_NoArgs(args)) 3932 return NULL; 3933 3934 /* Shortcut for single character strings */ 3935 if (PyUnicode_GET_SIZE(self) == 1 && 3936 Py_UNICODE_ISNUMERIC(*p)) 3937 return PyInt_FromLong(1); 3938 3939 /* Special case for empty strings */ 3940 if (PyString_GET_SIZE(self) == 0) 3941 return PyInt_FromLong(0); 3942 3943 e = p + PyUnicode_GET_SIZE(self); 3944 for (; p < e; p++) { 3945 if (!Py_UNICODE_ISNUMERIC(*p)) 3946 return PyInt_FromLong(0); 3947 } 3948 return PyInt_FromLong(1); 3949} 3950 3951static char join__doc__[] = 3952"S.join(sequence) -> unicode\n\ 3953\n\ 3954Return a string which is the concatenation of the strings in the\n\ 3955sequence. The separator between elements is S."; 3956 3957static PyObject* 3958unicode_join(PyUnicodeObject *self, PyObject *args) 3959{ 3960 PyObject *data; 3961 if (!PyArg_ParseTuple(args, "O:join", &data)) 3962 return NULL; 3963 3964 return PyUnicode_Join((PyObject *)self, data); 3965} 3966 3967static int 3968unicode_length(PyUnicodeObject *self) 3969{ 3970 return self->length; 3971} 3972 3973static char ljust__doc__[] = 3974"S.ljust(width) -> unicode\n\ 3975\n\ 3976Return S left justified in a Unicode string of length width. Padding is\n\ 3977done using spaces."; 3978 3979static PyObject * 3980unicode_ljust(PyUnicodeObject *self, PyObject *args) 3981{ 3982 int width; 3983 if (!PyArg_ParseTuple(args, "i:ljust", &width)) 3984 return NULL; 3985 3986 if (self->length >= width) { 3987 Py_INCREF(self); 3988 return (PyObject*) self; 3989 } 3990 3991 return (PyObject*) pad(self, 0, width - self->length, ' '); 3992} 3993 3994static char lower__doc__[] = 3995"S.lower() -> unicode\n\ 3996\n\ 3997Return a copy of the string S converted to lowercase."; 3998 3999static PyObject* 4000unicode_lower(PyUnicodeObject *self, PyObject *args) 4001{ 4002 if (!PyArg_NoArgs(args)) 4003 return NULL; 4004 return fixup(self, fixlower); 4005} 4006 4007static char lstrip__doc__[] = 4008"S.lstrip() -> unicode\n\ 4009\n\ 4010Return a copy of the string S with leading whitespace removed."; 4011 4012static PyObject * 4013unicode_lstrip(PyUnicodeObject *self, PyObject *args) 4014{ 4015 if (!PyArg_NoArgs(args)) 4016 return NULL; 4017 return strip(self, 1, 0); 4018} 4019 4020static PyObject* 4021unicode_repeat(PyUnicodeObject *str, int len) 4022{ 4023 PyUnicodeObject *u; 4024 Py_UNICODE *p; 4025 int nchars; 4026 size_t nbytes; 4027 4028 if (len < 0) 4029 len = 0; 4030 4031 if (len == 1) { 4032 /* no repeat, return original string */ 4033 Py_INCREF(str); 4034 return (PyObject*) str; 4035 } 4036 4037 /* ensure # of chars needed doesn't overflow int and # of bytes 4038 * needed doesn't overflow size_t 4039 */ 4040 nchars = len * str->length; 4041 if (len && nchars / len != str->length) { 4042 PyErr_SetString(PyExc_OverflowError, 4043 "repeated string is too long"); 4044 return NULL; 4045 } 4046 nbytes = (nchars + 1) * sizeof(Py_UNICODE); 4047 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { 4048 PyErr_SetString(PyExc_OverflowError, 4049 "repeated string is too long"); 4050 return NULL; 4051 } 4052 u = _PyUnicode_New(nchars); 4053 if (!u) 4054 return NULL; 4055 4056 p = u->str; 4057 4058 while (len-- > 0) { 4059 Py_UNICODE_COPY(p, str->str, str->length); 4060 p += str->length; 4061 } 4062 4063 return (PyObject*) u; 4064} 4065 4066PyObject *PyUnicode_Replace(PyObject *obj, 4067 PyObject *subobj, 4068 PyObject *replobj, 4069 int maxcount) 4070{ 4071 PyObject *self; 4072 PyObject *str1; 4073 PyObject *str2; 4074 PyObject *result; 4075 4076 self = PyUnicode_FromObject(obj); 4077 if (self == NULL) 4078 return NULL; 4079 str1 = PyUnicode_FromObject(subobj); 4080 if (str1 == NULL) { 4081 Py_DECREF(self); 4082 return NULL; 4083 } 4084 str2 = PyUnicode_FromObject(replobj); 4085 if (str2 == NULL) { 4086 Py_DECREF(self); 4087 Py_DECREF(str1); 4088 return NULL; 4089 } 4090 result = replace((PyUnicodeObject *)self, 4091 (PyUnicodeObject *)str1, 4092 (PyUnicodeObject *)str2, 4093 maxcount); 4094 Py_DECREF(self); 4095 Py_DECREF(str1); 4096 Py_DECREF(str2); 4097 return result; 4098} 4099 4100static char replace__doc__[] = 4101"S.replace (old, new[, maxsplit]) -> unicode\n\ 4102\n\ 4103Return a copy of S with all occurrences of substring\n\ 4104old replaced by new. If the optional argument maxsplit is\n\ 4105given, only the first maxsplit occurrences are replaced."; 4106 4107static PyObject* 4108unicode_replace(PyUnicodeObject *self, PyObject *args) 4109{ 4110 PyUnicodeObject *str1; 4111 PyUnicodeObject *str2; 4112 int maxcount = -1; 4113 PyObject *result; 4114 4115 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount)) 4116 return NULL; 4117 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); 4118 if (str1 == NULL) 4119 return NULL; 4120 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); 4121 if (str2 == NULL) 4122 return NULL; 4123 4124 result = replace(self, str1, str2, maxcount); 4125 4126 Py_DECREF(str1); 4127 Py_DECREF(str2); 4128 return result; 4129} 4130 4131static 4132PyObject *unicode_repr(PyObject *unicode) 4133{ 4134 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), 4135 PyUnicode_GET_SIZE(unicode), 4136 1); 4137} 4138 4139static char rfind__doc__[] = 4140"S.rfind(sub [,start [,end]]) -> int\n\ 4141\n\ 4142Return the highest index in S where substring sub is found,\n\ 4143such that sub is contained within s[start,end]. Optional\n\ 4144arguments start and end are interpreted as in slice notation.\n\ 4145\n\ 4146Return -1 on failure."; 4147 4148static PyObject * 4149unicode_rfind(PyUnicodeObject *self, PyObject *args) 4150{ 4151 PyUnicodeObject *substring; 4152 int start = 0; 4153 int end = INT_MAX; 4154 PyObject *result; 4155 4156 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring, 4157 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4158 return NULL; 4159 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4160 (PyObject *)substring); 4161 if (substring == NULL) 4162 return NULL; 4163 4164 result = PyInt_FromLong(findstring(self, substring, start, end, -1)); 4165 4166 Py_DECREF(substring); 4167 return result; 4168} 4169 4170static char rindex__doc__[] = 4171"S.rindex(sub [,start [,end]]) -> int\n\ 4172\n\ 4173Like S.rfind() but raise ValueError when the substring is not found."; 4174 4175static PyObject * 4176unicode_rindex(PyUnicodeObject *self, PyObject *args) 4177{ 4178 int result; 4179 PyUnicodeObject *substring; 4180 int start = 0; 4181 int end = INT_MAX; 4182 4183 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring, 4184 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4185 return NULL; 4186 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4187 (PyObject *)substring); 4188 if (substring == NULL) 4189 return NULL; 4190 4191 result = findstring(self, substring, start, end, -1); 4192 4193 Py_DECREF(substring); 4194 if (result < 0) { 4195 PyErr_SetString(PyExc_ValueError, "substring not found"); 4196 return NULL; 4197 } 4198 return PyInt_FromLong(result); 4199} 4200 4201static char rjust__doc__[] = 4202"S.rjust(width) -> unicode\n\ 4203\n\ 4204Return S right justified in a Unicode string of length width. Padding is\n\ 4205done using spaces."; 4206 4207static PyObject * 4208unicode_rjust(PyUnicodeObject *self, PyObject *args) 4209{ 4210 int width; 4211 if (!PyArg_ParseTuple(args, "i:rjust", &width)) 4212 return NULL; 4213 4214 if (self->length >= width) { 4215 Py_INCREF(self); 4216 return (PyObject*) self; 4217 } 4218 4219 return (PyObject*) pad(self, width - self->length, 0, ' '); 4220} 4221 4222static char rstrip__doc__[] = 4223"S.rstrip() -> unicode\n\ 4224\n\ 4225Return a copy of the string S with trailing whitespace removed."; 4226 4227static PyObject * 4228unicode_rstrip(PyUnicodeObject *self, PyObject *args) 4229{ 4230 if (!PyArg_NoArgs(args)) 4231 return NULL; 4232 return strip(self, 0, 1); 4233} 4234 4235static PyObject* 4236unicode_slice(PyUnicodeObject *self, int start, int end) 4237{ 4238 /* standard clamping */ 4239 if (start < 0) 4240 start = 0; 4241 if (end < 0) 4242 end = 0; 4243 if (end > self->length) 4244 end = self->length; 4245 if (start == 0 && end == self->length) { 4246 /* full slice, return original string */ 4247 Py_INCREF(self); 4248 return (PyObject*) self; 4249 } 4250 if (start > end) 4251 start = end; 4252 /* copy slice */ 4253 return (PyObject*) PyUnicode_FromUnicode(self->str + start, 4254 end - start); 4255} 4256 4257PyObject *PyUnicode_Split(PyObject *s, 4258 PyObject *sep, 4259 int maxsplit) 4260{ 4261 PyObject *result; 4262 4263 s = PyUnicode_FromObject(s); 4264 if (s == NULL) 4265 return NULL; 4266 if (sep != NULL) { 4267 sep = PyUnicode_FromObject(sep); 4268 if (sep == NULL) { 4269 Py_DECREF(s); 4270 return NULL; 4271 } 4272 } 4273 4274 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 4275 4276 Py_DECREF(s); 4277 Py_XDECREF(sep); 4278 return result; 4279} 4280 4281static char split__doc__[] = 4282"S.split([sep [,maxsplit]]) -> list of strings\n\ 4283\n\ 4284Return a list of the words in S, using sep as the\n\ 4285delimiter string. If maxsplit is given, at most maxsplit\n\ 4286splits are done. If sep is not specified, any whitespace string\n\ 4287is a separator."; 4288 4289static PyObject* 4290unicode_split(PyUnicodeObject *self, PyObject *args) 4291{ 4292 PyObject *substring = Py_None; 4293 int maxcount = -1; 4294 4295 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount)) 4296 return NULL; 4297 4298 if (substring == Py_None) 4299 return split(self, NULL, maxcount); 4300 else if (PyUnicode_Check(substring)) 4301 return split(self, (PyUnicodeObject *)substring, maxcount); 4302 else 4303 return PyUnicode_Split((PyObject *)self, substring, maxcount); 4304} 4305 4306static char splitlines__doc__[] = 4307"S.splitlines([keepends]]) -> list of strings\n\ 4308\n\ 4309Return a list of the lines in S, breaking at line boundaries.\n\ 4310Line breaks are not included in the resulting list unless keepends\n\ 4311is given and true."; 4312 4313static PyObject* 4314unicode_splitlines(PyUnicodeObject *self, PyObject *args) 4315{ 4316 int keepends = 0; 4317 4318 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) 4319 return NULL; 4320 4321 return PyUnicode_Splitlines((PyObject *)self, keepends); 4322} 4323 4324static 4325PyObject *unicode_str(PyUnicodeObject *self) 4326{ 4327 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); 4328} 4329 4330static char strip__doc__[] = 4331"S.strip() -> unicode\n\ 4332\n\ 4333Return a copy of S with leading and trailing whitespace removed."; 4334 4335static PyObject * 4336unicode_strip(PyUnicodeObject *self, PyObject *args) 4337{ 4338 if (!PyArg_NoArgs(args)) 4339 return NULL; 4340 return strip(self, 1, 1); 4341} 4342 4343static char swapcase__doc__[] = 4344"S.swapcase() -> unicode\n\ 4345\n\ 4346Return a copy of S with uppercase characters converted to lowercase\n\ 4347and vice versa."; 4348 4349static PyObject* 4350unicode_swapcase(PyUnicodeObject *self, PyObject *args) 4351{ 4352 if (!PyArg_NoArgs(args)) 4353 return NULL; 4354 return fixup(self, fixswapcase); 4355} 4356 4357static char translate__doc__[] = 4358"S.translate(table) -> unicode\n\ 4359\n\ 4360Return a copy of the string S, where all characters have been mapped\n\ 4361through the given translation table, which must be a mapping of\n\ 4362Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\ 4363are left untouched. Characters mapped to None are deleted."; 4364 4365static PyObject* 4366unicode_translate(PyUnicodeObject *self, PyObject *args) 4367{ 4368 PyObject *table; 4369 4370 if (!PyArg_ParseTuple(args, "O:translate", &table)) 4371 return NULL; 4372 return PyUnicode_TranslateCharmap(self->str, 4373 self->length, 4374 table, 4375 "ignore"); 4376} 4377 4378static char upper__doc__[] = 4379"S.upper() -> unicode\n\ 4380\n\ 4381Return a copy of S converted to uppercase."; 4382 4383static PyObject* 4384unicode_upper(PyUnicodeObject *self, PyObject *args) 4385{ 4386 if (!PyArg_NoArgs(args)) 4387 return NULL; 4388 return fixup(self, fixupper); 4389} 4390 4391#if 0 4392static char zfill__doc__[] = 4393"S.zfill(width) -> unicode\n\ 4394\n\ 4395Pad a numeric string x with zeros on the left, to fill a field\n\ 4396of the specified width. The string x is never truncated."; 4397 4398static PyObject * 4399unicode_zfill(PyUnicodeObject *self, PyObject *args) 4400{ 4401 int fill; 4402 PyUnicodeObject *u; 4403 4404 int width; 4405 if (!PyArg_ParseTuple(args, "i:zfill", &width)) 4406 return NULL; 4407 4408 if (self->length >= width) { 4409 Py_INCREF(self); 4410 return (PyObject*) self; 4411 } 4412 4413 fill = width - self->length; 4414 4415 u = pad(self, fill, 0, '0'); 4416 4417 if (u->str[fill] == '+' || u->str[fill] == '-') { 4418 /* move sign to beginning of string */ 4419 u->str[0] = u->str[fill]; 4420 u->str[fill] = '0'; 4421 } 4422 4423 return (PyObject*) u; 4424} 4425#endif 4426 4427#if 0 4428static PyObject* 4429unicode_freelistsize(PyUnicodeObject *self, PyObject *args) 4430{ 4431 if (!PyArg_NoArgs(args)) 4432 return NULL; 4433 return PyInt_FromLong(unicode_freelist_size); 4434} 4435#endif 4436 4437static char startswith__doc__[] = 4438"S.startswith(prefix[, start[, end]]) -> int\n\ 4439\n\ 4440Return 1 if S starts with the specified prefix, otherwise return 0. With\n\ 4441optional start, test S beginning at that position. With optional end, stop\n\ 4442comparing S at that position."; 4443 4444static PyObject * 4445unicode_startswith(PyUnicodeObject *self, 4446 PyObject *args) 4447{ 4448 PyUnicodeObject *substring; 4449 int start = 0; 4450 int end = INT_MAX; 4451 PyObject *result; 4452 4453 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring, 4454 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4455 return NULL; 4456 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4457 (PyObject *)substring); 4458 if (substring == NULL) 4459 return NULL; 4460 4461 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1)); 4462 4463 Py_DECREF(substring); 4464 return result; 4465} 4466 4467 4468static char endswith__doc__[] = 4469"S.endswith(suffix[, start[, end]]) -> int\n\ 4470\n\ 4471Return 1 if S ends with the specified suffix, otherwise return 0. With\n\ 4472optional start, test S beginning at that position. With optional end, stop\n\ 4473comparing S at that position."; 4474 4475static PyObject * 4476unicode_endswith(PyUnicodeObject *self, 4477 PyObject *args) 4478{ 4479 PyUnicodeObject *substring; 4480 int start = 0; 4481 int end = INT_MAX; 4482 PyObject *result; 4483 4484 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring, 4485 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) 4486 return NULL; 4487 substring = (PyUnicodeObject *)PyUnicode_FromObject( 4488 (PyObject *)substring); 4489 if (substring == NULL) 4490 return NULL; 4491 4492 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1)); 4493 4494 Py_DECREF(substring); 4495 return result; 4496} 4497 4498 4499static PyMethodDef unicode_methods[] = { 4500 4501 /* Order is according to common usage: often used methods should 4502 appear first, since lookup is done sequentially. */ 4503 4504 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__}, 4505 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__}, 4506 {"split", (PyCFunction) unicode_split, 1, split__doc__}, 4507 {"join", (PyCFunction) unicode_join, 1, join__doc__}, 4508 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__}, 4509 {"title", (PyCFunction) unicode_title, 0, title__doc__}, 4510 {"center", (PyCFunction) unicode_center, 1, center__doc__}, 4511 {"count", (PyCFunction) unicode_count, 1, count__doc__}, 4512 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__}, 4513 {"find", (PyCFunction) unicode_find, 1, find__doc__}, 4514 {"index", (PyCFunction) unicode_index, 1, index__doc__}, 4515 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__}, 4516 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__}, 4517 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__}, 4518/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */ 4519 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__}, 4520 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__}, 4521 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__}, 4522 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__}, 4523 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__}, 4524 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__}, 4525 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__}, 4526 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__}, 4527 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__}, 4528 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__}, 4529 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__}, 4530 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__}, 4531 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__}, 4532 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__}, 4533 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__}, 4534 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__}, 4535 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__}, 4536 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__}, 4537 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__}, 4538 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__}, 4539#if 0 4540 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__}, 4541 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__}, 4542#endif 4543 4544#if 0 4545 /* This one is just used for debugging the implementation. */ 4546 {"freelistsize", (PyCFunction) unicode_freelistsize, 0}, 4547#endif 4548 4549 {NULL, NULL} 4550}; 4551 4552static PyObject * 4553unicode_getattr(PyUnicodeObject *self, char *name) 4554{ 4555 return Py_FindMethod(unicode_methods, (PyObject*) self, name); 4556} 4557 4558static PySequenceMethods unicode_as_sequence = { 4559 (inquiry) unicode_length, /* sq_length */ 4560 (binaryfunc) PyUnicode_Concat, /* sq_concat */ 4561 (intargfunc) unicode_repeat, /* sq_repeat */ 4562 (intargfunc) unicode_getitem, /* sq_item */ 4563 (intintargfunc) unicode_slice, /* sq_slice */ 4564 0, /* sq_ass_item */ 4565 0, /* sq_ass_slice */ 4566 (objobjproc)PyUnicode_Contains, /*sq_contains*/ 4567}; 4568 4569static int 4570unicode_buffer_getreadbuf(PyUnicodeObject *self, 4571 int index, 4572 const void **ptr) 4573{ 4574 if (index != 0) { 4575 PyErr_SetString(PyExc_SystemError, 4576 "accessing non-existent unicode segment"); 4577 return -1; 4578 } 4579 *ptr = (void *) self->str; 4580 return PyUnicode_GET_DATA_SIZE(self); 4581} 4582 4583static int 4584unicode_buffer_getwritebuf(PyUnicodeObject *self, int index, 4585 const void **ptr) 4586{ 4587 PyErr_SetString(PyExc_TypeError, 4588 "cannot use unicode as modifyable buffer"); 4589 return -1; 4590} 4591 4592static int 4593unicode_buffer_getsegcount(PyUnicodeObject *self, 4594 int *lenp) 4595{ 4596 if (lenp) 4597 *lenp = PyUnicode_GET_DATA_SIZE(self); 4598 return 1; 4599} 4600 4601static int 4602unicode_buffer_getcharbuf(PyUnicodeObject *self, 4603 int index, 4604 const void **ptr) 4605{ 4606 PyObject *str; 4607 4608 if (index != 0) { 4609 PyErr_SetString(PyExc_SystemError, 4610 "accessing non-existent unicode segment"); 4611 return -1; 4612 } 4613 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); 4614 if (str == NULL) 4615 return -1; 4616 *ptr = (void *) PyString_AS_STRING(str); 4617 return PyString_GET_SIZE(str); 4618} 4619 4620/* Helpers for PyUnicode_Format() */ 4621 4622static PyObject * 4623getnextarg(PyObject *args, int arglen, int *p_argidx) 4624{ 4625 int argidx = *p_argidx; 4626 if (argidx < arglen) { 4627 (*p_argidx)++; 4628 if (arglen < 0) 4629 return args; 4630 else 4631 return PyTuple_GetItem(args, argidx); 4632 } 4633 PyErr_SetString(PyExc_TypeError, 4634 "not enough arguments for format string"); 4635 return NULL; 4636} 4637 4638#define F_LJUST (1<<0) 4639#define F_SIGN (1<<1) 4640#define F_BLANK (1<<2) 4641#define F_ALT (1<<3) 4642#define F_ZERO (1<<4) 4643 4644static 4645int usprintf(register Py_UNICODE *buffer, char *format, ...) 4646{ 4647 register int i; 4648 int len; 4649 va_list va; 4650 char *charbuffer; 4651 va_start(va, format); 4652 4653 /* First, format the string as char array, then expand to Py_UNICODE 4654 array. */ 4655 charbuffer = (char *)buffer; 4656 len = vsprintf(charbuffer, format, va); 4657 for (i = len - 1; i >= 0; i--) 4658 buffer[i] = (Py_UNICODE) charbuffer[i]; 4659 4660 va_end(va); 4661 return len; 4662} 4663 4664static int 4665formatfloat(Py_UNICODE *buf, 4666 size_t buflen, 4667 int flags, 4668 int prec, 4669 int type, 4670 PyObject *v) 4671{ 4672 /* fmt = '%#.' + `prec` + `type` 4673 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ 4674 char fmt[20]; 4675 double x; 4676 4677 x = PyFloat_AsDouble(v); 4678 if (x == -1.0 && PyErr_Occurred()) 4679 return -1; 4680 if (prec < 0) 4681 prec = 6; 4682 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) 4683 type = 'g'; 4684 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type); 4685 /* worst case length calc to ensure no buffer overrun: 4686 fmt = %#.<prec>g 4687 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp 4688 for any double rep.) 4689 len = 1 + prec + 1 + 2 + 5 = 9 + prec 4690 If prec=0 the effective precision is 1 (the leading digit is 4691 always given), therefore increase by one to 10+prec. */ 4692 if (buflen <= (size_t)10 + (size_t)prec) { 4693 PyErr_SetString(PyExc_OverflowError, 4694 "formatted float is too long (precision too long?)"); 4695 return -1; 4696 } 4697 return usprintf(buf, fmt, x); 4698} 4699 4700static PyObject* 4701formatlong(PyObject *val, int flags, int prec, int type) 4702{ 4703 char *buf; 4704 int i, len; 4705 PyObject *str; /* temporary string object. */ 4706 PyUnicodeObject *result; 4707 4708 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); 4709 if (!str) 4710 return NULL; 4711 result = _PyUnicode_New(len); 4712 for (i = 0; i < len; i++) 4713 result->str[i] = buf[i]; 4714 result->str[len] = 0; 4715 Py_DECREF(str); 4716 return (PyObject*)result; 4717} 4718 4719static int 4720formatint(Py_UNICODE *buf, 4721 size_t buflen, 4722 int flags, 4723 int prec, 4724 int type, 4725 PyObject *v) 4726{ 4727 /* fmt = '%#.' + `prec` + 'l' + `type` 4728 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) 4729 + 1 + 1 = 24*/ 4730 char fmt[64]; /* plenty big enough! */ 4731 long x; 4732 4733 x = PyInt_AsLong(v); 4734 if (x == -1 && PyErr_Occurred()) 4735 return -1; 4736 if (prec < 0) 4737 prec = 1; 4738 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal)) 4739 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */ 4740 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) { 4741 PyErr_SetString(PyExc_OverflowError, 4742 "formatted integer is too long (precision too long?)"); 4743 return -1; 4744 } 4745 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type); 4746 return usprintf(buf, fmt, x); 4747} 4748 4749static int 4750formatchar(Py_UNICODE *buf, 4751 size_t buflen, 4752 PyObject *v) 4753{ 4754 /* presume that the buffer is at least 2 characters long */ 4755 if (PyUnicode_Check(v)) { 4756 if (PyUnicode_GET_SIZE(v) != 1) 4757 goto onError; 4758 buf[0] = PyUnicode_AS_UNICODE(v)[0]; 4759 } 4760 4761 else if (PyString_Check(v)) { 4762 if (PyString_GET_SIZE(v) != 1) 4763 goto onError; 4764 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; 4765 } 4766 4767 else { 4768 /* Integer input truncated to a character */ 4769 long x; 4770 x = PyInt_AsLong(v); 4771 if (x == -1 && PyErr_Occurred()) 4772 goto onError; 4773 buf[0] = (char) x; 4774 } 4775 buf[1] = '\0'; 4776 return 1; 4777 4778 onError: 4779 PyErr_SetString(PyExc_TypeError, 4780 "%c requires int or char"); 4781 return -1; 4782} 4783 4784/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 4785 4786 FORMATBUFLEN is the length of the buffer in which the floats, ints, & 4787 chars are formatted. XXX This is a magic number. Each formatting 4788 routine does bounds checking to ensure no overflow, but a better 4789 solution may be to malloc a buffer of appropriate size for each 4790 format. For now, the current solution is sufficient. 4791*/ 4792#define FORMATBUFLEN (size_t)120 4793 4794PyObject *PyUnicode_Format(PyObject *format, 4795 PyObject *args) 4796{ 4797 Py_UNICODE *fmt, *res; 4798 int fmtcnt, rescnt, reslen, arglen, argidx; 4799 int args_owned = 0; 4800 PyUnicodeObject *result = NULL; 4801 PyObject *dict = NULL; 4802 PyObject *uformat; 4803 4804 if (format == NULL || args == NULL) { 4805 PyErr_BadInternalCall(); 4806 return NULL; 4807 } 4808 uformat = PyUnicode_FromObject(format); 4809 if (uformat == NULL) 4810 return NULL; 4811 fmt = PyUnicode_AS_UNICODE(uformat); 4812 fmtcnt = PyUnicode_GET_SIZE(uformat); 4813 4814 reslen = rescnt = fmtcnt + 100; 4815 result = _PyUnicode_New(reslen); 4816 if (result == NULL) 4817 goto onError; 4818 res = PyUnicode_AS_UNICODE(result); 4819 4820 if (PyTuple_Check(args)) { 4821 arglen = PyTuple_Size(args); 4822 argidx = 0; 4823 } 4824 else { 4825 arglen = -1; 4826 argidx = -2; 4827 } 4828 if (args->ob_type->tp_as_mapping) 4829 dict = args; 4830 4831 while (--fmtcnt >= 0) { 4832 if (*fmt != '%') { 4833 if (--rescnt < 0) { 4834 rescnt = fmtcnt + 100; 4835 reslen += rescnt; 4836 if (_PyUnicode_Resize(result, reslen) < 0) 4837 return NULL; 4838 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; 4839 --rescnt; 4840 } 4841 *res++ = *fmt++; 4842 } 4843 else { 4844 /* Got a format specifier */ 4845 int flags = 0; 4846 int width = -1; 4847 int prec = -1; 4848 int size = 0; 4849 Py_UNICODE c = '\0'; 4850 Py_UNICODE fill; 4851 PyObject *v = NULL; 4852 PyObject *temp = NULL; 4853 Py_UNICODE *pbuf; 4854 Py_UNICODE sign; 4855 int len; 4856 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ 4857 4858 fmt++; 4859 if (*fmt == '(') { 4860 Py_UNICODE *keystart; 4861 int keylen; 4862 PyObject *key; 4863 int pcount = 1; 4864 4865 if (dict == NULL) { 4866 PyErr_SetString(PyExc_TypeError, 4867 "format requires a mapping"); 4868 goto onError; 4869 } 4870 ++fmt; 4871 --fmtcnt; 4872 keystart = fmt; 4873 /* Skip over balanced parentheses */ 4874 while (pcount > 0 && --fmtcnt >= 0) { 4875 if (*fmt == ')') 4876 --pcount; 4877 else if (*fmt == '(') 4878 ++pcount; 4879 fmt++; 4880 } 4881 keylen = fmt - keystart - 1; 4882 if (fmtcnt < 0 || pcount > 0) { 4883 PyErr_SetString(PyExc_ValueError, 4884 "incomplete format key"); 4885 goto onError; 4886 } 4887 /* keys are converted to strings using UTF-8 and 4888 then looked up since Python uses strings to hold 4889 variables names etc. in its namespaces and we 4890 wouldn't want to break common idioms. */ 4891 key = PyUnicode_EncodeUTF8(keystart, 4892 keylen, 4893 NULL); 4894 if (key == NULL) 4895 goto onError; 4896 if (args_owned) { 4897 Py_DECREF(args); 4898 args_owned = 0; 4899 } 4900 args = PyObject_GetItem(dict, key); 4901 Py_DECREF(key); 4902 if (args == NULL) { 4903 goto onError; 4904 } 4905 args_owned = 1; 4906 arglen = -1; 4907 argidx = -2; 4908 } 4909 while (--fmtcnt >= 0) { 4910 switch (c = *fmt++) { 4911 case '-': flags |= F_LJUST; continue; 4912 case '+': flags |= F_SIGN; continue; 4913 case ' ': flags |= F_BLANK; continue; 4914 case '#': flags |= F_ALT; continue; 4915 case '0': flags |= F_ZERO; continue; 4916 } 4917 break; 4918 } 4919 if (c == '*') { 4920 v = getnextarg(args, arglen, &argidx); 4921 if (v == NULL) 4922 goto onError; 4923 if (!PyInt_Check(v)) { 4924 PyErr_SetString(PyExc_TypeError, 4925 "* wants int"); 4926 goto onError; 4927 } 4928 width = PyInt_AsLong(v); 4929 if (width < 0) { 4930 flags |= F_LJUST; 4931 width = -width; 4932 } 4933 if (--fmtcnt >= 0) 4934 c = *fmt++; 4935 } 4936 else if (c >= '0' && c <= '9') { 4937 width = c - '0'; 4938 while (--fmtcnt >= 0) { 4939 c = *fmt++; 4940 if (c < '0' || c > '9') 4941 break; 4942 if ((width*10) / 10 != width) { 4943 PyErr_SetString(PyExc_ValueError, 4944 "width too big"); 4945 goto onError; 4946 } 4947 width = width*10 + (c - '0'); 4948 } 4949 } 4950 if (c == '.') { 4951 prec = 0; 4952 if (--fmtcnt >= 0) 4953 c = *fmt++; 4954 if (c == '*') { 4955 v = getnextarg(args, arglen, &argidx); 4956 if (v == NULL) 4957 goto onError; 4958 if (!PyInt_Check(v)) { 4959 PyErr_SetString(PyExc_TypeError, 4960 "* wants int"); 4961 goto onError; 4962 } 4963 prec = PyInt_AsLong(v); 4964 if (prec < 0) 4965 prec = 0; 4966 if (--fmtcnt >= 0) 4967 c = *fmt++; 4968 } 4969 else if (c >= '0' && c <= '9') { 4970 prec = c - '0'; 4971 while (--fmtcnt >= 0) { 4972 c = Py_CHARMASK(*fmt++); 4973 if (c < '0' || c > '9') 4974 break; 4975 if ((prec*10) / 10 != prec) { 4976 PyErr_SetString(PyExc_ValueError, 4977 "prec too big"); 4978 goto onError; 4979 } 4980 prec = prec*10 + (c - '0'); 4981 } 4982 } 4983 } /* prec */ 4984 if (fmtcnt >= 0) { 4985 if (c == 'h' || c == 'l' || c == 'L') { 4986 size = c; 4987 if (--fmtcnt >= 0) 4988 c = *fmt++; 4989 } 4990 } 4991 if (fmtcnt < 0) { 4992 PyErr_SetString(PyExc_ValueError, 4993 "incomplete format"); 4994 goto onError; 4995 } 4996 if (c != '%') { 4997 v = getnextarg(args, arglen, &argidx); 4998 if (v == NULL) 4999 goto onError; 5000 } 5001 sign = 0; 5002 fill = ' '; 5003 switch (c) { 5004 5005 case '%': 5006 pbuf = formatbuf; 5007 /* presume that buffer length is at least 1 */ 5008 pbuf[0] = '%'; 5009 len = 1; 5010 break; 5011 5012 case 's': 5013 case 'r': 5014 if (PyUnicode_Check(v) && c == 's') { 5015 temp = v; 5016 Py_INCREF(temp); 5017 } 5018 else { 5019 PyObject *unicode; 5020 if (c == 's') 5021 temp = PyObject_Str(v); 5022 else 5023 temp = PyObject_Repr(v); 5024 if (temp == NULL) 5025 goto onError; 5026 if (!PyString_Check(temp)) { 5027 /* XXX Note: this should never happen, since 5028 PyObject_Repr() and PyObject_Str() assure 5029 this */ 5030 Py_DECREF(temp); 5031 PyErr_SetString(PyExc_TypeError, 5032 "%s argument has non-string str()"); 5033 goto onError; 5034 } 5035 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), 5036 PyString_GET_SIZE(temp), 5037 NULL, 5038 "strict"); 5039 Py_DECREF(temp); 5040 temp = unicode; 5041 if (temp == NULL) 5042 goto onError; 5043 } 5044 pbuf = PyUnicode_AS_UNICODE(temp); 5045 len = PyUnicode_GET_SIZE(temp); 5046 if (prec >= 0 && len > prec) 5047 len = prec; 5048 break; 5049 5050 case 'i': 5051 case 'd': 5052 case 'u': 5053 case 'o': 5054 case 'x': 5055 case 'X': 5056 if (c == 'i') 5057 c = 'd'; 5058 if (PyLong_Check(v)) { 5059 temp = formatlong(v, flags, prec, c); 5060 if (!temp) 5061 goto onError; 5062 pbuf = PyUnicode_AS_UNICODE(temp); 5063 len = PyUnicode_GET_SIZE(temp); 5064 /* unbounded ints can always produce 5065 a sign character! */ 5066 sign = 1; 5067 } 5068 else { 5069 pbuf = formatbuf; 5070 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5071 flags, prec, c, v); 5072 if (len < 0) 5073 goto onError; 5074 /* only d conversion is signed */ 5075 sign = c == 'd'; 5076 } 5077 if (flags & F_ZERO) 5078 fill = '0'; 5079 break; 5080 5081 case 'e': 5082 case 'E': 5083 case 'f': 5084 case 'g': 5085 case 'G': 5086 pbuf = formatbuf; 5087 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), 5088 flags, prec, c, v); 5089 if (len < 0) 5090 goto onError; 5091 sign = 1; 5092 if (flags & F_ZERO) 5093 fill = '0'; 5094 break; 5095 5096 case 'c': 5097 pbuf = formatbuf; 5098 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); 5099 if (len < 0) 5100 goto onError; 5101 break; 5102 5103 default: 5104 PyErr_Format(PyExc_ValueError, 5105 "unsupported format character '%c' (0x%x) " 5106 "at index %i", 5107 (31<=c && c<=126) ? c : '?', 5108 c, fmt -1 - PyUnicode_AS_UNICODE(uformat)); 5109 goto onError; 5110 } 5111 if (sign) { 5112 if (*pbuf == '-' || *pbuf == '+') { 5113 sign = *pbuf++; 5114 len--; 5115 } 5116 else if (flags & F_SIGN) 5117 sign = '+'; 5118 else if (flags & F_BLANK) 5119 sign = ' '; 5120 else 5121 sign = 0; 5122 } 5123 if (width < len) 5124 width = len; 5125 if (rescnt < width + (sign != 0)) { 5126 reslen -= rescnt; 5127 rescnt = width + fmtcnt + 100; 5128 reslen += rescnt; 5129 if (_PyUnicode_Resize(result, reslen) < 0) 5130 return NULL; 5131 res = PyUnicode_AS_UNICODE(result) 5132 + reslen - rescnt; 5133 } 5134 if (sign) { 5135 if (fill != ' ') 5136 *res++ = sign; 5137 rescnt--; 5138 if (width > len) 5139 width--; 5140 } 5141 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5142 assert(pbuf[0] == '0'); 5143 assert(pbuf[1] == c); 5144 if (fill != ' ') { 5145 *res++ = *pbuf++; 5146 *res++ = *pbuf++; 5147 } 5148 rescnt -= 2; 5149 width -= 2; 5150 if (width < 0) 5151 width = 0; 5152 len -= 2; 5153 } 5154 if (width > len && !(flags & F_LJUST)) { 5155 do { 5156 --rescnt; 5157 *res++ = fill; 5158 } while (--width > len); 5159 } 5160 if (fill == ' ') { 5161 if (sign) 5162 *res++ = sign; 5163 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { 5164 assert(pbuf[0] == '0'); 5165 assert(pbuf[1] == c); 5166 *res++ = *pbuf++; 5167 *res++ = *pbuf++; 5168 } 5169 } 5170 memcpy(res, pbuf, len * sizeof(Py_UNICODE)); 5171 res += len; 5172 rescnt -= len; 5173 while (--width >= len) { 5174 --rescnt; 5175 *res++ = ' '; 5176 } 5177 if (dict && (argidx < arglen) && c != '%') { 5178 PyErr_SetString(PyExc_TypeError, 5179 "not all arguments converted"); 5180 goto onError; 5181 } 5182 Py_XDECREF(temp); 5183 } /* '%' */ 5184 } /* until end */ 5185 if (argidx < arglen && !dict) { 5186 PyErr_SetString(PyExc_TypeError, 5187 "not all arguments converted"); 5188 goto onError; 5189 } 5190 5191 if (args_owned) { 5192 Py_DECREF(args); 5193 } 5194 Py_DECREF(uformat); 5195 if (_PyUnicode_Resize(result, reslen - rescnt)) 5196 goto onError; 5197 return (PyObject *)result; 5198 5199 onError: 5200 Py_XDECREF(result); 5201 Py_DECREF(uformat); 5202 if (args_owned) { 5203 Py_DECREF(args); 5204 } 5205 return NULL; 5206} 5207 5208static PyBufferProcs unicode_as_buffer = { 5209 (getreadbufferproc) unicode_buffer_getreadbuf, 5210 (getwritebufferproc) unicode_buffer_getwritebuf, 5211 (getsegcountproc) unicode_buffer_getsegcount, 5212 (getcharbufferproc) unicode_buffer_getcharbuf, 5213}; 5214 5215PyTypeObject PyUnicode_Type = { 5216 PyObject_HEAD_INIT(&PyType_Type) 5217 0, /* ob_size */ 5218 "unicode", /* tp_name */ 5219 sizeof(PyUnicodeObject), /* tp_size */ 5220 0, /* tp_itemsize */ 5221 /* Slots */ 5222 (destructor)_PyUnicode_Free, /* tp_dealloc */ 5223 0, /* tp_print */ 5224 (getattrfunc)unicode_getattr, /* tp_getattr */ 5225 0, /* tp_setattr */ 5226 (cmpfunc) unicode_compare, /* tp_compare */ 5227 (reprfunc) unicode_repr, /* tp_repr */ 5228 0, /* tp_as_number */ 5229 &unicode_as_sequence, /* tp_as_sequence */ 5230 0, /* tp_as_mapping */ 5231 (hashfunc) unicode_hash, /* tp_hash*/ 5232 0, /* tp_call*/ 5233 (reprfunc) unicode_str, /* tp_str */ 5234 (getattrofunc) NULL, /* tp_getattro */ 5235 (setattrofunc) NULL, /* tp_setattro */ 5236 &unicode_as_buffer, /* tp_as_buffer */ 5237 Py_TPFLAGS_DEFAULT, /* tp_flags */ 5238}; 5239 5240/* Initialize the Unicode implementation */ 5241 5242void _PyUnicode_Init(void) 5243{ 5244 /* Doublecheck the configuration... */ 5245 if (sizeof(Py_UNICODE) != 2) 5246 Py_FatalError("Unicode configuration error: " 5247 "sizeof(Py_UNICODE) != 2 bytes"); 5248 5249 /* Init the implementation */ 5250 unicode_freelist = NULL; 5251 unicode_freelist_size = 0; 5252 unicode_empty = _PyUnicode_New(0); 5253 strcpy(unicode_default_encoding, "ascii"); 5254} 5255 5256/* Finalize the Unicode implementation */ 5257 5258void 5259_PyUnicode_Fini(void) 5260{ 5261 PyUnicodeObject *u; 5262 5263 Py_XDECREF(unicode_empty); 5264 unicode_empty = NULL; 5265 5266 for (u = unicode_freelist; u != NULL;) { 5267 PyUnicodeObject *v = u; 5268 u = *(PyUnicodeObject **)u; 5269 if (v->str) 5270 PyMem_DEL(v->str); 5271 Py_XDECREF(v->defenc); 5272 PyObject_DEL(v); 5273 } 5274 unicode_freelist = NULL; 5275 unicode_freelist_size = 0; 5276} 5277