unicodeobject.c revision 794d567b173e4cc10ad233aeb8743283ea9c3e6b
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49#ifdef Py_DEBUG 50# define DONT_MAKE_RESULT_READY 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96#ifdef Py_DEBUG 97# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 98#else 99# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 100#endif 101 102#define _PyUnicode_UTF8(op) \ 103 (((PyCompactUnicodeObject*)(op))->utf8) 104#define PyUnicode_UTF8(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 assert(PyUnicode_IS_READY(op)), \ 107 PyUnicode_IS_COMPACT_ASCII(op) ? \ 108 ((char*)((PyASCIIObject*)(op) + 1)) : \ 109 _PyUnicode_UTF8(op)) 110#define _PyUnicode_UTF8_LENGTH(op) \ 111 (((PyCompactUnicodeObject*)(op))->utf8_length) 112#define PyUnicode_UTF8_LENGTH(op) \ 113 (assert(_PyUnicode_CHECK(op)), \ 114 assert(PyUnicode_IS_READY(op)), \ 115 PyUnicode_IS_COMPACT_ASCII(op) ? \ 116 ((PyASCIIObject*)(op))->length : \ 117 _PyUnicode_UTF8_LENGTH(op)) 118#define _PyUnicode_WSTR(op) \ 119 (((PyASCIIObject*)(op))->wstr) 120#define _PyUnicode_WSTR_LENGTH(op) \ 121 (((PyCompactUnicodeObject*)(op))->wstr_length) 122#define _PyUnicode_LENGTH(op) \ 123 (((PyASCIIObject *)(op))->length) 124#define _PyUnicode_STATE(op) \ 125 (((PyASCIIObject *)(op))->state) 126#define _PyUnicode_HASH(op) \ 127 (((PyASCIIObject *)(op))->hash) 128#define _PyUnicode_KIND(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 ((PyASCIIObject *)(op))->state.kind) 131#define _PyUnicode_GET_LENGTH(op) \ 132 (assert(_PyUnicode_CHECK(op)), \ 133 ((PyASCIIObject *)(op))->length) 134#define _PyUnicode_DATA_ANY(op) \ 135 (((PyUnicodeObject*)(op))->data.any) 136 137#undef PyUnicode_READY 138#define PyUnicode_READY(op) \ 139 (assert(_PyUnicode_CHECK(op)), \ 140 (PyUnicode_IS_READY(op) ? \ 141 0 : \ 142 _PyUnicode_Ready((PyObject *)(op)))) 143 144#define _PyUnicode_READY_REPLACE(p_obj) \ 145 (assert(_PyUnicode_CHECK(*p_obj)), \ 146 (PyUnicode_IS_READY(*p_obj) ? \ 147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 148 149#define _PyUnicode_SHARE_UTF8(op) \ 150 (assert(_PyUnicode_CHECK(op)), \ 151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 153#define _PyUnicode_SHARE_WSTR(op) \ 154 (assert(_PyUnicode_CHECK(op)), \ 155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 156 157/* true if the Unicode object has an allocated UTF-8 memory block 158 (not shared with other data) */ 159#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 160 (assert(_PyUnicode_CHECK(op)), \ 161 (!PyUnicode_IS_COMPACT_ASCII(op) \ 162 && _PyUnicode_UTF8(op) \ 163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 164 165/* true if the Unicode object has an allocated wstr memory block 166 (not shared with other data) */ 167#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 168 (assert(_PyUnicode_CHECK(op)), \ 169 (_PyUnicode_WSTR(op) && \ 170 (!PyUnicode_IS_READY(op) || \ 171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 172 173/* Generic helper macro to convert characters of different types. 174 from_type and to_type have to be valid type names, begin and end 175 are pointers to the source characters which should be of type 176 "from_type *". to is a pointer of type "to_type *" and points to the 177 buffer where the result characters are written to. */ 178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 179 do { \ 180 const from_type *iter_; to_type *to_; \ 181 for (iter_ = (begin), to_ = (to_type *)(to); \ 182 iter_ < (end); \ 183 ++iter_, ++to_) { \ 184 *to_ = (to_type)*iter_; \ 185 } \ 186 } while (0) 187 188/* The Unicode string has been modified: reset the hash */ 189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 190 191/* This dictionary holds all interned unicode strings. Note that references 192 to strings in this dictionary are *not* counted in the string's ob_refcnt. 193 When the interned string reaches a refcnt of 0 the string deallocation 194 function will delete the reference from this dictionary. 195 196 Another way to look at this is that to say that the actual reference 197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 198*/ 199static PyObject *interned; 200 201/* The empty Unicode object is shared to improve performance. */ 202static PyObject *unicode_empty; 203 204/* List of static strings. */ 205static _Py_Identifier *static_strings; 206 207/* Single character Unicode strings in the Latin-1 range are being 208 shared as well. */ 209static PyObject *unicode_latin1[256]; 210 211/* Fast detection of the most frequent whitespace characters */ 212const unsigned char _Py_ascii_whitespace[] = { 213 0, 0, 0, 0, 0, 0, 0, 0, 214/* case 0x0009: * CHARACTER TABULATION */ 215/* case 0x000A: * LINE FEED */ 216/* case 0x000B: * LINE TABULATION */ 217/* case 0x000C: * FORM FEED */ 218/* case 0x000D: * CARRIAGE RETURN */ 219 0, 1, 1, 1, 1, 1, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221/* case 0x001C: * FILE SEPARATOR */ 222/* case 0x001D: * GROUP SEPARATOR */ 223/* case 0x001E: * RECORD SEPARATOR */ 224/* case 0x001F: * UNIT SEPARATOR */ 225 0, 0, 0, 0, 1, 1, 1, 1, 226/* case 0x0020: * SPACE */ 227 1, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0, 237 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0, 239 0, 0, 0, 0, 0, 0, 0, 0 240}; 241 242/* forward */ 243static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 244static PyObject* get_latin1_char(unsigned char ch); 245static void copy_characters( 246 PyObject *to, Py_ssize_t to_start, 247 PyObject *from, Py_ssize_t from_start, 248 Py_ssize_t how_many); 249#ifdef Py_DEBUG 250static int unicode_is_singleton(PyObject *unicode); 251#endif 252 253static PyObject * 254unicode_encode_call_errorhandler(const char *errors, 255 PyObject **errorHandler,const char *encoding, const char *reason, 256 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 258 259static void 260raise_encode_exception(PyObject **exceptionObject, 261 const char *encoding, 262 const Py_UNICODE *unicode, Py_ssize_t size, 263 Py_ssize_t startpos, Py_ssize_t endpos, 264 const char *reason); 265 266/* Same for linebreaks */ 267static unsigned char ascii_linebreak[] = { 268 0, 0, 0, 0, 0, 0, 0, 0, 269/* 0x000A, * LINE FEED */ 270/* 0x000B, * LINE TABULATION */ 271/* 0x000C, * FORM FEED */ 272/* 0x000D, * CARRIAGE RETURN */ 273 0, 0, 1, 1, 1, 1, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275/* 0x001C, * FILE SEPARATOR */ 276/* 0x001D, * GROUP SEPARATOR */ 277/* 0x001E, * RECORD SEPARATOR */ 278 0, 0, 0, 0, 1, 1, 1, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0 292}; 293 294/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 295 This function is kept for backward compatibility with the old API. */ 296Py_UNICODE 297PyUnicode_GetMax(void) 298{ 299#ifdef Py_UNICODE_WIDE 300 return 0x10FFFF; 301#else 302 /* This is actually an illegal character, so it should 303 not be passed to unichr. */ 304 return 0xFFFF; 305#endif 306} 307 308#ifdef Py_DEBUG 309int 310/* FIXME: use PyObject* type for op */ 311_PyUnicode_CheckConsistency(void *op, int check_content) 312{ 313 PyASCIIObject *ascii; 314 unsigned int kind; 315 316 assert(PyUnicode_Check(op)); 317 318 ascii = (PyASCIIObject *)op; 319 kind = ascii->state.kind; 320 321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 322 assert(kind == PyUnicode_1BYTE_KIND); 323 assert(ascii->state.ready == 1); 324 } 325 else { 326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 327 void *data; 328 329 if (ascii->state.compact == 1) { 330 data = compact + 1; 331 assert(kind == PyUnicode_1BYTE_KIND 332 || kind == PyUnicode_2BYTE_KIND 333 || kind == PyUnicode_4BYTE_KIND); 334 assert(ascii->state.ascii == 0); 335 assert(ascii->state.ready == 1); 336 assert (compact->utf8 != data); 337 } else { 338 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 339 340 data = unicode->data.any; 341 if (kind == PyUnicode_WCHAR_KIND) { 342 assert(ascii->state.compact == 0); 343 assert(ascii->state.ascii == 0); 344 assert(ascii->state.ready == 0); 345 assert(ascii->wstr != NULL); 346 assert(data == NULL); 347 assert(compact->utf8 == NULL); 348 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 349 } 350 else { 351 assert(kind == PyUnicode_1BYTE_KIND 352 || kind == PyUnicode_2BYTE_KIND 353 || kind == PyUnicode_4BYTE_KIND); 354 assert(ascii->state.compact == 0); 355 assert(ascii->state.ready == 1); 356 assert(data != NULL); 357 if (ascii->state.ascii) { 358 assert (compact->utf8 == data); 359 assert (compact->utf8_length == ascii->length); 360 } 361 else 362 assert (compact->utf8 != data); 363 } 364 } 365 if (kind != PyUnicode_WCHAR_KIND) { 366 if ( 367#if SIZEOF_WCHAR_T == 2 368 kind == PyUnicode_2BYTE_KIND 369#else 370 kind == PyUnicode_4BYTE_KIND 371#endif 372 ) 373 { 374 assert(ascii->wstr == data); 375 assert(compact->wstr_length == ascii->length); 376 } else 377 assert(ascii->wstr != data); 378 } 379 380 if (compact->utf8 == NULL) 381 assert(compact->utf8_length == 0); 382 if (ascii->wstr == NULL) 383 assert(compact->wstr_length == 0); 384 } 385 /* check that the best kind is used */ 386 if (check_content && kind != PyUnicode_WCHAR_KIND) 387 { 388 Py_ssize_t i; 389 Py_UCS4 maxchar = 0; 390 void *data = PyUnicode_DATA(ascii); 391 for (i=0; i < ascii->length; i++) 392 { 393 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 394 if (ch > maxchar) 395 maxchar = ch; 396 } 397 if (kind == PyUnicode_1BYTE_KIND) { 398 if (ascii->state.ascii == 0) 399 assert(maxchar >= 128); 400 else 401 assert(maxchar < 128); 402 } 403 else if (kind == PyUnicode_2BYTE_KIND) 404 assert(maxchar >= 0x100); 405 else 406 assert(maxchar >= 0x10000); 407 } 408 if (check_content && !unicode_is_singleton((PyObject*)ascii)) 409 assert(ascii->hash == -1); 410 return 1; 411} 412#endif 413 414/* --- Bloom Filters ----------------------------------------------------- */ 415 416/* stuff to implement simple "bloom filters" for Unicode characters. 417 to keep things simple, we use a single bitmask, using the least 5 418 bits from each unicode characters as the bit index. */ 419 420/* the linebreak mask is set up by Unicode_Init below */ 421 422#if LONG_BIT >= 128 423#define BLOOM_WIDTH 128 424#elif LONG_BIT >= 64 425#define BLOOM_WIDTH 64 426#elif LONG_BIT >= 32 427#define BLOOM_WIDTH 32 428#else 429#error "LONG_BIT is smaller than 32" 430#endif 431 432#define BLOOM_MASK unsigned long 433 434static BLOOM_MASK bloom_linebreak; 435 436#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 437#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 438 439#define BLOOM_LINEBREAK(ch) \ 440 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 441 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 442 443Py_LOCAL_INLINE(BLOOM_MASK) 444make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 445{ 446 /* calculate simple bloom-style bitmask for a given unicode string */ 447 448 BLOOM_MASK mask; 449 Py_ssize_t i; 450 451 mask = 0; 452 for (i = 0; i < len; i++) 453 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 454 455 return mask; 456} 457 458#define BLOOM_MEMBER(mask, chr, str) \ 459 (BLOOM(mask, chr) \ 460 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 461 462/* --- Unicode Object ----------------------------------------------------- */ 463 464static PyObject * 465fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 466 467Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 468 Py_ssize_t size, Py_UCS4 ch, 469 int direction) 470{ 471 /* like wcschr, but doesn't stop at NULL characters */ 472 Py_ssize_t i; 473 if (direction == 1) { 474 for(i = 0; i < size; i++) 475 if (PyUnicode_READ(kind, s, i) == ch) 476 return (char*)s + kind * i; 477 } 478 else { 479 for(i = size-1; i >= 0; i--) 480 if (PyUnicode_READ(kind, s, i) == ch) 481 return (char*)s + kind * i; 482 } 483 return NULL; 484} 485 486static PyObject* 487resize_compact(PyObject *unicode, Py_ssize_t length) 488{ 489 Py_ssize_t char_size; 490 Py_ssize_t struct_size; 491 Py_ssize_t new_size; 492 int share_wstr; 493 494 assert(PyUnicode_IS_READY(unicode)); 495 char_size = PyUnicode_KIND(unicode); 496 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 497 struct_size = sizeof(PyASCIIObject); 498 else 499 struct_size = sizeof(PyCompactUnicodeObject); 500 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 501 502 _Py_DEC_REFTOTAL; 503 _Py_ForgetReference(unicode); 504 505 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 506 PyErr_NoMemory(); 507 return NULL; 508 } 509 new_size = (struct_size + (length + 1) * char_size); 510 511 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 512 if (unicode == NULL) { 513 PyObject_Del(unicode); 514 PyErr_NoMemory(); 515 return NULL; 516 } 517 _Py_NewReference(unicode); 518 _PyUnicode_LENGTH(unicode) = length; 519 if (share_wstr) { 520 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 521 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 522 _PyUnicode_WSTR_LENGTH(unicode) = length; 523 } 524 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 525 length, 0); 526 return unicode; 527} 528 529static int 530resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) 531{ 532 wchar_t *wstr; 533 assert(!PyUnicode_IS_COMPACT(unicode)); 534 assert(Py_REFCNT(unicode) == 1); 535 536 _PyUnicode_DIRTY(unicode); 537 538 if (PyUnicode_IS_READY(unicode)) { 539 Py_ssize_t char_size; 540 Py_ssize_t new_size; 541 int share_wstr, share_utf8; 542 void *data; 543 544 data = _PyUnicode_DATA_ANY(unicode); 545 assert(data != NULL); 546 char_size = PyUnicode_KIND(unicode); 547 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 548 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 549 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 550 { 551 PyObject_DEL(_PyUnicode_UTF8(unicode)); 552 _PyUnicode_UTF8(unicode) = NULL; 553 _PyUnicode_UTF8_LENGTH(unicode) = 0; 554 } 555 556 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 557 PyErr_NoMemory(); 558 return -1; 559 } 560 new_size = (length + 1) * char_size; 561 562 data = (PyObject *)PyObject_REALLOC(data, new_size); 563 if (data == NULL) { 564 PyErr_NoMemory(); 565 return -1; 566 } 567 _PyUnicode_DATA_ANY(unicode) = data; 568 if (share_wstr) { 569 _PyUnicode_WSTR(unicode) = data; 570 _PyUnicode_WSTR_LENGTH(unicode) = length; 571 } 572 if (share_utf8) { 573 _PyUnicode_UTF8(unicode) = data; 574 _PyUnicode_UTF8_LENGTH(unicode) = length; 575 } 576 _PyUnicode_LENGTH(unicode) = length; 577 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 578 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 579 assert(_PyUnicode_CheckConsistency(unicode, 0)); 580 return 0; 581 } 582 } 583 assert(_PyUnicode_WSTR(unicode) != NULL); 584 585 /* check for integer overflow */ 586 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 587 PyErr_NoMemory(); 588 return -1; 589 } 590 wstr = _PyUnicode_WSTR(unicode); 591 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 592 if (!wstr) { 593 PyErr_NoMemory(); 594 return -1; 595 } 596 _PyUnicode_WSTR(unicode) = wstr; 597 _PyUnicode_WSTR(unicode)[length] = 0; 598 _PyUnicode_WSTR_LENGTH(unicode) = length; 599 assert(_PyUnicode_CheckConsistency(unicode, 0)); 600 return 0; 601} 602 603static PyObject* 604resize_copy(PyObject *unicode, Py_ssize_t length) 605{ 606 Py_ssize_t copy_length; 607 if (PyUnicode_IS_COMPACT(unicode)) { 608 PyObject *copy; 609 assert(PyUnicode_IS_READY(unicode)); 610 611 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 612 if (copy == NULL) 613 return NULL; 614 615 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 616 copy_characters(copy, 0, unicode, 0, copy_length); 617 return copy; 618 } 619 else { 620 PyUnicodeObject *w; 621 assert(_PyUnicode_WSTR(unicode) != NULL); 622 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 623 w = _PyUnicode_New(length); 624 if (w == NULL) 625 return NULL; 626 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 627 copy_length = Py_MIN(copy_length, length); 628 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 629 copy_length); 630 return (PyObject*)w; 631 } 632} 633 634/* We allocate one more byte to make sure the string is 635 Ux0000 terminated; some code (e.g. new_identifier) 636 relies on that. 637 638 XXX This allocator could further be enhanced by assuring that the 639 free list never reduces its size below 1. 640 641*/ 642 643#ifdef Py_DEBUG 644int unicode_old_new_calls = 0; 645#endif 646 647static PyUnicodeObject * 648_PyUnicode_New(Py_ssize_t length) 649{ 650 register PyUnicodeObject *unicode; 651 size_t new_size; 652 653 /* Optimization for empty strings */ 654 if (length == 0 && unicode_empty != NULL) { 655 Py_INCREF(unicode_empty); 656 return (PyUnicodeObject*)unicode_empty; 657 } 658 659 /* Ensure we won't overflow the size. */ 660 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 661 return (PyUnicodeObject *)PyErr_NoMemory(); 662 } 663 if (length < 0) { 664 PyErr_SetString(PyExc_SystemError, 665 "Negative size passed to _PyUnicode_New"); 666 return NULL; 667 } 668 669#ifdef Py_DEBUG 670 ++unicode_old_new_calls; 671#endif 672 673 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 674 if (unicode == NULL) 675 return NULL; 676 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 677 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 678 if (!_PyUnicode_WSTR(unicode)) { 679 PyErr_NoMemory(); 680 goto onError; 681 } 682 683 /* Initialize the first element to guard against cases where 684 * the caller fails before initializing str -- unicode_resize() 685 * reads str[0], and the Keep-Alive optimization can keep memory 686 * allocated for str alive across a call to unicode_dealloc(unicode). 687 * We don't want unicode_resize to read uninitialized memory in 688 * that case. 689 */ 690 _PyUnicode_WSTR(unicode)[0] = 0; 691 _PyUnicode_WSTR(unicode)[length] = 0; 692 _PyUnicode_WSTR_LENGTH(unicode) = length; 693 _PyUnicode_HASH(unicode) = -1; 694 _PyUnicode_STATE(unicode).interned = 0; 695 _PyUnicode_STATE(unicode).kind = 0; 696 _PyUnicode_STATE(unicode).compact = 0; 697 _PyUnicode_STATE(unicode).ready = 0; 698 _PyUnicode_STATE(unicode).ascii = 0; 699 _PyUnicode_DATA_ANY(unicode) = NULL; 700 _PyUnicode_LENGTH(unicode) = 0; 701 _PyUnicode_UTF8(unicode) = NULL; 702 _PyUnicode_UTF8_LENGTH(unicode) = 0; 703 return unicode; 704 705 onError: 706 /* XXX UNREF/NEWREF interface should be more symmetrical */ 707 _Py_DEC_REFTOTAL; 708 _Py_ForgetReference((PyObject *)unicode); 709 PyObject_Del(unicode); 710 return NULL; 711} 712 713static const char* 714unicode_kind_name(PyObject *unicode) 715{ 716 /* don't check consistency: unicode_kind_name() is called from 717 _PyUnicode_Dump() */ 718 if (!PyUnicode_IS_COMPACT(unicode)) 719 { 720 if (!PyUnicode_IS_READY(unicode)) 721 return "wstr"; 722 switch(PyUnicode_KIND(unicode)) 723 { 724 case PyUnicode_1BYTE_KIND: 725 if (PyUnicode_IS_ASCII(unicode)) 726 return "legacy ascii"; 727 else 728 return "legacy latin1"; 729 case PyUnicode_2BYTE_KIND: 730 return "legacy UCS2"; 731 case PyUnicode_4BYTE_KIND: 732 return "legacy UCS4"; 733 default: 734 return "<legacy invalid kind>"; 735 } 736 } 737 assert(PyUnicode_IS_READY(unicode)); 738 switch(PyUnicode_KIND(unicode)) 739 { 740 case PyUnicode_1BYTE_KIND: 741 if (PyUnicode_IS_ASCII(unicode)) 742 return "ascii"; 743 else 744 return "latin1"; 745 case PyUnicode_2BYTE_KIND: 746 return "UCS2"; 747 case PyUnicode_4BYTE_KIND: 748 return "UCS4"; 749 default: 750 return "<invalid compact kind>"; 751 } 752} 753 754#ifdef Py_DEBUG 755int unicode_new_new_calls = 0; 756 757/* Functions wrapping macros for use in debugger */ 758char *_PyUnicode_utf8(void *unicode){ 759 return PyUnicode_UTF8(unicode); 760} 761 762void *_PyUnicode_compact_data(void *unicode) { 763 return _PyUnicode_COMPACT_DATA(unicode); 764} 765void *_PyUnicode_data(void *unicode){ 766 printf("obj %p\n", unicode); 767 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 768 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 769 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 770 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 771 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 772 return PyUnicode_DATA(unicode); 773} 774 775void 776_PyUnicode_Dump(PyObject *op) 777{ 778 PyASCIIObject *ascii = (PyASCIIObject *)op; 779 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 780 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 781 void *data; 782 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 783 if (ascii->state.compact) 784 data = (compact + 1); 785 else 786 data = unicode->data.any; 787 if (ascii->wstr == data) 788 printf("shared "); 789 printf("wstr=%p", ascii->wstr); 790 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 791 printf(" (%zu), ", compact->wstr_length); 792 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 793 printf("shared "); 794 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 795 } 796 printf(", data=%p\n", data); 797} 798#endif 799 800PyObject * 801PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 802{ 803 PyObject *obj; 804 PyCompactUnicodeObject *unicode; 805 void *data; 806 int kind_state; 807 int is_sharing, is_ascii; 808 Py_ssize_t char_size; 809 Py_ssize_t struct_size; 810 811 /* Optimization for empty strings */ 812 if (size == 0 && unicode_empty != NULL) { 813 Py_INCREF(unicode_empty); 814 return unicode_empty; 815 } 816 817#ifdef Py_DEBUG 818 ++unicode_new_new_calls; 819#endif 820 821 is_ascii = 0; 822 is_sharing = 0; 823 struct_size = sizeof(PyCompactUnicodeObject); 824 if (maxchar < 128) { 825 kind_state = PyUnicode_1BYTE_KIND; 826 char_size = 1; 827 is_ascii = 1; 828 struct_size = sizeof(PyASCIIObject); 829 } 830 else if (maxchar < 256) { 831 kind_state = PyUnicode_1BYTE_KIND; 832 char_size = 1; 833 } 834 else if (maxchar < 65536) { 835 kind_state = PyUnicode_2BYTE_KIND; 836 char_size = 2; 837 if (sizeof(wchar_t) == 2) 838 is_sharing = 1; 839 } 840 else { 841 kind_state = PyUnicode_4BYTE_KIND; 842 char_size = 4; 843 if (sizeof(wchar_t) == 4) 844 is_sharing = 1; 845 } 846 847 /* Ensure we won't overflow the size. */ 848 if (size < 0) { 849 PyErr_SetString(PyExc_SystemError, 850 "Negative size passed to PyUnicode_New"); 851 return NULL; 852 } 853 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 854 return PyErr_NoMemory(); 855 856 /* Duplicated allocation code from _PyObject_New() instead of a call to 857 * PyObject_New() so we are able to allocate space for the object and 858 * it's data buffer. 859 */ 860 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 861 if (obj == NULL) 862 return PyErr_NoMemory(); 863 obj = PyObject_INIT(obj, &PyUnicode_Type); 864 if (obj == NULL) 865 return NULL; 866 867 unicode = (PyCompactUnicodeObject *)obj; 868 if (is_ascii) 869 data = ((PyASCIIObject*)obj) + 1; 870 else 871 data = unicode + 1; 872 _PyUnicode_LENGTH(unicode) = size; 873 _PyUnicode_HASH(unicode) = -1; 874 _PyUnicode_STATE(unicode).interned = 0; 875 _PyUnicode_STATE(unicode).kind = kind_state; 876 _PyUnicode_STATE(unicode).compact = 1; 877 _PyUnicode_STATE(unicode).ready = 1; 878 _PyUnicode_STATE(unicode).ascii = is_ascii; 879 if (is_ascii) { 880 ((char*)data)[size] = 0; 881 _PyUnicode_WSTR(unicode) = NULL; 882 } 883 else if (kind_state == PyUnicode_1BYTE_KIND) { 884 ((char*)data)[size] = 0; 885 _PyUnicode_WSTR(unicode) = NULL; 886 _PyUnicode_WSTR_LENGTH(unicode) = 0; 887 unicode->utf8 = NULL; 888 unicode->utf8_length = 0; 889 } 890 else { 891 unicode->utf8 = NULL; 892 unicode->utf8_length = 0; 893 if (kind_state == PyUnicode_2BYTE_KIND) 894 ((Py_UCS2*)data)[size] = 0; 895 else /* kind_state == PyUnicode_4BYTE_KIND */ 896 ((Py_UCS4*)data)[size] = 0; 897 if (is_sharing) { 898 _PyUnicode_WSTR_LENGTH(unicode) = size; 899 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 900 } 901 else { 902 _PyUnicode_WSTR_LENGTH(unicode) = 0; 903 _PyUnicode_WSTR(unicode) = NULL; 904 } 905 } 906 assert(_PyUnicode_CheckConsistency(unicode, 0)); 907 return obj; 908} 909 910#if SIZEOF_WCHAR_T == 2 911/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 912 will decode surrogate pairs, the other conversions are implemented as macros 913 for efficiency. 914 915 This function assumes that unicode can hold one more code point than wstr 916 characters for a terminating null character. */ 917static void 918unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 919 PyUnicodeObject *unicode) 920{ 921 const wchar_t *iter; 922 Py_UCS4 *ucs4_out; 923 924 assert(unicode != NULL); 925 assert(_PyUnicode_CHECK(unicode)); 926 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 927 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 928 929 for (iter = begin; iter < end; ) { 930 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 931 _PyUnicode_GET_LENGTH(unicode))); 932 if (*iter >= 0xD800 && *iter <= 0xDBFF 933 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 934 { 935 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 936 iter += 2; 937 } 938 else { 939 *ucs4_out++ = *iter; 940 iter++; 941 } 942 } 943 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 944 _PyUnicode_GET_LENGTH(unicode))); 945 946} 947#endif 948 949static int 950_PyUnicode_Dirty(PyObject *unicode) 951{ 952 assert(_PyUnicode_CHECK(unicode)); 953 if (Py_REFCNT(unicode) != 1) { 954 PyErr_SetString(PyExc_SystemError, 955 "Cannot modify a string having more than 1 reference"); 956 return -1; 957 } 958 _PyUnicode_DIRTY(unicode); 959 return 0; 960} 961 962static int 963_copy_characters(PyObject *to, Py_ssize_t to_start, 964 PyObject *from, Py_ssize_t from_start, 965 Py_ssize_t how_many, int check_maxchar) 966{ 967 unsigned int from_kind, to_kind; 968 void *from_data, *to_data; 969 int fast; 970 971 assert(PyUnicode_Check(from)); 972 assert(PyUnicode_Check(to)); 973 assert(PyUnicode_IS_READY(from)); 974 assert(PyUnicode_IS_READY(to)); 975 976 assert(PyUnicode_GET_LENGTH(from) >= how_many); 977 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 978 assert(0 <= how_many); 979 980 if (how_many == 0) 981 return 0; 982 983 from_kind = PyUnicode_KIND(from); 984 from_data = PyUnicode_DATA(from); 985 to_kind = PyUnicode_KIND(to); 986 to_data = PyUnicode_DATA(to); 987 988#ifdef Py_DEBUG 989 if (!check_maxchar 990 && (from_kind > to_kind 991 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 992 { 993 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 994 Py_UCS4 ch; 995 Py_ssize_t i; 996 for (i=0; i < how_many; i++) { 997 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 998 assert(ch <= to_maxchar); 999 } 1000 } 1001#endif 1002 fast = (from_kind == to_kind); 1003 if (check_maxchar 1004 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1005 { 1006 /* deny latin1 => ascii */ 1007 fast = 0; 1008 } 1009 1010 if (fast) { 1011 Py_MEMCPY((char*)to_data + to_kind * to_start, 1012 (char*)from_data + from_kind * from_start, 1013 to_kind * how_many); 1014 } 1015 else if (from_kind == PyUnicode_1BYTE_KIND 1016 && to_kind == PyUnicode_2BYTE_KIND) 1017 { 1018 _PyUnicode_CONVERT_BYTES( 1019 Py_UCS1, Py_UCS2, 1020 PyUnicode_1BYTE_DATA(from) + from_start, 1021 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1022 PyUnicode_2BYTE_DATA(to) + to_start 1023 ); 1024 } 1025 else if (from_kind == PyUnicode_1BYTE_KIND 1026 && to_kind == PyUnicode_4BYTE_KIND) 1027 { 1028 _PyUnicode_CONVERT_BYTES( 1029 Py_UCS1, Py_UCS4, 1030 PyUnicode_1BYTE_DATA(from) + from_start, 1031 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1032 PyUnicode_4BYTE_DATA(to) + to_start 1033 ); 1034 } 1035 else if (from_kind == PyUnicode_2BYTE_KIND 1036 && to_kind == PyUnicode_4BYTE_KIND) 1037 { 1038 _PyUnicode_CONVERT_BYTES( 1039 Py_UCS2, Py_UCS4, 1040 PyUnicode_2BYTE_DATA(from) + from_start, 1041 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1042 PyUnicode_4BYTE_DATA(to) + to_start 1043 ); 1044 } 1045 else { 1046 /* check if max_char(from substring) <= max_char(to) */ 1047 if (from_kind > to_kind 1048 /* latin1 => ascii */ 1049 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1050 { 1051 /* slow path to check for character overflow */ 1052 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1053 Py_UCS4 ch; 1054 Py_ssize_t i; 1055 1056#ifdef Py_DEBUG 1057 for (i=0; i < how_many; i++) { 1058 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1059 assert(ch <= to_maxchar); 1060 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1061 } 1062#else 1063 if (!check_maxchar) { 1064 for (i=0; i < how_many; i++) { 1065 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1066 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1067 } 1068 } 1069 else { 1070 for (i=0; i < how_many; i++) { 1071 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1072 if (ch > to_maxchar) 1073 return 1; 1074 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1075 } 1076 } 1077#endif 1078 } 1079 else { 1080 assert(0 && "inconsistent state"); 1081 return 1; 1082 } 1083 } 1084 return 0; 1085} 1086 1087static void 1088copy_characters(PyObject *to, Py_ssize_t to_start, 1089 PyObject *from, Py_ssize_t from_start, 1090 Py_ssize_t how_many) 1091{ 1092 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1093} 1094 1095Py_ssize_t 1096PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1097 PyObject *from, Py_ssize_t from_start, 1098 Py_ssize_t how_many) 1099{ 1100 int err; 1101 1102 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1103 PyErr_BadInternalCall(); 1104 return -1; 1105 } 1106 1107 if (PyUnicode_READY(from)) 1108 return -1; 1109 if (PyUnicode_READY(to)) 1110 return -1; 1111 1112 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1113 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1114 PyErr_Format(PyExc_SystemError, 1115 "Cannot write %zi characters at %zi " 1116 "in a string of %zi characters", 1117 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1118 return -1; 1119 } 1120 1121 if (how_many == 0) 1122 return 0; 1123 1124 if (_PyUnicode_Dirty(to)) 1125 return -1; 1126 1127 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1128 if (err) { 1129 PyErr_Format(PyExc_SystemError, 1130 "Cannot copy %s characters " 1131 "into a string of %s characters", 1132 unicode_kind_name(from), 1133 unicode_kind_name(to)); 1134 return -1; 1135 } 1136 return how_many; 1137} 1138 1139/* Find the maximum code point and count the number of surrogate pairs so a 1140 correct string length can be computed before converting a string to UCS4. 1141 This function counts single surrogates as a character and not as a pair. 1142 1143 Return 0 on success, or -1 on error. */ 1144static int 1145find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1146 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1147{ 1148 const wchar_t *iter; 1149 1150 assert(num_surrogates != NULL && maxchar != NULL); 1151 *num_surrogates = 0; 1152 *maxchar = 0; 1153 1154 for (iter = begin; iter < end; ) { 1155 if (*iter > *maxchar) { 1156 *maxchar = *iter; 1157#if SIZEOF_WCHAR_T != 2 1158 if (*maxchar >= 0x10000) 1159 return 0; 1160#endif 1161 } 1162#if SIZEOF_WCHAR_T == 2 1163 if (*iter >= 0xD800 && *iter <= 0xDBFF 1164 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1165 { 1166 Py_UCS4 surrogate_val; 1167 surrogate_val = (((iter[0] & 0x3FF)<<10) 1168 | (iter[1] & 0x3FF)) + 0x10000; 1169 ++(*num_surrogates); 1170 if (surrogate_val > *maxchar) 1171 *maxchar = surrogate_val; 1172 iter += 2; 1173 } 1174 else 1175 iter++; 1176#else 1177 iter++; 1178#endif 1179 } 1180 return 0; 1181} 1182 1183#ifdef Py_DEBUG 1184int unicode_ready_calls = 0; 1185#endif 1186 1187static int 1188unicode_ready(PyObject **p_obj, int replace) 1189{ 1190 PyUnicodeObject *unicode; 1191 wchar_t *end; 1192 Py_UCS4 maxchar = 0; 1193 Py_ssize_t num_surrogates; 1194#if SIZEOF_WCHAR_T == 2 1195 Py_ssize_t length_wo_surrogates; 1196#endif 1197 1198 assert(p_obj != NULL); 1199 unicode = (PyUnicodeObject *)*p_obj; 1200 1201 /* _PyUnicode_Ready() is only intended for old-style API usage where 1202 strings were created using _PyObject_New() and where no canonical 1203 representation (the str field) has been set yet aka strings 1204 which are not yet ready. */ 1205 assert(_PyUnicode_CHECK(unicode)); 1206 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1207 assert(_PyUnicode_WSTR(unicode) != NULL); 1208 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1209 assert(_PyUnicode_UTF8(unicode) == NULL); 1210 /* Actually, it should neither be interned nor be anything else: */ 1211 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1212 1213#ifdef Py_DEBUG 1214 ++unicode_ready_calls; 1215#endif 1216 1217#ifdef Py_DEBUG 1218 assert(!replace || Py_REFCNT(unicode) == 1); 1219#else 1220 if (replace && Py_REFCNT(unicode) != 1) 1221 replace = 0; 1222#endif 1223 if (replace) { 1224 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1225 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1226 /* Optimization for empty strings */ 1227 if (len == 0) { 1228 Py_INCREF(unicode_empty); 1229 Py_DECREF(*p_obj); 1230 *p_obj = unicode_empty; 1231 return 0; 1232 } 1233 if (len == 1 && wstr[0] < 256) { 1234 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1235 if (latin1_char == NULL) 1236 return -1; 1237 Py_DECREF(*p_obj); 1238 *p_obj = latin1_char; 1239 return 0; 1240 } 1241 } 1242 1243 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1244 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1245 &maxchar, &num_surrogates) == -1) 1246 return -1; 1247 1248 if (maxchar < 256) { 1249 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1250 if (!_PyUnicode_DATA_ANY(unicode)) { 1251 PyErr_NoMemory(); 1252 return -1; 1253 } 1254 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1255 _PyUnicode_WSTR(unicode), end, 1256 PyUnicode_1BYTE_DATA(unicode)); 1257 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1258 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1259 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1260 if (maxchar < 128) { 1261 _PyUnicode_STATE(unicode).ascii = 1; 1262 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1263 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1264 } 1265 else { 1266 _PyUnicode_STATE(unicode).ascii = 0; 1267 _PyUnicode_UTF8(unicode) = NULL; 1268 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1269 } 1270 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1271 _PyUnicode_WSTR(unicode) = NULL; 1272 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1273 } 1274 /* In this case we might have to convert down from 4-byte native 1275 wchar_t to 2-byte unicode. */ 1276 else if (maxchar < 65536) { 1277 assert(num_surrogates == 0 && 1278 "FindMaxCharAndNumSurrogatePairs() messed up"); 1279 1280#if SIZEOF_WCHAR_T == 2 1281 /* We can share representations and are done. */ 1282 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1283 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1284 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1285 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1286 _PyUnicode_UTF8(unicode) = NULL; 1287 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1288#else 1289 /* sizeof(wchar_t) == 4 */ 1290 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1291 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1292 if (!_PyUnicode_DATA_ANY(unicode)) { 1293 PyErr_NoMemory(); 1294 return -1; 1295 } 1296 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1297 _PyUnicode_WSTR(unicode), end, 1298 PyUnicode_2BYTE_DATA(unicode)); 1299 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1300 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1301 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1302 _PyUnicode_UTF8(unicode) = NULL; 1303 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1304 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1305 _PyUnicode_WSTR(unicode) = NULL; 1306 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1307#endif 1308 } 1309 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1310 else { 1311#if SIZEOF_WCHAR_T == 2 1312 /* in case the native representation is 2-bytes, we need to allocate a 1313 new normalized 4-byte version. */ 1314 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1315 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1316 if (!_PyUnicode_DATA_ANY(unicode)) { 1317 PyErr_NoMemory(); 1318 return -1; 1319 } 1320 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1321 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1322 _PyUnicode_UTF8(unicode) = NULL; 1323 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1324 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1325 _PyUnicode_STATE(unicode).ready = 1; 1326 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1327 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1328 _PyUnicode_WSTR(unicode) = NULL; 1329 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1330#else 1331 assert(num_surrogates == 0); 1332 1333 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1334 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1335 _PyUnicode_UTF8(unicode) = NULL; 1336 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1337 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1338#endif 1339 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1340 } 1341 _PyUnicode_STATE(unicode).ready = 1; 1342 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1343 return 0; 1344} 1345 1346int 1347_PyUnicode_ReadyReplace(PyObject **op) 1348{ 1349 return unicode_ready(op, 1); 1350} 1351 1352int 1353_PyUnicode_Ready(PyObject *op) 1354{ 1355 return unicode_ready(&op, 0); 1356} 1357 1358static void 1359unicode_dealloc(register PyUnicodeObject *unicode) 1360{ 1361 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1362 case SSTATE_NOT_INTERNED: 1363 break; 1364 1365 case SSTATE_INTERNED_MORTAL: 1366 /* revive dead object temporarily for DelItem */ 1367 Py_REFCNT(unicode) = 3; 1368 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1369 Py_FatalError( 1370 "deletion of interned string failed"); 1371 break; 1372 1373 case SSTATE_INTERNED_IMMORTAL: 1374 Py_FatalError("Immortal interned string died."); 1375 1376 default: 1377 Py_FatalError("Inconsistent interned string state."); 1378 } 1379 1380 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1381 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1382 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1383 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1384 1385 if (PyUnicode_IS_COMPACT(unicode)) { 1386 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1387 } 1388 else { 1389 if (_PyUnicode_DATA_ANY(unicode)) 1390 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1391 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1392 } 1393} 1394 1395#ifdef Py_DEBUG 1396static int 1397unicode_is_singleton(PyObject *unicode) 1398{ 1399 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1400 if (unicode == unicode_empty) 1401 return 1; 1402 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1403 { 1404 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1405 if (ch < 256 && unicode_latin1[ch] == unicode) 1406 return 1; 1407 } 1408 return 0; 1409} 1410#endif 1411 1412static int 1413unicode_resizable(PyObject *unicode) 1414{ 1415 if (Py_REFCNT(unicode) != 1) 1416 return 0; 1417 if (PyUnicode_CHECK_INTERNED(unicode)) 1418 return 0; 1419#ifdef Py_DEBUG 1420 /* singleton refcount is greater than 1 */ 1421 assert(!unicode_is_singleton(unicode)); 1422#endif 1423 return 1; 1424} 1425 1426static int 1427unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1428{ 1429 PyObject *unicode; 1430 Py_ssize_t old_length; 1431 1432 assert(p_unicode != NULL); 1433 unicode = *p_unicode; 1434 1435 assert(unicode != NULL); 1436 assert(PyUnicode_Check(unicode)); 1437 assert(0 <= length); 1438 1439 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1440 old_length = PyUnicode_WSTR_LENGTH(unicode); 1441 else 1442 old_length = PyUnicode_GET_LENGTH(unicode); 1443 if (old_length == length) 1444 return 0; 1445 1446 if (!unicode_resizable(unicode)) { 1447 PyObject *copy = resize_copy(unicode, length); 1448 if (copy == NULL) 1449 return -1; 1450 Py_DECREF(*p_unicode); 1451 *p_unicode = copy; 1452 return 0; 1453 } 1454 1455 if (PyUnicode_IS_COMPACT(unicode)) { 1456 *p_unicode = resize_compact(unicode, length); 1457 if (*p_unicode == NULL) 1458 return -1; 1459 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); 1460 return 0; 1461 } 1462 return resize_inplace((PyUnicodeObject*)unicode, length); 1463} 1464 1465int 1466PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1467{ 1468 PyObject *unicode; 1469 if (p_unicode == NULL) { 1470 PyErr_BadInternalCall(); 1471 return -1; 1472 } 1473 unicode = *p_unicode; 1474 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1475 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1476 { 1477 PyErr_BadInternalCall(); 1478 return -1; 1479 } 1480 return unicode_resize(p_unicode, length); 1481} 1482 1483static PyObject* 1484get_latin1_char(unsigned char ch) 1485{ 1486 PyObject *unicode = unicode_latin1[ch]; 1487 if (!unicode) { 1488 unicode = PyUnicode_New(1, ch); 1489 if (!unicode) 1490 return NULL; 1491 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1492 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1493 unicode_latin1[ch] = unicode; 1494 } 1495 Py_INCREF(unicode); 1496 return unicode; 1497} 1498 1499PyObject * 1500PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1501{ 1502 PyUnicodeObject *unicode; 1503 Py_UCS4 maxchar = 0; 1504 Py_ssize_t num_surrogates; 1505 1506 if (u == NULL) 1507 return (PyObject*)_PyUnicode_New(size); 1508 1509 /* If the Unicode data is known at construction time, we can apply 1510 some optimizations which share commonly used objects. */ 1511 1512 /* Optimization for empty strings */ 1513 if (size == 0 && unicode_empty != NULL) { 1514 Py_INCREF(unicode_empty); 1515 return unicode_empty; 1516 } 1517 1518 /* Single character Unicode objects in the Latin-1 range are 1519 shared when using this constructor */ 1520 if (size == 1 && *u < 256) 1521 return get_latin1_char((unsigned char)*u); 1522 1523 /* If not empty and not single character, copy the Unicode data 1524 into the new object */ 1525 if (find_maxchar_surrogates(u, u + size, 1526 &maxchar, &num_surrogates) == -1) 1527 return NULL; 1528 1529 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1530 maxchar); 1531 if (!unicode) 1532 return NULL; 1533 1534 switch (PyUnicode_KIND(unicode)) { 1535 case PyUnicode_1BYTE_KIND: 1536 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1537 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1538 break; 1539 case PyUnicode_2BYTE_KIND: 1540#if Py_UNICODE_SIZE == 2 1541 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1542#else 1543 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1544 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1545#endif 1546 break; 1547 case PyUnicode_4BYTE_KIND: 1548#if SIZEOF_WCHAR_T == 2 1549 /* This is the only case which has to process surrogates, thus 1550 a simple copy loop is not enough and we need a function. */ 1551 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1552#else 1553 assert(num_surrogates == 0); 1554 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1555#endif 1556 break; 1557 default: 1558 assert(0 && "Impossible state"); 1559 } 1560 1561 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1562 return (PyObject *)unicode; 1563} 1564 1565PyObject * 1566PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1567{ 1568 PyUnicodeObject *unicode; 1569 1570 if (size < 0) { 1571 PyErr_SetString(PyExc_SystemError, 1572 "Negative size passed to PyUnicode_FromStringAndSize"); 1573 return NULL; 1574 } 1575 1576 /* If the Unicode data is known at construction time, we can apply 1577 some optimizations which share commonly used objects. 1578 Also, this means the input must be UTF-8, so fall back to the 1579 UTF-8 decoder at the end. */ 1580 if (u != NULL) { 1581 1582 /* Optimization for empty strings */ 1583 if (size == 0 && unicode_empty != NULL) { 1584 Py_INCREF(unicode_empty); 1585 return unicode_empty; 1586 } 1587 1588 /* Single characters are shared when using this constructor. 1589 Restrict to ASCII, since the input must be UTF-8. */ 1590 if (size == 1 && Py_CHARMASK(*u) < 128) 1591 return get_latin1_char(Py_CHARMASK(*u)); 1592 1593 return PyUnicode_DecodeUTF8(u, size, NULL); 1594 } 1595 1596 unicode = _PyUnicode_New(size); 1597 if (!unicode) 1598 return NULL; 1599 1600 return (PyObject *)unicode; 1601} 1602 1603PyObject * 1604PyUnicode_FromString(const char *u) 1605{ 1606 size_t size = strlen(u); 1607 if (size > PY_SSIZE_T_MAX) { 1608 PyErr_SetString(PyExc_OverflowError, "input too long"); 1609 return NULL; 1610 } 1611 1612 return PyUnicode_FromStringAndSize(u, size); 1613} 1614 1615PyObject * 1616_PyUnicode_FromId(_Py_Identifier *id) 1617{ 1618 if (!id->object) { 1619 id->object = PyUnicode_FromString(id->string); 1620 if (!id->object) 1621 return NULL; 1622 PyUnicode_InternInPlace(&id->object); 1623 assert(!id->next); 1624 id->next = static_strings; 1625 static_strings = id; 1626 } 1627 Py_INCREF(id->object); 1628 return id->object; 1629} 1630 1631void 1632_PyUnicode_ClearStaticStrings() 1633{ 1634 _Py_Identifier *i; 1635 for (i = static_strings; i; i = i->next) { 1636 Py_DECREF(i->object); 1637 i->object = NULL; 1638 i->next = NULL; 1639 } 1640} 1641 1642static PyObject* 1643unicode_fromascii(const unsigned char* s, Py_ssize_t size) 1644{ 1645 PyObject *res; 1646#ifdef Py_DEBUG 1647 const unsigned char *p; 1648 const unsigned char *end = s + size; 1649 for (p=s; p < end; p++) { 1650 assert(*p < 128); 1651 } 1652#endif 1653 if (size == 1) 1654 return get_latin1_char(s[0]); 1655 res = PyUnicode_New(size, 127); 1656 if (!res) 1657 return NULL; 1658 memcpy(PyUnicode_1BYTE_DATA(res), s, size); 1659 return res; 1660} 1661 1662static Py_UCS4 1663kind_maxchar_limit(unsigned int kind) 1664{ 1665 switch(kind) { 1666 case PyUnicode_1BYTE_KIND: 1667 return 0x80; 1668 case PyUnicode_2BYTE_KIND: 1669 return 0x100; 1670 case PyUnicode_4BYTE_KIND: 1671 return 0x10000; 1672 default: 1673 assert(0 && "invalid kind"); 1674 return 0x10ffff; 1675 } 1676} 1677 1678static PyObject* 1679_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1680{ 1681 PyObject *res; 1682 unsigned char max_char = 127; 1683 Py_ssize_t i; 1684 1685 assert(size >= 0); 1686 if (size == 1) 1687 return get_latin1_char(u[0]); 1688 for (i = 0; i < size; i++) { 1689 if (u[i] & 0x80) { 1690 max_char = 255; 1691 break; 1692 } 1693 } 1694 res = PyUnicode_New(size, max_char); 1695 if (!res) 1696 return NULL; 1697 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1698 assert(_PyUnicode_CheckConsistency(res, 1)); 1699 return res; 1700} 1701 1702static PyObject* 1703_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1704{ 1705 PyObject *res; 1706 Py_UCS2 max_char = 0; 1707 Py_ssize_t i; 1708 1709 assert(size >= 0); 1710 if (size == 1 && u[0] < 256) 1711 return get_latin1_char(u[0]); 1712 for (i = 0; i < size; i++) { 1713 if (u[i] > max_char) { 1714 max_char = u[i]; 1715 if (max_char >= 256) 1716 break; 1717 } 1718 } 1719 res = PyUnicode_New(size, max_char); 1720 if (!res) 1721 return NULL; 1722 if (max_char >= 256) 1723 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1724 else 1725 for (i = 0; i < size; i++) 1726 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1727 assert(_PyUnicode_CheckConsistency(res, 1)); 1728 return res; 1729} 1730 1731static PyObject* 1732_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1733{ 1734 PyObject *res; 1735 Py_UCS4 max_char = 0; 1736 Py_ssize_t i; 1737 1738 assert(size >= 0); 1739 if (size == 1 && u[0] < 256) 1740 return get_latin1_char(u[0]); 1741 for (i = 0; i < size; i++) { 1742 if (u[i] > max_char) { 1743 max_char = u[i]; 1744 if (max_char >= 0x10000) 1745 break; 1746 } 1747 } 1748 res = PyUnicode_New(size, max_char); 1749 if (!res) 1750 return NULL; 1751 if (max_char >= 0x10000) 1752 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1753 else { 1754 int kind = PyUnicode_KIND(res); 1755 void *data = PyUnicode_DATA(res); 1756 for (i = 0; i < size; i++) 1757 PyUnicode_WRITE(kind, data, i, u[i]); 1758 } 1759 assert(_PyUnicode_CheckConsistency(res, 1)); 1760 return res; 1761} 1762 1763PyObject* 1764PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1765{ 1766 switch(kind) { 1767 case PyUnicode_1BYTE_KIND: 1768 return _PyUnicode_FromUCS1(buffer, size); 1769 case PyUnicode_2BYTE_KIND: 1770 return _PyUnicode_FromUCS2(buffer, size); 1771 case PyUnicode_4BYTE_KIND: 1772 return _PyUnicode_FromUCS4(buffer, size); 1773 default: 1774 assert(0 && "invalid kind"); 1775 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1776 return NULL; 1777 } 1778} 1779 1780/* Ensure that a string uses the most efficient storage, if it is not the 1781 case: create a new string with of the right kind. Write NULL into *p_unicode 1782 on error. */ 1783void 1784unicode_adjust_maxchar(PyObject **p_unicode) 1785{ 1786 PyObject *unicode, *copy; 1787 Py_UCS4 max_char; 1788 Py_ssize_t i, len; 1789 unsigned int kind; 1790 1791 assert(p_unicode != NULL); 1792 unicode = *p_unicode; 1793 assert(PyUnicode_IS_READY(unicode)); 1794 if (PyUnicode_IS_ASCII(unicode)) 1795 return; 1796 1797 len = PyUnicode_GET_LENGTH(unicode); 1798 kind = PyUnicode_KIND(unicode); 1799 if (kind == PyUnicode_1BYTE_KIND) { 1800 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 1801 for (i = 0; i < len; i++) { 1802 if (u[i] & 0x80) 1803 return; 1804 } 1805 max_char = 127; 1806 } 1807 else if (kind == PyUnicode_2BYTE_KIND) { 1808 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 1809 max_char = 0; 1810 for (i = 0; i < len; i++) { 1811 if (u[i] > max_char) { 1812 max_char = u[i]; 1813 if (max_char >= 256) 1814 return; 1815 } 1816 } 1817 } 1818 else { 1819 const Py_UCS4 *u; 1820 assert(kind == PyUnicode_4BYTE_KIND); 1821 u = PyUnicode_4BYTE_DATA(unicode); 1822 max_char = 0; 1823 for (i = 0; i < len; i++) { 1824 if (u[i] > max_char) { 1825 max_char = u[i]; 1826 if (max_char >= 0x10000) 1827 return; 1828 } 1829 } 1830 } 1831 assert(max_char < PyUnicode_MAX_CHAR_VALUE(unicode)); 1832 copy = PyUnicode_New(len, max_char); 1833 copy_characters(copy, 0, unicode, 0, len); 1834 Py_DECREF(unicode); 1835 *p_unicode = copy; 1836} 1837 1838PyObject* 1839PyUnicode_Copy(PyObject *unicode) 1840{ 1841 Py_ssize_t size; 1842 PyObject *copy; 1843 void *data; 1844 1845 if (!PyUnicode_Check(unicode)) { 1846 PyErr_BadInternalCall(); 1847 return NULL; 1848 } 1849 if (PyUnicode_READY(unicode)) 1850 return NULL; 1851 1852 size = PyUnicode_GET_LENGTH(unicode); 1853 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1854 if (!copy) 1855 return NULL; 1856 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1857 1858 data = PyUnicode_DATA(unicode); 1859 switch (PyUnicode_KIND(unicode)) 1860 { 1861 case PyUnicode_1BYTE_KIND: 1862 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1863 break; 1864 case PyUnicode_2BYTE_KIND: 1865 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1866 break; 1867 case PyUnicode_4BYTE_KIND: 1868 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1869 break; 1870 default: 1871 assert(0); 1872 break; 1873 } 1874 assert(_PyUnicode_CheckConsistency(copy, 1)); 1875 return copy; 1876} 1877 1878 1879/* Widen Unicode objects to larger buffers. Don't write terminating null 1880 character. Return NULL on error. */ 1881 1882void* 1883_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1884{ 1885 Py_ssize_t len; 1886 void *result; 1887 unsigned int skind; 1888 1889 if (PyUnicode_READY(s)) 1890 return NULL; 1891 1892 len = PyUnicode_GET_LENGTH(s); 1893 skind = PyUnicode_KIND(s); 1894 if (skind >= kind) { 1895 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1896 return NULL; 1897 } 1898 switch(kind) { 1899 case PyUnicode_2BYTE_KIND: 1900 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1901 if (!result) 1902 return PyErr_NoMemory(); 1903 assert(skind == PyUnicode_1BYTE_KIND); 1904 _PyUnicode_CONVERT_BYTES( 1905 Py_UCS1, Py_UCS2, 1906 PyUnicode_1BYTE_DATA(s), 1907 PyUnicode_1BYTE_DATA(s) + len, 1908 result); 1909 return result; 1910 case PyUnicode_4BYTE_KIND: 1911 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1912 if (!result) 1913 return PyErr_NoMemory(); 1914 if (skind == PyUnicode_2BYTE_KIND) { 1915 _PyUnicode_CONVERT_BYTES( 1916 Py_UCS2, Py_UCS4, 1917 PyUnicode_2BYTE_DATA(s), 1918 PyUnicode_2BYTE_DATA(s) + len, 1919 result); 1920 } 1921 else { 1922 assert(skind == PyUnicode_1BYTE_KIND); 1923 _PyUnicode_CONVERT_BYTES( 1924 Py_UCS1, Py_UCS4, 1925 PyUnicode_1BYTE_DATA(s), 1926 PyUnicode_1BYTE_DATA(s) + len, 1927 result); 1928 } 1929 return result; 1930 default: 1931 break; 1932 } 1933 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1934 return NULL; 1935} 1936 1937static Py_UCS4* 1938as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1939 int copy_null) 1940{ 1941 int kind; 1942 void *data; 1943 Py_ssize_t len, targetlen; 1944 if (PyUnicode_READY(string) == -1) 1945 return NULL; 1946 kind = PyUnicode_KIND(string); 1947 data = PyUnicode_DATA(string); 1948 len = PyUnicode_GET_LENGTH(string); 1949 targetlen = len; 1950 if (copy_null) 1951 targetlen++; 1952 if (!target) { 1953 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1954 PyErr_NoMemory(); 1955 return NULL; 1956 } 1957 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1958 if (!target) { 1959 PyErr_NoMemory(); 1960 return NULL; 1961 } 1962 } 1963 else { 1964 if (targetsize < targetlen) { 1965 PyErr_Format(PyExc_SystemError, 1966 "string is longer than the buffer"); 1967 if (copy_null && 0 < targetsize) 1968 target[0] = 0; 1969 return NULL; 1970 } 1971 } 1972 if (kind != PyUnicode_4BYTE_KIND) { 1973 Py_ssize_t i; 1974 for (i = 0; i < len; i++) 1975 target[i] = PyUnicode_READ(kind, data, i); 1976 } 1977 else 1978 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1979 if (copy_null) 1980 target[len] = 0; 1981 return target; 1982} 1983 1984Py_UCS4* 1985PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1986 int copy_null) 1987{ 1988 if (target == NULL || targetsize < 1) { 1989 PyErr_BadInternalCall(); 1990 return NULL; 1991 } 1992 return as_ucs4(string, target, targetsize, copy_null); 1993} 1994 1995Py_UCS4* 1996PyUnicode_AsUCS4Copy(PyObject *string) 1997{ 1998 return as_ucs4(string, NULL, 0, 1); 1999} 2000 2001#ifdef HAVE_WCHAR_H 2002 2003PyObject * 2004PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2005{ 2006 if (w == NULL) { 2007 if (size == 0) 2008 return PyUnicode_New(0, 0); 2009 PyErr_BadInternalCall(); 2010 return NULL; 2011 } 2012 2013 if (size == -1) { 2014 size = wcslen(w); 2015 } 2016 2017 return PyUnicode_FromUnicode(w, size); 2018} 2019 2020#endif /* HAVE_WCHAR_H */ 2021 2022static void 2023makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2024 int zeropad, int width, int precision, char c) 2025{ 2026 *fmt++ = '%'; 2027 if (width) { 2028 if (zeropad) 2029 *fmt++ = '0'; 2030 fmt += sprintf(fmt, "%d", width); 2031 } 2032 if (precision) 2033 fmt += sprintf(fmt, ".%d", precision); 2034 if (longflag) 2035 *fmt++ = 'l'; 2036 else if (longlongflag) { 2037 /* longlongflag should only ever be nonzero on machines with 2038 HAVE_LONG_LONG defined */ 2039#ifdef HAVE_LONG_LONG 2040 char *f = PY_FORMAT_LONG_LONG; 2041 while (*f) 2042 *fmt++ = *f++; 2043#else 2044 /* we shouldn't ever get here */ 2045 assert(0); 2046 *fmt++ = 'l'; 2047#endif 2048 } 2049 else if (size_tflag) { 2050 char *f = PY_FORMAT_SIZE_T; 2051 while (*f) 2052 *fmt++ = *f++; 2053 } 2054 *fmt++ = c; 2055 *fmt = '\0'; 2056} 2057 2058/* helper for PyUnicode_FromFormatV() */ 2059 2060static const char* 2061parse_format_flags(const char *f, 2062 int *p_width, int *p_precision, 2063 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2064{ 2065 int width, precision, longflag, longlongflag, size_tflag; 2066 2067 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2068 f++; 2069 width = 0; 2070 while (Py_ISDIGIT((unsigned)*f)) 2071 width = (width*10) + *f++ - '0'; 2072 precision = 0; 2073 if (*f == '.') { 2074 f++; 2075 while (Py_ISDIGIT((unsigned)*f)) 2076 precision = (precision*10) + *f++ - '0'; 2077 if (*f == '%') { 2078 /* "%.3%s" => f points to "3" */ 2079 f--; 2080 } 2081 } 2082 if (*f == '\0') { 2083 /* bogus format "%.1" => go backward, f points to "1" */ 2084 f--; 2085 } 2086 if (p_width != NULL) 2087 *p_width = width; 2088 if (p_precision != NULL) 2089 *p_precision = precision; 2090 2091 /* Handle %ld, %lu, %lld and %llu. */ 2092 longflag = 0; 2093 longlongflag = 0; 2094 size_tflag = 0; 2095 2096 if (*f == 'l') { 2097 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2098 longflag = 1; 2099 ++f; 2100 } 2101#ifdef HAVE_LONG_LONG 2102 else if (f[1] == 'l' && 2103 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2104 longlongflag = 1; 2105 f += 2; 2106 } 2107#endif 2108 } 2109 /* handle the size_t flag. */ 2110 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2111 size_tflag = 1; 2112 ++f; 2113 } 2114 if (p_longflag != NULL) 2115 *p_longflag = longflag; 2116 if (p_longlongflag != NULL) 2117 *p_longlongflag = longlongflag; 2118 if (p_size_tflag != NULL) 2119 *p_size_tflag = size_tflag; 2120 return f; 2121} 2122 2123/* maximum number of characters required for output of %ld. 21 characters 2124 allows for 64-bit integers (in decimal) and an optional sign. */ 2125#define MAX_LONG_CHARS 21 2126/* maximum number of characters required for output of %lld. 2127 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2128 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2129#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2130 2131PyObject * 2132PyUnicode_FromFormatV(const char *format, va_list vargs) 2133{ 2134 va_list count; 2135 Py_ssize_t callcount = 0; 2136 PyObject **callresults = NULL; 2137 PyObject **callresult = NULL; 2138 Py_ssize_t n = 0; 2139 int width = 0; 2140 int precision = 0; 2141 int zeropad; 2142 const char* f; 2143 PyObject *string; 2144 /* used by sprintf */ 2145 char fmt[61]; /* should be enough for %0width.precisionlld */ 2146 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2147 Py_UCS4 argmaxchar; 2148 Py_ssize_t numbersize = 0; 2149 char *numberresults = NULL; 2150 char *numberresult = NULL; 2151 Py_ssize_t i; 2152 int kind; 2153 void *data; 2154 2155 Py_VA_COPY(count, vargs); 2156 /* step 1: count the number of %S/%R/%A/%s format specifications 2157 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2158 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2159 * result in an array) 2160 * also estimate a upper bound for all the number formats in the string, 2161 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2162 * buffer before putting everything together. */ 2163 for (f = format; *f; f++) { 2164 if (*f == '%') { 2165 int longlongflag; 2166 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2167 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2168 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2169 ++callcount; 2170 2171 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2172#ifdef HAVE_LONG_LONG 2173 if (longlongflag) { 2174 if (width < MAX_LONG_LONG_CHARS) 2175 width = MAX_LONG_LONG_CHARS; 2176 } 2177 else 2178#endif 2179 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2180 including sign. Decimal takes the most space. This 2181 isn't enough for octal. If a width is specified we 2182 need more (which we allocate later). */ 2183 if (width < MAX_LONG_CHARS) 2184 width = MAX_LONG_CHARS; 2185 2186 /* account for the size + '\0' to separate numbers 2187 inside of the numberresults buffer */ 2188 numbersize += (width + 1); 2189 } 2190 } 2191 else if ((unsigned char)*f > 127) { 2192 PyErr_Format(PyExc_ValueError, 2193 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2194 "string, got a non-ASCII byte: 0x%02x", 2195 (unsigned char)*f); 2196 return NULL; 2197 } 2198 } 2199 /* step 2: allocate memory for the results of 2200 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2201 if (callcount) { 2202 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2203 if (!callresults) { 2204 PyErr_NoMemory(); 2205 return NULL; 2206 } 2207 callresult = callresults; 2208 } 2209 /* step 2.5: allocate memory for the results of formating numbers */ 2210 if (numbersize) { 2211 numberresults = PyObject_Malloc(numbersize); 2212 if (!numberresults) { 2213 PyErr_NoMemory(); 2214 goto fail; 2215 } 2216 numberresult = numberresults; 2217 } 2218 2219 /* step 3: format numbers and figure out how large a buffer we need */ 2220 for (f = format; *f; f++) { 2221 if (*f == '%') { 2222 const char* p; 2223 int longflag; 2224 int longlongflag; 2225 int size_tflag; 2226 int numprinted; 2227 2228 p = f; 2229 zeropad = (f[1] == '0'); 2230 f = parse_format_flags(f, &width, &precision, 2231 &longflag, &longlongflag, &size_tflag); 2232 switch (*f) { 2233 case 'c': 2234 { 2235 Py_UCS4 ordinal = va_arg(count, int); 2236 maxchar = Py_MAX(maxchar, ordinal); 2237 n++; 2238 break; 2239 } 2240 case '%': 2241 n++; 2242 break; 2243 case 'i': 2244 case 'd': 2245 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2246 width, precision, *f); 2247 if (longflag) 2248 numprinted = sprintf(numberresult, fmt, 2249 va_arg(count, long)); 2250#ifdef HAVE_LONG_LONG 2251 else if (longlongflag) 2252 numprinted = sprintf(numberresult, fmt, 2253 va_arg(count, PY_LONG_LONG)); 2254#endif 2255 else if (size_tflag) 2256 numprinted = sprintf(numberresult, fmt, 2257 va_arg(count, Py_ssize_t)); 2258 else 2259 numprinted = sprintf(numberresult, fmt, 2260 va_arg(count, int)); 2261 n += numprinted; 2262 /* advance by +1 to skip over the '\0' */ 2263 numberresult += (numprinted + 1); 2264 assert(*(numberresult - 1) == '\0'); 2265 assert(*(numberresult - 2) != '\0'); 2266 assert(numprinted >= 0); 2267 assert(numberresult <= numberresults + numbersize); 2268 break; 2269 case 'u': 2270 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2271 width, precision, 'u'); 2272 if (longflag) 2273 numprinted = sprintf(numberresult, fmt, 2274 va_arg(count, unsigned long)); 2275#ifdef HAVE_LONG_LONG 2276 else if (longlongflag) 2277 numprinted = sprintf(numberresult, fmt, 2278 va_arg(count, unsigned PY_LONG_LONG)); 2279#endif 2280 else if (size_tflag) 2281 numprinted = sprintf(numberresult, fmt, 2282 va_arg(count, size_t)); 2283 else 2284 numprinted = sprintf(numberresult, fmt, 2285 va_arg(count, unsigned int)); 2286 n += numprinted; 2287 numberresult += (numprinted + 1); 2288 assert(*(numberresult - 1) == '\0'); 2289 assert(*(numberresult - 2) != '\0'); 2290 assert(numprinted >= 0); 2291 assert(numberresult <= numberresults + numbersize); 2292 break; 2293 case 'x': 2294 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2295 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2296 n += numprinted; 2297 numberresult += (numprinted + 1); 2298 assert(*(numberresult - 1) == '\0'); 2299 assert(*(numberresult - 2) != '\0'); 2300 assert(numprinted >= 0); 2301 assert(numberresult <= numberresults + numbersize); 2302 break; 2303 case 'p': 2304 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2305 /* %p is ill-defined: ensure leading 0x. */ 2306 if (numberresult[1] == 'X') 2307 numberresult[1] = 'x'; 2308 else if (numberresult[1] != 'x') { 2309 memmove(numberresult + 2, numberresult, 2310 strlen(numberresult) + 1); 2311 numberresult[0] = '0'; 2312 numberresult[1] = 'x'; 2313 numprinted += 2; 2314 } 2315 n += numprinted; 2316 numberresult += (numprinted + 1); 2317 assert(*(numberresult - 1) == '\0'); 2318 assert(*(numberresult - 2) != '\0'); 2319 assert(numprinted >= 0); 2320 assert(numberresult <= numberresults + numbersize); 2321 break; 2322 case 's': 2323 { 2324 /* UTF-8 */ 2325 const char *s = va_arg(count, const char*); 2326 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2327 if (!str) 2328 goto fail; 2329 /* since PyUnicode_DecodeUTF8 returns already flexible 2330 unicode objects, there is no need to call ready on them */ 2331 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2332 maxchar = Py_MAX(maxchar, argmaxchar); 2333 n += PyUnicode_GET_LENGTH(str); 2334 /* Remember the str and switch to the next slot */ 2335 *callresult++ = str; 2336 break; 2337 } 2338 case 'U': 2339 { 2340 PyObject *obj = va_arg(count, PyObject *); 2341 assert(obj && _PyUnicode_CHECK(obj)); 2342 if (PyUnicode_READY(obj) == -1) 2343 goto fail; 2344 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2345 maxchar = Py_MAX(maxchar, argmaxchar); 2346 n += PyUnicode_GET_LENGTH(obj); 2347 break; 2348 } 2349 case 'V': 2350 { 2351 PyObject *obj = va_arg(count, PyObject *); 2352 const char *str = va_arg(count, const char *); 2353 PyObject *str_obj; 2354 assert(obj || str); 2355 assert(!obj || _PyUnicode_CHECK(obj)); 2356 if (obj) { 2357 if (PyUnicode_READY(obj) == -1) 2358 goto fail; 2359 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2360 maxchar = Py_MAX(maxchar, argmaxchar); 2361 n += PyUnicode_GET_LENGTH(obj); 2362 *callresult++ = NULL; 2363 } 2364 else { 2365 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2366 if (!str_obj) 2367 goto fail; 2368 if (PyUnicode_READY(str_obj)) { 2369 Py_DECREF(str_obj); 2370 goto fail; 2371 } 2372 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2373 maxchar = Py_MAX(maxchar, argmaxchar); 2374 n += PyUnicode_GET_LENGTH(str_obj); 2375 *callresult++ = str_obj; 2376 } 2377 break; 2378 } 2379 case 'S': 2380 { 2381 PyObject *obj = va_arg(count, PyObject *); 2382 PyObject *str; 2383 assert(obj); 2384 str = PyObject_Str(obj); 2385 if (!str || PyUnicode_READY(str) == -1) 2386 goto fail; 2387 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2388 maxchar = Py_MAX(maxchar, argmaxchar); 2389 n += PyUnicode_GET_LENGTH(str); 2390 /* Remember the str and switch to the next slot */ 2391 *callresult++ = str; 2392 break; 2393 } 2394 case 'R': 2395 { 2396 PyObject *obj = va_arg(count, PyObject *); 2397 PyObject *repr; 2398 assert(obj); 2399 repr = PyObject_Repr(obj); 2400 if (!repr || PyUnicode_READY(repr) == -1) 2401 goto fail; 2402 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2403 maxchar = Py_MAX(maxchar, argmaxchar); 2404 n += PyUnicode_GET_LENGTH(repr); 2405 /* Remember the repr and switch to the next slot */ 2406 *callresult++ = repr; 2407 break; 2408 } 2409 case 'A': 2410 { 2411 PyObject *obj = va_arg(count, PyObject *); 2412 PyObject *ascii; 2413 assert(obj); 2414 ascii = PyObject_ASCII(obj); 2415 if (!ascii || PyUnicode_READY(ascii) == -1) 2416 goto fail; 2417 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2418 maxchar = Py_MAX(maxchar, argmaxchar); 2419 n += PyUnicode_GET_LENGTH(ascii); 2420 /* Remember the repr and switch to the next slot */ 2421 *callresult++ = ascii; 2422 break; 2423 } 2424 default: 2425 /* if we stumble upon an unknown 2426 formatting code, copy the rest of 2427 the format string to the output 2428 string. (we cannot just skip the 2429 code, since there's no way to know 2430 what's in the argument list) */ 2431 n += strlen(p); 2432 goto expand; 2433 } 2434 } else 2435 n++; 2436 } 2437 expand: 2438 /* step 4: fill the buffer */ 2439 /* Since we've analyzed how much space we need, 2440 we don't have to resize the string. 2441 There can be no errors beyond this point. */ 2442 string = PyUnicode_New(n, maxchar); 2443 if (!string) 2444 goto fail; 2445 kind = PyUnicode_KIND(string); 2446 data = PyUnicode_DATA(string); 2447 callresult = callresults; 2448 numberresult = numberresults; 2449 2450 for (i = 0, f = format; *f; f++) { 2451 if (*f == '%') { 2452 const char* p; 2453 2454 p = f; 2455 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2456 /* checking for == because the last argument could be a empty 2457 string, which causes i to point to end, the assert at the end of 2458 the loop */ 2459 assert(i <= PyUnicode_GET_LENGTH(string)); 2460 2461 switch (*f) { 2462 case 'c': 2463 { 2464 const int ordinal = va_arg(vargs, int); 2465 PyUnicode_WRITE(kind, data, i++, ordinal); 2466 break; 2467 } 2468 case 'i': 2469 case 'd': 2470 case 'u': 2471 case 'x': 2472 case 'p': 2473 /* unused, since we already have the result */ 2474 if (*f == 'p') 2475 (void) va_arg(vargs, void *); 2476 else 2477 (void) va_arg(vargs, int); 2478 /* extract the result from numberresults and append. */ 2479 for (; *numberresult; ++i, ++numberresult) 2480 PyUnicode_WRITE(kind, data, i, *numberresult); 2481 /* skip over the separating '\0' */ 2482 assert(*numberresult == '\0'); 2483 numberresult++; 2484 assert(numberresult <= numberresults + numbersize); 2485 break; 2486 case 's': 2487 { 2488 /* unused, since we already have the result */ 2489 Py_ssize_t size; 2490 (void) va_arg(vargs, char *); 2491 size = PyUnicode_GET_LENGTH(*callresult); 2492 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2493 copy_characters(string, i, *callresult, 0, size); 2494 i += size; 2495 /* We're done with the unicode()/repr() => forget it */ 2496 Py_DECREF(*callresult); 2497 /* switch to next unicode()/repr() result */ 2498 ++callresult; 2499 break; 2500 } 2501 case 'U': 2502 { 2503 PyObject *obj = va_arg(vargs, PyObject *); 2504 Py_ssize_t size; 2505 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2506 size = PyUnicode_GET_LENGTH(obj); 2507 copy_characters(string, i, obj, 0, size); 2508 i += size; 2509 break; 2510 } 2511 case 'V': 2512 { 2513 Py_ssize_t size; 2514 PyObject *obj = va_arg(vargs, PyObject *); 2515 va_arg(vargs, const char *); 2516 if (obj) { 2517 size = PyUnicode_GET_LENGTH(obj); 2518 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2519 copy_characters(string, i, obj, 0, size); 2520 i += size; 2521 } else { 2522 size = PyUnicode_GET_LENGTH(*callresult); 2523 assert(PyUnicode_KIND(*callresult) <= 2524 PyUnicode_KIND(string)); 2525 copy_characters(string, i, *callresult, 0, size); 2526 i += size; 2527 Py_DECREF(*callresult); 2528 } 2529 ++callresult; 2530 break; 2531 } 2532 case 'S': 2533 case 'R': 2534 case 'A': 2535 { 2536 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2537 /* unused, since we already have the result */ 2538 (void) va_arg(vargs, PyObject *); 2539 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2540 copy_characters(string, i, *callresult, 0, size); 2541 i += size; 2542 /* We're done with the unicode()/repr() => forget it */ 2543 Py_DECREF(*callresult); 2544 /* switch to next unicode()/repr() result */ 2545 ++callresult; 2546 break; 2547 } 2548 case '%': 2549 PyUnicode_WRITE(kind, data, i++, '%'); 2550 break; 2551 default: 2552 for (; *p; ++p, ++i) 2553 PyUnicode_WRITE(kind, data, i, *p); 2554 assert(i == PyUnicode_GET_LENGTH(string)); 2555 goto end; 2556 } 2557 } 2558 else { 2559 assert(i < PyUnicode_GET_LENGTH(string)); 2560 PyUnicode_WRITE(kind, data, i++, *f); 2561 } 2562 } 2563 assert(i == PyUnicode_GET_LENGTH(string)); 2564 2565 end: 2566 if (callresults) 2567 PyObject_Free(callresults); 2568 if (numberresults) 2569 PyObject_Free(numberresults); 2570 assert(_PyUnicode_CheckConsistency(string, 1)); 2571 return (PyObject *)string; 2572 fail: 2573 if (callresults) { 2574 PyObject **callresult2 = callresults; 2575 while (callresult2 < callresult) { 2576 Py_XDECREF(*callresult2); 2577 ++callresult2; 2578 } 2579 PyObject_Free(callresults); 2580 } 2581 if (numberresults) 2582 PyObject_Free(numberresults); 2583 return NULL; 2584} 2585 2586PyObject * 2587PyUnicode_FromFormat(const char *format, ...) 2588{ 2589 PyObject* ret; 2590 va_list vargs; 2591 2592#ifdef HAVE_STDARG_PROTOTYPES 2593 va_start(vargs, format); 2594#else 2595 va_start(vargs); 2596#endif 2597 ret = PyUnicode_FromFormatV(format, vargs); 2598 va_end(vargs); 2599 return ret; 2600} 2601 2602#ifdef HAVE_WCHAR_H 2603 2604/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2605 convert a Unicode object to a wide character string. 2606 2607 - If w is NULL: return the number of wide characters (including the null 2608 character) required to convert the unicode object. Ignore size argument. 2609 2610 - Otherwise: return the number of wide characters (excluding the null 2611 character) written into w. Write at most size wide characters (including 2612 the null character). */ 2613static Py_ssize_t 2614unicode_aswidechar(PyUnicodeObject *unicode, 2615 wchar_t *w, 2616 Py_ssize_t size) 2617{ 2618 Py_ssize_t res; 2619 const wchar_t *wstr; 2620 2621 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2622 if (wstr == NULL) 2623 return -1; 2624 2625 if (w != NULL) { 2626 if (size > res) 2627 size = res + 1; 2628 else 2629 res = size; 2630 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2631 return res; 2632 } 2633 else 2634 return res + 1; 2635} 2636 2637Py_ssize_t 2638PyUnicode_AsWideChar(PyObject *unicode, 2639 wchar_t *w, 2640 Py_ssize_t size) 2641{ 2642 if (unicode == NULL) { 2643 PyErr_BadInternalCall(); 2644 return -1; 2645 } 2646 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2647} 2648 2649wchar_t* 2650PyUnicode_AsWideCharString(PyObject *unicode, 2651 Py_ssize_t *size) 2652{ 2653 wchar_t* buffer; 2654 Py_ssize_t buflen; 2655 2656 if (unicode == NULL) { 2657 PyErr_BadInternalCall(); 2658 return NULL; 2659 } 2660 2661 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2662 if (buflen == -1) 2663 return NULL; 2664 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2665 PyErr_NoMemory(); 2666 return NULL; 2667 } 2668 2669 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2670 if (buffer == NULL) { 2671 PyErr_NoMemory(); 2672 return NULL; 2673 } 2674 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2675 if (buflen == -1) 2676 return NULL; 2677 if (size != NULL) 2678 *size = buflen; 2679 return buffer; 2680} 2681 2682#endif /* HAVE_WCHAR_H */ 2683 2684PyObject * 2685PyUnicode_FromOrdinal(int ordinal) 2686{ 2687 PyObject *v; 2688 if (ordinal < 0 || ordinal > 0x10ffff) { 2689 PyErr_SetString(PyExc_ValueError, 2690 "chr() arg not in range(0x110000)"); 2691 return NULL; 2692 } 2693 2694 if (ordinal < 256) 2695 return get_latin1_char(ordinal); 2696 2697 v = PyUnicode_New(1, ordinal); 2698 if (v == NULL) 2699 return NULL; 2700 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2701 assert(_PyUnicode_CheckConsistency(v, 1)); 2702 return v; 2703} 2704 2705PyObject * 2706PyUnicode_FromObject(register PyObject *obj) 2707{ 2708 /* XXX Perhaps we should make this API an alias of 2709 PyObject_Str() instead ?! */ 2710 if (PyUnicode_CheckExact(obj)) { 2711 if (PyUnicode_READY(obj)) 2712 return NULL; 2713 Py_INCREF(obj); 2714 return obj; 2715 } 2716 if (PyUnicode_Check(obj)) { 2717 /* For a Unicode subtype that's not a Unicode object, 2718 return a true Unicode object with the same data. */ 2719 return PyUnicode_Copy(obj); 2720 } 2721 PyErr_Format(PyExc_TypeError, 2722 "Can't convert '%.100s' object to str implicitly", 2723 Py_TYPE(obj)->tp_name); 2724 return NULL; 2725} 2726 2727PyObject * 2728PyUnicode_FromEncodedObject(register PyObject *obj, 2729 const char *encoding, 2730 const char *errors) 2731{ 2732 Py_buffer buffer; 2733 PyObject *v; 2734 2735 if (obj == NULL) { 2736 PyErr_BadInternalCall(); 2737 return NULL; 2738 } 2739 2740 /* Decoding bytes objects is the most common case and should be fast */ 2741 if (PyBytes_Check(obj)) { 2742 if (PyBytes_GET_SIZE(obj) == 0) { 2743 Py_INCREF(unicode_empty); 2744 v = unicode_empty; 2745 } 2746 else { 2747 v = PyUnicode_Decode( 2748 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2749 encoding, errors); 2750 } 2751 return v; 2752 } 2753 2754 if (PyUnicode_Check(obj)) { 2755 PyErr_SetString(PyExc_TypeError, 2756 "decoding str is not supported"); 2757 return NULL; 2758 } 2759 2760 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2761 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2762 PyErr_Format(PyExc_TypeError, 2763 "coercing to str: need bytes, bytearray " 2764 "or buffer-like object, %.80s found", 2765 Py_TYPE(obj)->tp_name); 2766 return NULL; 2767 } 2768 2769 if (buffer.len == 0) { 2770 Py_INCREF(unicode_empty); 2771 v = unicode_empty; 2772 } 2773 else 2774 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2775 2776 PyBuffer_Release(&buffer); 2777 return v; 2778} 2779 2780/* Convert encoding to lower case and replace '_' with '-' in order to 2781 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2782 1 on success. */ 2783static int 2784normalize_encoding(const char *encoding, 2785 char *lower, 2786 size_t lower_len) 2787{ 2788 const char *e; 2789 char *l; 2790 char *l_end; 2791 2792 e = encoding; 2793 l = lower; 2794 l_end = &lower[lower_len - 1]; 2795 while (*e) { 2796 if (l == l_end) 2797 return 0; 2798 if (Py_ISUPPER(*e)) { 2799 *l++ = Py_TOLOWER(*e++); 2800 } 2801 else if (*e == '_') { 2802 *l++ = '-'; 2803 e++; 2804 } 2805 else { 2806 *l++ = *e++; 2807 } 2808 } 2809 *l = '\0'; 2810 return 1; 2811} 2812 2813PyObject * 2814PyUnicode_Decode(const char *s, 2815 Py_ssize_t size, 2816 const char *encoding, 2817 const char *errors) 2818{ 2819 PyObject *buffer = NULL, *unicode; 2820 Py_buffer info; 2821 char lower[11]; /* Enough for any encoding shortcut */ 2822 2823 if (encoding == NULL) 2824 return PyUnicode_DecodeUTF8(s, size, errors); 2825 2826 /* Shortcuts for common default encodings */ 2827 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2828 if ((strcmp(lower, "utf-8") == 0) || 2829 (strcmp(lower, "utf8") == 0)) 2830 return PyUnicode_DecodeUTF8(s, size, errors); 2831 else if ((strcmp(lower, "latin-1") == 0) || 2832 (strcmp(lower, "latin1") == 0) || 2833 (strcmp(lower, "iso-8859-1") == 0)) 2834 return PyUnicode_DecodeLatin1(s, size, errors); 2835#ifdef HAVE_MBCS 2836 else if (strcmp(lower, "mbcs") == 0) 2837 return PyUnicode_DecodeMBCS(s, size, errors); 2838#endif 2839 else if (strcmp(lower, "ascii") == 0) 2840 return PyUnicode_DecodeASCII(s, size, errors); 2841 else if (strcmp(lower, "utf-16") == 0) 2842 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2843 else if (strcmp(lower, "utf-32") == 0) 2844 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2845 } 2846 2847 /* Decode via the codec registry */ 2848 buffer = NULL; 2849 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2850 goto onError; 2851 buffer = PyMemoryView_FromBuffer(&info); 2852 if (buffer == NULL) 2853 goto onError; 2854 unicode = PyCodec_Decode(buffer, encoding, errors); 2855 if (unicode == NULL) 2856 goto onError; 2857 if (!PyUnicode_Check(unicode)) { 2858 PyErr_Format(PyExc_TypeError, 2859 "decoder did not return a str object (type=%.400s)", 2860 Py_TYPE(unicode)->tp_name); 2861 Py_DECREF(unicode); 2862 goto onError; 2863 } 2864 Py_DECREF(buffer); 2865#ifndef DONT_MAKE_RESULT_READY 2866 if (_PyUnicode_READY_REPLACE(&unicode)) { 2867 Py_DECREF(unicode); 2868 return NULL; 2869 } 2870#endif 2871 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2872 return unicode; 2873 2874 onError: 2875 Py_XDECREF(buffer); 2876 return NULL; 2877} 2878 2879PyObject * 2880PyUnicode_AsDecodedObject(PyObject *unicode, 2881 const char *encoding, 2882 const char *errors) 2883{ 2884 PyObject *v; 2885 2886 if (!PyUnicode_Check(unicode)) { 2887 PyErr_BadArgument(); 2888 goto onError; 2889 } 2890 2891 if (encoding == NULL) 2892 encoding = PyUnicode_GetDefaultEncoding(); 2893 2894 /* Decode via the codec registry */ 2895 v = PyCodec_Decode(unicode, encoding, errors); 2896 if (v == NULL) 2897 goto onError; 2898 assert(_PyUnicode_CheckConsistency(v, 1)); 2899 return v; 2900 2901 onError: 2902 return NULL; 2903} 2904 2905PyObject * 2906PyUnicode_AsDecodedUnicode(PyObject *unicode, 2907 const char *encoding, 2908 const char *errors) 2909{ 2910 PyObject *v; 2911 2912 if (!PyUnicode_Check(unicode)) { 2913 PyErr_BadArgument(); 2914 goto onError; 2915 } 2916 2917 if (encoding == NULL) 2918 encoding = PyUnicode_GetDefaultEncoding(); 2919 2920 /* Decode via the codec registry */ 2921 v = PyCodec_Decode(unicode, encoding, errors); 2922 if (v == NULL) 2923 goto onError; 2924 if (!PyUnicode_Check(v)) { 2925 PyErr_Format(PyExc_TypeError, 2926 "decoder did not return a str object (type=%.400s)", 2927 Py_TYPE(v)->tp_name); 2928 Py_DECREF(v); 2929 goto onError; 2930 } 2931 assert(_PyUnicode_CheckConsistency(v, 1)); 2932 return v; 2933 2934 onError: 2935 return NULL; 2936} 2937 2938PyObject * 2939PyUnicode_Encode(const Py_UNICODE *s, 2940 Py_ssize_t size, 2941 const char *encoding, 2942 const char *errors) 2943{ 2944 PyObject *v, *unicode; 2945 2946 unicode = PyUnicode_FromUnicode(s, size); 2947 if (unicode == NULL) 2948 return NULL; 2949 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2950 Py_DECREF(unicode); 2951 return v; 2952} 2953 2954PyObject * 2955PyUnicode_AsEncodedObject(PyObject *unicode, 2956 const char *encoding, 2957 const char *errors) 2958{ 2959 PyObject *v; 2960 2961 if (!PyUnicode_Check(unicode)) { 2962 PyErr_BadArgument(); 2963 goto onError; 2964 } 2965 2966 if (encoding == NULL) 2967 encoding = PyUnicode_GetDefaultEncoding(); 2968 2969 /* Encode via the codec registry */ 2970 v = PyCodec_Encode(unicode, encoding, errors); 2971 if (v == NULL) 2972 goto onError; 2973 return v; 2974 2975 onError: 2976 return NULL; 2977} 2978 2979PyObject * 2980PyUnicode_EncodeFSDefault(PyObject *unicode) 2981{ 2982#ifdef HAVE_MBCS 2983 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2984 PyUnicode_GET_SIZE(unicode), 2985 NULL); 2986#elif defined(__APPLE__) 2987 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2988#else 2989 PyInterpreterState *interp = PyThreadState_GET()->interp; 2990 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2991 cannot use it to encode and decode filenames before it is loaded. Load 2992 the Python codec requires to encode at least its own filename. Use the C 2993 version of the locale codec until the codec registry is initialized and 2994 the Python codec is loaded. 2995 2996 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2997 cannot only rely on it: check also interp->fscodec_initialized for 2998 subinterpreters. */ 2999 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3000 return PyUnicode_AsEncodedString(unicode, 3001 Py_FileSystemDefaultEncoding, 3002 "surrogateescape"); 3003 } 3004 else { 3005 /* locale encoding with surrogateescape */ 3006 wchar_t *wchar; 3007 char *bytes; 3008 PyObject *bytes_obj; 3009 size_t error_pos; 3010 3011 wchar = PyUnicode_AsWideCharString(unicode, NULL); 3012 if (wchar == NULL) 3013 return NULL; 3014 bytes = _Py_wchar2char(wchar, &error_pos); 3015 if (bytes == NULL) { 3016 if (error_pos != (size_t)-1) { 3017 char *errmsg = strerror(errno); 3018 PyObject *exc = NULL; 3019 if (errmsg == NULL) 3020 errmsg = "Py_wchar2char() failed"; 3021 raise_encode_exception(&exc, 3022 "filesystemencoding", 3023 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 3024 error_pos, error_pos+1, 3025 errmsg); 3026 Py_XDECREF(exc); 3027 } 3028 else 3029 PyErr_NoMemory(); 3030 PyMem_Free(wchar); 3031 return NULL; 3032 } 3033 PyMem_Free(wchar); 3034 3035 bytes_obj = PyBytes_FromString(bytes); 3036 PyMem_Free(bytes); 3037 return bytes_obj; 3038 } 3039#endif 3040} 3041 3042PyObject * 3043PyUnicode_AsEncodedString(PyObject *unicode, 3044 const char *encoding, 3045 const char *errors) 3046{ 3047 PyObject *v; 3048 char lower[11]; /* Enough for any encoding shortcut */ 3049 3050 if (!PyUnicode_Check(unicode)) { 3051 PyErr_BadArgument(); 3052 return NULL; 3053 } 3054 3055 if (encoding == NULL) { 3056 if (errors == NULL || strcmp(errors, "strict") == 0) 3057 return _PyUnicode_AsUTF8String(unicode, NULL); 3058 else 3059 return _PyUnicode_AsUTF8String(unicode, errors); 3060 } 3061 3062 /* Shortcuts for common default encodings */ 3063 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3064 if ((strcmp(lower, "utf-8") == 0) || 3065 (strcmp(lower, "utf8") == 0)) 3066 { 3067 if (errors == NULL || strcmp(errors, "strict") == 0) 3068 return _PyUnicode_AsUTF8String(unicode, NULL); 3069 else 3070 return _PyUnicode_AsUTF8String(unicode, errors); 3071 } 3072 else if ((strcmp(lower, "latin-1") == 0) || 3073 (strcmp(lower, "latin1") == 0) || 3074 (strcmp(lower, "iso-8859-1") == 0)) 3075 return _PyUnicode_AsLatin1String(unicode, errors); 3076#ifdef HAVE_MBCS 3077 else if (strcmp(lower, "mbcs") == 0) 3078 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3079 PyUnicode_GET_SIZE(unicode), 3080 errors); 3081#endif 3082 else if (strcmp(lower, "ascii") == 0) 3083 return _PyUnicode_AsASCIIString(unicode, errors); 3084 } 3085 3086 /* Encode via the codec registry */ 3087 v = PyCodec_Encode(unicode, encoding, errors); 3088 if (v == NULL) 3089 return NULL; 3090 3091 /* The normal path */ 3092 if (PyBytes_Check(v)) 3093 return v; 3094 3095 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3096 if (PyByteArray_Check(v)) { 3097 int error; 3098 PyObject *b; 3099 3100 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3101 "encoder %s returned bytearray instead of bytes", 3102 encoding); 3103 if (error) { 3104 Py_DECREF(v); 3105 return NULL; 3106 } 3107 3108 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3109 Py_DECREF(v); 3110 return b; 3111 } 3112 3113 PyErr_Format(PyExc_TypeError, 3114 "encoder did not return a bytes object (type=%.400s)", 3115 Py_TYPE(v)->tp_name); 3116 Py_DECREF(v); 3117 return NULL; 3118} 3119 3120PyObject * 3121PyUnicode_AsEncodedUnicode(PyObject *unicode, 3122 const char *encoding, 3123 const char *errors) 3124{ 3125 PyObject *v; 3126 3127 if (!PyUnicode_Check(unicode)) { 3128 PyErr_BadArgument(); 3129 goto onError; 3130 } 3131 3132 if (encoding == NULL) 3133 encoding = PyUnicode_GetDefaultEncoding(); 3134 3135 /* Encode via the codec registry */ 3136 v = PyCodec_Encode(unicode, encoding, errors); 3137 if (v == NULL) 3138 goto onError; 3139 if (!PyUnicode_Check(v)) { 3140 PyErr_Format(PyExc_TypeError, 3141 "encoder did not return an str object (type=%.400s)", 3142 Py_TYPE(v)->tp_name); 3143 Py_DECREF(v); 3144 goto onError; 3145 } 3146 return v; 3147 3148 onError: 3149 return NULL; 3150} 3151 3152PyObject* 3153PyUnicode_DecodeFSDefault(const char *s) { 3154 Py_ssize_t size = (Py_ssize_t)strlen(s); 3155 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3156} 3157 3158PyObject* 3159PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3160{ 3161#ifdef HAVE_MBCS 3162 return PyUnicode_DecodeMBCS(s, size, NULL); 3163#elif defined(__APPLE__) 3164 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 3165#else 3166 PyInterpreterState *interp = PyThreadState_GET()->interp; 3167 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3168 cannot use it to encode and decode filenames before it is loaded. Load 3169 the Python codec requires to encode at least its own filename. Use the C 3170 version of the locale codec until the codec registry is initialized and 3171 the Python codec is loaded. 3172 3173 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3174 cannot only rely on it: check also interp->fscodec_initialized for 3175 subinterpreters. */ 3176 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3177 return PyUnicode_Decode(s, size, 3178 Py_FileSystemDefaultEncoding, 3179 "surrogateescape"); 3180 } 3181 else { 3182 /* locale encoding with surrogateescape */ 3183 wchar_t *wchar; 3184 PyObject *unicode; 3185 size_t len; 3186 3187 if (s[size] != '\0' || size != strlen(s)) { 3188 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3189 return NULL; 3190 } 3191 3192 wchar = _Py_char2wchar(s, &len); 3193 if (wchar == NULL) 3194 return PyErr_NoMemory(); 3195 3196 unicode = PyUnicode_FromWideChar(wchar, len); 3197 PyMem_Free(wchar); 3198 return unicode; 3199 } 3200#endif 3201} 3202 3203 3204int 3205PyUnicode_FSConverter(PyObject* arg, void* addr) 3206{ 3207 PyObject *output = NULL; 3208 Py_ssize_t size; 3209 void *data; 3210 if (arg == NULL) { 3211 Py_DECREF(*(PyObject**)addr); 3212 return 1; 3213 } 3214 if (PyBytes_Check(arg)) { 3215 output = arg; 3216 Py_INCREF(output); 3217 } 3218 else { 3219 arg = PyUnicode_FromObject(arg); 3220 if (!arg) 3221 return 0; 3222 output = PyUnicode_EncodeFSDefault(arg); 3223 Py_DECREF(arg); 3224 if (!output) 3225 return 0; 3226 if (!PyBytes_Check(output)) { 3227 Py_DECREF(output); 3228 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3229 return 0; 3230 } 3231 } 3232 size = PyBytes_GET_SIZE(output); 3233 data = PyBytes_AS_STRING(output); 3234 if (size != strlen(data)) { 3235 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3236 Py_DECREF(output); 3237 return 0; 3238 } 3239 *(PyObject**)addr = output; 3240 return Py_CLEANUP_SUPPORTED; 3241} 3242 3243 3244int 3245PyUnicode_FSDecoder(PyObject* arg, void* addr) 3246{ 3247 PyObject *output = NULL; 3248 if (arg == NULL) { 3249 Py_DECREF(*(PyObject**)addr); 3250 return 1; 3251 } 3252 if (PyUnicode_Check(arg)) { 3253 if (PyUnicode_READY(arg)) 3254 return 0; 3255 output = arg; 3256 Py_INCREF(output); 3257 } 3258 else { 3259 arg = PyBytes_FromObject(arg); 3260 if (!arg) 3261 return 0; 3262 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3263 PyBytes_GET_SIZE(arg)); 3264 Py_DECREF(arg); 3265 if (!output) 3266 return 0; 3267 if (!PyUnicode_Check(output)) { 3268 Py_DECREF(output); 3269 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3270 return 0; 3271 } 3272 } 3273 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3274 PyUnicode_GET_LENGTH(output), 0, 1)) { 3275 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3276 Py_DECREF(output); 3277 return 0; 3278 } 3279 *(PyObject**)addr = output; 3280 return Py_CLEANUP_SUPPORTED; 3281} 3282 3283 3284char* 3285PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3286{ 3287 PyObject *bytes; 3288 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3289 3290 if (!PyUnicode_Check(unicode)) { 3291 PyErr_BadArgument(); 3292 return NULL; 3293 } 3294 if (PyUnicode_READY(u) == -1) 3295 return NULL; 3296 3297 if (PyUnicode_UTF8(unicode) == NULL) { 3298 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3299 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3300 if (bytes == NULL) 3301 return NULL; 3302 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3303 if (_PyUnicode_UTF8(u) == NULL) { 3304 Py_DECREF(bytes); 3305 return NULL; 3306 } 3307 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3308 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3309 Py_DECREF(bytes); 3310 } 3311 3312 if (psize) 3313 *psize = PyUnicode_UTF8_LENGTH(unicode); 3314 return PyUnicode_UTF8(unicode); 3315} 3316 3317char* 3318PyUnicode_AsUTF8(PyObject *unicode) 3319{ 3320 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3321} 3322 3323#ifdef Py_DEBUG 3324int unicode_as_unicode_calls = 0; 3325#endif 3326 3327 3328Py_UNICODE * 3329PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3330{ 3331 PyUnicodeObject *u; 3332 const unsigned char *one_byte; 3333#if SIZEOF_WCHAR_T == 4 3334 const Py_UCS2 *two_bytes; 3335#else 3336 const Py_UCS4 *four_bytes; 3337 const Py_UCS4 *ucs4_end; 3338 Py_ssize_t num_surrogates; 3339#endif 3340 wchar_t *w; 3341 wchar_t *wchar_end; 3342 3343 if (!PyUnicode_Check(unicode)) { 3344 PyErr_BadArgument(); 3345 return NULL; 3346 } 3347 u = (PyUnicodeObject*)unicode; 3348 if (_PyUnicode_WSTR(u) == NULL) { 3349 /* Non-ASCII compact unicode object */ 3350 assert(_PyUnicode_KIND(u) != 0); 3351 assert(PyUnicode_IS_READY(u)); 3352 3353#ifdef Py_DEBUG 3354 ++unicode_as_unicode_calls; 3355#endif 3356 3357 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3358#if SIZEOF_WCHAR_T == 2 3359 four_bytes = PyUnicode_4BYTE_DATA(u); 3360 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3361 num_surrogates = 0; 3362 3363 for (; four_bytes < ucs4_end; ++four_bytes) { 3364 if (*four_bytes > 0xFFFF) 3365 ++num_surrogates; 3366 } 3367 3368 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3369 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3370 if (!_PyUnicode_WSTR(u)) { 3371 PyErr_NoMemory(); 3372 return NULL; 3373 } 3374 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3375 3376 w = _PyUnicode_WSTR(u); 3377 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3378 four_bytes = PyUnicode_4BYTE_DATA(u); 3379 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3380 if (*four_bytes > 0xFFFF) { 3381 /* encode surrogate pair in this case */ 3382 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3383 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3384 } 3385 else 3386 *w = *four_bytes; 3387 3388 if (w > wchar_end) { 3389 assert(0 && "Miscalculated string end"); 3390 } 3391 } 3392 *w = 0; 3393#else 3394 /* sizeof(wchar_t) == 4 */ 3395 Py_FatalError("Impossible unicode object state, wstr and str " 3396 "should share memory already."); 3397 return NULL; 3398#endif 3399 } 3400 else { 3401 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3402 (_PyUnicode_LENGTH(u) + 1)); 3403 if (!_PyUnicode_WSTR(u)) { 3404 PyErr_NoMemory(); 3405 return NULL; 3406 } 3407 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3408 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3409 w = _PyUnicode_WSTR(u); 3410 wchar_end = w + _PyUnicode_LENGTH(u); 3411 3412 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3413 one_byte = PyUnicode_1BYTE_DATA(u); 3414 for (; w < wchar_end; ++one_byte, ++w) 3415 *w = *one_byte; 3416 /* null-terminate the wstr */ 3417 *w = 0; 3418 } 3419 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3420#if SIZEOF_WCHAR_T == 4 3421 two_bytes = PyUnicode_2BYTE_DATA(u); 3422 for (; w < wchar_end; ++two_bytes, ++w) 3423 *w = *two_bytes; 3424 /* null-terminate the wstr */ 3425 *w = 0; 3426#else 3427 /* sizeof(wchar_t) == 2 */ 3428 PyObject_FREE(_PyUnicode_WSTR(u)); 3429 _PyUnicode_WSTR(u) = NULL; 3430 Py_FatalError("Impossible unicode object state, wstr " 3431 "and str should share memory already."); 3432 return NULL; 3433#endif 3434 } 3435 else { 3436 assert(0 && "This should never happen."); 3437 } 3438 } 3439 } 3440 if (size != NULL) 3441 *size = PyUnicode_WSTR_LENGTH(u); 3442 return _PyUnicode_WSTR(u); 3443} 3444 3445Py_UNICODE * 3446PyUnicode_AsUnicode(PyObject *unicode) 3447{ 3448 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3449} 3450 3451 3452Py_ssize_t 3453PyUnicode_GetSize(PyObject *unicode) 3454{ 3455 if (!PyUnicode_Check(unicode)) { 3456 PyErr_BadArgument(); 3457 goto onError; 3458 } 3459 return PyUnicode_GET_SIZE(unicode); 3460 3461 onError: 3462 return -1; 3463} 3464 3465Py_ssize_t 3466PyUnicode_GetLength(PyObject *unicode) 3467{ 3468 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3469 PyErr_BadArgument(); 3470 return -1; 3471 } 3472 3473 return PyUnicode_GET_LENGTH(unicode); 3474} 3475 3476Py_UCS4 3477PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3478{ 3479 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3480 PyErr_BadArgument(); 3481 return (Py_UCS4)-1; 3482 } 3483 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3484 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3485 return (Py_UCS4)-1; 3486 } 3487 return PyUnicode_READ_CHAR(unicode, index); 3488} 3489 3490int 3491PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3492{ 3493 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3494 PyErr_BadArgument(); 3495 return -1; 3496 } 3497 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3498 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3499 return -1; 3500 } 3501 if (_PyUnicode_Dirty(unicode)) 3502 return -1; 3503 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3504 index, ch); 3505 return 0; 3506} 3507 3508const char * 3509PyUnicode_GetDefaultEncoding(void) 3510{ 3511 return "utf-8"; 3512} 3513 3514/* create or adjust a UnicodeDecodeError */ 3515static void 3516make_decode_exception(PyObject **exceptionObject, 3517 const char *encoding, 3518 const char *input, Py_ssize_t length, 3519 Py_ssize_t startpos, Py_ssize_t endpos, 3520 const char *reason) 3521{ 3522 if (*exceptionObject == NULL) { 3523 *exceptionObject = PyUnicodeDecodeError_Create( 3524 encoding, input, length, startpos, endpos, reason); 3525 } 3526 else { 3527 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3528 goto onError; 3529 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3530 goto onError; 3531 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3532 goto onError; 3533 } 3534 return; 3535 3536onError: 3537 Py_DECREF(*exceptionObject); 3538 *exceptionObject = NULL; 3539} 3540 3541/* error handling callback helper: 3542 build arguments, call the callback and check the arguments, 3543 if no exception occurred, copy the replacement to the output 3544 and adjust various state variables. 3545 return 0 on success, -1 on error 3546*/ 3547 3548static int 3549unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3550 const char *encoding, const char *reason, 3551 const char **input, const char **inend, Py_ssize_t *startinpos, 3552 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3553 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3554{ 3555 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3556 3557 PyObject *restuple = NULL; 3558 PyObject *repunicode = NULL; 3559 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3560 Py_ssize_t insize; 3561 Py_ssize_t requiredsize; 3562 Py_ssize_t newpos; 3563 const Py_UNICODE *repptr; 3564 PyObject *inputobj = NULL; 3565 Py_ssize_t repsize; 3566 int res = -1; 3567 3568 if (*errorHandler == NULL) { 3569 *errorHandler = PyCodec_LookupError(errors); 3570 if (*errorHandler == NULL) 3571 goto onError; 3572 } 3573 3574 make_decode_exception(exceptionObject, 3575 encoding, 3576 *input, *inend - *input, 3577 *startinpos, *endinpos, 3578 reason); 3579 if (*exceptionObject == NULL) 3580 goto onError; 3581 3582 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3583 if (restuple == NULL) 3584 goto onError; 3585 if (!PyTuple_Check(restuple)) { 3586 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3587 goto onError; 3588 } 3589 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3590 goto onError; 3591 3592 /* Copy back the bytes variables, which might have been modified by the 3593 callback */ 3594 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3595 if (!inputobj) 3596 goto onError; 3597 if (!PyBytes_Check(inputobj)) { 3598 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3599 } 3600 *input = PyBytes_AS_STRING(inputobj); 3601 insize = PyBytes_GET_SIZE(inputobj); 3602 *inend = *input + insize; 3603 /* we can DECREF safely, as the exception has another reference, 3604 so the object won't go away. */ 3605 Py_DECREF(inputobj); 3606 3607 if (newpos<0) 3608 newpos = insize+newpos; 3609 if (newpos<0 || newpos>insize) { 3610 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3611 goto onError; 3612 } 3613 3614 /* need more space? (at least enough for what we 3615 have+the replacement+the rest of the string (starting 3616 at the new input position), so we won't have to check space 3617 when there are no errors in the rest of the string) */ 3618 repptr = PyUnicode_AS_UNICODE(repunicode); 3619 repsize = PyUnicode_GET_SIZE(repunicode); 3620 requiredsize = *outpos + repsize + insize-newpos; 3621 if (requiredsize > outsize) { 3622 if (requiredsize<2*outsize) 3623 requiredsize = 2*outsize; 3624 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3625 goto onError; 3626 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3627 } 3628 *endinpos = newpos; 3629 *inptr = *input + newpos; 3630 Py_UNICODE_COPY(*outptr, repptr, repsize); 3631 *outptr += repsize; 3632 *outpos += repsize; 3633 3634 /* we made it! */ 3635 res = 0; 3636 3637 onError: 3638 Py_XDECREF(restuple); 3639 return res; 3640} 3641 3642/* --- UTF-7 Codec -------------------------------------------------------- */ 3643 3644/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3645 3646/* Three simple macros defining base-64. */ 3647 3648/* Is c a base-64 character? */ 3649 3650#define IS_BASE64(c) \ 3651 (((c) >= 'A' && (c) <= 'Z') || \ 3652 ((c) >= 'a' && (c) <= 'z') || \ 3653 ((c) >= '0' && (c) <= '9') || \ 3654 (c) == '+' || (c) == '/') 3655 3656/* given that c is a base-64 character, what is its base-64 value? */ 3657 3658#define FROM_BASE64(c) \ 3659 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3660 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3661 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3662 (c) == '+' ? 62 : 63) 3663 3664/* What is the base-64 character of the bottom 6 bits of n? */ 3665 3666#define TO_BASE64(n) \ 3667 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3668 3669/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3670 * decoded as itself. We are permissive on decoding; the only ASCII 3671 * byte not decoding to itself is the + which begins a base64 3672 * string. */ 3673 3674#define DECODE_DIRECT(c) \ 3675 ((c) <= 127 && (c) != '+') 3676 3677/* The UTF-7 encoder treats ASCII characters differently according to 3678 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3679 * the above). See RFC2152. This array identifies these different 3680 * sets: 3681 * 0 : "Set D" 3682 * alphanumeric and '(),-./:? 3683 * 1 : "Set O" 3684 * !"#$%&*;<=>@[]^_`{|} 3685 * 2 : "whitespace" 3686 * ht nl cr sp 3687 * 3 : special (must be base64 encoded) 3688 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3689 */ 3690 3691static 3692char utf7_category[128] = { 3693/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3694 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3695/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3696 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3697/* sp ! " # $ % & ' ( ) * + , - . / */ 3698 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3699/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3701/* @ A B C D E F G H I J K L M N O */ 3702 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3703/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3705/* ` a b c d e f g h i j k l m n o */ 3706 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3707/* p q r s t u v w x y z { | } ~ del */ 3708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3709}; 3710 3711/* ENCODE_DIRECT: this character should be encoded as itself. The 3712 * answer depends on whether we are encoding set O as itself, and also 3713 * on whether we are encoding whitespace as itself. RFC2152 makes it 3714 * clear that the answers to these questions vary between 3715 * applications, so this code needs to be flexible. */ 3716 3717#define ENCODE_DIRECT(c, directO, directWS) \ 3718 ((c) < 128 && (c) > 0 && \ 3719 ((utf7_category[(c)] == 0) || \ 3720 (directWS && (utf7_category[(c)] == 2)) || \ 3721 (directO && (utf7_category[(c)] == 1)))) 3722 3723PyObject * 3724PyUnicode_DecodeUTF7(const char *s, 3725 Py_ssize_t size, 3726 const char *errors) 3727{ 3728 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3729} 3730 3731/* The decoder. The only state we preserve is our read position, 3732 * i.e. how many characters we have consumed. So if we end in the 3733 * middle of a shift sequence we have to back off the read position 3734 * and the output to the beginning of the sequence, otherwise we lose 3735 * all the shift state (seen bits, number of bits seen, high 3736 * surrogate). */ 3737 3738PyObject * 3739PyUnicode_DecodeUTF7Stateful(const char *s, 3740 Py_ssize_t size, 3741 const char *errors, 3742 Py_ssize_t *consumed) 3743{ 3744 const char *starts = s; 3745 Py_ssize_t startinpos; 3746 Py_ssize_t endinpos; 3747 Py_ssize_t outpos; 3748 const char *e; 3749 PyUnicodeObject *unicode; 3750 Py_UNICODE *p; 3751 const char *errmsg = ""; 3752 int inShift = 0; 3753 Py_UNICODE *shiftOutStart; 3754 unsigned int base64bits = 0; 3755 unsigned long base64buffer = 0; 3756 Py_UNICODE surrogate = 0; 3757 PyObject *errorHandler = NULL; 3758 PyObject *exc = NULL; 3759 3760 unicode = _PyUnicode_New(size); 3761 if (!unicode) 3762 return NULL; 3763 if (size == 0) { 3764 if (consumed) 3765 *consumed = 0; 3766 return (PyObject *)unicode; 3767 } 3768 3769 p = PyUnicode_AS_UNICODE(unicode); 3770 shiftOutStart = p; 3771 e = s + size; 3772 3773 while (s < e) { 3774 Py_UNICODE ch; 3775 restart: 3776 ch = (unsigned char) *s; 3777 3778 if (inShift) { /* in a base-64 section */ 3779 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3780 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3781 base64bits += 6; 3782 s++; 3783 if (base64bits >= 16) { 3784 /* we have enough bits for a UTF-16 value */ 3785 Py_UNICODE outCh = (Py_UNICODE) 3786 (base64buffer >> (base64bits-16)); 3787 base64bits -= 16; 3788 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3789 if (surrogate) { 3790 /* expecting a second surrogate */ 3791 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3792#ifdef Py_UNICODE_WIDE 3793 *p++ = (((surrogate & 0x3FF)<<10) 3794 | (outCh & 0x3FF)) + 0x10000; 3795#else 3796 *p++ = surrogate; 3797 *p++ = outCh; 3798#endif 3799 surrogate = 0; 3800 } 3801 else { 3802 surrogate = 0; 3803 errmsg = "second surrogate missing"; 3804 goto utf7Error; 3805 } 3806 } 3807 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3808 /* first surrogate */ 3809 surrogate = outCh; 3810 } 3811 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3812 errmsg = "unexpected second surrogate"; 3813 goto utf7Error; 3814 } 3815 else { 3816 *p++ = outCh; 3817 } 3818 } 3819 } 3820 else { /* now leaving a base-64 section */ 3821 inShift = 0; 3822 s++; 3823 if (surrogate) { 3824 errmsg = "second surrogate missing at end of shift sequence"; 3825 goto utf7Error; 3826 } 3827 if (base64bits > 0) { /* left-over bits */ 3828 if (base64bits >= 6) { 3829 /* We've seen at least one base-64 character */ 3830 errmsg = "partial character in shift sequence"; 3831 goto utf7Error; 3832 } 3833 else { 3834 /* Some bits remain; they should be zero */ 3835 if (base64buffer != 0) { 3836 errmsg = "non-zero padding bits in shift sequence"; 3837 goto utf7Error; 3838 } 3839 } 3840 } 3841 if (ch != '-') { 3842 /* '-' is absorbed; other terminating 3843 characters are preserved */ 3844 *p++ = ch; 3845 } 3846 } 3847 } 3848 else if ( ch == '+' ) { 3849 startinpos = s-starts; 3850 s++; /* consume '+' */ 3851 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3852 s++; 3853 *p++ = '+'; 3854 } 3855 else { /* begin base64-encoded section */ 3856 inShift = 1; 3857 shiftOutStart = p; 3858 base64bits = 0; 3859 } 3860 } 3861 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3862 *p++ = ch; 3863 s++; 3864 } 3865 else { 3866 startinpos = s-starts; 3867 s++; 3868 errmsg = "unexpected special character"; 3869 goto utf7Error; 3870 } 3871 continue; 3872utf7Error: 3873 outpos = p-PyUnicode_AS_UNICODE(unicode); 3874 endinpos = s-starts; 3875 if (unicode_decode_call_errorhandler( 3876 errors, &errorHandler, 3877 "utf7", errmsg, 3878 &starts, &e, &startinpos, &endinpos, &exc, &s, 3879 &unicode, &outpos, &p)) 3880 goto onError; 3881 } 3882 3883 /* end of string */ 3884 3885 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3886 /* if we're in an inconsistent state, that's an error */ 3887 if (surrogate || 3888 (base64bits >= 6) || 3889 (base64bits > 0 && base64buffer != 0)) { 3890 outpos = p-PyUnicode_AS_UNICODE(unicode); 3891 endinpos = size; 3892 if (unicode_decode_call_errorhandler( 3893 errors, &errorHandler, 3894 "utf7", "unterminated shift sequence", 3895 &starts, &e, &startinpos, &endinpos, &exc, &s, 3896 &unicode, &outpos, &p)) 3897 goto onError; 3898 if (s < e) 3899 goto restart; 3900 } 3901 } 3902 3903 /* return state */ 3904 if (consumed) { 3905 if (inShift) { 3906 p = shiftOutStart; /* back off output */ 3907 *consumed = startinpos; 3908 } 3909 else { 3910 *consumed = s-starts; 3911 } 3912 } 3913 3914 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3915 goto onError; 3916 3917 Py_XDECREF(errorHandler); 3918 Py_XDECREF(exc); 3919#ifndef DONT_MAKE_RESULT_READY 3920 if (_PyUnicode_READY_REPLACE(&unicode)) { 3921 Py_DECREF(unicode); 3922 return NULL; 3923 } 3924#endif 3925 assert(_PyUnicode_CheckConsistency(unicode, 1)); 3926 return (PyObject *)unicode; 3927 3928 onError: 3929 Py_XDECREF(errorHandler); 3930 Py_XDECREF(exc); 3931 Py_DECREF(unicode); 3932 return NULL; 3933} 3934 3935 3936PyObject * 3937PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3938 Py_ssize_t size, 3939 int base64SetO, 3940 int base64WhiteSpace, 3941 const char *errors) 3942{ 3943 PyObject *v; 3944 /* It might be possible to tighten this worst case */ 3945 Py_ssize_t allocated = 8 * size; 3946 int inShift = 0; 3947 Py_ssize_t i = 0; 3948 unsigned int base64bits = 0; 3949 unsigned long base64buffer = 0; 3950 char * out; 3951 char * start; 3952 3953 if (size == 0) 3954 return PyBytes_FromStringAndSize(NULL, 0); 3955 3956 if (allocated / 8 != size) 3957 return PyErr_NoMemory(); 3958 3959 v = PyBytes_FromStringAndSize(NULL, allocated); 3960 if (v == NULL) 3961 return NULL; 3962 3963 start = out = PyBytes_AS_STRING(v); 3964 for (;i < size; ++i) { 3965 Py_UNICODE ch = s[i]; 3966 3967 if (inShift) { 3968 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3969 /* shifting out */ 3970 if (base64bits) { /* output remaining bits */ 3971 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3972 base64buffer = 0; 3973 base64bits = 0; 3974 } 3975 inShift = 0; 3976 /* Characters not in the BASE64 set implicitly unshift the sequence 3977 so no '-' is required, except if the character is itself a '-' */ 3978 if (IS_BASE64(ch) || ch == '-') { 3979 *out++ = '-'; 3980 } 3981 *out++ = (char) ch; 3982 } 3983 else { 3984 goto encode_char; 3985 } 3986 } 3987 else { /* not in a shift sequence */ 3988 if (ch == '+') { 3989 *out++ = '+'; 3990 *out++ = '-'; 3991 } 3992 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3993 *out++ = (char) ch; 3994 } 3995 else { 3996 *out++ = '+'; 3997 inShift = 1; 3998 goto encode_char; 3999 } 4000 } 4001 continue; 4002encode_char: 4003#ifdef Py_UNICODE_WIDE 4004 if (ch >= 0x10000) { 4005 /* code first surrogate */ 4006 base64bits += 16; 4007 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4008 while (base64bits >= 6) { 4009 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4010 base64bits -= 6; 4011 } 4012 /* prepare second surrogate */ 4013 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 4014 } 4015#endif 4016 base64bits += 16; 4017 base64buffer = (base64buffer << 16) | ch; 4018 while (base64bits >= 6) { 4019 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4020 base64bits -= 6; 4021 } 4022 } 4023 if (base64bits) 4024 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4025 if (inShift) 4026 *out++ = '-'; 4027 if (_PyBytes_Resize(&v, out - start) < 0) 4028 return NULL; 4029 return v; 4030} 4031 4032#undef IS_BASE64 4033#undef FROM_BASE64 4034#undef TO_BASE64 4035#undef DECODE_DIRECT 4036#undef ENCODE_DIRECT 4037 4038/* --- UTF-8 Codec -------------------------------------------------------- */ 4039 4040static 4041char utf8_code_length[256] = { 4042 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 4043 illegal prefix. See RFC 3629 for details */ 4044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 4045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4046 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4047 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4048 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4049 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4051 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 4052 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 4053 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4054 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4055 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 4056 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 4057 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 4058 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 4059 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 4060}; 4061 4062PyObject * 4063PyUnicode_DecodeUTF8(const char *s, 4064 Py_ssize_t size, 4065 const char *errors) 4066{ 4067 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4068} 4069 4070/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 4071#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 4072 4073/* Mask to quickly check whether a C 'long' contains a 4074 non-ASCII, UTF8-encoded char. */ 4075#if (SIZEOF_LONG == 8) 4076# define ASCII_CHAR_MASK 0x8080808080808080L 4077#elif (SIZEOF_LONG == 4) 4078# define ASCII_CHAR_MASK 0x80808080L 4079#else 4080# error C 'long' size should be either 4 or 8! 4081#endif 4082 4083/* Scans a UTF-8 string and returns the maximum character to be expected, 4084 the size of the decoded unicode string and if any major errors were 4085 encountered. 4086 4087 This function does check basic UTF-8 sanity, it does however NOT CHECK 4088 if the string contains surrogates, and if all continuation bytes are 4089 within the correct ranges, these checks are performed in 4090 PyUnicode_DecodeUTF8Stateful. 4091 4092 If it sets has_errors to 1, it means the value of unicode_size and max_char 4093 will be bogus and you should not rely on useful information in them. 4094 */ 4095static Py_UCS4 4096utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 4097 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 4098 int *has_errors) 4099{ 4100 Py_ssize_t n; 4101 Py_ssize_t char_count = 0; 4102 Py_UCS4 max_char = 127, new_max; 4103 Py_UCS4 upper_bound; 4104 const unsigned char *p = (const unsigned char *)s; 4105 const unsigned char *end = p + string_size; 4106 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 4107 int err = 0; 4108 4109 for (; p < end && !err; ++p, ++char_count) { 4110 /* Only check value if it's not a ASCII char... */ 4111 if (*p < 0x80) { 4112 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 4113 an explanation. */ 4114 if (!((size_t) p & LONG_PTR_MASK)) { 4115 /* Help register allocation */ 4116 register const unsigned char *_p = p; 4117 while (_p < aligned_end) { 4118 unsigned long value = *(unsigned long *) _p; 4119 if (value & ASCII_CHAR_MASK) 4120 break; 4121 _p += SIZEOF_LONG; 4122 char_count += SIZEOF_LONG; 4123 } 4124 p = _p; 4125 if (p == end) 4126 break; 4127 } 4128 } 4129 if (*p >= 0x80) { 4130 n = utf8_code_length[*p]; 4131 new_max = max_char; 4132 switch (n) { 4133 /* invalid start byte */ 4134 case 0: 4135 err = 1; 4136 break; 4137 case 2: 4138 /* Code points between 0x00FF and 0x07FF inclusive. 4139 Approximate the upper bound of the code point, 4140 if this flips over 255 we can be sure it will be more 4141 than 255 and the string will need 2 bytes per code coint, 4142 if it stays under or equal to 255, we can be sure 1 byte 4143 is enough. 4144 ((*p & 0b00011111) << 6) | 0b00111111 */ 4145 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 4146 if (max_char < upper_bound) 4147 new_max = upper_bound; 4148 /* Ensure we track at least that we left ASCII space. */ 4149 if (new_max < 128) 4150 new_max = 128; 4151 break; 4152 case 3: 4153 /* Between 0x0FFF and 0xFFFF inclusive, so values are 4154 always > 255 and <= 65535 and will always need 2 bytes. */ 4155 if (max_char < 65535) 4156 new_max = 65535; 4157 break; 4158 case 4: 4159 /* Code point will be above 0xFFFF for sure in this case. */ 4160 new_max = 65537; 4161 break; 4162 /* Internal error, this should be caught by the first if */ 4163 case 1: 4164 default: 4165 assert(0 && "Impossible case in utf8_max_char_and_size"); 4166 err = 1; 4167 } 4168 /* Instead of number of overall bytes for this code point, 4169 n contains the number of following bytes: */ 4170 --n; 4171 /* Check if the follow up chars are all valid continuation bytes */ 4172 if (n >= 1) { 4173 const unsigned char *cont; 4174 if ((p + n) >= end) { 4175 if (consumed == 0) 4176 /* incomplete data, non-incremental decoding */ 4177 err = 1; 4178 break; 4179 } 4180 for (cont = p + 1; cont < (p + n); ++cont) { 4181 if ((*cont & 0xc0) != 0x80) { 4182 err = 1; 4183 break; 4184 } 4185 } 4186 p += n; 4187 } 4188 else 4189 err = 1; 4190 max_char = new_max; 4191 } 4192 } 4193 4194 if (unicode_size) 4195 *unicode_size = char_count; 4196 if (has_errors) 4197 *has_errors = err; 4198 return max_char; 4199} 4200 4201/* Similar to PyUnicode_WRITE but can also write into wstr field 4202 of the legacy unicode representation */ 4203#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 4204 do { \ 4205 const int k_ = (kind); \ 4206 if (k_ == PyUnicode_WCHAR_KIND) \ 4207 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 4208 else if (k_ == PyUnicode_1BYTE_KIND) \ 4209 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 4210 else if (k_ == PyUnicode_2BYTE_KIND) \ 4211 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 4212 else \ 4213 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 4214 } while (0) 4215 4216PyObject * 4217PyUnicode_DecodeUTF8Stateful(const char *s, 4218 Py_ssize_t size, 4219 const char *errors, 4220 Py_ssize_t *consumed) 4221{ 4222 const char *starts = s; 4223 int n; 4224 int k; 4225 Py_ssize_t startinpos; 4226 Py_ssize_t endinpos; 4227 const char *e, *aligned_end; 4228 PyUnicodeObject *unicode; 4229 const char *errmsg = ""; 4230 PyObject *errorHandler = NULL; 4231 PyObject *exc = NULL; 4232 Py_UCS4 maxchar = 0; 4233 Py_ssize_t unicode_size; 4234 Py_ssize_t i; 4235 int kind; 4236 void *data; 4237 int has_errors; 4238 Py_UNICODE *error_outptr; 4239#if SIZEOF_WCHAR_T == 2 4240 Py_ssize_t wchar_offset = 0; 4241#endif 4242 4243 if (size == 0) { 4244 if (consumed) 4245 *consumed = 0; 4246 return (PyObject *)PyUnicode_New(0, 0); 4247 } 4248 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4249 consumed, &has_errors); 4250 if (has_errors) { 4251 unicode = _PyUnicode_New(size); 4252 if (!unicode) 4253 return NULL; 4254 kind = PyUnicode_WCHAR_KIND; 4255 data = PyUnicode_AS_UNICODE(unicode); 4256 assert(data != NULL); 4257 } 4258 else { 4259 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 4260 if (!unicode) 4261 return NULL; 4262 /* When the string is ASCII only, just use memcpy and return. 4263 unicode_size may be != size if there is an incomplete UTF-8 4264 sequence at the end of the ASCII block. */ 4265 if (maxchar < 128 && size == unicode_size) { 4266 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4267 return (PyObject *)unicode; 4268 } 4269 kind = PyUnicode_KIND(unicode); 4270 data = PyUnicode_DATA(unicode); 4271 } 4272 /* Unpack UTF-8 encoded data */ 4273 i = 0; 4274 e = s + size; 4275 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4276 4277 while (s < e) { 4278 Py_UCS4 ch = (unsigned char)*s; 4279 4280 if (ch < 0x80) { 4281 /* Fast path for runs of ASCII characters. Given that common UTF-8 4282 input will consist of an overwhelming majority of ASCII 4283 characters, we try to optimize for this case by checking 4284 as many characters as a C 'long' can contain. 4285 First, check if we can do an aligned read, as most CPUs have 4286 a penalty for unaligned reads. 4287 */ 4288 if (!((size_t) s & LONG_PTR_MASK)) { 4289 /* Help register allocation */ 4290 register const char *_s = s; 4291 register Py_ssize_t _i = i; 4292 while (_s < aligned_end) { 4293 /* Read a whole long at a time (either 4 or 8 bytes), 4294 and do a fast unrolled copy if it only contains ASCII 4295 characters. */ 4296 unsigned long value = *(unsigned long *) _s; 4297 if (value & ASCII_CHAR_MASK) 4298 break; 4299 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4300 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4301 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4302 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4303#if (SIZEOF_LONG == 8) 4304 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4305 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4306 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4307 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4308#endif 4309 _s += SIZEOF_LONG; 4310 _i += SIZEOF_LONG; 4311 } 4312 s = _s; 4313 i = _i; 4314 if (s == e) 4315 break; 4316 ch = (unsigned char)*s; 4317 } 4318 } 4319 4320 if (ch < 0x80) { 4321 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4322 s++; 4323 continue; 4324 } 4325 4326 n = utf8_code_length[ch]; 4327 4328 if (s + n > e) { 4329 if (consumed) 4330 break; 4331 else { 4332 errmsg = "unexpected end of data"; 4333 startinpos = s-starts; 4334 endinpos = startinpos+1; 4335 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4336 endinpos++; 4337 goto utf8Error; 4338 } 4339 } 4340 4341 switch (n) { 4342 4343 case 0: 4344 errmsg = "invalid start byte"; 4345 startinpos = s-starts; 4346 endinpos = startinpos+1; 4347 goto utf8Error; 4348 4349 case 1: 4350 errmsg = "internal error"; 4351 startinpos = s-starts; 4352 endinpos = startinpos+1; 4353 goto utf8Error; 4354 4355 case 2: 4356 if ((s[1] & 0xc0) != 0x80) { 4357 errmsg = "invalid continuation byte"; 4358 startinpos = s-starts; 4359 endinpos = startinpos + 1; 4360 goto utf8Error; 4361 } 4362 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4363 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4364 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4365 break; 4366 4367 case 3: 4368 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4369 will result in surrogates in range d800-dfff. Surrogates are 4370 not valid UTF-8 so they are rejected. 4371 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4372 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4373 if ((s[1] & 0xc0) != 0x80 || 4374 (s[2] & 0xc0) != 0x80 || 4375 ((unsigned char)s[0] == 0xE0 && 4376 (unsigned char)s[1] < 0xA0) || 4377 ((unsigned char)s[0] == 0xED && 4378 (unsigned char)s[1] > 0x9F)) { 4379 errmsg = "invalid continuation byte"; 4380 startinpos = s-starts; 4381 endinpos = startinpos + 1; 4382 4383 /* if s[1] first two bits are 1 and 0, then the invalid 4384 continuation byte is s[2], so increment endinpos by 1, 4385 if not, s[1] is invalid and endinpos doesn't need to 4386 be incremented. */ 4387 if ((s[1] & 0xC0) == 0x80) 4388 endinpos++; 4389 goto utf8Error; 4390 } 4391 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4392 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4393 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4394 break; 4395 4396 case 4: 4397 if ((s[1] & 0xc0) != 0x80 || 4398 (s[2] & 0xc0) != 0x80 || 4399 (s[3] & 0xc0) != 0x80 || 4400 ((unsigned char)s[0] == 0xF0 && 4401 (unsigned char)s[1] < 0x90) || 4402 ((unsigned char)s[0] == 0xF4 && 4403 (unsigned char)s[1] > 0x8F)) { 4404 errmsg = "invalid continuation byte"; 4405 startinpos = s-starts; 4406 endinpos = startinpos + 1; 4407 if ((s[1] & 0xC0) == 0x80) { 4408 endinpos++; 4409 if ((s[2] & 0xC0) == 0x80) 4410 endinpos++; 4411 } 4412 goto utf8Error; 4413 } 4414 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4415 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4416 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4417 4418 /* If the string is flexible or we have native UCS-4, write 4419 directly.. */ 4420 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4421 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4422 4423 else { 4424 /* compute and append the two surrogates: */ 4425 4426 /* translate from 10000..10FFFF to 0..FFFF */ 4427 ch -= 0x10000; 4428 4429 /* high surrogate = top 10 bits added to D800 */ 4430 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4431 (Py_UNICODE)(0xD800 + (ch >> 10))); 4432 4433 /* low surrogate = bottom 10 bits added to DC00 */ 4434 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4435 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4436 } 4437#if SIZEOF_WCHAR_T == 2 4438 wchar_offset++; 4439#endif 4440 break; 4441 } 4442 s += n; 4443 continue; 4444 4445 utf8Error: 4446 /* If this is not yet a resizable string, make it one.. */ 4447 if (kind != PyUnicode_WCHAR_KIND) { 4448 const Py_UNICODE *u; 4449 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4450 if (!new_unicode) 4451 goto onError; 4452 u = PyUnicode_AsUnicode((PyObject *)unicode); 4453 if (!u) 4454 goto onError; 4455#if SIZEOF_WCHAR_T == 2 4456 i += wchar_offset; 4457#endif 4458 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4459 Py_DECREF(unicode); 4460 unicode = new_unicode; 4461 kind = 0; 4462 data = PyUnicode_AS_UNICODE(new_unicode); 4463 assert(data != NULL); 4464 } 4465 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4466 if (unicode_decode_call_errorhandler( 4467 errors, &errorHandler, 4468 "utf8", errmsg, 4469 &starts, &e, &startinpos, &endinpos, &exc, &s, 4470 &unicode, &i, &error_outptr)) 4471 goto onError; 4472 /* Update data because unicode_decode_call_errorhandler might have 4473 re-created or resized the unicode object. */ 4474 data = PyUnicode_AS_UNICODE(unicode); 4475 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4476 } 4477 /* Ensure the unicode_size calculation above was correct: */ 4478 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4479 4480 if (consumed) 4481 *consumed = s-starts; 4482 4483 /* Adjust length and ready string when it contained errors and 4484 is of the old resizable kind. */ 4485 if (kind == PyUnicode_WCHAR_KIND) { 4486 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4487 goto onError; 4488 } 4489 4490 Py_XDECREF(errorHandler); 4491 Py_XDECREF(exc); 4492#ifndef DONT_MAKE_RESULT_READY 4493 if (_PyUnicode_READY_REPLACE(&unicode)) { 4494 Py_DECREF(unicode); 4495 return NULL; 4496 } 4497#endif 4498 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4499 return (PyObject *)unicode; 4500 4501 onError: 4502 Py_XDECREF(errorHandler); 4503 Py_XDECREF(exc); 4504 Py_DECREF(unicode); 4505 return NULL; 4506} 4507 4508#undef WRITE_FLEXIBLE_OR_WSTR 4509 4510#ifdef __APPLE__ 4511 4512/* Simplified UTF-8 decoder using surrogateescape error handler, 4513 used to decode the command line arguments on Mac OS X. */ 4514 4515wchar_t* 4516_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4517{ 4518 int n; 4519 const char *e; 4520 wchar_t *unicode, *p; 4521 4522 /* Note: size will always be longer than the resulting Unicode 4523 character count */ 4524 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4525 PyErr_NoMemory(); 4526 return NULL; 4527 } 4528 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4529 if (!unicode) 4530 return NULL; 4531 4532 /* Unpack UTF-8 encoded data */ 4533 p = unicode; 4534 e = s + size; 4535 while (s < e) { 4536 Py_UCS4 ch = (unsigned char)*s; 4537 4538 if (ch < 0x80) { 4539 *p++ = (wchar_t)ch; 4540 s++; 4541 continue; 4542 } 4543 4544 n = utf8_code_length[ch]; 4545 if (s + n > e) { 4546 goto surrogateescape; 4547 } 4548 4549 switch (n) { 4550 case 0: 4551 case 1: 4552 goto surrogateescape; 4553 4554 case 2: 4555 if ((s[1] & 0xc0) != 0x80) 4556 goto surrogateescape; 4557 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4558 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4559 *p++ = (wchar_t)ch; 4560 break; 4561 4562 case 3: 4563 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4564 will result in surrogates in range d800-dfff. Surrogates are 4565 not valid UTF-8 so they are rejected. 4566 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4567 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4568 if ((s[1] & 0xc0) != 0x80 || 4569 (s[2] & 0xc0) != 0x80 || 4570 ((unsigned char)s[0] == 0xE0 && 4571 (unsigned char)s[1] < 0xA0) || 4572 ((unsigned char)s[0] == 0xED && 4573 (unsigned char)s[1] > 0x9F)) { 4574 4575 goto surrogateescape; 4576 } 4577 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4578 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4579 *p++ = (wchar_t)ch; 4580 break; 4581 4582 case 4: 4583 if ((s[1] & 0xc0) != 0x80 || 4584 (s[2] & 0xc0) != 0x80 || 4585 (s[3] & 0xc0) != 0x80 || 4586 ((unsigned char)s[0] == 0xF0 && 4587 (unsigned char)s[1] < 0x90) || 4588 ((unsigned char)s[0] == 0xF4 && 4589 (unsigned char)s[1] > 0x8F)) { 4590 goto surrogateescape; 4591 } 4592 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4593 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4594 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4595 4596#if SIZEOF_WCHAR_T == 4 4597 *p++ = (wchar_t)ch; 4598#else 4599 /* compute and append the two surrogates: */ 4600 4601 /* translate from 10000..10FFFF to 0..FFFF */ 4602 ch -= 0x10000; 4603 4604 /* high surrogate = top 10 bits added to D800 */ 4605 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4606 4607 /* low surrogate = bottom 10 bits added to DC00 */ 4608 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4609#endif 4610 break; 4611 } 4612 s += n; 4613 continue; 4614 4615 surrogateescape: 4616 *p++ = 0xDC00 + ch; 4617 s++; 4618 } 4619 *p = L'\0'; 4620 return unicode; 4621} 4622 4623#endif /* __APPLE__ */ 4624 4625/* Primary internal function which creates utf8 encoded bytes objects. 4626 4627 Allocation strategy: if the string is short, convert into a stack buffer 4628 and allocate exactly as much space needed at the end. Else allocate the 4629 maximum possible needed (4 result bytes per Unicode character), and return 4630 the excess memory at the end. 4631*/ 4632PyObject * 4633_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4634{ 4635#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4636 4637 Py_ssize_t i; /* index into s of next input byte */ 4638 PyObject *result; /* result string object */ 4639 char *p; /* next free byte in output buffer */ 4640 Py_ssize_t nallocated; /* number of result bytes allocated */ 4641 Py_ssize_t nneeded; /* number of result bytes needed */ 4642 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4643 PyObject *errorHandler = NULL; 4644 PyObject *exc = NULL; 4645 int kind; 4646 void *data; 4647 Py_ssize_t size; 4648 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4649#if SIZEOF_WCHAR_T == 2 4650 Py_ssize_t wchar_offset = 0; 4651#endif 4652 4653 if (!PyUnicode_Check(unicode)) { 4654 PyErr_BadArgument(); 4655 return NULL; 4656 } 4657 4658 if (PyUnicode_READY(unicode) == -1) 4659 return NULL; 4660 4661 if (PyUnicode_UTF8(unicode)) 4662 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4663 PyUnicode_UTF8_LENGTH(unicode)); 4664 4665 kind = PyUnicode_KIND(unicode); 4666 data = PyUnicode_DATA(unicode); 4667 size = PyUnicode_GET_LENGTH(unicode); 4668 4669 assert(size >= 0); 4670 4671 if (size <= MAX_SHORT_UNICHARS) { 4672 /* Write into the stack buffer; nallocated can't overflow. 4673 * At the end, we'll allocate exactly as much heap space as it 4674 * turns out we need. 4675 */ 4676 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4677 result = NULL; /* will allocate after we're done */ 4678 p = stackbuf; 4679 } 4680 else { 4681 /* Overallocate on the heap, and give the excess back at the end. */ 4682 nallocated = size * 4; 4683 if (nallocated / 4 != size) /* overflow! */ 4684 return PyErr_NoMemory(); 4685 result = PyBytes_FromStringAndSize(NULL, nallocated); 4686 if (result == NULL) 4687 return NULL; 4688 p = PyBytes_AS_STRING(result); 4689 } 4690 4691 for (i = 0; i < size;) { 4692 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4693 4694 if (ch < 0x80) 4695 /* Encode ASCII */ 4696 *p++ = (char) ch; 4697 4698 else if (ch < 0x0800) { 4699 /* Encode Latin-1 */ 4700 *p++ = (char)(0xc0 | (ch >> 6)); 4701 *p++ = (char)(0x80 | (ch & 0x3f)); 4702 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4703 Py_ssize_t newpos; 4704 PyObject *rep; 4705 Py_ssize_t repsize, k, startpos; 4706 startpos = i-1; 4707#if SIZEOF_WCHAR_T == 2 4708 startpos += wchar_offset; 4709#endif 4710 rep = unicode_encode_call_errorhandler( 4711 errors, &errorHandler, "utf-8", "surrogates not allowed", 4712 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4713 &exc, startpos, startpos+1, &newpos); 4714 if (!rep) 4715 goto error; 4716 4717 if (PyBytes_Check(rep)) 4718 repsize = PyBytes_GET_SIZE(rep); 4719 else 4720 repsize = PyUnicode_GET_SIZE(rep); 4721 4722 if (repsize > 4) { 4723 Py_ssize_t offset; 4724 4725 if (result == NULL) 4726 offset = p - stackbuf; 4727 else 4728 offset = p - PyBytes_AS_STRING(result); 4729 4730 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4731 /* integer overflow */ 4732 PyErr_NoMemory(); 4733 goto error; 4734 } 4735 nallocated += repsize - 4; 4736 if (result != NULL) { 4737 if (_PyBytes_Resize(&result, nallocated) < 0) 4738 goto error; 4739 } else { 4740 result = PyBytes_FromStringAndSize(NULL, nallocated); 4741 if (result == NULL) 4742 goto error; 4743 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4744 } 4745 p = PyBytes_AS_STRING(result) + offset; 4746 } 4747 4748 if (PyBytes_Check(rep)) { 4749 char *prep = PyBytes_AS_STRING(rep); 4750 for(k = repsize; k > 0; k--) 4751 *p++ = *prep++; 4752 } else /* rep is unicode */ { 4753 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4754 Py_UNICODE c; 4755 4756 for(k=0; k<repsize; k++) { 4757 c = prep[k]; 4758 if (0x80 <= c) { 4759 raise_encode_exception(&exc, "utf-8", 4760 PyUnicode_AS_UNICODE(unicode), 4761 size, i-1, i, 4762 "surrogates not allowed"); 4763 goto error; 4764 } 4765 *p++ = (char)prep[k]; 4766 } 4767 } 4768 Py_DECREF(rep); 4769 } else if (ch < 0x10000) { 4770 *p++ = (char)(0xe0 | (ch >> 12)); 4771 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4772 *p++ = (char)(0x80 | (ch & 0x3f)); 4773 } else /* ch >= 0x10000 */ { 4774 /* Encode UCS4 Unicode ordinals */ 4775 *p++ = (char)(0xf0 | (ch >> 18)); 4776 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4777 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4778 *p++ = (char)(0x80 | (ch & 0x3f)); 4779#if SIZEOF_WCHAR_T == 2 4780 wchar_offset++; 4781#endif 4782 } 4783 } 4784 4785 if (result == NULL) { 4786 /* This was stack allocated. */ 4787 nneeded = p - stackbuf; 4788 assert(nneeded <= nallocated); 4789 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4790 } 4791 else { 4792 /* Cut back to size actually needed. */ 4793 nneeded = p - PyBytes_AS_STRING(result); 4794 assert(nneeded <= nallocated); 4795 _PyBytes_Resize(&result, nneeded); 4796 } 4797 4798 Py_XDECREF(errorHandler); 4799 Py_XDECREF(exc); 4800 return result; 4801 error: 4802 Py_XDECREF(errorHandler); 4803 Py_XDECREF(exc); 4804 Py_XDECREF(result); 4805 return NULL; 4806 4807#undef MAX_SHORT_UNICHARS 4808} 4809 4810PyObject * 4811PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4812 Py_ssize_t size, 4813 const char *errors) 4814{ 4815 PyObject *v, *unicode; 4816 4817 unicode = PyUnicode_FromUnicode(s, size); 4818 if (unicode == NULL) 4819 return NULL; 4820 v = _PyUnicode_AsUTF8String(unicode, errors); 4821 Py_DECREF(unicode); 4822 return v; 4823} 4824 4825PyObject * 4826PyUnicode_AsUTF8String(PyObject *unicode) 4827{ 4828 return _PyUnicode_AsUTF8String(unicode, NULL); 4829} 4830 4831/* --- UTF-32 Codec ------------------------------------------------------- */ 4832 4833PyObject * 4834PyUnicode_DecodeUTF32(const char *s, 4835 Py_ssize_t size, 4836 const char *errors, 4837 int *byteorder) 4838{ 4839 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4840} 4841 4842PyObject * 4843PyUnicode_DecodeUTF32Stateful(const char *s, 4844 Py_ssize_t size, 4845 const char *errors, 4846 int *byteorder, 4847 Py_ssize_t *consumed) 4848{ 4849 const char *starts = s; 4850 Py_ssize_t startinpos; 4851 Py_ssize_t endinpos; 4852 Py_ssize_t outpos; 4853 PyUnicodeObject *unicode; 4854 Py_UNICODE *p; 4855#ifndef Py_UNICODE_WIDE 4856 int pairs = 0; 4857 const unsigned char *qq; 4858#else 4859 const int pairs = 0; 4860#endif 4861 const unsigned char *q, *e; 4862 int bo = 0; /* assume native ordering by default */ 4863 const char *errmsg = ""; 4864 /* Offsets from q for retrieving bytes in the right order. */ 4865#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4866 int iorder[] = {0, 1, 2, 3}; 4867#else 4868 int iorder[] = {3, 2, 1, 0}; 4869#endif 4870 PyObject *errorHandler = NULL; 4871 PyObject *exc = NULL; 4872 4873 q = (unsigned char *)s; 4874 e = q + size; 4875 4876 if (byteorder) 4877 bo = *byteorder; 4878 4879 /* Check for BOM marks (U+FEFF) in the input and adjust current 4880 byte order setting accordingly. In native mode, the leading BOM 4881 mark is skipped, in all other modes, it is copied to the output 4882 stream as-is (giving a ZWNBSP character). */ 4883 if (bo == 0) { 4884 if (size >= 4) { 4885 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4886 (q[iorder[1]] << 8) | q[iorder[0]]; 4887#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4888 if (bom == 0x0000FEFF) { 4889 q += 4; 4890 bo = -1; 4891 } 4892 else if (bom == 0xFFFE0000) { 4893 q += 4; 4894 bo = 1; 4895 } 4896#else 4897 if (bom == 0x0000FEFF) { 4898 q += 4; 4899 bo = 1; 4900 } 4901 else if (bom == 0xFFFE0000) { 4902 q += 4; 4903 bo = -1; 4904 } 4905#endif 4906 } 4907 } 4908 4909 if (bo == -1) { 4910 /* force LE */ 4911 iorder[0] = 0; 4912 iorder[1] = 1; 4913 iorder[2] = 2; 4914 iorder[3] = 3; 4915 } 4916 else if (bo == 1) { 4917 /* force BE */ 4918 iorder[0] = 3; 4919 iorder[1] = 2; 4920 iorder[2] = 1; 4921 iorder[3] = 0; 4922 } 4923 4924 /* On narrow builds we split characters outside the BMP into two 4925 codepoints => count how much extra space we need. */ 4926#ifndef Py_UNICODE_WIDE 4927 for (qq = q; qq < e; qq += 4) 4928 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4929 pairs++; 4930#endif 4931 4932 /* This might be one to much, because of a BOM */ 4933 unicode = _PyUnicode_New((size+3)/4+pairs); 4934 if (!unicode) 4935 return NULL; 4936 if (size == 0) 4937 return (PyObject *)unicode; 4938 4939 /* Unpack UTF-32 encoded data */ 4940 p = PyUnicode_AS_UNICODE(unicode); 4941 4942 while (q < e) { 4943 Py_UCS4 ch; 4944 /* remaining bytes at the end? (size should be divisible by 4) */ 4945 if (e-q<4) { 4946 if (consumed) 4947 break; 4948 errmsg = "truncated data"; 4949 startinpos = ((const char *)q)-starts; 4950 endinpos = ((const char *)e)-starts; 4951 goto utf32Error; 4952 /* The remaining input chars are ignored if the callback 4953 chooses to skip the input */ 4954 } 4955 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4956 (q[iorder[1]] << 8) | q[iorder[0]]; 4957 4958 if (ch >= 0x110000) 4959 { 4960 errmsg = "codepoint not in range(0x110000)"; 4961 startinpos = ((const char *)q)-starts; 4962 endinpos = startinpos+4; 4963 goto utf32Error; 4964 } 4965#ifndef Py_UNICODE_WIDE 4966 if (ch >= 0x10000) 4967 { 4968 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4969 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4970 } 4971 else 4972#endif 4973 *p++ = ch; 4974 q += 4; 4975 continue; 4976 utf32Error: 4977 outpos = p-PyUnicode_AS_UNICODE(unicode); 4978 if (unicode_decode_call_errorhandler( 4979 errors, &errorHandler, 4980 "utf32", errmsg, 4981 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4982 &unicode, &outpos, &p)) 4983 goto onError; 4984 } 4985 4986 if (byteorder) 4987 *byteorder = bo; 4988 4989 if (consumed) 4990 *consumed = (const char *)q-starts; 4991 4992 /* Adjust length */ 4993 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4994 goto onError; 4995 4996 Py_XDECREF(errorHandler); 4997 Py_XDECREF(exc); 4998#ifndef DONT_MAKE_RESULT_READY 4999 if (_PyUnicode_READY_REPLACE(&unicode)) { 5000 Py_DECREF(unicode); 5001 return NULL; 5002 } 5003#endif 5004 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5005 return (PyObject *)unicode; 5006 5007 onError: 5008 Py_DECREF(unicode); 5009 Py_XDECREF(errorHandler); 5010 Py_XDECREF(exc); 5011 return NULL; 5012} 5013 5014PyObject * 5015PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5016 Py_ssize_t size, 5017 const char *errors, 5018 int byteorder) 5019{ 5020 PyObject *v; 5021 unsigned char *p; 5022 Py_ssize_t nsize, bytesize; 5023#ifndef Py_UNICODE_WIDE 5024 Py_ssize_t i, pairs; 5025#else 5026 const int pairs = 0; 5027#endif 5028 /* Offsets from p for storing byte pairs in the right order. */ 5029#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5030 int iorder[] = {0, 1, 2, 3}; 5031#else 5032 int iorder[] = {3, 2, 1, 0}; 5033#endif 5034 5035#define STORECHAR(CH) \ 5036 do { \ 5037 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5038 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5039 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5040 p[iorder[0]] = (CH) & 0xff; \ 5041 p += 4; \ 5042 } while(0) 5043 5044 /* In narrow builds we can output surrogate pairs as one codepoint, 5045 so we need less space. */ 5046#ifndef Py_UNICODE_WIDE 5047 for (i = pairs = 0; i < size-1; i++) 5048 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 5049 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 5050 pairs++; 5051#endif 5052 nsize = (size - pairs + (byteorder == 0)); 5053 bytesize = nsize * 4; 5054 if (bytesize / 4 != nsize) 5055 return PyErr_NoMemory(); 5056 v = PyBytes_FromStringAndSize(NULL, bytesize); 5057 if (v == NULL) 5058 return NULL; 5059 5060 p = (unsigned char *)PyBytes_AS_STRING(v); 5061 if (byteorder == 0) 5062 STORECHAR(0xFEFF); 5063 if (size == 0) 5064 goto done; 5065 5066 if (byteorder == -1) { 5067 /* force LE */ 5068 iorder[0] = 0; 5069 iorder[1] = 1; 5070 iorder[2] = 2; 5071 iorder[3] = 3; 5072 } 5073 else if (byteorder == 1) { 5074 /* force BE */ 5075 iorder[0] = 3; 5076 iorder[1] = 2; 5077 iorder[2] = 1; 5078 iorder[3] = 0; 5079 } 5080 5081 while (size-- > 0) { 5082 Py_UCS4 ch = *s++; 5083#ifndef Py_UNICODE_WIDE 5084 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 5085 Py_UCS4 ch2 = *s; 5086 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5087 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5088 s++; 5089 size--; 5090 } 5091 } 5092#endif 5093 STORECHAR(ch); 5094 } 5095 5096 done: 5097 return v; 5098#undef STORECHAR 5099} 5100 5101PyObject * 5102PyUnicode_AsUTF32String(PyObject *unicode) 5103{ 5104 if (!PyUnicode_Check(unicode)) { 5105 PyErr_BadArgument(); 5106 return NULL; 5107 } 5108 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 5109 PyUnicode_GET_SIZE(unicode), 5110 NULL, 5111 0); 5112} 5113 5114/* --- UTF-16 Codec ------------------------------------------------------- */ 5115 5116PyObject * 5117PyUnicode_DecodeUTF16(const char *s, 5118 Py_ssize_t size, 5119 const char *errors, 5120 int *byteorder) 5121{ 5122 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5123} 5124 5125/* Two masks for fast checking of whether a C 'long' may contain 5126 UTF16-encoded surrogate characters. This is an efficient heuristic, 5127 assuming that non-surrogate characters with a code point >= 0x8000 are 5128 rare in most input. 5129 FAST_CHAR_MASK is used when the input is in native byte ordering, 5130 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 5131*/ 5132#if (SIZEOF_LONG == 8) 5133# define FAST_CHAR_MASK 0x8000800080008000L 5134# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 5135#elif (SIZEOF_LONG == 4) 5136# define FAST_CHAR_MASK 0x80008000L 5137# define SWAPPED_FAST_CHAR_MASK 0x00800080L 5138#else 5139# error C 'long' size should be either 4 or 8! 5140#endif 5141 5142PyObject * 5143PyUnicode_DecodeUTF16Stateful(const char *s, 5144 Py_ssize_t size, 5145 const char *errors, 5146 int *byteorder, 5147 Py_ssize_t *consumed) 5148{ 5149 const char *starts = s; 5150 Py_ssize_t startinpos; 5151 Py_ssize_t endinpos; 5152 Py_ssize_t outpos; 5153 PyUnicodeObject *unicode; 5154 Py_UNICODE *p; 5155 const unsigned char *q, *e, *aligned_end; 5156 int bo = 0; /* assume native ordering by default */ 5157 int native_ordering = 0; 5158 const char *errmsg = ""; 5159 /* Offsets from q for retrieving byte pairs in the right order. */ 5160#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5161 int ihi = 1, ilo = 0; 5162#else 5163 int ihi = 0, ilo = 1; 5164#endif 5165 PyObject *errorHandler = NULL; 5166 PyObject *exc = NULL; 5167 5168 /* Note: size will always be longer than the resulting Unicode 5169 character count */ 5170 unicode = _PyUnicode_New(size); 5171 if (!unicode) 5172 return NULL; 5173 if (size == 0) 5174 return (PyObject *)unicode; 5175 5176 /* Unpack UTF-16 encoded data */ 5177 p = PyUnicode_AS_UNICODE(unicode); 5178 q = (unsigned char *)s; 5179 e = q + size - 1; 5180 5181 if (byteorder) 5182 bo = *byteorder; 5183 5184 /* Check for BOM marks (U+FEFF) in the input and adjust current 5185 byte order setting accordingly. In native mode, the leading BOM 5186 mark is skipped, in all other modes, it is copied to the output 5187 stream as-is (giving a ZWNBSP character). */ 5188 if (bo == 0) { 5189 if (size >= 2) { 5190 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 5191#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5192 if (bom == 0xFEFF) { 5193 q += 2; 5194 bo = -1; 5195 } 5196 else if (bom == 0xFFFE) { 5197 q += 2; 5198 bo = 1; 5199 } 5200#else 5201 if (bom == 0xFEFF) { 5202 q += 2; 5203 bo = 1; 5204 } 5205 else if (bom == 0xFFFE) { 5206 q += 2; 5207 bo = -1; 5208 } 5209#endif 5210 } 5211 } 5212 5213 if (bo == -1) { 5214 /* force LE */ 5215 ihi = 1; 5216 ilo = 0; 5217 } 5218 else if (bo == 1) { 5219 /* force BE */ 5220 ihi = 0; 5221 ilo = 1; 5222 } 5223#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5224 native_ordering = ilo < ihi; 5225#else 5226 native_ordering = ilo > ihi; 5227#endif 5228 5229 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5230 while (q < e) { 5231 Py_UNICODE ch; 5232 /* First check for possible aligned read of a C 'long'. Unaligned 5233 reads are more expensive, better to defer to another iteration. */ 5234 if (!((size_t) q & LONG_PTR_MASK)) { 5235 /* Fast path for runs of non-surrogate chars. */ 5236 register const unsigned char *_q = q; 5237 Py_UNICODE *_p = p; 5238 if (native_ordering) { 5239 /* Native ordering is simple: as long as the input cannot 5240 possibly contain a surrogate char, do an unrolled copy 5241 of several 16-bit code points to the target object. 5242 The non-surrogate check is done on several input bytes 5243 at a time (as many as a C 'long' can contain). */ 5244 while (_q < aligned_end) { 5245 unsigned long data = * (unsigned long *) _q; 5246 if (data & FAST_CHAR_MASK) 5247 break; 5248 _p[0] = ((unsigned short *) _q)[0]; 5249 _p[1] = ((unsigned short *) _q)[1]; 5250#if (SIZEOF_LONG == 8) 5251 _p[2] = ((unsigned short *) _q)[2]; 5252 _p[3] = ((unsigned short *) _q)[3]; 5253#endif 5254 _q += SIZEOF_LONG; 5255 _p += SIZEOF_LONG / 2; 5256 } 5257 } 5258 else { 5259 /* Byteswapped ordering is similar, but we must decompose 5260 the copy bytewise, and take care of zero'ing out the 5261 upper bytes if the target object is in 32-bit units 5262 (that is, in UCS-4 builds). */ 5263 while (_q < aligned_end) { 5264 unsigned long data = * (unsigned long *) _q; 5265 if (data & SWAPPED_FAST_CHAR_MASK) 5266 break; 5267 /* Zero upper bytes in UCS-4 builds */ 5268#if (Py_UNICODE_SIZE > 2) 5269 _p[0] = 0; 5270 _p[1] = 0; 5271#if (SIZEOF_LONG == 8) 5272 _p[2] = 0; 5273 _p[3] = 0; 5274#endif 5275#endif 5276 /* Issue #4916; UCS-4 builds on big endian machines must 5277 fill the two last bytes of each 4-byte unit. */ 5278#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5279# define OFF 2 5280#else 5281# define OFF 0 5282#endif 5283 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5284 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5285 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5286 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5287#if (SIZEOF_LONG == 8) 5288 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5289 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5290 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5291 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5292#endif 5293#undef OFF 5294 _q += SIZEOF_LONG; 5295 _p += SIZEOF_LONG / 2; 5296 } 5297 } 5298 p = _p; 5299 q = _q; 5300 if (q >= e) 5301 break; 5302 } 5303 ch = (q[ihi] << 8) | q[ilo]; 5304 5305 q += 2; 5306 5307 if (ch < 0xD800 || ch > 0xDFFF) { 5308 *p++ = ch; 5309 continue; 5310 } 5311 5312 /* UTF-16 code pair: */ 5313 if (q > e) { 5314 errmsg = "unexpected end of data"; 5315 startinpos = (((const char *)q) - 2) - starts; 5316 endinpos = ((const char *)e) + 1 - starts; 5317 goto utf16Error; 5318 } 5319 if (0xD800 <= ch && ch <= 0xDBFF) { 5320 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5321 q += 2; 5322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5323#ifndef Py_UNICODE_WIDE 5324 *p++ = ch; 5325 *p++ = ch2; 5326#else 5327 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5328#endif 5329 continue; 5330 } 5331 else { 5332 errmsg = "illegal UTF-16 surrogate"; 5333 startinpos = (((const char *)q)-4)-starts; 5334 endinpos = startinpos+2; 5335 goto utf16Error; 5336 } 5337 5338 } 5339 errmsg = "illegal encoding"; 5340 startinpos = (((const char *)q)-2)-starts; 5341 endinpos = startinpos+2; 5342 /* Fall through to report the error */ 5343 5344 utf16Error: 5345 outpos = p - PyUnicode_AS_UNICODE(unicode); 5346 if (unicode_decode_call_errorhandler( 5347 errors, 5348 &errorHandler, 5349 "utf16", errmsg, 5350 &starts, 5351 (const char **)&e, 5352 &startinpos, 5353 &endinpos, 5354 &exc, 5355 (const char **)&q, 5356 &unicode, 5357 &outpos, 5358 &p)) 5359 goto onError; 5360 } 5361 /* remaining byte at the end? (size should be even) */ 5362 if (e == q) { 5363 if (!consumed) { 5364 errmsg = "truncated data"; 5365 startinpos = ((const char *)q) - starts; 5366 endinpos = ((const char *)e) + 1 - starts; 5367 outpos = p - PyUnicode_AS_UNICODE(unicode); 5368 if (unicode_decode_call_errorhandler( 5369 errors, 5370 &errorHandler, 5371 "utf16", errmsg, 5372 &starts, 5373 (const char **)&e, 5374 &startinpos, 5375 &endinpos, 5376 &exc, 5377 (const char **)&q, 5378 &unicode, 5379 &outpos, 5380 &p)) 5381 goto onError; 5382 /* The remaining input chars are ignored if the callback 5383 chooses to skip the input */ 5384 } 5385 } 5386 5387 if (byteorder) 5388 *byteorder = bo; 5389 5390 if (consumed) 5391 *consumed = (const char *)q-starts; 5392 5393 /* Adjust length */ 5394 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5395 goto onError; 5396 5397 Py_XDECREF(errorHandler); 5398 Py_XDECREF(exc); 5399#ifndef DONT_MAKE_RESULT_READY 5400 if (_PyUnicode_READY_REPLACE(&unicode)) { 5401 Py_DECREF(unicode); 5402 return NULL; 5403 } 5404#endif 5405 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5406 return (PyObject *)unicode; 5407 5408 onError: 5409 Py_DECREF(unicode); 5410 Py_XDECREF(errorHandler); 5411 Py_XDECREF(exc); 5412 return NULL; 5413} 5414 5415#undef FAST_CHAR_MASK 5416#undef SWAPPED_FAST_CHAR_MASK 5417 5418PyObject * 5419PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5420 Py_ssize_t size, 5421 const char *errors, 5422 int byteorder) 5423{ 5424 PyObject *v; 5425 unsigned char *p; 5426 Py_ssize_t nsize, bytesize; 5427#ifdef Py_UNICODE_WIDE 5428 Py_ssize_t i, pairs; 5429#else 5430 const int pairs = 0; 5431#endif 5432 /* Offsets from p for storing byte pairs in the right order. */ 5433#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5434 int ihi = 1, ilo = 0; 5435#else 5436 int ihi = 0, ilo = 1; 5437#endif 5438 5439#define STORECHAR(CH) \ 5440 do { \ 5441 p[ihi] = ((CH) >> 8) & 0xff; \ 5442 p[ilo] = (CH) & 0xff; \ 5443 p += 2; \ 5444 } while(0) 5445 5446#ifdef Py_UNICODE_WIDE 5447 for (i = pairs = 0; i < size; i++) 5448 if (s[i] >= 0x10000) 5449 pairs++; 5450#endif 5451 /* 2 * (size + pairs + (byteorder == 0)) */ 5452 if (size > PY_SSIZE_T_MAX || 5453 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5454 return PyErr_NoMemory(); 5455 nsize = size + pairs + (byteorder == 0); 5456 bytesize = nsize * 2; 5457 if (bytesize / 2 != nsize) 5458 return PyErr_NoMemory(); 5459 v = PyBytes_FromStringAndSize(NULL, bytesize); 5460 if (v == NULL) 5461 return NULL; 5462 5463 p = (unsigned char *)PyBytes_AS_STRING(v); 5464 if (byteorder == 0) 5465 STORECHAR(0xFEFF); 5466 if (size == 0) 5467 goto done; 5468 5469 if (byteorder == -1) { 5470 /* force LE */ 5471 ihi = 1; 5472 ilo = 0; 5473 } 5474 else if (byteorder == 1) { 5475 /* force BE */ 5476 ihi = 0; 5477 ilo = 1; 5478 } 5479 5480 while (size-- > 0) { 5481 Py_UNICODE ch = *s++; 5482 Py_UNICODE ch2 = 0; 5483#ifdef Py_UNICODE_WIDE 5484 if (ch >= 0x10000) { 5485 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5486 ch = 0xD800 | ((ch-0x10000) >> 10); 5487 } 5488#endif 5489 STORECHAR(ch); 5490 if (ch2) 5491 STORECHAR(ch2); 5492 } 5493 5494 done: 5495 return v; 5496#undef STORECHAR 5497} 5498 5499PyObject * 5500PyUnicode_AsUTF16String(PyObject *unicode) 5501{ 5502 if (!PyUnicode_Check(unicode)) { 5503 PyErr_BadArgument(); 5504 return NULL; 5505 } 5506 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5507 PyUnicode_GET_SIZE(unicode), 5508 NULL, 5509 0); 5510} 5511 5512/* --- Unicode Escape Codec ----------------------------------------------- */ 5513 5514/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5515 if all the escapes in the string make it still a valid ASCII string. 5516 Returns -1 if any escapes were found which cause the string to 5517 pop out of ASCII range. Otherwise returns the length of the 5518 required buffer to hold the string. 5519 */ 5520Py_ssize_t 5521length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5522{ 5523 const unsigned char *p = (const unsigned char *)s; 5524 const unsigned char *end = p + size; 5525 Py_ssize_t length = 0; 5526 5527 if (size < 0) 5528 return -1; 5529 5530 for (; p < end; ++p) { 5531 if (*p > 127) { 5532 /* Non-ASCII */ 5533 return -1; 5534 } 5535 else if (*p != '\\') { 5536 /* Normal character */ 5537 ++length; 5538 } 5539 else { 5540 /* Backslash-escape, check next char */ 5541 ++p; 5542 /* Escape sequence reaches till end of string or 5543 non-ASCII follow-up. */ 5544 if (p >= end || *p > 127) 5545 return -1; 5546 switch (*p) { 5547 case '\n': 5548 /* backslash + \n result in zero characters */ 5549 break; 5550 case '\\': case '\'': case '\"': 5551 case 'b': case 'f': case 't': 5552 case 'n': case 'r': case 'v': case 'a': 5553 ++length; 5554 break; 5555 case '0': case '1': case '2': case '3': 5556 case '4': case '5': case '6': case '7': 5557 case 'x': case 'u': case 'U': case 'N': 5558 /* these do not guarantee ASCII characters */ 5559 return -1; 5560 default: 5561 /* count the backslash + the other character */ 5562 length += 2; 5563 } 5564 } 5565 } 5566 return length; 5567} 5568 5569/* Similar to PyUnicode_WRITE but either write into wstr field 5570 or treat string as ASCII. */ 5571#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5572 do { \ 5573 if ((kind) != PyUnicode_WCHAR_KIND) \ 5574 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5575 else \ 5576 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5577 } while (0) 5578 5579#define WRITE_WSTR(buf, index, value) \ 5580 assert(kind == PyUnicode_WCHAR_KIND), \ 5581 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5582 5583 5584static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5585 5586PyObject * 5587PyUnicode_DecodeUnicodeEscape(const char *s, 5588 Py_ssize_t size, 5589 const char *errors) 5590{ 5591 const char *starts = s; 5592 Py_ssize_t startinpos; 5593 Py_ssize_t endinpos; 5594 int j; 5595 PyUnicodeObject *v; 5596 Py_UNICODE *p; 5597 const char *end; 5598 char* message; 5599 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5600 PyObject *errorHandler = NULL; 5601 PyObject *exc = NULL; 5602 Py_ssize_t ascii_length; 5603 Py_ssize_t i; 5604 int kind; 5605 void *data; 5606 5607 ascii_length = length_of_escaped_ascii_string(s, size); 5608 5609 /* After length_of_escaped_ascii_string() there are two alternatives, 5610 either the string is pure ASCII with named escapes like \n, etc. 5611 and we determined it's exact size (common case) 5612 or it contains \x, \u, ... escape sequences. then we create a 5613 legacy wchar string and resize it at the end of this function. */ 5614 if (ascii_length >= 0) { 5615 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5616 if (!v) 5617 goto onError; 5618 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5619 kind = PyUnicode_1BYTE_KIND; 5620 data = PyUnicode_DATA(v); 5621 } 5622 else { 5623 /* Escaped strings will always be longer than the resulting 5624 Unicode string, so we start with size here and then reduce the 5625 length after conversion to the true value. 5626 (but if the error callback returns a long replacement string 5627 we'll have to allocate more space) */ 5628 v = _PyUnicode_New(size); 5629 if (!v) 5630 goto onError; 5631 kind = PyUnicode_WCHAR_KIND; 5632 data = PyUnicode_AS_UNICODE(v); 5633 } 5634 5635 if (size == 0) 5636 return (PyObject *)v; 5637 i = 0; 5638 end = s + size; 5639 5640 while (s < end) { 5641 unsigned char c; 5642 Py_UNICODE x; 5643 int digits; 5644 5645 if (kind == PyUnicode_WCHAR_KIND) { 5646 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5647 } 5648 else { 5649 /* The only case in which i == ascii_length is a backslash 5650 followed by a newline. */ 5651 assert(i <= ascii_length); 5652 } 5653 5654 /* Non-escape characters are interpreted as Unicode ordinals */ 5655 if (*s != '\\') { 5656 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5657 continue; 5658 } 5659 5660 startinpos = s-starts; 5661 /* \ - Escapes */ 5662 s++; 5663 c = *s++; 5664 if (s > end) 5665 c = '\0'; /* Invalid after \ */ 5666 5667 if (kind == PyUnicode_WCHAR_KIND) { 5668 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5669 } 5670 else { 5671 /* The only case in which i == ascii_length is a backslash 5672 followed by a newline. */ 5673 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5674 } 5675 5676 switch (c) { 5677 5678 /* \x escapes */ 5679 case '\n': break; 5680 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5681 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5682 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5683 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5684 /* FF */ 5685 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5686 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5687 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5688 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5689 /* VT */ 5690 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5691 /* BEL, not classic C */ 5692 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5693 5694 /* \OOO (octal) escapes */ 5695 case '0': case '1': case '2': case '3': 5696 case '4': case '5': case '6': case '7': 5697 x = s[-1] - '0'; 5698 if (s < end && '0' <= *s && *s <= '7') { 5699 x = (x<<3) + *s++ - '0'; 5700 if (s < end && '0' <= *s && *s <= '7') 5701 x = (x<<3) + *s++ - '0'; 5702 } 5703 WRITE_WSTR(data, i++, x); 5704 break; 5705 5706 /* hex escapes */ 5707 /* \xXX */ 5708 case 'x': 5709 digits = 2; 5710 message = "truncated \\xXX escape"; 5711 goto hexescape; 5712 5713 /* \uXXXX */ 5714 case 'u': 5715 digits = 4; 5716 message = "truncated \\uXXXX escape"; 5717 goto hexescape; 5718 5719 /* \UXXXXXXXX */ 5720 case 'U': 5721 digits = 8; 5722 message = "truncated \\UXXXXXXXX escape"; 5723 hexescape: 5724 chr = 0; 5725 p = PyUnicode_AS_UNICODE(v) + i; 5726 if (s+digits>end) { 5727 endinpos = size; 5728 if (unicode_decode_call_errorhandler( 5729 errors, &errorHandler, 5730 "unicodeescape", "end of string in escape sequence", 5731 &starts, &end, &startinpos, &endinpos, &exc, &s, 5732 &v, &i, &p)) 5733 goto onError; 5734 data = PyUnicode_AS_UNICODE(v); 5735 goto nextByte; 5736 } 5737 for (j = 0; j < digits; ++j) { 5738 c = (unsigned char) s[j]; 5739 if (!Py_ISXDIGIT(c)) { 5740 endinpos = (s+j+1)-starts; 5741 p = PyUnicode_AS_UNICODE(v) + i; 5742 if (unicode_decode_call_errorhandler( 5743 errors, &errorHandler, 5744 "unicodeescape", message, 5745 &starts, &end, &startinpos, &endinpos, &exc, &s, 5746 &v, &i, &p)) 5747 goto onError; 5748 data = PyUnicode_AS_UNICODE(v); 5749 goto nextByte; 5750 } 5751 chr = (chr<<4) & ~0xF; 5752 if (c >= '0' && c <= '9') 5753 chr += c - '0'; 5754 else if (c >= 'a' && c <= 'f') 5755 chr += 10 + c - 'a'; 5756 else 5757 chr += 10 + c - 'A'; 5758 } 5759 s += j; 5760 if (chr == 0xffffffff && PyErr_Occurred()) 5761 /* _decoding_error will have already written into the 5762 target buffer. */ 5763 break; 5764 store: 5765 /* when we get here, chr is a 32-bit unicode character */ 5766 if (chr <= 0xffff) 5767 /* UCS-2 character */ 5768 WRITE_WSTR(data, i++, chr); 5769 else if (chr <= 0x10ffff) { 5770 /* UCS-4 character. Either store directly, or as 5771 surrogate pair. */ 5772#ifdef Py_UNICODE_WIDE 5773 WRITE_WSTR(data, i++, chr); 5774#else 5775 chr -= 0x10000L; 5776 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5777 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5778#endif 5779 } else { 5780 endinpos = s-starts; 5781 p = PyUnicode_AS_UNICODE(v) + i; 5782 if (unicode_decode_call_errorhandler( 5783 errors, &errorHandler, 5784 "unicodeescape", "illegal Unicode character", 5785 &starts, &end, &startinpos, &endinpos, &exc, &s, 5786 &v, &i, &p)) 5787 goto onError; 5788 data = PyUnicode_AS_UNICODE(v); 5789 } 5790 break; 5791 5792 /* \N{name} */ 5793 case 'N': 5794 message = "malformed \\N character escape"; 5795 if (ucnhash_CAPI == NULL) { 5796 /* load the unicode data module */ 5797 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5798 PyUnicodeData_CAPSULE_NAME, 1); 5799 if (ucnhash_CAPI == NULL) 5800 goto ucnhashError; 5801 } 5802 if (*s == '{') { 5803 const char *start = s+1; 5804 /* look for the closing brace */ 5805 while (*s != '}' && s < end) 5806 s++; 5807 if (s > start && s < end && *s == '}') { 5808 /* found a name. look it up in the unicode database */ 5809 message = "unknown Unicode character name"; 5810 s++; 5811 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5812 &chr)) 5813 goto store; 5814 } 5815 } 5816 endinpos = s-starts; 5817 p = PyUnicode_AS_UNICODE(v) + i; 5818 if (unicode_decode_call_errorhandler( 5819 errors, &errorHandler, 5820 "unicodeescape", message, 5821 &starts, &end, &startinpos, &endinpos, &exc, &s, 5822 &v, &i, &p)) 5823 goto onError; 5824 data = PyUnicode_AS_UNICODE(v); 5825 break; 5826 5827 default: 5828 if (s > end) { 5829 assert(kind == PyUnicode_WCHAR_KIND); 5830 message = "\\ at end of string"; 5831 s--; 5832 endinpos = s-starts; 5833 p = PyUnicode_AS_UNICODE(v) + i; 5834 if (unicode_decode_call_errorhandler( 5835 errors, &errorHandler, 5836 "unicodeescape", message, 5837 &starts, &end, &startinpos, &endinpos, &exc, &s, 5838 &v, &i, &p)) 5839 goto onError; 5840 data = PyUnicode_AS_UNICODE(v); 5841 } 5842 else { 5843 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5844 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5845 } 5846 break; 5847 } 5848 nextByte: 5849 ; 5850 } 5851 /* Ensure the length prediction worked in case of ASCII strings */ 5852 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5853 5854 if (kind == PyUnicode_WCHAR_KIND) 5855 { 5856 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5857 goto onError; 5858 } 5859 Py_XDECREF(errorHandler); 5860 Py_XDECREF(exc); 5861#ifndef DONT_MAKE_RESULT_READY 5862 if (_PyUnicode_READY_REPLACE(&v)) { 5863 Py_DECREF(v); 5864 return NULL; 5865 } 5866#endif 5867 assert(_PyUnicode_CheckConsistency(v, 1)); 5868 return (PyObject *)v; 5869 5870 ucnhashError: 5871 PyErr_SetString( 5872 PyExc_UnicodeError, 5873 "\\N escapes not supported (can't load unicodedata module)" 5874 ); 5875 Py_XDECREF(v); 5876 Py_XDECREF(errorHandler); 5877 Py_XDECREF(exc); 5878 return NULL; 5879 5880 onError: 5881 Py_XDECREF(v); 5882 Py_XDECREF(errorHandler); 5883 Py_XDECREF(exc); 5884 return NULL; 5885} 5886 5887#undef WRITE_ASCII_OR_WSTR 5888#undef WRITE_WSTR 5889 5890/* Return a Unicode-Escape string version of the Unicode object. 5891 5892 If quotes is true, the string is enclosed in u"" or u'' quotes as 5893 appropriate. 5894 5895*/ 5896 5897static const char *hexdigits = "0123456789abcdef"; 5898 5899PyObject * 5900PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5901 Py_ssize_t size) 5902{ 5903 PyObject *repr; 5904 char *p; 5905 5906#ifdef Py_UNICODE_WIDE 5907 const Py_ssize_t expandsize = 10; 5908#else 5909 const Py_ssize_t expandsize = 6; 5910#endif 5911 5912 /* XXX(nnorwitz): rather than over-allocating, it would be 5913 better to choose a different scheme. Perhaps scan the 5914 first N-chars of the string and allocate based on that size. 5915 */ 5916 /* Initial allocation is based on the longest-possible unichr 5917 escape. 5918 5919 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5920 unichr, so in this case it's the longest unichr escape. In 5921 narrow (UTF-16) builds this is five chars per source unichr 5922 since there are two unichrs in the surrogate pair, so in narrow 5923 (UTF-16) builds it's not the longest unichr escape. 5924 5925 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5926 so in the narrow (UTF-16) build case it's the longest unichr 5927 escape. 5928 */ 5929 5930 if (size == 0) 5931 return PyBytes_FromStringAndSize(NULL, 0); 5932 5933 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5934 return PyErr_NoMemory(); 5935 5936 repr = PyBytes_FromStringAndSize(NULL, 5937 2 5938 + expandsize*size 5939 + 1); 5940 if (repr == NULL) 5941 return NULL; 5942 5943 p = PyBytes_AS_STRING(repr); 5944 5945 while (size-- > 0) { 5946 Py_UNICODE ch = *s++; 5947 5948 /* Escape backslashes */ 5949 if (ch == '\\') { 5950 *p++ = '\\'; 5951 *p++ = (char) ch; 5952 continue; 5953 } 5954 5955#ifdef Py_UNICODE_WIDE 5956 /* Map 21-bit characters to '\U00xxxxxx' */ 5957 else if (ch >= 0x10000) { 5958 *p++ = '\\'; 5959 *p++ = 'U'; 5960 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5961 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5962 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5963 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5964 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5965 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5966 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5967 *p++ = hexdigits[ch & 0x0000000F]; 5968 continue; 5969 } 5970#else 5971 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5972 else if (ch >= 0xD800 && ch < 0xDC00) { 5973 Py_UNICODE ch2; 5974 Py_UCS4 ucs; 5975 5976 ch2 = *s++; 5977 size--; 5978 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5979 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5980 *p++ = '\\'; 5981 *p++ = 'U'; 5982 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5983 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5984 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5985 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5986 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5987 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5988 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5989 *p++ = hexdigits[ucs & 0x0000000F]; 5990 continue; 5991 } 5992 /* Fall through: isolated surrogates are copied as-is */ 5993 s--; 5994 size++; 5995 } 5996#endif 5997 5998 /* Map 16-bit characters to '\uxxxx' */ 5999 if (ch >= 256) { 6000 *p++ = '\\'; 6001 *p++ = 'u'; 6002 *p++ = hexdigits[(ch >> 12) & 0x000F]; 6003 *p++ = hexdigits[(ch >> 8) & 0x000F]; 6004 *p++ = hexdigits[(ch >> 4) & 0x000F]; 6005 *p++ = hexdigits[ch & 0x000F]; 6006 } 6007 6008 /* Map special whitespace to '\t', \n', '\r' */ 6009 else if (ch == '\t') { 6010 *p++ = '\\'; 6011 *p++ = 't'; 6012 } 6013 else if (ch == '\n') { 6014 *p++ = '\\'; 6015 *p++ = 'n'; 6016 } 6017 else if (ch == '\r') { 6018 *p++ = '\\'; 6019 *p++ = 'r'; 6020 } 6021 6022 /* Map non-printable US ASCII to '\xhh' */ 6023 else if (ch < ' ' || ch >= 0x7F) { 6024 *p++ = '\\'; 6025 *p++ = 'x'; 6026 *p++ = hexdigits[(ch >> 4) & 0x000F]; 6027 *p++ = hexdigits[ch & 0x000F]; 6028 } 6029 6030 /* Copy everything else as-is */ 6031 else 6032 *p++ = (char) ch; 6033 } 6034 6035 assert(p - PyBytes_AS_STRING(repr) > 0); 6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6037 return NULL; 6038 return repr; 6039} 6040 6041PyObject * 6042PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6043{ 6044 PyObject *s; 6045 if (!PyUnicode_Check(unicode)) { 6046 PyErr_BadArgument(); 6047 return NULL; 6048 } 6049 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6050 PyUnicode_GET_SIZE(unicode)); 6051 return s; 6052} 6053 6054/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6055 6056PyObject * 6057PyUnicode_DecodeRawUnicodeEscape(const char *s, 6058 Py_ssize_t size, 6059 const char *errors) 6060{ 6061 const char *starts = s; 6062 Py_ssize_t startinpos; 6063 Py_ssize_t endinpos; 6064 Py_ssize_t outpos; 6065 PyUnicodeObject *v; 6066 Py_UNICODE *p; 6067 const char *end; 6068 const char *bs; 6069 PyObject *errorHandler = NULL; 6070 PyObject *exc = NULL; 6071 6072 /* Escaped strings will always be longer than the resulting 6073 Unicode string, so we start with size here and then reduce the 6074 length after conversion to the true value. (But decoding error 6075 handler might have to resize the string) */ 6076 v = _PyUnicode_New(size); 6077 if (v == NULL) 6078 goto onError; 6079 if (size == 0) 6080 return (PyObject *)v; 6081 p = PyUnicode_AS_UNICODE(v); 6082 end = s + size; 6083 while (s < end) { 6084 unsigned char c; 6085 Py_UCS4 x; 6086 int i; 6087 int count; 6088 6089 /* Non-escape characters are interpreted as Unicode ordinals */ 6090 if (*s != '\\') { 6091 *p++ = (unsigned char)*s++; 6092 continue; 6093 } 6094 startinpos = s-starts; 6095 6096 /* \u-escapes are only interpreted iff the number of leading 6097 backslashes if odd */ 6098 bs = s; 6099 for (;s < end;) { 6100 if (*s != '\\') 6101 break; 6102 *p++ = (unsigned char)*s++; 6103 } 6104 if (((s - bs) & 1) == 0 || 6105 s >= end || 6106 (*s != 'u' && *s != 'U')) { 6107 continue; 6108 } 6109 p--; 6110 count = *s=='u' ? 4 : 8; 6111 s++; 6112 6113 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6114 outpos = p-PyUnicode_AS_UNICODE(v); 6115 for (x = 0, i = 0; i < count; ++i, ++s) { 6116 c = (unsigned char)*s; 6117 if (!Py_ISXDIGIT(c)) { 6118 endinpos = s-starts; 6119 if (unicode_decode_call_errorhandler( 6120 errors, &errorHandler, 6121 "rawunicodeescape", "truncated \\uXXXX", 6122 &starts, &end, &startinpos, &endinpos, &exc, &s, 6123 &v, &outpos, &p)) 6124 goto onError; 6125 goto nextByte; 6126 } 6127 x = (x<<4) & ~0xF; 6128 if (c >= '0' && c <= '9') 6129 x += c - '0'; 6130 else if (c >= 'a' && c <= 'f') 6131 x += 10 + c - 'a'; 6132 else 6133 x += 10 + c - 'A'; 6134 } 6135 if (x <= 0xffff) 6136 /* UCS-2 character */ 6137 *p++ = (Py_UNICODE) x; 6138 else if (x <= 0x10ffff) { 6139 /* UCS-4 character. Either store directly, or as 6140 surrogate pair. */ 6141#ifdef Py_UNICODE_WIDE 6142 *p++ = (Py_UNICODE) x; 6143#else 6144 x -= 0x10000L; 6145 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 6146 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 6147#endif 6148 } else { 6149 endinpos = s-starts; 6150 outpos = p-PyUnicode_AS_UNICODE(v); 6151 if (unicode_decode_call_errorhandler( 6152 errors, &errorHandler, 6153 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6154 &starts, &end, &startinpos, &endinpos, &exc, &s, 6155 &v, &outpos, &p)) 6156 goto onError; 6157 } 6158 nextByte: 6159 ; 6160 } 6161 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6162 goto onError; 6163 Py_XDECREF(errorHandler); 6164 Py_XDECREF(exc); 6165#ifndef DONT_MAKE_RESULT_READY 6166 if (_PyUnicode_READY_REPLACE(&v)) { 6167 Py_DECREF(v); 6168 return NULL; 6169 } 6170#endif 6171 assert(_PyUnicode_CheckConsistency(v, 1)); 6172 return (PyObject *)v; 6173 6174 onError: 6175 Py_XDECREF(v); 6176 Py_XDECREF(errorHandler); 6177 Py_XDECREF(exc); 6178 return NULL; 6179} 6180 6181PyObject * 6182PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6183 Py_ssize_t size) 6184{ 6185 PyObject *repr; 6186 char *p; 6187 char *q; 6188 6189#ifdef Py_UNICODE_WIDE 6190 const Py_ssize_t expandsize = 10; 6191#else 6192 const Py_ssize_t expandsize = 6; 6193#endif 6194 6195 if (size > PY_SSIZE_T_MAX / expandsize) 6196 return PyErr_NoMemory(); 6197 6198 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 6199 if (repr == NULL) 6200 return NULL; 6201 if (size == 0) 6202 return repr; 6203 6204 p = q = PyBytes_AS_STRING(repr); 6205 while (size-- > 0) { 6206 Py_UNICODE ch = *s++; 6207#ifdef Py_UNICODE_WIDE 6208 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6209 if (ch >= 0x10000) { 6210 *p++ = '\\'; 6211 *p++ = 'U'; 6212 *p++ = hexdigits[(ch >> 28) & 0xf]; 6213 *p++ = hexdigits[(ch >> 24) & 0xf]; 6214 *p++ = hexdigits[(ch >> 20) & 0xf]; 6215 *p++ = hexdigits[(ch >> 16) & 0xf]; 6216 *p++ = hexdigits[(ch >> 12) & 0xf]; 6217 *p++ = hexdigits[(ch >> 8) & 0xf]; 6218 *p++ = hexdigits[(ch >> 4) & 0xf]; 6219 *p++ = hexdigits[ch & 15]; 6220 } 6221 else 6222#else 6223 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 6224 if (ch >= 0xD800 && ch < 0xDC00) { 6225 Py_UNICODE ch2; 6226 Py_UCS4 ucs; 6227 6228 ch2 = *s++; 6229 size--; 6230 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 6231 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 6232 *p++ = '\\'; 6233 *p++ = 'U'; 6234 *p++ = hexdigits[(ucs >> 28) & 0xf]; 6235 *p++ = hexdigits[(ucs >> 24) & 0xf]; 6236 *p++ = hexdigits[(ucs >> 20) & 0xf]; 6237 *p++ = hexdigits[(ucs >> 16) & 0xf]; 6238 *p++ = hexdigits[(ucs >> 12) & 0xf]; 6239 *p++ = hexdigits[(ucs >> 8) & 0xf]; 6240 *p++ = hexdigits[(ucs >> 4) & 0xf]; 6241 *p++ = hexdigits[ucs & 0xf]; 6242 continue; 6243 } 6244 /* Fall through: isolated surrogates are copied as-is */ 6245 s--; 6246 size++; 6247 } 6248#endif 6249 /* Map 16-bit characters to '\uxxxx' */ 6250 if (ch >= 256) { 6251 *p++ = '\\'; 6252 *p++ = 'u'; 6253 *p++ = hexdigits[(ch >> 12) & 0xf]; 6254 *p++ = hexdigits[(ch >> 8) & 0xf]; 6255 *p++ = hexdigits[(ch >> 4) & 0xf]; 6256 *p++ = hexdigits[ch & 15]; 6257 } 6258 /* Copy everything else as-is */ 6259 else 6260 *p++ = (char) ch; 6261 } 6262 size = p - q; 6263 6264 assert(size > 0); 6265 if (_PyBytes_Resize(&repr, size) < 0) 6266 return NULL; 6267 return repr; 6268} 6269 6270PyObject * 6271PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6272{ 6273 PyObject *s; 6274 if (!PyUnicode_Check(unicode)) { 6275 PyErr_BadArgument(); 6276 return NULL; 6277 } 6278 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6279 PyUnicode_GET_SIZE(unicode)); 6280 6281 return s; 6282} 6283 6284/* --- Unicode Internal Codec ------------------------------------------- */ 6285 6286PyObject * 6287_PyUnicode_DecodeUnicodeInternal(const char *s, 6288 Py_ssize_t size, 6289 const char *errors) 6290{ 6291 const char *starts = s; 6292 Py_ssize_t startinpos; 6293 Py_ssize_t endinpos; 6294 Py_ssize_t outpos; 6295 PyUnicodeObject *v; 6296 Py_UNICODE *p; 6297 const char *end; 6298 const char *reason; 6299 PyObject *errorHandler = NULL; 6300 PyObject *exc = NULL; 6301 6302#ifdef Py_UNICODE_WIDE 6303 Py_UNICODE unimax = PyUnicode_GetMax(); 6304#endif 6305 6306 /* XXX overflow detection missing */ 6307 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6308 if (v == NULL) 6309 goto onError; 6310 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6311 as string was created with the old API. */ 6312 if (PyUnicode_GET_SIZE(v) == 0) 6313 return (PyObject *)v; 6314 p = PyUnicode_AS_UNICODE(v); 6315 end = s + size; 6316 6317 while (s < end) { 6318 memcpy(p, s, sizeof(Py_UNICODE)); 6319 /* We have to sanity check the raw data, otherwise doom looms for 6320 some malformed UCS-4 data. */ 6321 if ( 6322#ifdef Py_UNICODE_WIDE 6323 *p > unimax || *p < 0 || 6324#endif 6325 end-s < Py_UNICODE_SIZE 6326 ) 6327 { 6328 startinpos = s - starts; 6329 if (end-s < Py_UNICODE_SIZE) { 6330 endinpos = end-starts; 6331 reason = "truncated input"; 6332 } 6333 else { 6334 endinpos = s - starts + Py_UNICODE_SIZE; 6335 reason = "illegal code point (> 0x10FFFF)"; 6336 } 6337 outpos = p - PyUnicode_AS_UNICODE(v); 6338 if (unicode_decode_call_errorhandler( 6339 errors, &errorHandler, 6340 "unicode_internal", reason, 6341 &starts, &end, &startinpos, &endinpos, &exc, &s, 6342 &v, &outpos, &p)) { 6343 goto onError; 6344 } 6345 } 6346 else { 6347 p++; 6348 s += Py_UNICODE_SIZE; 6349 } 6350 } 6351 6352 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6353 goto onError; 6354 Py_XDECREF(errorHandler); 6355 Py_XDECREF(exc); 6356#ifndef DONT_MAKE_RESULT_READY 6357 if (_PyUnicode_READY_REPLACE(&v)) { 6358 Py_DECREF(v); 6359 return NULL; 6360 } 6361#endif 6362 assert(_PyUnicode_CheckConsistency(v, 1)); 6363 return (PyObject *)v; 6364 6365 onError: 6366 Py_XDECREF(v); 6367 Py_XDECREF(errorHandler); 6368 Py_XDECREF(exc); 6369 return NULL; 6370} 6371 6372/* --- Latin-1 Codec ------------------------------------------------------ */ 6373 6374PyObject * 6375PyUnicode_DecodeLatin1(const char *s, 6376 Py_ssize_t size, 6377 const char *errors) 6378{ 6379 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6380 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6381} 6382 6383/* create or adjust a UnicodeEncodeError */ 6384static void 6385make_encode_exception(PyObject **exceptionObject, 6386 const char *encoding, 6387 const Py_UNICODE *unicode, Py_ssize_t size, 6388 Py_ssize_t startpos, Py_ssize_t endpos, 6389 const char *reason) 6390{ 6391 if (*exceptionObject == NULL) { 6392 *exceptionObject = PyUnicodeEncodeError_Create( 6393 encoding, unicode, size, startpos, endpos, reason); 6394 } 6395 else { 6396 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6397 goto onError; 6398 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6399 goto onError; 6400 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6401 goto onError; 6402 return; 6403 onError: 6404 Py_DECREF(*exceptionObject); 6405 *exceptionObject = NULL; 6406 } 6407} 6408 6409/* raises a UnicodeEncodeError */ 6410static void 6411raise_encode_exception(PyObject **exceptionObject, 6412 const char *encoding, 6413 const Py_UNICODE *unicode, Py_ssize_t size, 6414 Py_ssize_t startpos, Py_ssize_t endpos, 6415 const char *reason) 6416{ 6417 make_encode_exception(exceptionObject, 6418 encoding, unicode, size, startpos, endpos, reason); 6419 if (*exceptionObject != NULL) 6420 PyCodec_StrictErrors(*exceptionObject); 6421} 6422 6423/* error handling callback helper: 6424 build arguments, call the callback and check the arguments, 6425 put the result into newpos and return the replacement string, which 6426 has to be freed by the caller */ 6427static PyObject * 6428unicode_encode_call_errorhandler(const char *errors, 6429 PyObject **errorHandler, 6430 const char *encoding, const char *reason, 6431 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6432 Py_ssize_t startpos, Py_ssize_t endpos, 6433 Py_ssize_t *newpos) 6434{ 6435 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6436 6437 PyObject *restuple; 6438 PyObject *resunicode; 6439 6440 if (*errorHandler == NULL) { 6441 *errorHandler = PyCodec_LookupError(errors); 6442 if (*errorHandler == NULL) 6443 return NULL; 6444 } 6445 6446 make_encode_exception(exceptionObject, 6447 encoding, unicode, size, startpos, endpos, reason); 6448 if (*exceptionObject == NULL) 6449 return NULL; 6450 6451 restuple = PyObject_CallFunctionObjArgs( 6452 *errorHandler, *exceptionObject, NULL); 6453 if (restuple == NULL) 6454 return NULL; 6455 if (!PyTuple_Check(restuple)) { 6456 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6457 Py_DECREF(restuple); 6458 return NULL; 6459 } 6460 if (!PyArg_ParseTuple(restuple, argparse, 6461 &resunicode, newpos)) { 6462 Py_DECREF(restuple); 6463 return NULL; 6464 } 6465 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6466 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6467 Py_DECREF(restuple); 6468 return NULL; 6469 } 6470 if (*newpos<0) 6471 *newpos = size+*newpos; 6472 if (*newpos<0 || *newpos>size) { 6473 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6474 Py_DECREF(restuple); 6475 return NULL; 6476 } 6477 Py_INCREF(resunicode); 6478 Py_DECREF(restuple); 6479 return resunicode; 6480} 6481 6482static PyObject * 6483unicode_encode_ucs1(const Py_UNICODE *p, 6484 Py_ssize_t size, 6485 const char *errors, 6486 int limit) 6487{ 6488 /* output object */ 6489 PyObject *res; 6490 /* pointers to the beginning and end+1 of input */ 6491 const Py_UNICODE *startp = p; 6492 const Py_UNICODE *endp = p + size; 6493 /* pointer to the beginning of the unencodable characters */ 6494 /* const Py_UNICODE *badp = NULL; */ 6495 /* pointer into the output */ 6496 char *str; 6497 /* current output position */ 6498 Py_ssize_t ressize; 6499 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6500 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6501 PyObject *errorHandler = NULL; 6502 PyObject *exc = NULL; 6503 /* the following variable is used for caching string comparisons 6504 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6505 int known_errorHandler = -1; 6506 6507 /* allocate enough for a simple encoding without 6508 replacements, if we need more, we'll resize */ 6509 if (size == 0) 6510 return PyBytes_FromStringAndSize(NULL, 0); 6511 res = PyBytes_FromStringAndSize(NULL, size); 6512 if (res == NULL) 6513 return NULL; 6514 str = PyBytes_AS_STRING(res); 6515 ressize = size; 6516 6517 while (p<endp) { 6518 Py_UNICODE c = *p; 6519 6520 /* can we encode this? */ 6521 if (c<limit) { 6522 /* no overflow check, because we know that the space is enough */ 6523 *str++ = (char)c; 6524 ++p; 6525 } 6526 else { 6527 Py_ssize_t unicodepos = p-startp; 6528 Py_ssize_t requiredsize; 6529 PyObject *repunicode; 6530 Py_ssize_t repsize; 6531 Py_ssize_t newpos; 6532 Py_ssize_t respos; 6533 Py_UNICODE *uni2; 6534 /* startpos for collecting unencodable chars */ 6535 const Py_UNICODE *collstart = p; 6536 const Py_UNICODE *collend = p; 6537 /* find all unecodable characters */ 6538 while ((collend < endp) && ((*collend)>=limit)) 6539 ++collend; 6540 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6541 if (known_errorHandler==-1) { 6542 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6543 known_errorHandler = 1; 6544 else if (!strcmp(errors, "replace")) 6545 known_errorHandler = 2; 6546 else if (!strcmp(errors, "ignore")) 6547 known_errorHandler = 3; 6548 else if (!strcmp(errors, "xmlcharrefreplace")) 6549 known_errorHandler = 4; 6550 else 6551 known_errorHandler = 0; 6552 } 6553 switch (known_errorHandler) { 6554 case 1: /* strict */ 6555 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6556 goto onError; 6557 case 2: /* replace */ 6558 while (collstart++<collend) 6559 *str++ = '?'; /* fall through */ 6560 case 3: /* ignore */ 6561 p = collend; 6562 break; 6563 case 4: /* xmlcharrefreplace */ 6564 respos = str - PyBytes_AS_STRING(res); 6565 /* determine replacement size (temporarily (mis)uses p) */ 6566 for (p = collstart, repsize = 0; p < collend; ++p) { 6567 if (*p<10) 6568 repsize += 2+1+1; 6569 else if (*p<100) 6570 repsize += 2+2+1; 6571 else if (*p<1000) 6572 repsize += 2+3+1; 6573 else if (*p<10000) 6574 repsize += 2+4+1; 6575#ifndef Py_UNICODE_WIDE 6576 else 6577 repsize += 2+5+1; 6578#else 6579 else if (*p<100000) 6580 repsize += 2+5+1; 6581 else if (*p<1000000) 6582 repsize += 2+6+1; 6583 else 6584 repsize += 2+7+1; 6585#endif 6586 } 6587 requiredsize = respos+repsize+(endp-collend); 6588 if (requiredsize > ressize) { 6589 if (requiredsize<2*ressize) 6590 requiredsize = 2*ressize; 6591 if (_PyBytes_Resize(&res, requiredsize)) 6592 goto onError; 6593 str = PyBytes_AS_STRING(res) + respos; 6594 ressize = requiredsize; 6595 } 6596 /* generate replacement (temporarily (mis)uses p) */ 6597 for (p = collstart; p < collend; ++p) { 6598 str += sprintf(str, "&#%d;", (int)*p); 6599 } 6600 p = collend; 6601 break; 6602 default: 6603 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6604 encoding, reason, startp, size, &exc, 6605 collstart-startp, collend-startp, &newpos); 6606 if (repunicode == NULL) 6607 goto onError; 6608 if (PyBytes_Check(repunicode)) { 6609 /* Directly copy bytes result to output. */ 6610 repsize = PyBytes_Size(repunicode); 6611 if (repsize > 1) { 6612 /* Make room for all additional bytes. */ 6613 respos = str - PyBytes_AS_STRING(res); 6614 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6615 Py_DECREF(repunicode); 6616 goto onError; 6617 } 6618 str = PyBytes_AS_STRING(res) + respos; 6619 ressize += repsize-1; 6620 } 6621 memcpy(str, PyBytes_AsString(repunicode), repsize); 6622 str += repsize; 6623 p = startp + newpos; 6624 Py_DECREF(repunicode); 6625 break; 6626 } 6627 /* need more space? (at least enough for what we 6628 have+the replacement+the rest of the string, so 6629 we won't have to check space for encodable characters) */ 6630 respos = str - PyBytes_AS_STRING(res); 6631 repsize = PyUnicode_GET_SIZE(repunicode); 6632 requiredsize = respos+repsize+(endp-collend); 6633 if (requiredsize > ressize) { 6634 if (requiredsize<2*ressize) 6635 requiredsize = 2*ressize; 6636 if (_PyBytes_Resize(&res, requiredsize)) { 6637 Py_DECREF(repunicode); 6638 goto onError; 6639 } 6640 str = PyBytes_AS_STRING(res) + respos; 6641 ressize = requiredsize; 6642 } 6643 /* check if there is anything unencodable in the replacement 6644 and copy it to the output */ 6645 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6646 c = *uni2; 6647 if (c >= limit) { 6648 raise_encode_exception(&exc, encoding, startp, size, 6649 unicodepos, unicodepos+1, reason); 6650 Py_DECREF(repunicode); 6651 goto onError; 6652 } 6653 *str = (char)c; 6654 } 6655 p = startp + newpos; 6656 Py_DECREF(repunicode); 6657 } 6658 } 6659 } 6660 /* Resize if we allocated to much */ 6661 size = str - PyBytes_AS_STRING(res); 6662 if (size < ressize) { /* If this falls res will be NULL */ 6663 assert(size >= 0); 6664 if (_PyBytes_Resize(&res, size) < 0) 6665 goto onError; 6666 } 6667 6668 Py_XDECREF(errorHandler); 6669 Py_XDECREF(exc); 6670 return res; 6671 6672 onError: 6673 Py_XDECREF(res); 6674 Py_XDECREF(errorHandler); 6675 Py_XDECREF(exc); 6676 return NULL; 6677} 6678 6679PyObject * 6680PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6681 Py_ssize_t size, 6682 const char *errors) 6683{ 6684 return unicode_encode_ucs1(p, size, errors, 256); 6685} 6686 6687PyObject * 6688_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6689{ 6690 if (!PyUnicode_Check(unicode)) { 6691 PyErr_BadArgument(); 6692 return NULL; 6693 } 6694 if (PyUnicode_READY(unicode) == -1) 6695 return NULL; 6696 /* Fast path: if it is a one-byte string, construct 6697 bytes object directly. */ 6698 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6699 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6700 PyUnicode_GET_LENGTH(unicode)); 6701 /* Non-Latin-1 characters present. Defer to above function to 6702 raise the exception. */ 6703 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6704 PyUnicode_GET_SIZE(unicode), 6705 errors); 6706} 6707 6708PyObject* 6709PyUnicode_AsLatin1String(PyObject *unicode) 6710{ 6711 return _PyUnicode_AsLatin1String(unicode, NULL); 6712} 6713 6714/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6715 6716PyObject * 6717PyUnicode_DecodeASCII(const char *s, 6718 Py_ssize_t size, 6719 const char *errors) 6720{ 6721 const char *starts = s; 6722 PyUnicodeObject *v; 6723 Py_UNICODE *u; 6724 Py_ssize_t startinpos; 6725 Py_ssize_t endinpos; 6726 Py_ssize_t outpos; 6727 const char *e; 6728 int has_error; 6729 const unsigned char *p = (const unsigned char *)s; 6730 const unsigned char *end = p + size; 6731 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6732 PyObject *errorHandler = NULL; 6733 PyObject *exc = NULL; 6734 6735 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6736 if (size == 1 && (unsigned char)s[0] < 128) 6737 return get_latin1_char((unsigned char)s[0]); 6738 6739 has_error = 0; 6740 while (p < end && !has_error) { 6741 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6742 an explanation. */ 6743 if (!((size_t) p & LONG_PTR_MASK)) { 6744 /* Help register allocation */ 6745 register const unsigned char *_p = p; 6746 while (_p < aligned_end) { 6747 unsigned long value = *(unsigned long *) _p; 6748 if (value & ASCII_CHAR_MASK) { 6749 has_error = 1; 6750 break; 6751 } 6752 _p += SIZEOF_LONG; 6753 } 6754 if (_p == end) 6755 break; 6756 if (has_error) 6757 break; 6758 p = _p; 6759 } 6760 if (*p & 0x80) { 6761 has_error = 1; 6762 break; 6763 } 6764 else { 6765 ++p; 6766 } 6767 } 6768 if (!has_error) 6769 return unicode_fromascii((const unsigned char *)s, size); 6770 6771 v = _PyUnicode_New(size); 6772 if (v == NULL) 6773 goto onError; 6774 if (size == 0) 6775 return (PyObject *)v; 6776 u = PyUnicode_AS_UNICODE(v); 6777 e = s + size; 6778 while (s < e) { 6779 register unsigned char c = (unsigned char)*s; 6780 if (c < 128) { 6781 *u++ = c; 6782 ++s; 6783 } 6784 else { 6785 startinpos = s-starts; 6786 endinpos = startinpos + 1; 6787 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6788 if (unicode_decode_call_errorhandler( 6789 errors, &errorHandler, 6790 "ascii", "ordinal not in range(128)", 6791 &starts, &e, &startinpos, &endinpos, &exc, &s, 6792 &v, &outpos, &u)) 6793 goto onError; 6794 } 6795 } 6796 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6797 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0) 6798 goto onError; 6799 Py_XDECREF(errorHandler); 6800 Py_XDECREF(exc); 6801#ifndef DONT_MAKE_RESULT_READY 6802 if (_PyUnicode_READY_REPLACE(&v)) { 6803 Py_DECREF(v); 6804 return NULL; 6805 } 6806#endif 6807 assert(_PyUnicode_CheckConsistency(v, 1)); 6808 return (PyObject *)v; 6809 6810 onError: 6811 Py_XDECREF(v); 6812 Py_XDECREF(errorHandler); 6813 Py_XDECREF(exc); 6814 return NULL; 6815} 6816 6817PyObject * 6818PyUnicode_EncodeASCII(const Py_UNICODE *p, 6819 Py_ssize_t size, 6820 const char *errors) 6821{ 6822 return unicode_encode_ucs1(p, size, errors, 128); 6823} 6824 6825PyObject * 6826_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6827{ 6828 if (!PyUnicode_Check(unicode)) { 6829 PyErr_BadArgument(); 6830 return NULL; 6831 } 6832 if (PyUnicode_READY(unicode) == -1) 6833 return NULL; 6834 /* Fast path: if it is an ASCII-only string, construct bytes object 6835 directly. Else defer to above function to raise the exception. */ 6836 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6837 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6838 PyUnicode_GET_LENGTH(unicode)); 6839 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6840 PyUnicode_GET_SIZE(unicode), 6841 errors); 6842} 6843 6844PyObject * 6845PyUnicode_AsASCIIString(PyObject *unicode) 6846{ 6847 return _PyUnicode_AsASCIIString(unicode, NULL); 6848} 6849 6850#ifdef HAVE_MBCS 6851 6852/* --- MBCS codecs for Windows -------------------------------------------- */ 6853 6854#if SIZEOF_INT < SIZEOF_SIZE_T 6855#define NEED_RETRY 6856#endif 6857 6858/* XXX This code is limited to "true" double-byte encodings, as 6859 a) it assumes an incomplete character consists of a single byte, and 6860 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6861 encodings, see IsDBCSLeadByteEx documentation. */ 6862 6863static int 6864is_dbcs_lead_byte(const char *s, int offset) 6865{ 6866 const char *curr = s + offset; 6867 6868 if (IsDBCSLeadByte(*curr)) { 6869 const char *prev = CharPrev(s, curr); 6870 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6871 } 6872 return 0; 6873} 6874 6875/* 6876 * Decode MBCS string into unicode object. If 'final' is set, converts 6877 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6878 */ 6879static int 6880decode_mbcs(PyUnicodeObject **v, 6881 const char *s, /* MBCS string */ 6882 int size, /* sizeof MBCS string */ 6883 int final, 6884 const char *errors) 6885{ 6886 Py_UNICODE *p; 6887 Py_ssize_t n; 6888 DWORD usize; 6889 DWORD flags; 6890 6891 assert(size >= 0); 6892 6893 /* check and handle 'errors' arg */ 6894 if (errors==NULL || strcmp(errors, "strict")==0) 6895 flags = MB_ERR_INVALID_CHARS; 6896 else if (strcmp(errors, "ignore")==0) 6897 flags = 0; 6898 else { 6899 PyErr_Format(PyExc_ValueError, 6900 "mbcs encoding does not support errors='%s'", 6901 errors); 6902 return -1; 6903 } 6904 6905 /* Skip trailing lead-byte unless 'final' is set */ 6906 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6907 --size; 6908 6909 /* First get the size of the result */ 6910 if (size > 0) { 6911 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6912 if (usize==0) 6913 goto mbcs_decode_error; 6914 } else 6915 usize = 0; 6916 6917 if (*v == NULL) { 6918 /* Create unicode object */ 6919 *v = _PyUnicode_New(usize); 6920 if (*v == NULL) 6921 return -1; 6922 n = 0; 6923 } 6924 else { 6925 /* Extend unicode object */ 6926 n = PyUnicode_GET_SIZE(*v); 6927 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6928 return -1; 6929 } 6930 6931 /* Do the conversion */ 6932 if (usize > 0) { 6933 p = PyUnicode_AS_UNICODE(*v) + n; 6934 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6935 goto mbcs_decode_error; 6936 } 6937 } 6938 return size; 6939 6940mbcs_decode_error: 6941 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6942 we raise a UnicodeDecodeError - else it is a 'generic' 6943 windows error 6944 */ 6945 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6946 /* Ideally, we should get reason from FormatMessage - this 6947 is the Windows 2000 English version of the message 6948 */ 6949 PyObject *exc = NULL; 6950 const char *reason = "No mapping for the Unicode character exists " 6951 "in the target multi-byte code page."; 6952 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6953 if (exc != NULL) { 6954 PyCodec_StrictErrors(exc); 6955 Py_DECREF(exc); 6956 } 6957 } else { 6958 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6959 } 6960 return -1; 6961} 6962 6963PyObject * 6964PyUnicode_DecodeMBCSStateful(const char *s, 6965 Py_ssize_t size, 6966 const char *errors, 6967 Py_ssize_t *consumed) 6968{ 6969 PyUnicodeObject *v = NULL; 6970 int done; 6971 6972 if (consumed) 6973 *consumed = 0; 6974 6975#ifdef NEED_RETRY 6976 retry: 6977 if (size > INT_MAX) 6978 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6979 else 6980#endif 6981 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6982 6983 if (done < 0) { 6984 Py_XDECREF(v); 6985 return NULL; 6986 } 6987 6988 if (consumed) 6989 *consumed += done; 6990 6991#ifdef NEED_RETRY 6992 if (size > INT_MAX) { 6993 s += done; 6994 size -= done; 6995 goto retry; 6996 } 6997#endif 6998#ifndef DONT_MAKE_RESULT_READY 6999 if (_PyUnicode_READY_REPLACE(&v)) { 7000 Py_DECREF(v); 7001 return NULL; 7002 } 7003#endif 7004 assert(_PyUnicode_CheckConsistency(v, 1)); 7005 return (PyObject *)v; 7006} 7007 7008PyObject * 7009PyUnicode_DecodeMBCS(const char *s, 7010 Py_ssize_t size, 7011 const char *errors) 7012{ 7013 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7014} 7015 7016/* 7017 * Convert unicode into string object (MBCS). 7018 * Returns 0 if succeed, -1 otherwise. 7019 */ 7020static int 7021encode_mbcs(PyObject **repr, 7022 const Py_UNICODE *p, /* unicode */ 7023 int size, /* size of unicode */ 7024 const char* errors) 7025{ 7026 BOOL usedDefaultChar = FALSE; 7027 BOOL *pusedDefaultChar; 7028 int mbcssize; 7029 Py_ssize_t n; 7030 PyObject *exc = NULL; 7031 DWORD flags; 7032 7033 assert(size >= 0); 7034 7035 /* check and handle 'errors' arg */ 7036 if (errors==NULL || strcmp(errors, "strict")==0) { 7037 flags = WC_NO_BEST_FIT_CHARS; 7038 pusedDefaultChar = &usedDefaultChar; 7039 } else if (strcmp(errors, "replace")==0) { 7040 flags = 0; 7041 pusedDefaultChar = NULL; 7042 } else { 7043 PyErr_Format(PyExc_ValueError, 7044 "mbcs encoding does not support errors='%s'", 7045 errors); 7046 return -1; 7047 } 7048 7049 /* First get the size of the result */ 7050 if (size > 0) { 7051 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 7052 NULL, pusedDefaultChar); 7053 if (mbcssize == 0) { 7054 PyErr_SetFromWindowsErrWithFilename(0, NULL); 7055 return -1; 7056 } 7057 /* If we used a default char, then we failed! */ 7058 if (pusedDefaultChar && *pusedDefaultChar) 7059 goto mbcs_encode_error; 7060 } else { 7061 mbcssize = 0; 7062 } 7063 7064 if (*repr == NULL) { 7065 /* Create string object */ 7066 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 7067 if (*repr == NULL) 7068 return -1; 7069 n = 0; 7070 } 7071 else { 7072 /* Extend string object */ 7073 n = PyBytes_Size(*repr); 7074 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 7075 return -1; 7076 } 7077 7078 /* Do the conversion */ 7079 if (size > 0) { 7080 char *s = PyBytes_AS_STRING(*repr) + n; 7081 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 7082 NULL, pusedDefaultChar)) { 7083 PyErr_SetFromWindowsErrWithFilename(0, NULL); 7084 return -1; 7085 } 7086 if (pusedDefaultChar && *pusedDefaultChar) 7087 goto mbcs_encode_error; 7088 } 7089 return 0; 7090 7091mbcs_encode_error: 7092 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 7093 Py_XDECREF(exc); 7094 return -1; 7095} 7096 7097PyObject * 7098PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7099 Py_ssize_t size, 7100 const char *errors) 7101{ 7102 PyObject *repr = NULL; 7103 int ret; 7104 7105#ifdef NEED_RETRY 7106 retry: 7107 if (size > INT_MAX) 7108 ret = encode_mbcs(&repr, p, INT_MAX, errors); 7109 else 7110#endif 7111 ret = encode_mbcs(&repr, p, (int)size, errors); 7112 7113 if (ret < 0) { 7114 Py_XDECREF(repr); 7115 return NULL; 7116 } 7117 7118#ifdef NEED_RETRY 7119 if (size > INT_MAX) { 7120 p += INT_MAX; 7121 size -= INT_MAX; 7122 goto retry; 7123 } 7124#endif 7125 7126 return repr; 7127} 7128 7129PyObject * 7130PyUnicode_AsMBCSString(PyObject *unicode) 7131{ 7132 if (!PyUnicode_Check(unicode)) { 7133 PyErr_BadArgument(); 7134 return NULL; 7135 } 7136 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 7137 PyUnicode_GET_SIZE(unicode), 7138 NULL); 7139} 7140 7141#undef NEED_RETRY 7142 7143#endif /* HAVE_MBCS */ 7144 7145/* --- Character Mapping Codec -------------------------------------------- */ 7146 7147PyObject * 7148PyUnicode_DecodeCharmap(const char *s, 7149 Py_ssize_t size, 7150 PyObject *mapping, 7151 const char *errors) 7152{ 7153 const char *starts = s; 7154 Py_ssize_t startinpos; 7155 Py_ssize_t endinpos; 7156 Py_ssize_t outpos; 7157 const char *e; 7158 PyUnicodeObject *v; 7159 Py_UNICODE *p; 7160 Py_ssize_t extrachars = 0; 7161 PyObject *errorHandler = NULL; 7162 PyObject *exc = NULL; 7163 Py_UNICODE *mapstring = NULL; 7164 Py_ssize_t maplen = 0; 7165 7166 /* Default to Latin-1 */ 7167 if (mapping == NULL) 7168 return PyUnicode_DecodeLatin1(s, size, errors); 7169 7170 v = _PyUnicode_New(size); 7171 if (v == NULL) 7172 goto onError; 7173 if (size == 0) 7174 return (PyObject *)v; 7175 p = PyUnicode_AS_UNICODE(v); 7176 e = s + size; 7177 if (PyUnicode_CheckExact(mapping)) { 7178 mapstring = PyUnicode_AS_UNICODE(mapping); 7179 maplen = PyUnicode_GET_SIZE(mapping); 7180 while (s < e) { 7181 unsigned char ch = *s; 7182 Py_UNICODE x = 0xfffe; /* illegal value */ 7183 7184 if (ch < maplen) 7185 x = mapstring[ch]; 7186 7187 if (x == 0xfffe) { 7188 /* undefined mapping */ 7189 outpos = p-PyUnicode_AS_UNICODE(v); 7190 startinpos = s-starts; 7191 endinpos = startinpos+1; 7192 if (unicode_decode_call_errorhandler( 7193 errors, &errorHandler, 7194 "charmap", "character maps to <undefined>", 7195 &starts, &e, &startinpos, &endinpos, &exc, &s, 7196 &v, &outpos, &p)) { 7197 goto onError; 7198 } 7199 continue; 7200 } 7201 *p++ = x; 7202 ++s; 7203 } 7204 } 7205 else { 7206 while (s < e) { 7207 unsigned char ch = *s; 7208 PyObject *w, *x; 7209 7210 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7211 w = PyLong_FromLong((long)ch); 7212 if (w == NULL) 7213 goto onError; 7214 x = PyObject_GetItem(mapping, w); 7215 Py_DECREF(w); 7216 if (x == NULL) { 7217 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7218 /* No mapping found means: mapping is undefined. */ 7219 PyErr_Clear(); 7220 x = Py_None; 7221 Py_INCREF(x); 7222 } else 7223 goto onError; 7224 } 7225 7226 /* Apply mapping */ 7227 if (PyLong_Check(x)) { 7228 long value = PyLong_AS_LONG(x); 7229 if (value < 0 || value > 65535) { 7230 PyErr_SetString(PyExc_TypeError, 7231 "character mapping must be in range(65536)"); 7232 Py_DECREF(x); 7233 goto onError; 7234 } 7235 *p++ = (Py_UNICODE)value; 7236 } 7237 else if (x == Py_None) { 7238 /* undefined mapping */ 7239 outpos = p-PyUnicode_AS_UNICODE(v); 7240 startinpos = s-starts; 7241 endinpos = startinpos+1; 7242 if (unicode_decode_call_errorhandler( 7243 errors, &errorHandler, 7244 "charmap", "character maps to <undefined>", 7245 &starts, &e, &startinpos, &endinpos, &exc, &s, 7246 &v, &outpos, &p)) { 7247 Py_DECREF(x); 7248 goto onError; 7249 } 7250 Py_DECREF(x); 7251 continue; 7252 } 7253 else if (PyUnicode_Check(x)) { 7254 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 7255 7256 if (targetsize == 1) 7257 /* 1-1 mapping */ 7258 *p++ = *PyUnicode_AS_UNICODE(x); 7259 7260 else if (targetsize > 1) { 7261 /* 1-n mapping */ 7262 if (targetsize > extrachars) { 7263 /* resize first */ 7264 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 7265 Py_ssize_t needed = (targetsize - extrachars) + \ 7266 (targetsize << 2); 7267 extrachars += needed; 7268 /* XXX overflow detection missing */ 7269 if (PyUnicode_Resize((PyObject**)&v, 7270 PyUnicode_GET_SIZE(v) + needed) < 0) { 7271 Py_DECREF(x); 7272 goto onError; 7273 } 7274 p = PyUnicode_AS_UNICODE(v) + oldpos; 7275 } 7276 Py_UNICODE_COPY(p, 7277 PyUnicode_AS_UNICODE(x), 7278 targetsize); 7279 p += targetsize; 7280 extrachars -= targetsize; 7281 } 7282 /* 1-0 mapping: skip the character */ 7283 } 7284 else { 7285 /* wrong return value */ 7286 PyErr_SetString(PyExc_TypeError, 7287 "character mapping must return integer, None or str"); 7288 Py_DECREF(x); 7289 goto onError; 7290 } 7291 Py_DECREF(x); 7292 ++s; 7293 } 7294 } 7295 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 7296 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 7297 goto onError; 7298 Py_XDECREF(errorHandler); 7299 Py_XDECREF(exc); 7300#ifndef DONT_MAKE_RESULT_READY 7301 if (_PyUnicode_READY_REPLACE(&v)) { 7302 Py_DECREF(v); 7303 return NULL; 7304 } 7305#endif 7306 assert(_PyUnicode_CheckConsistency(v, 1)); 7307 return (PyObject *)v; 7308 7309 onError: 7310 Py_XDECREF(errorHandler); 7311 Py_XDECREF(exc); 7312 Py_XDECREF(v); 7313 return NULL; 7314} 7315 7316/* Charmap encoding: the lookup table */ 7317 7318struct encoding_map { 7319 PyObject_HEAD 7320 unsigned char level1[32]; 7321 int count2, count3; 7322 unsigned char level23[1]; 7323}; 7324 7325static PyObject* 7326encoding_map_size(PyObject *obj, PyObject* args) 7327{ 7328 struct encoding_map *map = (struct encoding_map*)obj; 7329 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7330 128*map->count3); 7331} 7332 7333static PyMethodDef encoding_map_methods[] = { 7334 {"size", encoding_map_size, METH_NOARGS, 7335 PyDoc_STR("Return the size (in bytes) of this object") }, 7336 { 0 } 7337}; 7338 7339static void 7340encoding_map_dealloc(PyObject* o) 7341{ 7342 PyObject_FREE(o); 7343} 7344 7345static PyTypeObject EncodingMapType = { 7346 PyVarObject_HEAD_INIT(NULL, 0) 7347 "EncodingMap", /*tp_name*/ 7348 sizeof(struct encoding_map), /*tp_basicsize*/ 7349 0, /*tp_itemsize*/ 7350 /* methods */ 7351 encoding_map_dealloc, /*tp_dealloc*/ 7352 0, /*tp_print*/ 7353 0, /*tp_getattr*/ 7354 0, /*tp_setattr*/ 7355 0, /*tp_reserved*/ 7356 0, /*tp_repr*/ 7357 0, /*tp_as_number*/ 7358 0, /*tp_as_sequence*/ 7359 0, /*tp_as_mapping*/ 7360 0, /*tp_hash*/ 7361 0, /*tp_call*/ 7362 0, /*tp_str*/ 7363 0, /*tp_getattro*/ 7364 0, /*tp_setattro*/ 7365 0, /*tp_as_buffer*/ 7366 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7367 0, /*tp_doc*/ 7368 0, /*tp_traverse*/ 7369 0, /*tp_clear*/ 7370 0, /*tp_richcompare*/ 7371 0, /*tp_weaklistoffset*/ 7372 0, /*tp_iter*/ 7373 0, /*tp_iternext*/ 7374 encoding_map_methods, /*tp_methods*/ 7375 0, /*tp_members*/ 7376 0, /*tp_getset*/ 7377 0, /*tp_base*/ 7378 0, /*tp_dict*/ 7379 0, /*tp_descr_get*/ 7380 0, /*tp_descr_set*/ 7381 0, /*tp_dictoffset*/ 7382 0, /*tp_init*/ 7383 0, /*tp_alloc*/ 7384 0, /*tp_new*/ 7385 0, /*tp_free*/ 7386 0, /*tp_is_gc*/ 7387}; 7388 7389PyObject* 7390PyUnicode_BuildEncodingMap(PyObject* string) 7391{ 7392 PyObject *result; 7393 struct encoding_map *mresult; 7394 int i; 7395 int need_dict = 0; 7396 unsigned char level1[32]; 7397 unsigned char level2[512]; 7398 unsigned char *mlevel1, *mlevel2, *mlevel3; 7399 int count2 = 0, count3 = 0; 7400 int kind; 7401 void *data; 7402 Py_UCS4 ch; 7403 7404 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7405 PyErr_BadArgument(); 7406 return NULL; 7407 } 7408 kind = PyUnicode_KIND(string); 7409 data = PyUnicode_DATA(string); 7410 memset(level1, 0xFF, sizeof level1); 7411 memset(level2, 0xFF, sizeof level2); 7412 7413 /* If there isn't a one-to-one mapping of NULL to \0, 7414 or if there are non-BMP characters, we need to use 7415 a mapping dictionary. */ 7416 if (PyUnicode_READ(kind, data, 0) != 0) 7417 need_dict = 1; 7418 for (i = 1; i < 256; i++) { 7419 int l1, l2; 7420 ch = PyUnicode_READ(kind, data, i); 7421 if (ch == 0 || ch > 0xFFFF) { 7422 need_dict = 1; 7423 break; 7424 } 7425 if (ch == 0xFFFE) 7426 /* unmapped character */ 7427 continue; 7428 l1 = ch >> 11; 7429 l2 = ch >> 7; 7430 if (level1[l1] == 0xFF) 7431 level1[l1] = count2++; 7432 if (level2[l2] == 0xFF) 7433 level2[l2] = count3++; 7434 } 7435 7436 if (count2 >= 0xFF || count3 >= 0xFF) 7437 need_dict = 1; 7438 7439 if (need_dict) { 7440 PyObject *result = PyDict_New(); 7441 PyObject *key, *value; 7442 if (!result) 7443 return NULL; 7444 for (i = 0; i < 256; i++) { 7445 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7446 value = PyLong_FromLong(i); 7447 if (!key || !value) 7448 goto failed1; 7449 if (PyDict_SetItem(result, key, value) == -1) 7450 goto failed1; 7451 Py_DECREF(key); 7452 Py_DECREF(value); 7453 } 7454 return result; 7455 failed1: 7456 Py_XDECREF(key); 7457 Py_XDECREF(value); 7458 Py_DECREF(result); 7459 return NULL; 7460 } 7461 7462 /* Create a three-level trie */ 7463 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7464 16*count2 + 128*count3 - 1); 7465 if (!result) 7466 return PyErr_NoMemory(); 7467 PyObject_Init(result, &EncodingMapType); 7468 mresult = (struct encoding_map*)result; 7469 mresult->count2 = count2; 7470 mresult->count3 = count3; 7471 mlevel1 = mresult->level1; 7472 mlevel2 = mresult->level23; 7473 mlevel3 = mresult->level23 + 16*count2; 7474 memcpy(mlevel1, level1, 32); 7475 memset(mlevel2, 0xFF, 16*count2); 7476 memset(mlevel3, 0, 128*count3); 7477 count3 = 0; 7478 for (i = 1; i < 256; i++) { 7479 int o1, o2, o3, i2, i3; 7480 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7481 /* unmapped character */ 7482 continue; 7483 o1 = PyUnicode_READ(kind, data, i)>>11; 7484 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7485 i2 = 16*mlevel1[o1] + o2; 7486 if (mlevel2[i2] == 0xFF) 7487 mlevel2[i2] = count3++; 7488 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7489 i3 = 128*mlevel2[i2] + o3; 7490 mlevel3[i3] = i; 7491 } 7492 return result; 7493} 7494 7495static int 7496encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7497{ 7498 struct encoding_map *map = (struct encoding_map*)mapping; 7499 int l1 = c>>11; 7500 int l2 = (c>>7) & 0xF; 7501 int l3 = c & 0x7F; 7502 int i; 7503 7504#ifdef Py_UNICODE_WIDE 7505 if (c > 0xFFFF) { 7506 return -1; 7507 } 7508#endif 7509 if (c == 0) 7510 return 0; 7511 /* level 1*/ 7512 i = map->level1[l1]; 7513 if (i == 0xFF) { 7514 return -1; 7515 } 7516 /* level 2*/ 7517 i = map->level23[16*i+l2]; 7518 if (i == 0xFF) { 7519 return -1; 7520 } 7521 /* level 3 */ 7522 i = map->level23[16*map->count2 + 128*i + l3]; 7523 if (i == 0) { 7524 return -1; 7525 } 7526 return i; 7527} 7528 7529/* Lookup the character ch in the mapping. If the character 7530 can't be found, Py_None is returned (or NULL, if another 7531 error occurred). */ 7532static PyObject * 7533charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7534{ 7535 PyObject *w = PyLong_FromLong((long)c); 7536 PyObject *x; 7537 7538 if (w == NULL) 7539 return NULL; 7540 x = PyObject_GetItem(mapping, w); 7541 Py_DECREF(w); 7542 if (x == NULL) { 7543 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7544 /* No mapping found means: mapping is undefined. */ 7545 PyErr_Clear(); 7546 x = Py_None; 7547 Py_INCREF(x); 7548 return x; 7549 } else 7550 return NULL; 7551 } 7552 else if (x == Py_None) 7553 return x; 7554 else if (PyLong_Check(x)) { 7555 long value = PyLong_AS_LONG(x); 7556 if (value < 0 || value > 255) { 7557 PyErr_SetString(PyExc_TypeError, 7558 "character mapping must be in range(256)"); 7559 Py_DECREF(x); 7560 return NULL; 7561 } 7562 return x; 7563 } 7564 else if (PyBytes_Check(x)) 7565 return x; 7566 else { 7567 /* wrong return value */ 7568 PyErr_Format(PyExc_TypeError, 7569 "character mapping must return integer, bytes or None, not %.400s", 7570 x->ob_type->tp_name); 7571 Py_DECREF(x); 7572 return NULL; 7573 } 7574} 7575 7576static int 7577charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7578{ 7579 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7580 /* exponentially overallocate to minimize reallocations */ 7581 if (requiredsize < 2*outsize) 7582 requiredsize = 2*outsize; 7583 if (_PyBytes_Resize(outobj, requiredsize)) 7584 return -1; 7585 return 0; 7586} 7587 7588typedef enum charmapencode_result { 7589 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7590} charmapencode_result; 7591/* lookup the character, put the result in the output string and adjust 7592 various state variables. Resize the output bytes object if not enough 7593 space is available. Return a new reference to the object that 7594 was put in the output buffer, or Py_None, if the mapping was undefined 7595 (in which case no character was written) or NULL, if a 7596 reallocation error occurred. The caller must decref the result */ 7597static charmapencode_result 7598charmapencode_output(Py_UNICODE c, PyObject *mapping, 7599 PyObject **outobj, Py_ssize_t *outpos) 7600{ 7601 PyObject *rep; 7602 char *outstart; 7603 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7604 7605 if (Py_TYPE(mapping) == &EncodingMapType) { 7606 int res = encoding_map_lookup(c, mapping); 7607 Py_ssize_t requiredsize = *outpos+1; 7608 if (res == -1) 7609 return enc_FAILED; 7610 if (outsize<requiredsize) 7611 if (charmapencode_resize(outobj, outpos, requiredsize)) 7612 return enc_EXCEPTION; 7613 outstart = PyBytes_AS_STRING(*outobj); 7614 outstart[(*outpos)++] = (char)res; 7615 return enc_SUCCESS; 7616 } 7617 7618 rep = charmapencode_lookup(c, mapping); 7619 if (rep==NULL) 7620 return enc_EXCEPTION; 7621 else if (rep==Py_None) { 7622 Py_DECREF(rep); 7623 return enc_FAILED; 7624 } else { 7625 if (PyLong_Check(rep)) { 7626 Py_ssize_t requiredsize = *outpos+1; 7627 if (outsize<requiredsize) 7628 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7629 Py_DECREF(rep); 7630 return enc_EXCEPTION; 7631 } 7632 outstart = PyBytes_AS_STRING(*outobj); 7633 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7634 } 7635 else { 7636 const char *repchars = PyBytes_AS_STRING(rep); 7637 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7638 Py_ssize_t requiredsize = *outpos+repsize; 7639 if (outsize<requiredsize) 7640 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7641 Py_DECREF(rep); 7642 return enc_EXCEPTION; 7643 } 7644 outstart = PyBytes_AS_STRING(*outobj); 7645 memcpy(outstart + *outpos, repchars, repsize); 7646 *outpos += repsize; 7647 } 7648 } 7649 Py_DECREF(rep); 7650 return enc_SUCCESS; 7651} 7652 7653/* handle an error in PyUnicode_EncodeCharmap 7654 Return 0 on success, -1 on error */ 7655static int 7656charmap_encoding_error( 7657 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7658 PyObject **exceptionObject, 7659 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7660 PyObject **res, Py_ssize_t *respos) 7661{ 7662 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7663 Py_ssize_t repsize; 7664 Py_ssize_t newpos; 7665 Py_UNICODE *uni2; 7666 /* startpos for collecting unencodable chars */ 7667 Py_ssize_t collstartpos = *inpos; 7668 Py_ssize_t collendpos = *inpos+1; 7669 Py_ssize_t collpos; 7670 char *encoding = "charmap"; 7671 char *reason = "character maps to <undefined>"; 7672 charmapencode_result x; 7673 7674 /* find all unencodable characters */ 7675 while (collendpos < size) { 7676 PyObject *rep; 7677 if (Py_TYPE(mapping) == &EncodingMapType) { 7678 int res = encoding_map_lookup(p[collendpos], mapping); 7679 if (res != -1) 7680 break; 7681 ++collendpos; 7682 continue; 7683 } 7684 7685 rep = charmapencode_lookup(p[collendpos], mapping); 7686 if (rep==NULL) 7687 return -1; 7688 else if (rep!=Py_None) { 7689 Py_DECREF(rep); 7690 break; 7691 } 7692 Py_DECREF(rep); 7693 ++collendpos; 7694 } 7695 /* cache callback name lookup 7696 * (if not done yet, i.e. it's the first error) */ 7697 if (*known_errorHandler==-1) { 7698 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7699 *known_errorHandler = 1; 7700 else if (!strcmp(errors, "replace")) 7701 *known_errorHandler = 2; 7702 else if (!strcmp(errors, "ignore")) 7703 *known_errorHandler = 3; 7704 else if (!strcmp(errors, "xmlcharrefreplace")) 7705 *known_errorHandler = 4; 7706 else 7707 *known_errorHandler = 0; 7708 } 7709 switch (*known_errorHandler) { 7710 case 1: /* strict */ 7711 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7712 return -1; 7713 case 2: /* replace */ 7714 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7715 x = charmapencode_output('?', mapping, res, respos); 7716 if (x==enc_EXCEPTION) { 7717 return -1; 7718 } 7719 else if (x==enc_FAILED) { 7720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7721 return -1; 7722 } 7723 } 7724 /* fall through */ 7725 case 3: /* ignore */ 7726 *inpos = collendpos; 7727 break; 7728 case 4: /* xmlcharrefreplace */ 7729 /* generate replacement (temporarily (mis)uses p) */ 7730 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7731 char buffer[2+29+1+1]; 7732 char *cp; 7733 sprintf(buffer, "&#%d;", (int)p[collpos]); 7734 for (cp = buffer; *cp; ++cp) { 7735 x = charmapencode_output(*cp, mapping, res, respos); 7736 if (x==enc_EXCEPTION) 7737 return -1; 7738 else if (x==enc_FAILED) { 7739 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7740 return -1; 7741 } 7742 } 7743 } 7744 *inpos = collendpos; 7745 break; 7746 default: 7747 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7748 encoding, reason, p, size, exceptionObject, 7749 collstartpos, collendpos, &newpos); 7750 if (repunicode == NULL) 7751 return -1; 7752 if (PyBytes_Check(repunicode)) { 7753 /* Directly copy bytes result to output. */ 7754 Py_ssize_t outsize = PyBytes_Size(*res); 7755 Py_ssize_t requiredsize; 7756 repsize = PyBytes_Size(repunicode); 7757 requiredsize = *respos + repsize; 7758 if (requiredsize > outsize) 7759 /* Make room for all additional bytes. */ 7760 if (charmapencode_resize(res, respos, requiredsize)) { 7761 Py_DECREF(repunicode); 7762 return -1; 7763 } 7764 memcpy(PyBytes_AsString(*res) + *respos, 7765 PyBytes_AsString(repunicode), repsize); 7766 *respos += repsize; 7767 *inpos = newpos; 7768 Py_DECREF(repunicode); 7769 break; 7770 } 7771 /* generate replacement */ 7772 repsize = PyUnicode_GET_SIZE(repunicode); 7773 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7774 x = charmapencode_output(*uni2, mapping, res, respos); 7775 if (x==enc_EXCEPTION) { 7776 return -1; 7777 } 7778 else if (x==enc_FAILED) { 7779 Py_DECREF(repunicode); 7780 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7781 return -1; 7782 } 7783 } 7784 *inpos = newpos; 7785 Py_DECREF(repunicode); 7786 } 7787 return 0; 7788} 7789 7790PyObject * 7791PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7792 Py_ssize_t size, 7793 PyObject *mapping, 7794 const char *errors) 7795{ 7796 /* output object */ 7797 PyObject *res = NULL; 7798 /* current input position */ 7799 Py_ssize_t inpos = 0; 7800 /* current output position */ 7801 Py_ssize_t respos = 0; 7802 PyObject *errorHandler = NULL; 7803 PyObject *exc = NULL; 7804 /* the following variable is used for caching string comparisons 7805 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7806 * 3=ignore, 4=xmlcharrefreplace */ 7807 int known_errorHandler = -1; 7808 7809 /* Default to Latin-1 */ 7810 if (mapping == NULL) 7811 return PyUnicode_EncodeLatin1(p, size, errors); 7812 7813 /* allocate enough for a simple encoding without 7814 replacements, if we need more, we'll resize */ 7815 res = PyBytes_FromStringAndSize(NULL, size); 7816 if (res == NULL) 7817 goto onError; 7818 if (size == 0) 7819 return res; 7820 7821 while (inpos<size) { 7822 /* try to encode it */ 7823 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7824 if (x==enc_EXCEPTION) /* error */ 7825 goto onError; 7826 if (x==enc_FAILED) { /* unencodable character */ 7827 if (charmap_encoding_error(p, size, &inpos, mapping, 7828 &exc, 7829 &known_errorHandler, &errorHandler, errors, 7830 &res, &respos)) { 7831 goto onError; 7832 } 7833 } 7834 else 7835 /* done with this character => adjust input position */ 7836 ++inpos; 7837 } 7838 7839 /* Resize if we allocated to much */ 7840 if (respos<PyBytes_GET_SIZE(res)) 7841 if (_PyBytes_Resize(&res, respos) < 0) 7842 goto onError; 7843 7844 Py_XDECREF(exc); 7845 Py_XDECREF(errorHandler); 7846 return res; 7847 7848 onError: 7849 Py_XDECREF(res); 7850 Py_XDECREF(exc); 7851 Py_XDECREF(errorHandler); 7852 return NULL; 7853} 7854 7855PyObject * 7856PyUnicode_AsCharmapString(PyObject *unicode, 7857 PyObject *mapping) 7858{ 7859 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7860 PyErr_BadArgument(); 7861 return NULL; 7862 } 7863 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7864 PyUnicode_GET_SIZE(unicode), 7865 mapping, 7866 NULL); 7867} 7868 7869/* create or adjust a UnicodeTranslateError */ 7870static void 7871make_translate_exception(PyObject **exceptionObject, 7872 PyObject *unicode, 7873 Py_ssize_t startpos, Py_ssize_t endpos, 7874 const char *reason) 7875{ 7876 if (*exceptionObject == NULL) { 7877 *exceptionObject = _PyUnicodeTranslateError_Create( 7878 unicode, startpos, endpos, reason); 7879 } 7880 else { 7881 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7882 goto onError; 7883 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7884 goto onError; 7885 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7886 goto onError; 7887 return; 7888 onError: 7889 Py_DECREF(*exceptionObject); 7890 *exceptionObject = NULL; 7891 } 7892} 7893 7894/* raises a UnicodeTranslateError */ 7895static void 7896raise_translate_exception(PyObject **exceptionObject, 7897 PyObject *unicode, 7898 Py_ssize_t startpos, Py_ssize_t endpos, 7899 const char *reason) 7900{ 7901 make_translate_exception(exceptionObject, 7902 unicode, startpos, endpos, reason); 7903 if (*exceptionObject != NULL) 7904 PyCodec_StrictErrors(*exceptionObject); 7905} 7906 7907/* error handling callback helper: 7908 build arguments, call the callback and check the arguments, 7909 put the result into newpos and return the replacement string, which 7910 has to be freed by the caller */ 7911static PyObject * 7912unicode_translate_call_errorhandler(const char *errors, 7913 PyObject **errorHandler, 7914 const char *reason, 7915 PyObject *unicode, PyObject **exceptionObject, 7916 Py_ssize_t startpos, Py_ssize_t endpos, 7917 Py_ssize_t *newpos) 7918{ 7919 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7920 7921 Py_ssize_t i_newpos; 7922 PyObject *restuple; 7923 PyObject *resunicode; 7924 7925 if (*errorHandler == NULL) { 7926 *errorHandler = PyCodec_LookupError(errors); 7927 if (*errorHandler == NULL) 7928 return NULL; 7929 } 7930 7931 make_translate_exception(exceptionObject, 7932 unicode, startpos, endpos, reason); 7933 if (*exceptionObject == NULL) 7934 return NULL; 7935 7936 restuple = PyObject_CallFunctionObjArgs( 7937 *errorHandler, *exceptionObject, NULL); 7938 if (restuple == NULL) 7939 return NULL; 7940 if (!PyTuple_Check(restuple)) { 7941 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7942 Py_DECREF(restuple); 7943 return NULL; 7944 } 7945 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7946 &resunicode, &i_newpos)) { 7947 Py_DECREF(restuple); 7948 return NULL; 7949 } 7950 if (i_newpos<0) 7951 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7952 else 7953 *newpos = i_newpos; 7954 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7955 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7956 Py_DECREF(restuple); 7957 return NULL; 7958 } 7959 Py_INCREF(resunicode); 7960 Py_DECREF(restuple); 7961 return resunicode; 7962} 7963 7964/* Lookup the character ch in the mapping and put the result in result, 7965 which must be decrefed by the caller. 7966 Return 0 on success, -1 on error */ 7967static int 7968charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7969{ 7970 PyObject *w = PyLong_FromLong((long)c); 7971 PyObject *x; 7972 7973 if (w == NULL) 7974 return -1; 7975 x = PyObject_GetItem(mapping, w); 7976 Py_DECREF(w); 7977 if (x == NULL) { 7978 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7979 /* No mapping found means: use 1:1 mapping. */ 7980 PyErr_Clear(); 7981 *result = NULL; 7982 return 0; 7983 } else 7984 return -1; 7985 } 7986 else if (x == Py_None) { 7987 *result = x; 7988 return 0; 7989 } 7990 else if (PyLong_Check(x)) { 7991 long value = PyLong_AS_LONG(x); 7992 long max = PyUnicode_GetMax(); 7993 if (value < 0 || value > max) { 7994 PyErr_Format(PyExc_TypeError, 7995 "character mapping must be in range(0x%x)", max+1); 7996 Py_DECREF(x); 7997 return -1; 7998 } 7999 *result = x; 8000 return 0; 8001 } 8002 else if (PyUnicode_Check(x)) { 8003 *result = x; 8004 return 0; 8005 } 8006 else { 8007 /* wrong return value */ 8008 PyErr_SetString(PyExc_TypeError, 8009 "character mapping must return integer, None or str"); 8010 Py_DECREF(x); 8011 return -1; 8012 } 8013} 8014/* ensure that *outobj is at least requiredsize characters long, 8015 if not reallocate and adjust various state variables. 8016 Return 0 on success, -1 on error */ 8017static int 8018charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8019 Py_ssize_t requiredsize) 8020{ 8021 Py_ssize_t oldsize = *psize; 8022 if (requiredsize > oldsize) { 8023 /* exponentially overallocate to minimize reallocations */ 8024 if (requiredsize < 2 * oldsize) 8025 requiredsize = 2 * oldsize; 8026 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8027 if (*outobj == 0) 8028 return -1; 8029 *psize = requiredsize; 8030 } 8031 return 0; 8032} 8033/* lookup the character, put the result in the output string and adjust 8034 various state variables. Return a new reference to the object that 8035 was put in the output buffer in *result, or Py_None, if the mapping was 8036 undefined (in which case no character was written). 8037 The called must decref result. 8038 Return 0 on success, -1 on error. */ 8039static int 8040charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8041 PyObject *mapping, Py_UCS4 **output, 8042 Py_ssize_t *osize, Py_ssize_t *opos, 8043 PyObject **res) 8044{ 8045 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8046 if (charmaptranslate_lookup(curinp, mapping, res)) 8047 return -1; 8048 if (*res==NULL) { 8049 /* not found => default to 1:1 mapping */ 8050 (*output)[(*opos)++] = curinp; 8051 } 8052 else if (*res==Py_None) 8053 ; 8054 else if (PyLong_Check(*res)) { 8055 /* no overflow check, because we know that the space is enough */ 8056 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8057 } 8058 else if (PyUnicode_Check(*res)) { 8059 Py_ssize_t repsize; 8060 if (PyUnicode_READY(*res) == -1) 8061 return -1; 8062 repsize = PyUnicode_GET_LENGTH(*res); 8063 if (repsize==1) { 8064 /* no overflow check, because we know that the space is enough */ 8065 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8066 } 8067 else if (repsize!=0) { 8068 /* more than one character */ 8069 Py_ssize_t requiredsize = *opos + 8070 (PyUnicode_GET_LENGTH(input) - ipos) + 8071 repsize - 1; 8072 Py_ssize_t i; 8073 if (charmaptranslate_makespace(output, osize, requiredsize)) 8074 return -1; 8075 for(i = 0; i < repsize; i++) 8076 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8077 } 8078 } 8079 else 8080 return -1; 8081 return 0; 8082} 8083 8084PyObject * 8085_PyUnicode_TranslateCharmap(PyObject *input, 8086 PyObject *mapping, 8087 const char *errors) 8088{ 8089 /* input object */ 8090 char *idata; 8091 Py_ssize_t size, i; 8092 int kind; 8093 /* output buffer */ 8094 Py_UCS4 *output = NULL; 8095 Py_ssize_t osize; 8096 PyObject *res; 8097 /* current output position */ 8098 Py_ssize_t opos; 8099 char *reason = "character maps to <undefined>"; 8100 PyObject *errorHandler = NULL; 8101 PyObject *exc = NULL; 8102 /* the following variable is used for caching string comparisons 8103 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8104 * 3=ignore, 4=xmlcharrefreplace */ 8105 int known_errorHandler = -1; 8106 8107 if (mapping == NULL) { 8108 PyErr_BadArgument(); 8109 return NULL; 8110 } 8111 8112 if (PyUnicode_READY(input) == -1) 8113 return NULL; 8114 idata = (char*)PyUnicode_DATA(input); 8115 kind = PyUnicode_KIND(input); 8116 size = PyUnicode_GET_LENGTH(input); 8117 i = 0; 8118 8119 if (size == 0) { 8120 Py_INCREF(input); 8121 return input; 8122 } 8123 8124 /* allocate enough for a simple 1:1 translation without 8125 replacements, if we need more, we'll resize */ 8126 osize = size; 8127 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8128 opos = 0; 8129 if (output == NULL) { 8130 PyErr_NoMemory(); 8131 goto onError; 8132 } 8133 8134 while (i<size) { 8135 /* try to encode it */ 8136 PyObject *x = NULL; 8137 if (charmaptranslate_output(input, i, mapping, 8138 &output, &osize, &opos, &x)) { 8139 Py_XDECREF(x); 8140 goto onError; 8141 } 8142 Py_XDECREF(x); 8143 if (x!=Py_None) /* it worked => adjust input pointer */ 8144 ++i; 8145 else { /* untranslatable character */ 8146 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8147 Py_ssize_t repsize; 8148 Py_ssize_t newpos; 8149 Py_ssize_t uni2; 8150 /* startpos for collecting untranslatable chars */ 8151 Py_ssize_t collstart = i; 8152 Py_ssize_t collend = i+1; 8153 Py_ssize_t coll; 8154 8155 /* find all untranslatable characters */ 8156 while (collend < size) { 8157 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8158 goto onError; 8159 Py_XDECREF(x); 8160 if (x!=Py_None) 8161 break; 8162 ++collend; 8163 } 8164 /* cache callback name lookup 8165 * (if not done yet, i.e. it's the first error) */ 8166 if (known_errorHandler==-1) { 8167 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8168 known_errorHandler = 1; 8169 else if (!strcmp(errors, "replace")) 8170 known_errorHandler = 2; 8171 else if (!strcmp(errors, "ignore")) 8172 known_errorHandler = 3; 8173 else if (!strcmp(errors, "xmlcharrefreplace")) 8174 known_errorHandler = 4; 8175 else 8176 known_errorHandler = 0; 8177 } 8178 switch (known_errorHandler) { 8179 case 1: /* strict */ 8180 raise_translate_exception(&exc, input, collstart, 8181 collend, reason); 8182 goto onError; 8183 case 2: /* replace */ 8184 /* No need to check for space, this is a 1:1 replacement */ 8185 for (coll = collstart; coll<collend; coll++) 8186 output[opos++] = '?'; 8187 /* fall through */ 8188 case 3: /* ignore */ 8189 i = collend; 8190 break; 8191 case 4: /* xmlcharrefreplace */ 8192 /* generate replacement (temporarily (mis)uses i) */ 8193 for (i = collstart; i < collend; ++i) { 8194 char buffer[2+29+1+1]; 8195 char *cp; 8196 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8197 if (charmaptranslate_makespace(&output, &osize, 8198 opos+strlen(buffer)+(size-collend))) 8199 goto onError; 8200 for (cp = buffer; *cp; ++cp) 8201 output[opos++] = *cp; 8202 } 8203 i = collend; 8204 break; 8205 default: 8206 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8207 reason, input, &exc, 8208 collstart, collend, &newpos); 8209 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 8210 goto onError; 8211 /* generate replacement */ 8212 repsize = PyUnicode_GET_LENGTH(repunicode); 8213 if (charmaptranslate_makespace(&output, &osize, 8214 opos+repsize+(size-collend))) { 8215 Py_DECREF(repunicode); 8216 goto onError; 8217 } 8218 for (uni2 = 0; repsize-->0; ++uni2) 8219 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8220 i = newpos; 8221 Py_DECREF(repunicode); 8222 } 8223 } 8224 } 8225 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8226 if (!res) 8227 goto onError; 8228 PyMem_Free(output); 8229 Py_XDECREF(exc); 8230 Py_XDECREF(errorHandler); 8231 return res; 8232 8233 onError: 8234 PyMem_Free(output); 8235 Py_XDECREF(exc); 8236 Py_XDECREF(errorHandler); 8237 return NULL; 8238} 8239 8240/* Deprecated. Use PyUnicode_Translate instead. */ 8241PyObject * 8242PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8243 Py_ssize_t size, 8244 PyObject *mapping, 8245 const char *errors) 8246{ 8247 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8248 if (!unicode) 8249 return NULL; 8250 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8251} 8252 8253PyObject * 8254PyUnicode_Translate(PyObject *str, 8255 PyObject *mapping, 8256 const char *errors) 8257{ 8258 PyObject *result; 8259 8260 str = PyUnicode_FromObject(str); 8261 if (str == NULL) 8262 goto onError; 8263 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8264 Py_DECREF(str); 8265 return result; 8266 8267 onError: 8268 Py_XDECREF(str); 8269 return NULL; 8270} 8271 8272static Py_UCS4 8273fix_decimal_and_space_to_ascii(PyObject *self) 8274{ 8275 /* No need to call PyUnicode_READY(self) because this function is only 8276 called as a callback from fixup() which does it already. */ 8277 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8278 const int kind = PyUnicode_KIND(self); 8279 void *data = PyUnicode_DATA(self); 8280 Py_UCS4 maxchar = 0, ch, fixed; 8281 Py_ssize_t i; 8282 8283 for (i = 0; i < len; ++i) { 8284 ch = PyUnicode_READ(kind, data, i); 8285 fixed = 0; 8286 if (ch > 127) { 8287 if (Py_UNICODE_ISSPACE(ch)) 8288 fixed = ' '; 8289 else { 8290 const int decimal = Py_UNICODE_TODECIMAL(ch); 8291 if (decimal >= 0) 8292 fixed = '0' + decimal; 8293 } 8294 if (fixed != 0) { 8295 if (fixed > maxchar) 8296 maxchar = fixed; 8297 PyUnicode_WRITE(kind, data, i, fixed); 8298 } 8299 else if (ch > maxchar) 8300 maxchar = ch; 8301 } 8302 else if (ch > maxchar) 8303 maxchar = ch; 8304 } 8305 8306 return maxchar; 8307} 8308 8309PyObject * 8310_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8311{ 8312 if (!PyUnicode_Check(unicode)) { 8313 PyErr_BadInternalCall(); 8314 return NULL; 8315 } 8316 if (PyUnicode_READY(unicode) == -1) 8317 return NULL; 8318 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8319 /* If the string is already ASCII, just return the same string */ 8320 Py_INCREF(unicode); 8321 return unicode; 8322 } 8323 return fixup(unicode, fix_decimal_and_space_to_ascii); 8324} 8325 8326PyObject * 8327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8328 Py_ssize_t length) 8329{ 8330 PyObject *result; 8331 Py_UNICODE *p; /* write pointer into result */ 8332 Py_ssize_t i; 8333 /* Copy to a new string */ 8334 result = (PyObject *)_PyUnicode_New(length); 8335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8336 if (result == NULL) 8337 return result; 8338 p = PyUnicode_AS_UNICODE(result); 8339 /* Iterate over code points */ 8340 for (i = 0; i < length; i++) { 8341 Py_UNICODE ch =s[i]; 8342 if (ch > 127) { 8343 int decimal = Py_UNICODE_TODECIMAL(ch); 8344 if (decimal >= 0) 8345 p[i] = '0' + decimal; 8346 } 8347 } 8348#ifndef DONT_MAKE_RESULT_READY 8349 if (_PyUnicode_READY_REPLACE(&result)) { 8350 Py_DECREF(result); 8351 return NULL; 8352 } 8353#endif 8354 assert(_PyUnicode_CheckConsistency(result, 1)); 8355 return result; 8356} 8357/* --- Decimal Encoder ---------------------------------------------------- */ 8358 8359int 8360PyUnicode_EncodeDecimal(Py_UNICODE *s, 8361 Py_ssize_t length, 8362 char *output, 8363 const char *errors) 8364{ 8365 Py_UNICODE *p, *end; 8366 PyObject *errorHandler = NULL; 8367 PyObject *exc = NULL; 8368 const char *encoding = "decimal"; 8369 const char *reason = "invalid decimal Unicode string"; 8370 /* the following variable is used for caching string comparisons 8371 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8372 int known_errorHandler = -1; 8373 8374 if (output == NULL) { 8375 PyErr_BadArgument(); 8376 return -1; 8377 } 8378 8379 p = s; 8380 end = s + length; 8381 while (p < end) { 8382 register Py_UNICODE ch = *p; 8383 int decimal; 8384 PyObject *repunicode; 8385 Py_ssize_t repsize; 8386 Py_ssize_t newpos; 8387 Py_UNICODE *uni2; 8388 Py_UNICODE *collstart; 8389 Py_UNICODE *collend; 8390 8391 if (Py_UNICODE_ISSPACE(ch)) { 8392 *output++ = ' '; 8393 ++p; 8394 continue; 8395 } 8396 decimal = Py_UNICODE_TODECIMAL(ch); 8397 if (decimal >= 0) { 8398 *output++ = '0' + decimal; 8399 ++p; 8400 continue; 8401 } 8402 if (0 < ch && ch < 256) { 8403 *output++ = (char)ch; 8404 ++p; 8405 continue; 8406 } 8407 /* All other characters are considered unencodable */ 8408 collstart = p; 8409 collend = p+1; 8410 while (collend < end) { 8411 if ((0 < *collend && *collend < 256) || 8412 !Py_UNICODE_ISSPACE(*collend) || 8413 Py_UNICODE_TODECIMAL(*collend)) 8414 break; 8415 } 8416 /* cache callback name lookup 8417 * (if not done yet, i.e. it's the first error) */ 8418 if (known_errorHandler==-1) { 8419 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8420 known_errorHandler = 1; 8421 else if (!strcmp(errors, "replace")) 8422 known_errorHandler = 2; 8423 else if (!strcmp(errors, "ignore")) 8424 known_errorHandler = 3; 8425 else if (!strcmp(errors, "xmlcharrefreplace")) 8426 known_errorHandler = 4; 8427 else 8428 known_errorHandler = 0; 8429 } 8430 switch (known_errorHandler) { 8431 case 1: /* strict */ 8432 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8433 goto onError; 8434 case 2: /* replace */ 8435 for (p = collstart; p < collend; ++p) 8436 *output++ = '?'; 8437 /* fall through */ 8438 case 3: /* ignore */ 8439 p = collend; 8440 break; 8441 case 4: /* xmlcharrefreplace */ 8442 /* generate replacement (temporarily (mis)uses p) */ 8443 for (p = collstart; p < collend; ++p) 8444 output += sprintf(output, "&#%d;", (int)*p); 8445 p = collend; 8446 break; 8447 default: 8448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8449 encoding, reason, s, length, &exc, 8450 collstart-s, collend-s, &newpos); 8451 if (repunicode == NULL) 8452 goto onError; 8453 if (!PyUnicode_Check(repunicode)) { 8454 /* Byte results not supported, since they have no decimal property. */ 8455 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8456 Py_DECREF(repunicode); 8457 goto onError; 8458 } 8459 /* generate replacement */ 8460 repsize = PyUnicode_GET_SIZE(repunicode); 8461 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8462 Py_UNICODE ch = *uni2; 8463 if (Py_UNICODE_ISSPACE(ch)) 8464 *output++ = ' '; 8465 else { 8466 decimal = Py_UNICODE_TODECIMAL(ch); 8467 if (decimal >= 0) 8468 *output++ = '0' + decimal; 8469 else if (0 < ch && ch < 256) 8470 *output++ = (char)ch; 8471 else { 8472 Py_DECREF(repunicode); 8473 raise_encode_exception(&exc, encoding, 8474 s, length, collstart-s, collend-s, reason); 8475 goto onError; 8476 } 8477 } 8478 } 8479 p = s + newpos; 8480 Py_DECREF(repunicode); 8481 } 8482 } 8483 /* 0-terminate the output string */ 8484 *output++ = '\0'; 8485 Py_XDECREF(exc); 8486 Py_XDECREF(errorHandler); 8487 return 0; 8488 8489 onError: 8490 Py_XDECREF(exc); 8491 Py_XDECREF(errorHandler); 8492 return -1; 8493} 8494 8495/* --- Helpers ------------------------------------------------------------ */ 8496 8497#include "stringlib/asciilib.h" 8498#include "stringlib/fastsearch.h" 8499#include "stringlib/partition.h" 8500#include "stringlib/split.h" 8501#include "stringlib/count.h" 8502#include "stringlib/find.h" 8503#include "stringlib/localeutil.h" 8504#include "stringlib/undef.h" 8505 8506#include "stringlib/ucs1lib.h" 8507#include "stringlib/fastsearch.h" 8508#include "stringlib/partition.h" 8509#include "stringlib/split.h" 8510#include "stringlib/count.h" 8511#include "stringlib/find.h" 8512#include "stringlib/localeutil.h" 8513#include "stringlib/undef.h" 8514 8515#include "stringlib/ucs2lib.h" 8516#include "stringlib/fastsearch.h" 8517#include "stringlib/partition.h" 8518#include "stringlib/split.h" 8519#include "stringlib/count.h" 8520#include "stringlib/find.h" 8521#include "stringlib/localeutil.h" 8522#include "stringlib/undef.h" 8523 8524#include "stringlib/ucs4lib.h" 8525#include "stringlib/fastsearch.h" 8526#include "stringlib/partition.h" 8527#include "stringlib/split.h" 8528#include "stringlib/count.h" 8529#include "stringlib/find.h" 8530#include "stringlib/localeutil.h" 8531#include "stringlib/undef.h" 8532 8533static Py_ssize_t 8534any_find_slice(int direction, PyObject* s1, PyObject* s2, 8535 Py_ssize_t start, 8536 Py_ssize_t end) 8537{ 8538 int kind1, kind2, kind; 8539 void *buf1, *buf2; 8540 Py_ssize_t len1, len2, result; 8541 8542 kind1 = PyUnicode_KIND(s1); 8543 kind2 = PyUnicode_KIND(s2); 8544 kind = kind1 > kind2 ? kind1 : kind2; 8545 buf1 = PyUnicode_DATA(s1); 8546 buf2 = PyUnicode_DATA(s2); 8547 if (kind1 != kind) 8548 buf1 = _PyUnicode_AsKind(s1, kind); 8549 if (!buf1) 8550 return -2; 8551 if (kind2 != kind) 8552 buf2 = _PyUnicode_AsKind(s2, kind); 8553 if (!buf2) { 8554 if (kind1 != kind) PyMem_Free(buf1); 8555 return -2; 8556 } 8557 len1 = PyUnicode_GET_LENGTH(s1); 8558 len2 = PyUnicode_GET_LENGTH(s2); 8559 8560 if (direction > 0) { 8561 switch(kind) { 8562 case PyUnicode_1BYTE_KIND: 8563 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8564 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8565 else 8566 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8567 break; 8568 case PyUnicode_2BYTE_KIND: 8569 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8570 break; 8571 case PyUnicode_4BYTE_KIND: 8572 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8573 break; 8574 default: 8575 assert(0); result = -2; 8576 } 8577 } 8578 else { 8579 switch(kind) { 8580 case PyUnicode_1BYTE_KIND: 8581 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8582 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8583 else 8584 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8585 break; 8586 case PyUnicode_2BYTE_KIND: 8587 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8588 break; 8589 case PyUnicode_4BYTE_KIND: 8590 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8591 break; 8592 default: 8593 assert(0); result = -2; 8594 } 8595 } 8596 8597 if (kind1 != kind) 8598 PyMem_Free(buf1); 8599 if (kind2 != kind) 8600 PyMem_Free(buf2); 8601 8602 return result; 8603} 8604 8605Py_ssize_t 8606_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, 8607 Py_ssize_t n_buffer, 8608 void *digits, Py_ssize_t n_digits, 8609 Py_ssize_t min_width, 8610 const char *grouping, 8611 const char *thousands_sep) 8612{ 8613 switch(kind) { 8614 case PyUnicode_1BYTE_KIND: 8615 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8616 return _PyUnicode_ascii_InsertThousandsGrouping( 8617 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8618 min_width, grouping, thousands_sep); 8619 else 8620 return _PyUnicode_ucs1_InsertThousandsGrouping( 8621 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8622 min_width, grouping, thousands_sep); 8623 case PyUnicode_2BYTE_KIND: 8624 return _PyUnicode_ucs2_InsertThousandsGrouping( 8625 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8626 min_width, grouping, thousands_sep); 8627 case PyUnicode_4BYTE_KIND: 8628 return _PyUnicode_ucs4_InsertThousandsGrouping( 8629 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8630 min_width, grouping, thousands_sep); 8631 } 8632 assert(0); 8633 return -1; 8634} 8635 8636 8637#include "stringlib/unicodedefs.h" 8638#include "stringlib/fastsearch.h" 8639 8640#include "stringlib/count.h" 8641#include "stringlib/find.h" 8642 8643/* helper macro to fixup start/end slice values */ 8644#define ADJUST_INDICES(start, end, len) \ 8645 if (end > len) \ 8646 end = len; \ 8647 else if (end < 0) { \ 8648 end += len; \ 8649 if (end < 0) \ 8650 end = 0; \ 8651 } \ 8652 if (start < 0) { \ 8653 start += len; \ 8654 if (start < 0) \ 8655 start = 0; \ 8656 } 8657 8658Py_ssize_t 8659PyUnicode_Count(PyObject *str, 8660 PyObject *substr, 8661 Py_ssize_t start, 8662 Py_ssize_t end) 8663{ 8664 Py_ssize_t result; 8665 PyUnicodeObject* str_obj; 8666 PyUnicodeObject* sub_obj; 8667 int kind1, kind2, kind; 8668 void *buf1 = NULL, *buf2 = NULL; 8669 Py_ssize_t len1, len2; 8670 8671 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8672 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8673 return -1; 8674 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8675 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8676 Py_DECREF(str_obj); 8677 return -1; 8678 } 8679 8680 kind1 = PyUnicode_KIND(str_obj); 8681 kind2 = PyUnicode_KIND(sub_obj); 8682 kind = kind1 > kind2 ? kind1 : kind2; 8683 buf1 = PyUnicode_DATA(str_obj); 8684 if (kind1 != kind) 8685 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8686 if (!buf1) 8687 goto onError; 8688 buf2 = PyUnicode_DATA(sub_obj); 8689 if (kind2 != kind) 8690 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8691 if (!buf2) 8692 goto onError; 8693 len1 = PyUnicode_GET_LENGTH(str_obj); 8694 len2 = PyUnicode_GET_LENGTH(sub_obj); 8695 8696 ADJUST_INDICES(start, end, len1); 8697 switch(kind) { 8698 case PyUnicode_1BYTE_KIND: 8699 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8700 result = asciilib_count( 8701 ((Py_UCS1*)buf1) + start, end - start, 8702 buf2, len2, PY_SSIZE_T_MAX 8703 ); 8704 else 8705 result = ucs1lib_count( 8706 ((Py_UCS1*)buf1) + start, end - start, 8707 buf2, len2, PY_SSIZE_T_MAX 8708 ); 8709 break; 8710 case PyUnicode_2BYTE_KIND: 8711 result = ucs2lib_count( 8712 ((Py_UCS2*)buf1) + start, end - start, 8713 buf2, len2, PY_SSIZE_T_MAX 8714 ); 8715 break; 8716 case PyUnicode_4BYTE_KIND: 8717 result = ucs4lib_count( 8718 ((Py_UCS4*)buf1) + start, end - start, 8719 buf2, len2, PY_SSIZE_T_MAX 8720 ); 8721 break; 8722 default: 8723 assert(0); result = 0; 8724 } 8725 8726 Py_DECREF(sub_obj); 8727 Py_DECREF(str_obj); 8728 8729 if (kind1 != kind) 8730 PyMem_Free(buf1); 8731 if (kind2 != kind) 8732 PyMem_Free(buf2); 8733 8734 return result; 8735 onError: 8736 Py_DECREF(sub_obj); 8737 Py_DECREF(str_obj); 8738 if (kind1 != kind && buf1) 8739 PyMem_Free(buf1); 8740 if (kind2 != kind && buf2) 8741 PyMem_Free(buf2); 8742 return -1; 8743} 8744 8745Py_ssize_t 8746PyUnicode_Find(PyObject *str, 8747 PyObject *sub, 8748 Py_ssize_t start, 8749 Py_ssize_t end, 8750 int direction) 8751{ 8752 Py_ssize_t result; 8753 8754 str = PyUnicode_FromObject(str); 8755 if (!str || PyUnicode_READY(str) == -1) 8756 return -2; 8757 sub = PyUnicode_FromObject(sub); 8758 if (!sub || PyUnicode_READY(sub) == -1) { 8759 Py_DECREF(str); 8760 return -2; 8761 } 8762 8763 result = any_find_slice(direction, 8764 str, sub, start, end 8765 ); 8766 8767 Py_DECREF(str); 8768 Py_DECREF(sub); 8769 8770 return result; 8771} 8772 8773Py_ssize_t 8774PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8775 Py_ssize_t start, Py_ssize_t end, 8776 int direction) 8777{ 8778 char *result; 8779 int kind; 8780 if (PyUnicode_READY(str) == -1) 8781 return -2; 8782 if (start < 0 || end < 0) { 8783 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8784 return -2; 8785 } 8786 if (end > PyUnicode_GET_LENGTH(str)) 8787 end = PyUnicode_GET_LENGTH(str); 8788 kind = PyUnicode_KIND(str); 8789 result = findchar(PyUnicode_1BYTE_DATA(str) 8790 + kind*start, 8791 kind, 8792 end-start, ch, direction); 8793 if (!result) 8794 return -1; 8795 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8796} 8797 8798static int 8799tailmatch(PyUnicodeObject *self, 8800 PyUnicodeObject *substring, 8801 Py_ssize_t start, 8802 Py_ssize_t end, 8803 int direction) 8804{ 8805 int kind_self; 8806 int kind_sub; 8807 void *data_self; 8808 void *data_sub; 8809 Py_ssize_t offset; 8810 Py_ssize_t i; 8811 Py_ssize_t end_sub; 8812 8813 if (PyUnicode_READY(self) == -1 || 8814 PyUnicode_READY(substring) == -1) 8815 return 0; 8816 8817 if (PyUnicode_GET_LENGTH(substring) == 0) 8818 return 1; 8819 8820 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8821 end -= PyUnicode_GET_LENGTH(substring); 8822 if (end < start) 8823 return 0; 8824 8825 kind_self = PyUnicode_KIND(self); 8826 data_self = PyUnicode_DATA(self); 8827 kind_sub = PyUnicode_KIND(substring); 8828 data_sub = PyUnicode_DATA(substring); 8829 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8830 8831 if (direction > 0) 8832 offset = end; 8833 else 8834 offset = start; 8835 8836 if (PyUnicode_READ(kind_self, data_self, offset) == 8837 PyUnicode_READ(kind_sub, data_sub, 0) && 8838 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8839 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8840 /* If both are of the same kind, memcmp is sufficient */ 8841 if (kind_self == kind_sub) { 8842 return ! memcmp((char *)data_self + 8843 (offset * PyUnicode_KIND(substring)), 8844 data_sub, 8845 PyUnicode_GET_LENGTH(substring) * 8846 PyUnicode_KIND(substring)); 8847 } 8848 /* otherwise we have to compare each character by first accesing it */ 8849 else { 8850 /* We do not need to compare 0 and len(substring)-1 because 8851 the if statement above ensured already that they are equal 8852 when we end up here. */ 8853 // TODO: honor direction and do a forward or backwards search 8854 for (i = 1; i < end_sub; ++i) { 8855 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8856 PyUnicode_READ(kind_sub, data_sub, i)) 8857 return 0; 8858 } 8859 return 1; 8860 } 8861 } 8862 8863 return 0; 8864} 8865 8866Py_ssize_t 8867PyUnicode_Tailmatch(PyObject *str, 8868 PyObject *substr, 8869 Py_ssize_t start, 8870 Py_ssize_t end, 8871 int direction) 8872{ 8873 Py_ssize_t result; 8874 8875 str = PyUnicode_FromObject(str); 8876 if (str == NULL) 8877 return -1; 8878 substr = PyUnicode_FromObject(substr); 8879 if (substr == NULL) { 8880 Py_DECREF(str); 8881 return -1; 8882 } 8883 8884 result = tailmatch((PyUnicodeObject *)str, 8885 (PyUnicodeObject *)substr, 8886 start, end, direction); 8887 Py_DECREF(str); 8888 Py_DECREF(substr); 8889 return result; 8890} 8891 8892/* Apply fixfct filter to the Unicode object self and return a 8893 reference to the modified object */ 8894 8895static PyObject * 8896fixup(PyObject *self, 8897 Py_UCS4 (*fixfct)(PyObject *s)) 8898{ 8899 PyObject *u; 8900 Py_UCS4 maxchar_old, maxchar_new = 0; 8901 8902 if (PyUnicode_READY(self) == -1) 8903 return NULL; 8904 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8905 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8906 maxchar_old); 8907 if (u == NULL) 8908 return NULL; 8909 8910 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8911 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u)); 8912 8913 /* fix functions return the new maximum character in a string, 8914 if the kind of the resulting unicode object does not change, 8915 everything is fine. Otherwise we need to change the string kind 8916 and re-run the fix function. */ 8917 maxchar_new = fixfct(u); 8918 if (maxchar_new == 0) 8919 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8920 else if (maxchar_new <= 127) 8921 maxchar_new = 127; 8922 else if (maxchar_new <= 255) 8923 maxchar_new = 255; 8924 else if (maxchar_new <= 65535) 8925 maxchar_new = 65535; 8926 else 8927 maxchar_new = 1114111; /* 0x10ffff */ 8928 8929 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8930 /* fixfct should return TRUE if it modified the buffer. If 8931 FALSE, return a reference to the original buffer instead 8932 (to save space, not time) */ 8933 Py_INCREF(self); 8934 Py_DECREF(u); 8935 return (PyObject*) self; 8936 } 8937 else if (maxchar_new == maxchar_old) { 8938 return u; 8939 } 8940 else { 8941 /* In case the maximum character changed, we need to 8942 convert the string to the new category. */ 8943 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8944 if (v == NULL) { 8945 Py_DECREF(u); 8946 return NULL; 8947 } 8948 if (maxchar_new > maxchar_old) { 8949 /* If the maxchar increased so that the kind changed, not all 8950 characters are representable anymore and we need to fix the 8951 string again. This only happens in very few cases. */ 8952 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); 8953 maxchar_old = fixfct(v); 8954 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8955 } 8956 else { 8957 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); 8958 } 8959 8960 Py_DECREF(u); 8961 assert(_PyUnicode_CheckConsistency(v, 1)); 8962 return v; 8963 } 8964} 8965 8966static Py_UCS4 8967fixupper(PyObject *self) 8968{ 8969 /* No need to call PyUnicode_READY(self) because this function is only 8970 called as a callback from fixup() which does it already. */ 8971 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8972 const int kind = PyUnicode_KIND(self); 8973 void *data = PyUnicode_DATA(self); 8974 int touched = 0; 8975 Py_UCS4 maxchar = 0; 8976 Py_ssize_t i; 8977 8978 for (i = 0; i < len; ++i) { 8979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8980 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8981 if (up != ch) { 8982 if (up > maxchar) 8983 maxchar = up; 8984 PyUnicode_WRITE(kind, data, i, up); 8985 touched = 1; 8986 } 8987 else if (ch > maxchar) 8988 maxchar = ch; 8989 } 8990 8991 if (touched) 8992 return maxchar; 8993 else 8994 return 0; 8995} 8996 8997static Py_UCS4 8998fixlower(PyObject *self) 8999{ 9000 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9001 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9002 const int kind = PyUnicode_KIND(self); 9003 void *data = PyUnicode_DATA(self); 9004 int touched = 0; 9005 Py_UCS4 maxchar = 0; 9006 Py_ssize_t i; 9007 9008 for(i = 0; i < len; ++i) { 9009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9010 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9011 if (lo != ch) { 9012 if (lo > maxchar) 9013 maxchar = lo; 9014 PyUnicode_WRITE(kind, data, i, lo); 9015 touched = 1; 9016 } 9017 else if (ch > maxchar) 9018 maxchar = ch; 9019 } 9020 9021 if (touched) 9022 return maxchar; 9023 else 9024 return 0; 9025} 9026 9027static Py_UCS4 9028fixswapcase(PyObject *self) 9029{ 9030 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9031 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9032 const int kind = PyUnicode_KIND(self); 9033 void *data = PyUnicode_DATA(self); 9034 int touched = 0; 9035 Py_UCS4 maxchar = 0; 9036 Py_ssize_t i; 9037 9038 for(i = 0; i < len; ++i) { 9039 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9040 Py_UCS4 nu = 0; 9041 9042 if (Py_UNICODE_ISUPPER(ch)) 9043 nu = Py_UNICODE_TOLOWER(ch); 9044 else if (Py_UNICODE_ISLOWER(ch)) 9045 nu = Py_UNICODE_TOUPPER(ch); 9046 9047 if (nu != 0) { 9048 if (nu > maxchar) 9049 maxchar = nu; 9050 PyUnicode_WRITE(kind, data, i, nu); 9051 touched = 1; 9052 } 9053 else if (ch > maxchar) 9054 maxchar = ch; 9055 } 9056 9057 if (touched) 9058 return maxchar; 9059 else 9060 return 0; 9061} 9062 9063static Py_UCS4 9064fixcapitalize(PyObject *self) 9065{ 9066 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9067 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9068 const int kind = PyUnicode_KIND(self); 9069 void *data = PyUnicode_DATA(self); 9070 int touched = 0; 9071 Py_UCS4 maxchar = 0; 9072 Py_ssize_t i = 0; 9073 Py_UCS4 ch; 9074 9075 if (len == 0) 9076 return 0; 9077 9078 ch = PyUnicode_READ(kind, data, i); 9079 if (!Py_UNICODE_ISUPPER(ch)) { 9080 maxchar = Py_UNICODE_TOUPPER(ch); 9081 PyUnicode_WRITE(kind, data, i, maxchar); 9082 touched = 1; 9083 } 9084 ++i; 9085 for(; i < len; ++i) { 9086 ch = PyUnicode_READ(kind, data, i); 9087 if (!Py_UNICODE_ISLOWER(ch)) { 9088 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9089 if (lo > maxchar) 9090 maxchar = lo; 9091 PyUnicode_WRITE(kind, data, i, lo); 9092 touched = 1; 9093 } 9094 else if (ch > maxchar) 9095 maxchar = ch; 9096 } 9097 9098 if (touched) 9099 return maxchar; 9100 else 9101 return 0; 9102} 9103 9104static Py_UCS4 9105fixtitle(PyObject *self) 9106{ 9107 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9108 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9109 const int kind = PyUnicode_KIND(self); 9110 void *data = PyUnicode_DATA(self); 9111 Py_UCS4 maxchar = 0; 9112 Py_ssize_t i = 0; 9113 int previous_is_cased; 9114 9115 /* Shortcut for single character strings */ 9116 if (len == 1) { 9117 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9118 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 9119 if (ti != ch) { 9120 PyUnicode_WRITE(kind, data, i, ti); 9121 return ti; 9122 } 9123 else 9124 return 0; 9125 } 9126 previous_is_cased = 0; 9127 for(; i < len; ++i) { 9128 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9129 Py_UCS4 nu; 9130 9131 if (previous_is_cased) 9132 nu = Py_UNICODE_TOLOWER(ch); 9133 else 9134 nu = Py_UNICODE_TOTITLE(ch); 9135 9136 if (nu > maxchar) 9137 maxchar = nu; 9138 PyUnicode_WRITE(kind, data, i, nu); 9139 9140 if (Py_UNICODE_ISLOWER(ch) || 9141 Py_UNICODE_ISUPPER(ch) || 9142 Py_UNICODE_ISTITLE(ch)) 9143 previous_is_cased = 1; 9144 else 9145 previous_is_cased = 0; 9146 } 9147 return maxchar; 9148} 9149 9150PyObject * 9151PyUnicode_Join(PyObject *separator, PyObject *seq) 9152{ 9153 PyObject *sep = NULL; 9154 Py_ssize_t seplen; 9155 PyObject *res = NULL; /* the result */ 9156 PyObject *fseq; /* PySequence_Fast(seq) */ 9157 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9158 PyObject **items; 9159 PyObject *item; 9160 Py_ssize_t sz, i, res_offset; 9161 Py_UCS4 maxchar; 9162 Py_UCS4 item_maxchar; 9163 int use_memcpy; 9164 unsigned char *res_data = NULL, *sep_data = NULL; 9165 PyObject *last_obj; 9166 unsigned int kind = 0; 9167 9168 fseq = PySequence_Fast(seq, ""); 9169 if (fseq == NULL) { 9170 return NULL; 9171 } 9172 9173 /* NOTE: the following code can't call back into Python code, 9174 * so we are sure that fseq won't be mutated. 9175 */ 9176 9177 seqlen = PySequence_Fast_GET_SIZE(fseq); 9178 /* If empty sequence, return u"". */ 9179 if (seqlen == 0) { 9180 Py_DECREF(fseq); 9181 Py_INCREF(unicode_empty); 9182 res = unicode_empty; 9183 return res; 9184 } 9185 9186 /* If singleton sequence with an exact Unicode, return that. */ 9187 last_obj = NULL; 9188 items = PySequence_Fast_ITEMS(fseq); 9189 if (seqlen == 1) { 9190 if (PyUnicode_CheckExact(items[0])) { 9191 res = items[0]; 9192 Py_INCREF(res); 9193 Py_DECREF(fseq); 9194 return res; 9195 } 9196 seplen = 0; 9197 maxchar = 0; 9198 } 9199 else { 9200 /* Set up sep and seplen */ 9201 if (separator == NULL) { 9202 /* fall back to a blank space separator */ 9203 sep = PyUnicode_FromOrdinal(' '); 9204 if (!sep) 9205 goto onError; 9206 seplen = 1; 9207 maxchar = 32; 9208 } 9209 else { 9210 if (!PyUnicode_Check(separator)) { 9211 PyErr_Format(PyExc_TypeError, 9212 "separator: expected str instance," 9213 " %.80s found", 9214 Py_TYPE(separator)->tp_name); 9215 goto onError; 9216 } 9217 if (PyUnicode_READY(separator)) 9218 goto onError; 9219 sep = separator; 9220 seplen = PyUnicode_GET_LENGTH(separator); 9221 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9222 /* inc refcount to keep this code path symmetric with the 9223 above case of a blank separator */ 9224 Py_INCREF(sep); 9225 } 9226 last_obj = sep; 9227 } 9228 9229 /* There are at least two things to join, or else we have a subclass 9230 * of str in the sequence. 9231 * Do a pre-pass to figure out the total amount of space we'll 9232 * need (sz), and see whether all argument are strings. 9233 */ 9234 sz = 0; 9235#ifdef Py_DEBUG 9236 use_memcpy = 0; 9237#else 9238 use_memcpy = 1; 9239#endif 9240 for (i = 0; i < seqlen; i++) { 9241 const Py_ssize_t old_sz = sz; 9242 item = items[i]; 9243 if (!PyUnicode_Check(item)) { 9244 PyErr_Format(PyExc_TypeError, 9245 "sequence item %zd: expected str instance," 9246 " %.80s found", 9247 i, Py_TYPE(item)->tp_name); 9248 goto onError; 9249 } 9250 if (PyUnicode_READY(item) == -1) 9251 goto onError; 9252 sz += PyUnicode_GET_LENGTH(item); 9253 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9254 maxchar = Py_MAX(maxchar, item_maxchar); 9255 if (i != 0) 9256 sz += seplen; 9257 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9258 PyErr_SetString(PyExc_OverflowError, 9259 "join() result is too long for a Python string"); 9260 goto onError; 9261 } 9262 if (use_memcpy && last_obj != NULL) { 9263 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9264 use_memcpy = 0; 9265 } 9266 last_obj = item; 9267 } 9268 9269 res = PyUnicode_New(sz, maxchar); 9270 if (res == NULL) 9271 goto onError; 9272 9273 /* Catenate everything. */ 9274#ifdef Py_DEBUG 9275 use_memcpy = 0; 9276#else 9277 if (use_memcpy) { 9278 res_data = PyUnicode_1BYTE_DATA(res); 9279 kind = PyUnicode_KIND(res); 9280 if (seplen != 0) 9281 sep_data = PyUnicode_1BYTE_DATA(sep); 9282 } 9283#endif 9284 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9285 Py_ssize_t itemlen; 9286 item = items[i]; 9287 /* Copy item, and maybe the separator. */ 9288 if (i && seplen != 0) { 9289 if (use_memcpy) { 9290 Py_MEMCPY(res_data, 9291 sep_data, 9292 kind * seplen); 9293 res_data += kind * seplen; 9294 } 9295 else { 9296 copy_characters(res, res_offset, sep, 0, seplen); 9297 res_offset += seplen; 9298 } 9299 } 9300 itemlen = PyUnicode_GET_LENGTH(item); 9301 if (itemlen != 0) { 9302 if (use_memcpy) { 9303 Py_MEMCPY(res_data, 9304 PyUnicode_DATA(item), 9305 kind * itemlen); 9306 res_data += kind * itemlen; 9307 } 9308 else { 9309 copy_characters(res, res_offset, item, 0, itemlen); 9310 res_offset += itemlen; 9311 } 9312 } 9313 } 9314 if (use_memcpy) 9315 assert(res_data == PyUnicode_1BYTE_DATA(res) 9316 + kind * PyUnicode_GET_LENGTH(res)); 9317 else 9318 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9319 9320 Py_DECREF(fseq); 9321 Py_XDECREF(sep); 9322 assert(_PyUnicode_CheckConsistency(res, 1)); 9323 return res; 9324 9325 onError: 9326 Py_DECREF(fseq); 9327 Py_XDECREF(sep); 9328 Py_XDECREF(res); 9329 return NULL; 9330} 9331 9332#define FILL(kind, data, value, start, length) \ 9333 do { \ 9334 Py_ssize_t i_ = 0; \ 9335 assert(kind != PyUnicode_WCHAR_KIND); \ 9336 switch ((kind)) { \ 9337 case PyUnicode_1BYTE_KIND: { \ 9338 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9339 memset(to_, (unsigned char)value, length); \ 9340 break; \ 9341 } \ 9342 case PyUnicode_2BYTE_KIND: { \ 9343 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9344 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9345 break; \ 9346 } \ 9347 default: { \ 9348 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9349 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9350 break; \ 9351 } \ 9352 } \ 9353 } while (0) 9354 9355static PyObject * 9356pad(PyObject *self, 9357 Py_ssize_t left, 9358 Py_ssize_t right, 9359 Py_UCS4 fill) 9360{ 9361 PyObject *u; 9362 Py_UCS4 maxchar; 9363 int kind; 9364 void *data; 9365 9366 if (left < 0) 9367 left = 0; 9368 if (right < 0) 9369 right = 0; 9370 9371 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9372 Py_INCREF(self); 9373 return self; 9374 } 9375 9376 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9377 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9378 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9379 return NULL; 9380 } 9381 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9382 if (fill > maxchar) 9383 maxchar = fill; 9384 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9385 if (!u) 9386 return NULL; 9387 9388 kind = PyUnicode_KIND(u); 9389 data = PyUnicode_DATA(u); 9390 if (left) 9391 FILL(kind, data, fill, 0, left); 9392 if (right) 9393 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9394 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9395 assert(_PyUnicode_CheckConsistency(u, 1)); 9396 return u; 9397} 9398#undef FILL 9399 9400PyObject * 9401PyUnicode_Splitlines(PyObject *string, int keepends) 9402{ 9403 PyObject *list; 9404 9405 string = PyUnicode_FromObject(string); 9406 if (string == NULL || PyUnicode_READY(string) == -1) 9407 return NULL; 9408 9409 switch(PyUnicode_KIND(string)) { 9410 case PyUnicode_1BYTE_KIND: 9411 if (PyUnicode_IS_ASCII(string)) 9412 list = asciilib_splitlines( 9413 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9414 PyUnicode_GET_LENGTH(string), keepends); 9415 else 9416 list = ucs1lib_splitlines( 9417 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9418 PyUnicode_GET_LENGTH(string), keepends); 9419 break; 9420 case PyUnicode_2BYTE_KIND: 9421 list = ucs2lib_splitlines( 9422 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9423 PyUnicode_GET_LENGTH(string), keepends); 9424 break; 9425 case PyUnicode_4BYTE_KIND: 9426 list = ucs4lib_splitlines( 9427 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9428 PyUnicode_GET_LENGTH(string), keepends); 9429 break; 9430 default: 9431 assert(0); 9432 list = 0; 9433 } 9434 Py_DECREF(string); 9435 return list; 9436} 9437 9438static PyObject * 9439split(PyObject *self, 9440 PyObject *substring, 9441 Py_ssize_t maxcount) 9442{ 9443 int kind1, kind2, kind; 9444 void *buf1, *buf2; 9445 Py_ssize_t len1, len2; 9446 PyObject* out; 9447 9448 if (maxcount < 0) 9449 maxcount = PY_SSIZE_T_MAX; 9450 9451 if (PyUnicode_READY(self) == -1) 9452 return NULL; 9453 9454 if (substring == NULL) 9455 switch(PyUnicode_KIND(self)) { 9456 case PyUnicode_1BYTE_KIND: 9457 if (PyUnicode_IS_ASCII(self)) 9458 return asciilib_split_whitespace( 9459 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9460 PyUnicode_GET_LENGTH(self), maxcount 9461 ); 9462 else 9463 return ucs1lib_split_whitespace( 9464 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9465 PyUnicode_GET_LENGTH(self), maxcount 9466 ); 9467 case PyUnicode_2BYTE_KIND: 9468 return ucs2lib_split_whitespace( 9469 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9470 PyUnicode_GET_LENGTH(self), maxcount 9471 ); 9472 case PyUnicode_4BYTE_KIND: 9473 return ucs4lib_split_whitespace( 9474 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9475 PyUnicode_GET_LENGTH(self), maxcount 9476 ); 9477 default: 9478 assert(0); 9479 return NULL; 9480 } 9481 9482 if (PyUnicode_READY(substring) == -1) 9483 return NULL; 9484 9485 kind1 = PyUnicode_KIND(self); 9486 kind2 = PyUnicode_KIND(substring); 9487 kind = kind1 > kind2 ? kind1 : kind2; 9488 buf1 = PyUnicode_DATA(self); 9489 buf2 = PyUnicode_DATA(substring); 9490 if (kind1 != kind) 9491 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9492 if (!buf1) 9493 return NULL; 9494 if (kind2 != kind) 9495 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9496 if (!buf2) { 9497 if (kind1 != kind) PyMem_Free(buf1); 9498 return NULL; 9499 } 9500 len1 = PyUnicode_GET_LENGTH(self); 9501 len2 = PyUnicode_GET_LENGTH(substring); 9502 9503 switch(kind) { 9504 case PyUnicode_1BYTE_KIND: 9505 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9506 out = asciilib_split( 9507 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9508 else 9509 out = ucs1lib_split( 9510 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9511 break; 9512 case PyUnicode_2BYTE_KIND: 9513 out = ucs2lib_split( 9514 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9515 break; 9516 case PyUnicode_4BYTE_KIND: 9517 out = ucs4lib_split( 9518 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9519 break; 9520 default: 9521 out = NULL; 9522 } 9523 if (kind1 != kind) 9524 PyMem_Free(buf1); 9525 if (kind2 != kind) 9526 PyMem_Free(buf2); 9527 return out; 9528} 9529 9530static PyObject * 9531rsplit(PyObject *self, 9532 PyObject *substring, 9533 Py_ssize_t maxcount) 9534{ 9535 int kind1, kind2, kind; 9536 void *buf1, *buf2; 9537 Py_ssize_t len1, len2; 9538 PyObject* out; 9539 9540 if (maxcount < 0) 9541 maxcount = PY_SSIZE_T_MAX; 9542 9543 if (PyUnicode_READY(self) == -1) 9544 return NULL; 9545 9546 if (substring == NULL) 9547 switch(PyUnicode_KIND(self)) { 9548 case PyUnicode_1BYTE_KIND: 9549 if (PyUnicode_IS_ASCII(self)) 9550 return asciilib_rsplit_whitespace( 9551 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9552 PyUnicode_GET_LENGTH(self), maxcount 9553 ); 9554 else 9555 return ucs1lib_rsplit_whitespace( 9556 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9557 PyUnicode_GET_LENGTH(self), maxcount 9558 ); 9559 case PyUnicode_2BYTE_KIND: 9560 return ucs2lib_rsplit_whitespace( 9561 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9562 PyUnicode_GET_LENGTH(self), maxcount 9563 ); 9564 case PyUnicode_4BYTE_KIND: 9565 return ucs4lib_rsplit_whitespace( 9566 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9567 PyUnicode_GET_LENGTH(self), maxcount 9568 ); 9569 default: 9570 assert(0); 9571 return NULL; 9572 } 9573 9574 if (PyUnicode_READY(substring) == -1) 9575 return NULL; 9576 9577 kind1 = PyUnicode_KIND(self); 9578 kind2 = PyUnicode_KIND(substring); 9579 kind = kind1 > kind2 ? kind1 : kind2; 9580 buf1 = PyUnicode_DATA(self); 9581 buf2 = PyUnicode_DATA(substring); 9582 if (kind1 != kind) 9583 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9584 if (!buf1) 9585 return NULL; 9586 if (kind2 != kind) 9587 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9588 if (!buf2) { 9589 if (kind1 != kind) PyMem_Free(buf1); 9590 return NULL; 9591 } 9592 len1 = PyUnicode_GET_LENGTH(self); 9593 len2 = PyUnicode_GET_LENGTH(substring); 9594 9595 switch(kind) { 9596 case PyUnicode_1BYTE_KIND: 9597 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9598 out = asciilib_rsplit( 9599 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9600 else 9601 out = ucs1lib_rsplit( 9602 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9603 break; 9604 case PyUnicode_2BYTE_KIND: 9605 out = ucs2lib_rsplit( 9606 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9607 break; 9608 case PyUnicode_4BYTE_KIND: 9609 out = ucs4lib_rsplit( 9610 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9611 break; 9612 default: 9613 out = NULL; 9614 } 9615 if (kind1 != kind) 9616 PyMem_Free(buf1); 9617 if (kind2 != kind) 9618 PyMem_Free(buf2); 9619 return out; 9620} 9621 9622static Py_ssize_t 9623anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9624 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9625{ 9626 switch(kind) { 9627 case PyUnicode_1BYTE_KIND: 9628 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9629 return asciilib_find(buf1, len1, buf2, len2, offset); 9630 else 9631 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9632 case PyUnicode_2BYTE_KIND: 9633 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9634 case PyUnicode_4BYTE_KIND: 9635 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9636 } 9637 assert(0); 9638 return -1; 9639} 9640 9641static Py_ssize_t 9642anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9643 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9644{ 9645 switch(kind) { 9646 case PyUnicode_1BYTE_KIND: 9647 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 9648 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 9649 else 9650 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9651 case PyUnicode_2BYTE_KIND: 9652 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9653 case PyUnicode_4BYTE_KIND: 9654 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9655 } 9656 assert(0); 9657 return 0; 9658} 9659 9660static PyObject * 9661replace(PyObject *self, PyObject *str1, 9662 PyObject *str2, Py_ssize_t maxcount) 9663{ 9664 PyObject *u; 9665 char *sbuf = PyUnicode_DATA(self); 9666 char *buf1 = PyUnicode_DATA(str1); 9667 char *buf2 = PyUnicode_DATA(str2); 9668 int srelease = 0, release1 = 0, release2 = 0; 9669 int skind = PyUnicode_KIND(self); 9670 int kind1 = PyUnicode_KIND(str1); 9671 int kind2 = PyUnicode_KIND(str2); 9672 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9673 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9674 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9675 9676 if (maxcount < 0) 9677 maxcount = PY_SSIZE_T_MAX; 9678 else if (maxcount == 0 || slen == 0) 9679 goto nothing; 9680 9681 if (str1 == str2) 9682 goto nothing; 9683 if (skind < kind1) 9684 /* substring too wide to be present */ 9685 goto nothing; 9686 9687 if (len1 == len2) { 9688 Py_ssize_t i; 9689 /* same length */ 9690 if (len1 == 0) 9691 goto nothing; 9692 if (len1 == 1) { 9693 /* replace characters */ 9694 Py_UCS4 u1, u2, maxchar; 9695 int mayshrink, rkind; 9696 u1 = PyUnicode_READ_CHAR(str1, 0); 9697 if (!findchar(sbuf, PyUnicode_KIND(self), 9698 slen, u1, 1)) 9699 goto nothing; 9700 u2 = PyUnicode_READ_CHAR(str2, 0); 9701 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9702 /* Replacing u1 with u2 may cause a maxchar reduction in the 9703 result string. */ 9704 if (u2 > maxchar) { 9705 maxchar = u2; 9706 mayshrink = 0; 9707 } 9708 else 9709 mayshrink = maxchar > 127; 9710 u = PyUnicode_New(slen, maxchar); 9711 if (!u) 9712 goto error; 9713 copy_characters(u, 0, self, 0, slen); 9714 rkind = PyUnicode_KIND(u); 9715 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9716 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9717 if (--maxcount < 0) 9718 break; 9719 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9720 } 9721 if (mayshrink) { 9722 unicode_adjust_maxchar(&u); 9723 if (u == NULL) 9724 goto error; 9725 } 9726 } else { 9727 int rkind = skind; 9728 char *res; 9729 PyObject *rstr; 9730 Py_UCS4 maxchar; 9731 9732 if (kind1 < rkind) { 9733 /* widen substring */ 9734 buf1 = _PyUnicode_AsKind(str1, rkind); 9735 if (!buf1) goto error; 9736 release1 = 1; 9737 } 9738 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 9739 if (i < 0) 9740 goto nothing; 9741 if (rkind > kind2) { 9742 /* widen replacement */ 9743 buf2 = _PyUnicode_AsKind(str2, rkind); 9744 if (!buf2) goto error; 9745 release2 = 1; 9746 } 9747 else if (rkind < kind2) { 9748 /* widen self and buf1 */ 9749 rkind = kind2; 9750 if (release1) PyMem_Free(buf1); 9751 sbuf = _PyUnicode_AsKind(self, rkind); 9752 if (!sbuf) goto error; 9753 srelease = 1; 9754 buf1 = _PyUnicode_AsKind(str1, rkind); 9755 if (!buf1) goto error; 9756 release1 = 1; 9757 } 9758 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9759 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); 9760 rstr = PyUnicode_New(slen, maxchar); 9761 if (!rstr) 9762 goto error; 9763 res = PyUnicode_DATA(rstr); 9764 9765 memcpy(res, sbuf, rkind * slen); 9766 /* change everything in-place, starting with this one */ 9767 memcpy(res + rkind * i, 9768 buf2, 9769 rkind * len2); 9770 i += len1; 9771 9772 while ( --maxcount > 0) { 9773 i = anylib_find(rkind, self, 9774 sbuf+rkind*i, slen-i, 9775 str1, buf1, len1, i); 9776 if (i == -1) 9777 break; 9778 memcpy(res + rkind * i, 9779 buf2, 9780 rkind * len2); 9781 i += len1; 9782 } 9783 9784 u = rstr; 9785 unicode_adjust_maxchar(&u); 9786 if (!u) 9787 goto error; 9788 } 9789 } else { 9790 9791 Py_ssize_t n, i, j, ires; 9792 Py_ssize_t product, new_size; 9793 int rkind = skind; 9794 PyObject *rstr; 9795 char *res; 9796 Py_UCS4 maxchar; 9797 9798 if (kind1 < rkind) { 9799 buf1 = _PyUnicode_AsKind(str1, rkind); 9800 if (!buf1) goto error; 9801 release1 = 1; 9802 } 9803 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 9804 if (n == 0) 9805 goto nothing; 9806 if (kind2 < rkind) { 9807 buf2 = _PyUnicode_AsKind(str2, rkind); 9808 if (!buf2) goto error; 9809 release2 = 1; 9810 } 9811 else if (kind2 > rkind) { 9812 rkind = kind2; 9813 sbuf = _PyUnicode_AsKind(self, rkind); 9814 if (!sbuf) goto error; 9815 srelease = 1; 9816 if (release1) PyMem_Free(buf1); 9817 buf1 = _PyUnicode_AsKind(str1, rkind); 9818 if (!buf1) goto error; 9819 release1 = 1; 9820 } 9821 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9822 PyUnicode_GET_LENGTH(str1))); */ 9823 product = n * (len2-len1); 9824 if ((product / (len2-len1)) != n) { 9825 PyErr_SetString(PyExc_OverflowError, 9826 "replace string is too long"); 9827 goto error; 9828 } 9829 new_size = slen + product; 9830 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9831 PyErr_SetString(PyExc_OverflowError, 9832 "replace string is too long"); 9833 goto error; 9834 } 9835 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9836 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); 9837 rstr = PyUnicode_New(new_size, maxchar); 9838 if (!rstr) 9839 goto error; 9840 res = PyUnicode_DATA(rstr); 9841 ires = i = 0; 9842 if (len1 > 0) { 9843 while (n-- > 0) { 9844 /* look for next match */ 9845 j = anylib_find(rkind, self, 9846 sbuf + rkind * i, slen-i, 9847 str1, buf1, len1, i); 9848 if (j == -1) 9849 break; 9850 else if (j > i) { 9851 /* copy unchanged part [i:j] */ 9852 memcpy(res + rkind * ires, 9853 sbuf + rkind * i, 9854 rkind * (j-i)); 9855 ires += j - i; 9856 } 9857 /* copy substitution string */ 9858 if (len2 > 0) { 9859 memcpy(res + rkind * ires, 9860 buf2, 9861 rkind * len2); 9862 ires += len2; 9863 } 9864 i = j + len1; 9865 } 9866 if (i < slen) 9867 /* copy tail [i:] */ 9868 memcpy(res + rkind * ires, 9869 sbuf + rkind * i, 9870 rkind * (slen-i)); 9871 } else { 9872 /* interleave */ 9873 while (n > 0) { 9874 memcpy(res + rkind * ires, 9875 buf2, 9876 rkind * len2); 9877 ires += len2; 9878 if (--n <= 0) 9879 break; 9880 memcpy(res + rkind * ires, 9881 sbuf + rkind * i, 9882 rkind); 9883 ires++; 9884 i++; 9885 } 9886 memcpy(res + rkind * ires, 9887 sbuf + rkind * i, 9888 rkind * (slen-i)); 9889 } 9890 u = rstr; 9891 unicode_adjust_maxchar(&u); 9892 if (u == NULL) 9893 goto error; 9894 } 9895 if (srelease) 9896 PyMem_FREE(sbuf); 9897 if (release1) 9898 PyMem_FREE(buf1); 9899 if (release2) 9900 PyMem_FREE(buf2); 9901 assert(_PyUnicode_CheckConsistency(u, 1)); 9902 return u; 9903 9904 nothing: 9905 /* nothing to replace; return original string (when possible) */ 9906 if (srelease) 9907 PyMem_FREE(sbuf); 9908 if (release1) 9909 PyMem_FREE(buf1); 9910 if (release2) 9911 PyMem_FREE(buf2); 9912 if (PyUnicode_CheckExact(self)) { 9913 Py_INCREF(self); 9914 return (PyObject *) self; 9915 } 9916 return PyUnicode_Copy(self); 9917 error: 9918 if (srelease && sbuf) 9919 PyMem_FREE(sbuf); 9920 if (release1 && buf1) 9921 PyMem_FREE(buf1); 9922 if (release2 && buf2) 9923 PyMem_FREE(buf2); 9924 return NULL; 9925} 9926 9927/* --- Unicode Object Methods --------------------------------------------- */ 9928 9929PyDoc_STRVAR(title__doc__, 9930 "S.title() -> str\n\ 9931\n\ 9932Return a titlecased version of S, i.e. words start with title case\n\ 9933characters, all remaining cased characters have lower case."); 9934 9935static PyObject* 9936unicode_title(PyObject *self) 9937{ 9938 return fixup(self, fixtitle); 9939} 9940 9941PyDoc_STRVAR(capitalize__doc__, 9942 "S.capitalize() -> str\n\ 9943\n\ 9944Return a capitalized version of S, i.e. make the first character\n\ 9945have upper case and the rest lower case."); 9946 9947static PyObject* 9948unicode_capitalize(PyObject *self) 9949{ 9950 return fixup(self, fixcapitalize); 9951} 9952 9953#if 0 9954PyDoc_STRVAR(capwords__doc__, 9955 "S.capwords() -> str\n\ 9956\n\ 9957Apply .capitalize() to all words in S and return the result with\n\ 9958normalized whitespace (all whitespace strings are replaced by ' ')."); 9959 9960static PyObject* 9961unicode_capwords(PyUnicodeObject *self) 9962{ 9963 PyObject *list; 9964 PyObject *item; 9965 Py_ssize_t i; 9966 9967 /* Split into words */ 9968 list = split(self, NULL, -1); 9969 if (!list) 9970 return NULL; 9971 9972 /* Capitalize each word */ 9973 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9974 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9975 fixcapitalize); 9976 if (item == NULL) 9977 goto onError; 9978 Py_DECREF(PyList_GET_ITEM(list, i)); 9979 PyList_SET_ITEM(list, i, item); 9980 } 9981 9982 /* Join the words to form a new string */ 9983 item = PyUnicode_Join(NULL, list); 9984 9985 onError: 9986 Py_DECREF(list); 9987 return (PyObject *)item; 9988} 9989#endif 9990 9991/* Argument converter. Coerces to a single unicode character */ 9992 9993static int 9994convert_uc(PyObject *obj, void *addr) 9995{ 9996 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9997 PyObject *uniobj; 9998 9999 uniobj = PyUnicode_FromObject(obj); 10000 if (uniobj == NULL) { 10001 PyErr_SetString(PyExc_TypeError, 10002 "The fill character cannot be converted to Unicode"); 10003 return 0; 10004 } 10005 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10006 PyErr_SetString(PyExc_TypeError, 10007 "The fill character must be exactly one character long"); 10008 Py_DECREF(uniobj); 10009 return 0; 10010 } 10011 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10012 Py_DECREF(uniobj); 10013 return 1; 10014} 10015 10016PyDoc_STRVAR(center__doc__, 10017 "S.center(width[, fillchar]) -> str\n\ 10018\n\ 10019Return S centered in a string of length width. Padding is\n\ 10020done using the specified fill character (default is a space)"); 10021 10022static PyObject * 10023unicode_center(PyObject *self, PyObject *args) 10024{ 10025 Py_ssize_t marg, left; 10026 Py_ssize_t width; 10027 Py_UCS4 fillchar = ' '; 10028 10029 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10030 return NULL; 10031 10032 if (PyUnicode_READY(self) == -1) 10033 return NULL; 10034 10035 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10036 Py_INCREF(self); 10037 return (PyObject*) self; 10038 } 10039 10040 marg = width - _PyUnicode_LENGTH(self); 10041 left = marg / 2 + (marg & width & 1); 10042 10043 return pad(self, left, marg - left, fillchar); 10044} 10045 10046#if 0 10047 10048/* This code should go into some future Unicode collation support 10049 module. The basic comparison should compare ordinals on a naive 10050 basis (this is what Java does and thus Jython too). */ 10051 10052/* speedy UTF-16 code point order comparison */ 10053/* gleaned from: */ 10054/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 10055 10056static short utf16Fixup[32] = 10057{ 10058 0, 0, 0, 0, 0, 0, 0, 0, 10059 0, 0, 0, 0, 0, 0, 0, 0, 10060 0, 0, 0, 0, 0, 0, 0, 0, 10061 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 10062}; 10063 10064static int 10065unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 10066{ 10067 Py_ssize_t len1, len2; 10068 10069 Py_UNICODE *s1 = str1->str; 10070 Py_UNICODE *s2 = str2->str; 10071 10072 len1 = str1->_base._base.length; 10073 len2 = str2->_base._base.length; 10074 10075 while (len1 > 0 && len2 > 0) { 10076 Py_UNICODE c1, c2; 10077 10078 c1 = *s1++; 10079 c2 = *s2++; 10080 10081 if (c1 > (1<<11) * 26) 10082 c1 += utf16Fixup[c1>>11]; 10083 if (c2 > (1<<11) * 26) 10084 c2 += utf16Fixup[c2>>11]; 10085 /* now c1 and c2 are in UTF-32-compatible order */ 10086 10087 if (c1 != c2) 10088 return (c1 < c2) ? -1 : 1; 10089 10090 len1--; len2--; 10091 } 10092 10093 return (len1 < len2) ? -1 : (len1 != len2); 10094} 10095 10096#else 10097 10098/* This function assumes that str1 and str2 are readied by the caller. */ 10099 10100static int 10101unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 10102{ 10103 int kind1, kind2; 10104 void *data1, *data2; 10105 Py_ssize_t len1, len2, i; 10106 10107 kind1 = PyUnicode_KIND(str1); 10108 kind2 = PyUnicode_KIND(str2); 10109 data1 = PyUnicode_DATA(str1); 10110 data2 = PyUnicode_DATA(str2); 10111 len1 = PyUnicode_GET_LENGTH(str1); 10112 len2 = PyUnicode_GET_LENGTH(str2); 10113 10114 for (i = 0; i < len1 && i < len2; ++i) { 10115 Py_UCS4 c1, c2; 10116 c1 = PyUnicode_READ(kind1, data1, i); 10117 c2 = PyUnicode_READ(kind2, data2, i); 10118 10119 if (c1 != c2) 10120 return (c1 < c2) ? -1 : 1; 10121 } 10122 10123 return (len1 < len2) ? -1 : (len1 != len2); 10124} 10125 10126#endif 10127 10128int 10129PyUnicode_Compare(PyObject *left, PyObject *right) 10130{ 10131 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10132 if (PyUnicode_READY(left) == -1 || 10133 PyUnicode_READY(right) == -1) 10134 return -1; 10135 return unicode_compare((PyUnicodeObject *)left, 10136 (PyUnicodeObject *)right); 10137 } 10138 PyErr_Format(PyExc_TypeError, 10139 "Can't compare %.100s and %.100s", 10140 left->ob_type->tp_name, 10141 right->ob_type->tp_name); 10142 return -1; 10143} 10144 10145int 10146PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10147{ 10148 Py_ssize_t i; 10149 int kind; 10150 void *data; 10151 Py_UCS4 chr; 10152 10153 assert(_PyUnicode_CHECK(uni)); 10154 if (PyUnicode_READY(uni) == -1) 10155 return -1; 10156 kind = PyUnicode_KIND(uni); 10157 data = PyUnicode_DATA(uni); 10158 /* Compare Unicode string and source character set string */ 10159 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10160 if (chr != str[i]) 10161 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10162 /* This check keeps Python strings that end in '\0' from comparing equal 10163 to C strings identical up to that point. */ 10164 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10165 return 1; /* uni is longer */ 10166 if (str[i]) 10167 return -1; /* str is longer */ 10168 return 0; 10169} 10170 10171 10172#define TEST_COND(cond) \ 10173 ((cond) ? Py_True : Py_False) 10174 10175PyObject * 10176PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10177{ 10178 int result; 10179 10180 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10181 PyObject *v; 10182 if (PyUnicode_READY(left) == -1 || 10183 PyUnicode_READY(right) == -1) 10184 return NULL; 10185 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10186 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10187 if (op == Py_EQ) { 10188 Py_INCREF(Py_False); 10189 return Py_False; 10190 } 10191 if (op == Py_NE) { 10192 Py_INCREF(Py_True); 10193 return Py_True; 10194 } 10195 } 10196 if (left == right) 10197 result = 0; 10198 else 10199 result = unicode_compare((PyUnicodeObject *)left, 10200 (PyUnicodeObject *)right); 10201 10202 /* Convert the return value to a Boolean */ 10203 switch (op) { 10204 case Py_EQ: 10205 v = TEST_COND(result == 0); 10206 break; 10207 case Py_NE: 10208 v = TEST_COND(result != 0); 10209 break; 10210 case Py_LE: 10211 v = TEST_COND(result <= 0); 10212 break; 10213 case Py_GE: 10214 v = TEST_COND(result >= 0); 10215 break; 10216 case Py_LT: 10217 v = TEST_COND(result == -1); 10218 break; 10219 case Py_GT: 10220 v = TEST_COND(result == 1); 10221 break; 10222 default: 10223 PyErr_BadArgument(); 10224 return NULL; 10225 } 10226 Py_INCREF(v); 10227 return v; 10228 } 10229 10230 Py_RETURN_NOTIMPLEMENTED; 10231} 10232 10233int 10234PyUnicode_Contains(PyObject *container, PyObject *element) 10235{ 10236 PyObject *str, *sub; 10237 int kind1, kind2, kind; 10238 void *buf1, *buf2; 10239 Py_ssize_t len1, len2; 10240 int result; 10241 10242 /* Coerce the two arguments */ 10243 sub = PyUnicode_FromObject(element); 10244 if (!sub) { 10245 PyErr_Format(PyExc_TypeError, 10246 "'in <string>' requires string as left operand, not %s", 10247 element->ob_type->tp_name); 10248 return -1; 10249 } 10250 if (PyUnicode_READY(sub) == -1) 10251 return -1; 10252 10253 str = PyUnicode_FromObject(container); 10254 if (!str || PyUnicode_READY(str) == -1) { 10255 Py_DECREF(sub); 10256 return -1; 10257 } 10258 10259 kind1 = PyUnicode_KIND(str); 10260 kind2 = PyUnicode_KIND(sub); 10261 kind = kind1 > kind2 ? kind1 : kind2; 10262 buf1 = PyUnicode_DATA(str); 10263 buf2 = PyUnicode_DATA(sub); 10264 if (kind1 != kind) 10265 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 10266 if (!buf1) { 10267 Py_DECREF(sub); 10268 return -1; 10269 } 10270 if (kind2 != kind) 10271 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 10272 if (!buf2) { 10273 Py_DECREF(sub); 10274 if (kind1 != kind) PyMem_Free(buf1); 10275 return -1; 10276 } 10277 len1 = PyUnicode_GET_LENGTH(str); 10278 len2 = PyUnicode_GET_LENGTH(sub); 10279 10280 switch(kind) { 10281 case PyUnicode_1BYTE_KIND: 10282 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10283 break; 10284 case PyUnicode_2BYTE_KIND: 10285 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10286 break; 10287 case PyUnicode_4BYTE_KIND: 10288 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10289 break; 10290 default: 10291 result = -1; 10292 assert(0); 10293 } 10294 10295 Py_DECREF(str); 10296 Py_DECREF(sub); 10297 10298 if (kind1 != kind) 10299 PyMem_Free(buf1); 10300 if (kind2 != kind) 10301 PyMem_Free(buf2); 10302 10303 return result; 10304} 10305 10306/* Concat to string or Unicode object giving a new Unicode object. */ 10307 10308PyObject * 10309PyUnicode_Concat(PyObject *left, PyObject *right) 10310{ 10311 PyObject *u = NULL, *v = NULL, *w; 10312 Py_UCS4 maxchar; 10313 10314 /* Coerce the two arguments */ 10315 u = PyUnicode_FromObject(left); 10316 if (u == NULL) 10317 goto onError; 10318 v = PyUnicode_FromObject(right); 10319 if (v == NULL) 10320 goto onError; 10321 10322 /* Shortcuts */ 10323 if (v == unicode_empty) { 10324 Py_DECREF(v); 10325 return u; 10326 } 10327 if (u == unicode_empty) { 10328 Py_DECREF(u); 10329 return v; 10330 } 10331 10332 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10333 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 10334 10335 /* Concat the two Unicode strings */ 10336 w = PyUnicode_New( 10337 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10338 maxchar); 10339 if (w == NULL) 10340 goto onError; 10341 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); 10342 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); 10343 Py_DECREF(u); 10344 Py_DECREF(v); 10345 assert(_PyUnicode_CheckConsistency(w, 1)); 10346 return w; 10347 10348 onError: 10349 Py_XDECREF(u); 10350 Py_XDECREF(v); 10351 return NULL; 10352} 10353 10354static void 10355unicode_append_inplace(PyObject **p_left, PyObject *right) 10356{ 10357 Py_ssize_t left_len, right_len, new_len; 10358 10359 assert(PyUnicode_IS_READY(*p_left)); 10360 assert(PyUnicode_IS_READY(right)); 10361 10362 left_len = PyUnicode_GET_LENGTH(*p_left); 10363 right_len = PyUnicode_GET_LENGTH(right); 10364 if (left_len > PY_SSIZE_T_MAX - right_len) { 10365 PyErr_SetString(PyExc_OverflowError, 10366 "strings are too large to concat"); 10367 goto error; 10368 } 10369 new_len = left_len + right_len; 10370 10371 /* Now we own the last reference to 'left', so we can resize it 10372 * in-place. 10373 */ 10374 if (unicode_resize(p_left, new_len) != 0) { 10375 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10376 * deallocated so it cannot be put back into 10377 * 'variable'. The MemoryError is raised when there 10378 * is no value in 'variable', which might (very 10379 * remotely) be a cause of incompatibilities. 10380 */ 10381 goto error; 10382 } 10383 /* copy 'right' into the newly allocated area of 'left' */ 10384 copy_characters(*p_left, left_len, right, 0, right_len); 10385 _PyUnicode_DIRTY(*p_left); 10386 return; 10387 10388error: 10389 Py_DECREF(*p_left); 10390 *p_left = NULL; 10391} 10392 10393void 10394PyUnicode_Append(PyObject **p_left, PyObject *right) 10395{ 10396 PyObject *left, *res; 10397 10398 if (p_left == NULL) { 10399 if (!PyErr_Occurred()) 10400 PyErr_BadInternalCall(); 10401 return; 10402 } 10403 left = *p_left; 10404 if (right == NULL || !PyUnicode_Check(left)) { 10405 if (!PyErr_Occurred()) 10406 PyErr_BadInternalCall(); 10407 goto error; 10408 } 10409 10410 if (PyUnicode_READY(left)) 10411 goto error; 10412 if (PyUnicode_READY(right)) 10413 goto error; 10414 10415 if (PyUnicode_CheckExact(left) && left != unicode_empty 10416 && PyUnicode_CheckExact(right) && right != unicode_empty 10417 && unicode_resizable(left) 10418 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10419 || _PyUnicode_WSTR(left) != NULL)) 10420 { 10421 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10422 to change the structure size, but characters are stored just after 10423 the structure, and so it requires to move all characters which is 10424 not so different than duplicating the string. */ 10425 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10426 { 10427 unicode_append_inplace(p_left, right); 10428 if (p_left != NULL) 10429 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10430 return; 10431 } 10432 } 10433 10434 res = PyUnicode_Concat(left, right); 10435 if (res == NULL) 10436 goto error; 10437 Py_DECREF(left); 10438 *p_left = res; 10439 return; 10440 10441error: 10442 Py_DECREF(*p_left); 10443 *p_left = NULL; 10444} 10445 10446void 10447PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10448{ 10449 PyUnicode_Append(pleft, right); 10450 Py_XDECREF(right); 10451} 10452 10453PyDoc_STRVAR(count__doc__, 10454 "S.count(sub[, start[, end]]) -> int\n\ 10455\n\ 10456Return the number of non-overlapping occurrences of substring sub in\n\ 10457string S[start:end]. Optional arguments start and end are\n\ 10458interpreted as in slice notation."); 10459 10460static PyObject * 10461unicode_count(PyUnicodeObject *self, PyObject *args) 10462{ 10463 PyUnicodeObject *substring; 10464 Py_ssize_t start = 0; 10465 Py_ssize_t end = PY_SSIZE_T_MAX; 10466 PyObject *result; 10467 int kind1, kind2, kind; 10468 void *buf1, *buf2; 10469 Py_ssize_t len1, len2, iresult; 10470 10471 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10472 &start, &end)) 10473 return NULL; 10474 10475 kind1 = PyUnicode_KIND(self); 10476 kind2 = PyUnicode_KIND(substring); 10477 kind = kind1 > kind2 ? kind1 : kind2; 10478 buf1 = PyUnicode_DATA(self); 10479 buf2 = PyUnicode_DATA(substring); 10480 if (kind1 != kind) 10481 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10482 if (!buf1) { 10483 Py_DECREF(substring); 10484 return NULL; 10485 } 10486 if (kind2 != kind) 10487 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10488 if (!buf2) { 10489 Py_DECREF(substring); 10490 if (kind1 != kind) PyMem_Free(buf1); 10491 return NULL; 10492 } 10493 len1 = PyUnicode_GET_LENGTH(self); 10494 len2 = PyUnicode_GET_LENGTH(substring); 10495 10496 ADJUST_INDICES(start, end, len1); 10497 switch(kind) { 10498 case PyUnicode_1BYTE_KIND: 10499 iresult = ucs1lib_count( 10500 ((Py_UCS1*)buf1) + start, end - start, 10501 buf2, len2, PY_SSIZE_T_MAX 10502 ); 10503 break; 10504 case PyUnicode_2BYTE_KIND: 10505 iresult = ucs2lib_count( 10506 ((Py_UCS2*)buf1) + start, end - start, 10507 buf2, len2, PY_SSIZE_T_MAX 10508 ); 10509 break; 10510 case PyUnicode_4BYTE_KIND: 10511 iresult = ucs4lib_count( 10512 ((Py_UCS4*)buf1) + start, end - start, 10513 buf2, len2, PY_SSIZE_T_MAX 10514 ); 10515 break; 10516 default: 10517 assert(0); iresult = 0; 10518 } 10519 10520 result = PyLong_FromSsize_t(iresult); 10521 10522 if (kind1 != kind) 10523 PyMem_Free(buf1); 10524 if (kind2 != kind) 10525 PyMem_Free(buf2); 10526 10527 Py_DECREF(substring); 10528 10529 return result; 10530} 10531 10532PyDoc_STRVAR(encode__doc__, 10533 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10534\n\ 10535Encode S using the codec registered for encoding. Default encoding\n\ 10536is 'utf-8'. errors may be given to set a different error\n\ 10537handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10538a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10539'xmlcharrefreplace' as well as any other name registered with\n\ 10540codecs.register_error that can handle UnicodeEncodeErrors."); 10541 10542static PyObject * 10543unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10544{ 10545 static char *kwlist[] = {"encoding", "errors", 0}; 10546 char *encoding = NULL; 10547 char *errors = NULL; 10548 10549 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10550 kwlist, &encoding, &errors)) 10551 return NULL; 10552 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10553} 10554 10555PyDoc_STRVAR(expandtabs__doc__, 10556 "S.expandtabs([tabsize]) -> str\n\ 10557\n\ 10558Return a copy of S where all tab characters are expanded using spaces.\n\ 10559If tabsize is not given, a tab size of 8 characters is assumed."); 10560 10561static PyObject* 10562unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10563{ 10564 Py_ssize_t i, j, line_pos, src_len, incr; 10565 Py_UCS4 ch; 10566 PyObject *u; 10567 void *src_data, *dest_data; 10568 int tabsize = 8; 10569 int kind; 10570 int found; 10571 10572 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10573 return NULL; 10574 10575 if (PyUnicode_READY(self) == -1) 10576 return NULL; 10577 10578 /* First pass: determine size of output string */ 10579 src_len = PyUnicode_GET_LENGTH(self); 10580 i = j = line_pos = 0; 10581 kind = PyUnicode_KIND(self); 10582 src_data = PyUnicode_DATA(self); 10583 found = 0; 10584 for (; i < src_len; i++) { 10585 ch = PyUnicode_READ(kind, src_data, i); 10586 if (ch == '\t') { 10587 found = 1; 10588 if (tabsize > 0) { 10589 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10590 if (j > PY_SSIZE_T_MAX - incr) 10591 goto overflow; 10592 line_pos += incr; 10593 j += incr; 10594 } 10595 } 10596 else { 10597 if (j > PY_SSIZE_T_MAX - 1) 10598 goto overflow; 10599 line_pos++; 10600 j++; 10601 if (ch == '\n' || ch == '\r') 10602 line_pos = 0; 10603 } 10604 } 10605 if (!found && PyUnicode_CheckExact(self)) { 10606 Py_INCREF((PyObject *) self); 10607 return (PyObject *) self; 10608 } 10609 10610 /* Second pass: create output string and fill it */ 10611 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10612 if (!u) 10613 return NULL; 10614 dest_data = PyUnicode_DATA(u); 10615 10616 i = j = line_pos = 0; 10617 10618 for (; i < src_len; i++) { 10619 ch = PyUnicode_READ(kind, src_data, i); 10620 if (ch == '\t') { 10621 if (tabsize > 0) { 10622 incr = tabsize - (line_pos % tabsize); 10623 line_pos += incr; 10624 while (incr--) { 10625 PyUnicode_WRITE(kind, dest_data, j, ' '); 10626 j++; 10627 } 10628 } 10629 } 10630 else { 10631 line_pos++; 10632 PyUnicode_WRITE(kind, dest_data, j, ch); 10633 j++; 10634 if (ch == '\n' || ch == '\r') 10635 line_pos = 0; 10636 } 10637 } 10638 assert (j == PyUnicode_GET_LENGTH(u)); 10639#ifndef DONT_MAKE_RESULT_READY 10640 if (_PyUnicode_READY_REPLACE(&u)) { 10641 Py_DECREF(u); 10642 return NULL; 10643 } 10644#endif 10645 assert(_PyUnicode_CheckConsistency(u, 1)); 10646 return (PyObject*) u; 10647 10648 overflow: 10649 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10650 return NULL; 10651} 10652 10653PyDoc_STRVAR(find__doc__, 10654 "S.find(sub[, start[, end]]) -> int\n\ 10655\n\ 10656Return the lowest index in S where substring sub is found,\n\ 10657such that sub is contained within S[start:end]. Optional\n\ 10658arguments start and end are interpreted as in slice notation.\n\ 10659\n\ 10660Return -1 on failure."); 10661 10662static PyObject * 10663unicode_find(PyObject *self, PyObject *args) 10664{ 10665 PyUnicodeObject *substring; 10666 Py_ssize_t start; 10667 Py_ssize_t end; 10668 Py_ssize_t result; 10669 10670 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10671 &start, &end)) 10672 return NULL; 10673 10674 if (PyUnicode_READY(self) == -1) 10675 return NULL; 10676 if (PyUnicode_READY(substring) == -1) 10677 return NULL; 10678 10679 result = any_find_slice(1, 10680 self, (PyObject*)substring, start, end 10681 ); 10682 10683 Py_DECREF(substring); 10684 10685 if (result == -2) 10686 return NULL; 10687 10688 return PyLong_FromSsize_t(result); 10689} 10690 10691static PyObject * 10692unicode_getitem(PyObject *self, Py_ssize_t index) 10693{ 10694 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10695 if (ch == (Py_UCS4)-1) 10696 return NULL; 10697 return PyUnicode_FromOrdinal(ch); 10698} 10699 10700/* Believe it or not, this produces the same value for ASCII strings 10701 as bytes_hash(). */ 10702static Py_hash_t 10703unicode_hash(PyUnicodeObject *self) 10704{ 10705 Py_ssize_t len; 10706 Py_uhash_t x; 10707 10708 if (_PyUnicode_HASH(self) != -1) 10709 return _PyUnicode_HASH(self); 10710 if (PyUnicode_READY(self) == -1) 10711 return -1; 10712 len = PyUnicode_GET_LENGTH(self); 10713 10714 /* The hash function as a macro, gets expanded three times below. */ 10715#define HASH(P) \ 10716 x = (Py_uhash_t)*P << 7; \ 10717 while (--len >= 0) \ 10718 x = (1000003*x) ^ (Py_uhash_t)*P++; 10719 10720 switch (PyUnicode_KIND(self)) { 10721 case PyUnicode_1BYTE_KIND: { 10722 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10723 HASH(c); 10724 break; 10725 } 10726 case PyUnicode_2BYTE_KIND: { 10727 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10728 HASH(s); 10729 break; 10730 } 10731 default: { 10732 Py_UCS4 *l; 10733 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10734 "Impossible switch case in unicode_hash"); 10735 l = PyUnicode_4BYTE_DATA(self); 10736 HASH(l); 10737 break; 10738 } 10739 } 10740 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10741 10742 if (x == -1) 10743 x = -2; 10744 _PyUnicode_HASH(self) = x; 10745 return x; 10746} 10747#undef HASH 10748 10749PyDoc_STRVAR(index__doc__, 10750 "S.index(sub[, start[, end]]) -> int\n\ 10751\n\ 10752Like S.find() but raise ValueError when the substring is not found."); 10753 10754static PyObject * 10755unicode_index(PyObject *self, PyObject *args) 10756{ 10757 Py_ssize_t result; 10758 PyUnicodeObject *substring; 10759 Py_ssize_t start; 10760 Py_ssize_t end; 10761 10762 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10763 &start, &end)) 10764 return NULL; 10765 10766 if (PyUnicode_READY(self) == -1) 10767 return NULL; 10768 if (PyUnicode_READY(substring) == -1) 10769 return NULL; 10770 10771 result = any_find_slice(1, 10772 self, (PyObject*)substring, start, end 10773 ); 10774 10775 Py_DECREF(substring); 10776 10777 if (result == -2) 10778 return NULL; 10779 10780 if (result < 0) { 10781 PyErr_SetString(PyExc_ValueError, "substring not found"); 10782 return NULL; 10783 } 10784 10785 return PyLong_FromSsize_t(result); 10786} 10787 10788PyDoc_STRVAR(islower__doc__, 10789 "S.islower() -> bool\n\ 10790\n\ 10791Return True if all cased characters in S are lowercase and there is\n\ 10792at least one cased character in S, False otherwise."); 10793 10794static PyObject* 10795unicode_islower(PyUnicodeObject *self) 10796{ 10797 Py_ssize_t i, length; 10798 int kind; 10799 void *data; 10800 int cased; 10801 10802 if (PyUnicode_READY(self) == -1) 10803 return NULL; 10804 length = PyUnicode_GET_LENGTH(self); 10805 kind = PyUnicode_KIND(self); 10806 data = PyUnicode_DATA(self); 10807 10808 /* Shortcut for single character strings */ 10809 if (length == 1) 10810 return PyBool_FromLong( 10811 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10812 10813 /* Special case for empty strings */ 10814 if (length == 0) 10815 return PyBool_FromLong(0); 10816 10817 cased = 0; 10818 for (i = 0; i < length; i++) { 10819 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10820 10821 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10822 return PyBool_FromLong(0); 10823 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10824 cased = 1; 10825 } 10826 return PyBool_FromLong(cased); 10827} 10828 10829PyDoc_STRVAR(isupper__doc__, 10830 "S.isupper() -> bool\n\ 10831\n\ 10832Return True if all cased characters in S are uppercase and there is\n\ 10833at least one cased character in S, False otherwise."); 10834 10835static PyObject* 10836unicode_isupper(PyUnicodeObject *self) 10837{ 10838 Py_ssize_t i, length; 10839 int kind; 10840 void *data; 10841 int cased; 10842 10843 if (PyUnicode_READY(self) == -1) 10844 return NULL; 10845 length = PyUnicode_GET_LENGTH(self); 10846 kind = PyUnicode_KIND(self); 10847 data = PyUnicode_DATA(self); 10848 10849 /* Shortcut for single character strings */ 10850 if (length == 1) 10851 return PyBool_FromLong( 10852 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10853 10854 /* Special case for empty strings */ 10855 if (length == 0) 10856 return PyBool_FromLong(0); 10857 10858 cased = 0; 10859 for (i = 0; i < length; i++) { 10860 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10861 10862 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10863 return PyBool_FromLong(0); 10864 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10865 cased = 1; 10866 } 10867 return PyBool_FromLong(cased); 10868} 10869 10870PyDoc_STRVAR(istitle__doc__, 10871 "S.istitle() -> bool\n\ 10872\n\ 10873Return True if S is a titlecased string and there is at least one\n\ 10874character in S, i.e. upper- and titlecase characters may only\n\ 10875follow uncased characters and lowercase characters only cased ones.\n\ 10876Return False otherwise."); 10877 10878static PyObject* 10879unicode_istitle(PyUnicodeObject *self) 10880{ 10881 Py_ssize_t i, length; 10882 int kind; 10883 void *data; 10884 int cased, previous_is_cased; 10885 10886 if (PyUnicode_READY(self) == -1) 10887 return NULL; 10888 length = PyUnicode_GET_LENGTH(self); 10889 kind = PyUnicode_KIND(self); 10890 data = PyUnicode_DATA(self); 10891 10892 /* Shortcut for single character strings */ 10893 if (length == 1) { 10894 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10895 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10896 (Py_UNICODE_ISUPPER(ch) != 0)); 10897 } 10898 10899 /* Special case for empty strings */ 10900 if (length == 0) 10901 return PyBool_FromLong(0); 10902 10903 cased = 0; 10904 previous_is_cased = 0; 10905 for (i = 0; i < length; i++) { 10906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10907 10908 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10909 if (previous_is_cased) 10910 return PyBool_FromLong(0); 10911 previous_is_cased = 1; 10912 cased = 1; 10913 } 10914 else if (Py_UNICODE_ISLOWER(ch)) { 10915 if (!previous_is_cased) 10916 return PyBool_FromLong(0); 10917 previous_is_cased = 1; 10918 cased = 1; 10919 } 10920 else 10921 previous_is_cased = 0; 10922 } 10923 return PyBool_FromLong(cased); 10924} 10925 10926PyDoc_STRVAR(isspace__doc__, 10927 "S.isspace() -> bool\n\ 10928\n\ 10929Return True if all characters in S are whitespace\n\ 10930and there is at least one character in S, False otherwise."); 10931 10932static PyObject* 10933unicode_isspace(PyUnicodeObject *self) 10934{ 10935 Py_ssize_t i, length; 10936 int kind; 10937 void *data; 10938 10939 if (PyUnicode_READY(self) == -1) 10940 return NULL; 10941 length = PyUnicode_GET_LENGTH(self); 10942 kind = PyUnicode_KIND(self); 10943 data = PyUnicode_DATA(self); 10944 10945 /* Shortcut for single character strings */ 10946 if (length == 1) 10947 return PyBool_FromLong( 10948 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10949 10950 /* Special case for empty strings */ 10951 if (length == 0) 10952 return PyBool_FromLong(0); 10953 10954 for (i = 0; i < length; i++) { 10955 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10956 if (!Py_UNICODE_ISSPACE(ch)) 10957 return PyBool_FromLong(0); 10958 } 10959 return PyBool_FromLong(1); 10960} 10961 10962PyDoc_STRVAR(isalpha__doc__, 10963 "S.isalpha() -> bool\n\ 10964\n\ 10965Return True if all characters in S are alphabetic\n\ 10966and there is at least one character in S, False otherwise."); 10967 10968static PyObject* 10969unicode_isalpha(PyUnicodeObject *self) 10970{ 10971 Py_ssize_t i, length; 10972 int kind; 10973 void *data; 10974 10975 if (PyUnicode_READY(self) == -1) 10976 return NULL; 10977 length = PyUnicode_GET_LENGTH(self); 10978 kind = PyUnicode_KIND(self); 10979 data = PyUnicode_DATA(self); 10980 10981 /* Shortcut for single character strings */ 10982 if (length == 1) 10983 return PyBool_FromLong( 10984 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10985 10986 /* Special case for empty strings */ 10987 if (length == 0) 10988 return PyBool_FromLong(0); 10989 10990 for (i = 0; i < length; i++) { 10991 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10992 return PyBool_FromLong(0); 10993 } 10994 return PyBool_FromLong(1); 10995} 10996 10997PyDoc_STRVAR(isalnum__doc__, 10998 "S.isalnum() -> bool\n\ 10999\n\ 11000Return True if all characters in S are alphanumeric\n\ 11001and there is at least one character in S, False otherwise."); 11002 11003static PyObject* 11004unicode_isalnum(PyUnicodeObject *self) 11005{ 11006 int kind; 11007 void *data; 11008 Py_ssize_t len, i; 11009 11010 if (PyUnicode_READY(self) == -1) 11011 return NULL; 11012 11013 kind = PyUnicode_KIND(self); 11014 data = PyUnicode_DATA(self); 11015 len = PyUnicode_GET_LENGTH(self); 11016 11017 /* Shortcut for single character strings */ 11018 if (len == 1) { 11019 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11020 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11021 } 11022 11023 /* Special case for empty strings */ 11024 if (len == 0) 11025 return PyBool_FromLong(0); 11026 11027 for (i = 0; i < len; i++) { 11028 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11029 if (!Py_UNICODE_ISALNUM(ch)) 11030 return PyBool_FromLong(0); 11031 } 11032 return PyBool_FromLong(1); 11033} 11034 11035PyDoc_STRVAR(isdecimal__doc__, 11036 "S.isdecimal() -> bool\n\ 11037\n\ 11038Return True if there are only decimal characters in S,\n\ 11039False otherwise."); 11040 11041static PyObject* 11042unicode_isdecimal(PyUnicodeObject *self) 11043{ 11044 Py_ssize_t i, length; 11045 int kind; 11046 void *data; 11047 11048 if (PyUnicode_READY(self) == -1) 11049 return NULL; 11050 length = PyUnicode_GET_LENGTH(self); 11051 kind = PyUnicode_KIND(self); 11052 data = PyUnicode_DATA(self); 11053 11054 /* Shortcut for single character strings */ 11055 if (length == 1) 11056 return PyBool_FromLong( 11057 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11058 11059 /* Special case for empty strings */ 11060 if (length == 0) 11061 return PyBool_FromLong(0); 11062 11063 for (i = 0; i < length; i++) { 11064 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11065 return PyBool_FromLong(0); 11066 } 11067 return PyBool_FromLong(1); 11068} 11069 11070PyDoc_STRVAR(isdigit__doc__, 11071 "S.isdigit() -> bool\n\ 11072\n\ 11073Return True if all characters in S are digits\n\ 11074and there is at least one character in S, False otherwise."); 11075 11076static PyObject* 11077unicode_isdigit(PyUnicodeObject *self) 11078{ 11079 Py_ssize_t i, length; 11080 int kind; 11081 void *data; 11082 11083 if (PyUnicode_READY(self) == -1) 11084 return NULL; 11085 length = PyUnicode_GET_LENGTH(self); 11086 kind = PyUnicode_KIND(self); 11087 data = PyUnicode_DATA(self); 11088 11089 /* Shortcut for single character strings */ 11090 if (length == 1) { 11091 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11092 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11093 } 11094 11095 /* Special case for empty strings */ 11096 if (length == 0) 11097 return PyBool_FromLong(0); 11098 11099 for (i = 0; i < length; i++) { 11100 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11101 return PyBool_FromLong(0); 11102 } 11103 return PyBool_FromLong(1); 11104} 11105 11106PyDoc_STRVAR(isnumeric__doc__, 11107 "S.isnumeric() -> bool\n\ 11108\n\ 11109Return True if there are only numeric characters in S,\n\ 11110False otherwise."); 11111 11112static PyObject* 11113unicode_isnumeric(PyUnicodeObject *self) 11114{ 11115 Py_ssize_t i, length; 11116 int kind; 11117 void *data; 11118 11119 if (PyUnicode_READY(self) == -1) 11120 return NULL; 11121 length = PyUnicode_GET_LENGTH(self); 11122 kind = PyUnicode_KIND(self); 11123 data = PyUnicode_DATA(self); 11124 11125 /* Shortcut for single character strings */ 11126 if (length == 1) 11127 return PyBool_FromLong( 11128 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11129 11130 /* Special case for empty strings */ 11131 if (length == 0) 11132 return PyBool_FromLong(0); 11133 11134 for (i = 0; i < length; i++) { 11135 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11136 return PyBool_FromLong(0); 11137 } 11138 return PyBool_FromLong(1); 11139} 11140 11141int 11142PyUnicode_IsIdentifier(PyObject *self) 11143{ 11144 int kind; 11145 void *data; 11146 Py_ssize_t i; 11147 Py_UCS4 first; 11148 11149 if (PyUnicode_READY(self) == -1) { 11150 Py_FatalError("identifier not ready"); 11151 return 0; 11152 } 11153 11154 /* Special case for empty strings */ 11155 if (PyUnicode_GET_LENGTH(self) == 0) 11156 return 0; 11157 kind = PyUnicode_KIND(self); 11158 data = PyUnicode_DATA(self); 11159 11160 /* PEP 3131 says that the first character must be in 11161 XID_Start and subsequent characters in XID_Continue, 11162 and for the ASCII range, the 2.x rules apply (i.e 11163 start with letters and underscore, continue with 11164 letters, digits, underscore). However, given the current 11165 definition of XID_Start and XID_Continue, it is sufficient 11166 to check just for these, except that _ must be allowed 11167 as starting an identifier. */ 11168 first = PyUnicode_READ(kind, data, 0); 11169 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11170 return 0; 11171 11172 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11173 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11174 return 0; 11175 return 1; 11176} 11177 11178PyDoc_STRVAR(isidentifier__doc__, 11179 "S.isidentifier() -> bool\n\ 11180\n\ 11181Return True if S is a valid identifier according\n\ 11182to the language definition."); 11183 11184static PyObject* 11185unicode_isidentifier(PyObject *self) 11186{ 11187 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11188} 11189 11190PyDoc_STRVAR(isprintable__doc__, 11191 "S.isprintable() -> bool\n\ 11192\n\ 11193Return True if all characters in S are considered\n\ 11194printable in repr() or S is empty, False otherwise."); 11195 11196static PyObject* 11197unicode_isprintable(PyObject *self) 11198{ 11199 Py_ssize_t i, length; 11200 int kind; 11201 void *data; 11202 11203 if (PyUnicode_READY(self) == -1) 11204 return NULL; 11205 length = PyUnicode_GET_LENGTH(self); 11206 kind = PyUnicode_KIND(self); 11207 data = PyUnicode_DATA(self); 11208 11209 /* Shortcut for single character strings */ 11210 if (length == 1) 11211 return PyBool_FromLong( 11212 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11213 11214 for (i = 0; i < length; i++) { 11215 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11216 Py_RETURN_FALSE; 11217 } 11218 } 11219 Py_RETURN_TRUE; 11220} 11221 11222PyDoc_STRVAR(join__doc__, 11223 "S.join(iterable) -> str\n\ 11224\n\ 11225Return a string which is the concatenation of the strings in the\n\ 11226iterable. The separator between elements is S."); 11227 11228static PyObject* 11229unicode_join(PyObject *self, PyObject *data) 11230{ 11231 return PyUnicode_Join(self, data); 11232} 11233 11234static Py_ssize_t 11235unicode_length(PyUnicodeObject *self) 11236{ 11237 if (PyUnicode_READY(self) == -1) 11238 return -1; 11239 return PyUnicode_GET_LENGTH(self); 11240} 11241 11242PyDoc_STRVAR(ljust__doc__, 11243 "S.ljust(width[, fillchar]) -> str\n\ 11244\n\ 11245Return S left-justified in a Unicode string of length width. Padding is\n\ 11246done using the specified fill character (default is a space)."); 11247 11248static PyObject * 11249unicode_ljust(PyObject *self, PyObject *args) 11250{ 11251 Py_ssize_t width; 11252 Py_UCS4 fillchar = ' '; 11253 11254 if (PyUnicode_READY(self) == -1) 11255 return NULL; 11256 11257 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11258 return NULL; 11259 11260 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11261 Py_INCREF(self); 11262 return (PyObject*) self; 11263 } 11264 11265 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 11266} 11267 11268PyDoc_STRVAR(lower__doc__, 11269 "S.lower() -> str\n\ 11270\n\ 11271Return a copy of the string S converted to lowercase."); 11272 11273static PyObject* 11274unicode_lower(PyObject *self) 11275{ 11276 return fixup(self, fixlower); 11277} 11278 11279#define LEFTSTRIP 0 11280#define RIGHTSTRIP 1 11281#define BOTHSTRIP 2 11282 11283/* Arrays indexed by above */ 11284static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11285 11286#define STRIPNAME(i) (stripformat[i]+3) 11287 11288/* externally visible for str.strip(unicode) */ 11289PyObject * 11290_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 11291{ 11292 void *data; 11293 int kind; 11294 Py_ssize_t i, j, len; 11295 BLOOM_MASK sepmask; 11296 11297 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11298 return NULL; 11299 11300 kind = PyUnicode_KIND(self); 11301 data = PyUnicode_DATA(self); 11302 len = PyUnicode_GET_LENGTH(self); 11303 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11304 PyUnicode_DATA(sepobj), 11305 PyUnicode_GET_LENGTH(sepobj)); 11306 11307 i = 0; 11308 if (striptype != RIGHTSTRIP) { 11309 while (i < len && 11310 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11311 i++; 11312 } 11313 } 11314 11315 j = len; 11316 if (striptype != LEFTSTRIP) { 11317 do { 11318 j--; 11319 } while (j >= i && 11320 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11321 j++; 11322 } 11323 11324 return PyUnicode_Substring((PyObject*)self, i, j); 11325} 11326 11327PyObject* 11328PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11329{ 11330 unsigned char *data; 11331 int kind; 11332 Py_ssize_t length; 11333 11334 if (PyUnicode_READY(self) == -1) 11335 return NULL; 11336 11337 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11338 11339 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11340 { 11341 if (PyUnicode_CheckExact(self)) { 11342 Py_INCREF(self); 11343 return self; 11344 } 11345 else 11346 return PyUnicode_Copy(self); 11347 } 11348 11349 length = end - start; 11350 if (length == 1) 11351 return unicode_getitem(self, start); 11352 11353 if (start < 0 || end < 0) { 11354 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11355 return NULL; 11356 } 11357 11358 if (PyUnicode_IS_ASCII(self)) { 11359 kind = PyUnicode_KIND(self); 11360 data = PyUnicode_1BYTE_DATA(self); 11361 return unicode_fromascii(data + start, length); 11362 } 11363 else { 11364 kind = PyUnicode_KIND(self); 11365 data = PyUnicode_1BYTE_DATA(self); 11366 return PyUnicode_FromKindAndData(kind, 11367 data + kind * start, 11368 length); 11369 } 11370} 11371 11372static PyObject * 11373do_strip(PyUnicodeObject *self, int striptype) 11374{ 11375 int kind; 11376 void *data; 11377 Py_ssize_t len, i, j; 11378 11379 if (PyUnicode_READY(self) == -1) 11380 return NULL; 11381 11382 kind = PyUnicode_KIND(self); 11383 data = PyUnicode_DATA(self); 11384 len = PyUnicode_GET_LENGTH(self); 11385 11386 i = 0; 11387 if (striptype != RIGHTSTRIP) { 11388 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11389 i++; 11390 } 11391 } 11392 11393 j = len; 11394 if (striptype != LEFTSTRIP) { 11395 do { 11396 j--; 11397 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11398 j++; 11399 } 11400 11401 return PyUnicode_Substring((PyObject*)self, i, j); 11402} 11403 11404 11405static PyObject * 11406do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 11407{ 11408 PyObject *sep = NULL; 11409 11410 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11411 return NULL; 11412 11413 if (sep != NULL && sep != Py_None) { 11414 if (PyUnicode_Check(sep)) 11415 return _PyUnicode_XStrip(self, striptype, sep); 11416 else { 11417 PyErr_Format(PyExc_TypeError, 11418 "%s arg must be None or str", 11419 STRIPNAME(striptype)); 11420 return NULL; 11421 } 11422 } 11423 11424 return do_strip(self, striptype); 11425} 11426 11427 11428PyDoc_STRVAR(strip__doc__, 11429 "S.strip([chars]) -> str\n\ 11430\n\ 11431Return a copy of the string S with leading and trailing\n\ 11432whitespace removed.\n\ 11433If chars is given and not None, remove characters in chars instead."); 11434 11435static PyObject * 11436unicode_strip(PyUnicodeObject *self, PyObject *args) 11437{ 11438 if (PyTuple_GET_SIZE(args) == 0) 11439 return do_strip(self, BOTHSTRIP); /* Common case */ 11440 else 11441 return do_argstrip(self, BOTHSTRIP, args); 11442} 11443 11444 11445PyDoc_STRVAR(lstrip__doc__, 11446 "S.lstrip([chars]) -> str\n\ 11447\n\ 11448Return a copy of the string S with leading whitespace removed.\n\ 11449If chars is given and not None, remove characters in chars instead."); 11450 11451static PyObject * 11452unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11453{ 11454 if (PyTuple_GET_SIZE(args) == 0) 11455 return do_strip(self, LEFTSTRIP); /* Common case */ 11456 else 11457 return do_argstrip(self, LEFTSTRIP, args); 11458} 11459 11460 11461PyDoc_STRVAR(rstrip__doc__, 11462 "S.rstrip([chars]) -> str\n\ 11463\n\ 11464Return a copy of the string S with trailing whitespace removed.\n\ 11465If chars is given and not None, remove characters in chars instead."); 11466 11467static PyObject * 11468unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11469{ 11470 if (PyTuple_GET_SIZE(args) == 0) 11471 return do_strip(self, RIGHTSTRIP); /* Common case */ 11472 else 11473 return do_argstrip(self, RIGHTSTRIP, args); 11474} 11475 11476 11477static PyObject* 11478unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11479{ 11480 PyUnicodeObject *u; 11481 Py_ssize_t nchars, n; 11482 11483 if (len < 1) { 11484 Py_INCREF(unicode_empty); 11485 return unicode_empty; 11486 } 11487 11488 if (len == 1 && PyUnicode_CheckExact(str)) { 11489 /* no repeat, return original string */ 11490 Py_INCREF(str); 11491 return (PyObject*) str; 11492 } 11493 11494 if (PyUnicode_READY(str) == -1) 11495 return NULL; 11496 11497 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11498 PyErr_SetString(PyExc_OverflowError, 11499 "repeated string is too long"); 11500 return NULL; 11501 } 11502 nchars = len * PyUnicode_GET_LENGTH(str); 11503 11504 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11505 if (!u) 11506 return NULL; 11507 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11508 11509 if (PyUnicode_GET_LENGTH(str) == 1) { 11510 const int kind = PyUnicode_KIND(str); 11511 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11512 void *to = PyUnicode_DATA(u); 11513 if (kind == PyUnicode_1BYTE_KIND) 11514 memset(to, (unsigned char)fill_char, len); 11515 else { 11516 for (n = 0; n < len; ++n) 11517 PyUnicode_WRITE(kind, to, n, fill_char); 11518 } 11519 } 11520 else { 11521 /* number of characters copied this far */ 11522 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11523 const Py_ssize_t char_size = PyUnicode_KIND(str); 11524 char *to = (char *) PyUnicode_DATA(u); 11525 Py_MEMCPY(to, PyUnicode_DATA(str), 11526 PyUnicode_GET_LENGTH(str) * char_size); 11527 while (done < nchars) { 11528 n = (done <= nchars-done) ? done : nchars-done; 11529 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11530 done += n; 11531 } 11532 } 11533 11534 assert(_PyUnicode_CheckConsistency(u, 1)); 11535 return (PyObject*) u; 11536} 11537 11538PyObject * 11539PyUnicode_Replace(PyObject *obj, 11540 PyObject *subobj, 11541 PyObject *replobj, 11542 Py_ssize_t maxcount) 11543{ 11544 PyObject *self; 11545 PyObject *str1; 11546 PyObject *str2; 11547 PyObject *result; 11548 11549 self = PyUnicode_FromObject(obj); 11550 if (self == NULL || PyUnicode_READY(self) == -1) 11551 return NULL; 11552 str1 = PyUnicode_FromObject(subobj); 11553 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11554 Py_DECREF(self); 11555 return NULL; 11556 } 11557 str2 = PyUnicode_FromObject(replobj); 11558 if (str2 == NULL || PyUnicode_READY(str2)) { 11559 Py_DECREF(self); 11560 Py_DECREF(str1); 11561 return NULL; 11562 } 11563 result = replace(self, str1, str2, maxcount); 11564 Py_DECREF(self); 11565 Py_DECREF(str1); 11566 Py_DECREF(str2); 11567 return result; 11568} 11569 11570PyDoc_STRVAR(replace__doc__, 11571 "S.replace(old, new[, count]) -> str\n\ 11572\n\ 11573Return a copy of S with all occurrences of substring\n\ 11574old replaced by new. If the optional argument count is\n\ 11575given, only the first count occurrences are replaced."); 11576 11577static PyObject* 11578unicode_replace(PyObject *self, PyObject *args) 11579{ 11580 PyObject *str1; 11581 PyObject *str2; 11582 Py_ssize_t maxcount = -1; 11583 PyObject *result; 11584 11585 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11586 return NULL; 11587 if (!PyUnicode_READY(self) == -1) 11588 return NULL; 11589 str1 = PyUnicode_FromObject(str1); 11590 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11591 return NULL; 11592 str2 = PyUnicode_FromObject(str2); 11593 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11594 Py_DECREF(str1); 11595 return NULL; 11596 } 11597 11598 result = replace(self, str1, str2, maxcount); 11599 11600 Py_DECREF(str1); 11601 Py_DECREF(str2); 11602 return result; 11603} 11604 11605static PyObject * 11606unicode_repr(PyObject *unicode) 11607{ 11608 PyObject *repr; 11609 Py_ssize_t isize; 11610 Py_ssize_t osize, squote, dquote, i, o; 11611 Py_UCS4 max, quote; 11612 int ikind, okind; 11613 void *idata, *odata; 11614 11615 if (PyUnicode_READY(unicode) == -1) 11616 return NULL; 11617 11618 isize = PyUnicode_GET_LENGTH(unicode); 11619 idata = PyUnicode_DATA(unicode); 11620 11621 /* Compute length of output, quote characters, and 11622 maximum character */ 11623 osize = 2; /* quotes */ 11624 max = 127; 11625 squote = dquote = 0; 11626 ikind = PyUnicode_KIND(unicode); 11627 for (i = 0; i < isize; i++) { 11628 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11629 switch (ch) { 11630 case '\'': squote++; osize++; break; 11631 case '"': dquote++; osize++; break; 11632 case '\\': case '\t': case '\r': case '\n': 11633 osize += 2; break; 11634 default: 11635 /* Fast-path ASCII */ 11636 if (ch < ' ' || ch == 0x7f) 11637 osize += 4; /* \xHH */ 11638 else if (ch < 0x7f) 11639 osize++; 11640 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11641 osize++; 11642 max = ch > max ? ch : max; 11643 } 11644 else if (ch < 0x100) 11645 osize += 4; /* \xHH */ 11646 else if (ch < 0x10000) 11647 osize += 6; /* \uHHHH */ 11648 else 11649 osize += 10; /* \uHHHHHHHH */ 11650 } 11651 } 11652 11653 quote = '\''; 11654 if (squote) { 11655 if (dquote) 11656 /* Both squote and dquote present. Use squote, 11657 and escape them */ 11658 osize += squote; 11659 else 11660 quote = '"'; 11661 } 11662 11663 repr = PyUnicode_New(osize, max); 11664 if (repr == NULL) 11665 return NULL; 11666 okind = PyUnicode_KIND(repr); 11667 odata = PyUnicode_DATA(repr); 11668 11669 PyUnicode_WRITE(okind, odata, 0, quote); 11670 PyUnicode_WRITE(okind, odata, osize-1, quote); 11671 11672 for (i = 0, o = 1; i < isize; i++) { 11673 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11674 11675 /* Escape quotes and backslashes */ 11676 if ((ch == quote) || (ch == '\\')) { 11677 PyUnicode_WRITE(okind, odata, o++, '\\'); 11678 PyUnicode_WRITE(okind, odata, o++, ch); 11679 continue; 11680 } 11681 11682 /* Map special whitespace to '\t', \n', '\r' */ 11683 if (ch == '\t') { 11684 PyUnicode_WRITE(okind, odata, o++, '\\'); 11685 PyUnicode_WRITE(okind, odata, o++, 't'); 11686 } 11687 else if (ch == '\n') { 11688 PyUnicode_WRITE(okind, odata, o++, '\\'); 11689 PyUnicode_WRITE(okind, odata, o++, 'n'); 11690 } 11691 else if (ch == '\r') { 11692 PyUnicode_WRITE(okind, odata, o++, '\\'); 11693 PyUnicode_WRITE(okind, odata, o++, 'r'); 11694 } 11695 11696 /* Map non-printable US ASCII to '\xhh' */ 11697 else if (ch < ' ' || ch == 0x7F) { 11698 PyUnicode_WRITE(okind, odata, o++, '\\'); 11699 PyUnicode_WRITE(okind, odata, o++, 'x'); 11700 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11701 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11702 } 11703 11704 /* Copy ASCII characters as-is */ 11705 else if (ch < 0x7F) { 11706 PyUnicode_WRITE(okind, odata, o++, ch); 11707 } 11708 11709 /* Non-ASCII characters */ 11710 else { 11711 /* Map Unicode whitespace and control characters 11712 (categories Z* and C* except ASCII space) 11713 */ 11714 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11715 /* Map 8-bit characters to '\xhh' */ 11716 if (ch <= 0xff) { 11717 PyUnicode_WRITE(okind, odata, o++, '\\'); 11718 PyUnicode_WRITE(okind, odata, o++, 'x'); 11719 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11720 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11721 } 11722 /* Map 21-bit characters to '\U00xxxxxx' */ 11723 else if (ch >= 0x10000) { 11724 PyUnicode_WRITE(okind, odata, o++, '\\'); 11725 PyUnicode_WRITE(okind, odata, o++, 'U'); 11726 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11727 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11728 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11729 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11730 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11731 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11732 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11733 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11734 } 11735 /* Map 16-bit characters to '\uxxxx' */ 11736 else { 11737 PyUnicode_WRITE(okind, odata, o++, '\\'); 11738 PyUnicode_WRITE(okind, odata, o++, 'u'); 11739 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11740 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11741 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11742 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11743 } 11744 } 11745 /* Copy characters as-is */ 11746 else { 11747 PyUnicode_WRITE(okind, odata, o++, ch); 11748 } 11749 } 11750 } 11751 /* Closing quote already added at the beginning */ 11752 assert(_PyUnicode_CheckConsistency(repr, 1)); 11753 return repr; 11754} 11755 11756PyDoc_STRVAR(rfind__doc__, 11757 "S.rfind(sub[, start[, end]]) -> int\n\ 11758\n\ 11759Return the highest index in S where substring sub is found,\n\ 11760such that sub is contained within S[start:end]. Optional\n\ 11761arguments start and end are interpreted as in slice notation.\n\ 11762\n\ 11763Return -1 on failure."); 11764 11765static PyObject * 11766unicode_rfind(PyObject *self, PyObject *args) 11767{ 11768 PyUnicodeObject *substring; 11769 Py_ssize_t start; 11770 Py_ssize_t end; 11771 Py_ssize_t result; 11772 11773 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11774 &start, &end)) 11775 return NULL; 11776 11777 if (PyUnicode_READY(self) == -1) 11778 return NULL; 11779 if (PyUnicode_READY(substring) == -1) 11780 return NULL; 11781 11782 result = any_find_slice(-1, 11783 self, (PyObject*)substring, start, end 11784 ); 11785 11786 Py_DECREF(substring); 11787 11788 if (result == -2) 11789 return NULL; 11790 11791 return PyLong_FromSsize_t(result); 11792} 11793 11794PyDoc_STRVAR(rindex__doc__, 11795 "S.rindex(sub[, start[, end]]) -> int\n\ 11796\n\ 11797Like S.rfind() but raise ValueError when the substring is not found."); 11798 11799static PyObject * 11800unicode_rindex(PyObject *self, PyObject *args) 11801{ 11802 PyUnicodeObject *substring; 11803 Py_ssize_t start; 11804 Py_ssize_t end; 11805 Py_ssize_t result; 11806 11807 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11808 &start, &end)) 11809 return NULL; 11810 11811 if (PyUnicode_READY(self) == -1) 11812 return NULL; 11813 if (PyUnicode_READY(substring) == -1) 11814 return NULL; 11815 11816 result = any_find_slice(-1, 11817 self, (PyObject*)substring, start, end 11818 ); 11819 11820 Py_DECREF(substring); 11821 11822 if (result == -2) 11823 return NULL; 11824 11825 if (result < 0) { 11826 PyErr_SetString(PyExc_ValueError, "substring not found"); 11827 return NULL; 11828 } 11829 11830 return PyLong_FromSsize_t(result); 11831} 11832 11833PyDoc_STRVAR(rjust__doc__, 11834 "S.rjust(width[, fillchar]) -> str\n\ 11835\n\ 11836Return S right-justified in a string of length width. Padding is\n\ 11837done using the specified fill character (default is a space)."); 11838 11839static PyObject * 11840unicode_rjust(PyObject *self, PyObject *args) 11841{ 11842 Py_ssize_t width; 11843 Py_UCS4 fillchar = ' '; 11844 11845 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11846 return NULL; 11847 11848 if (PyUnicode_READY(self) == -1) 11849 return NULL; 11850 11851 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11852 Py_INCREF(self); 11853 return (PyObject*) self; 11854 } 11855 11856 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11857} 11858 11859PyObject * 11860PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11861{ 11862 PyObject *result; 11863 11864 s = PyUnicode_FromObject(s); 11865 if (s == NULL) 11866 return NULL; 11867 if (sep != NULL) { 11868 sep = PyUnicode_FromObject(sep); 11869 if (sep == NULL) { 11870 Py_DECREF(s); 11871 return NULL; 11872 } 11873 } 11874 11875 result = split(s, sep, maxsplit); 11876 11877 Py_DECREF(s); 11878 Py_XDECREF(sep); 11879 return result; 11880} 11881 11882PyDoc_STRVAR(split__doc__, 11883 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11884\n\ 11885Return a list of the words in S, using sep as the\n\ 11886delimiter string. If maxsplit is given, at most maxsplit\n\ 11887splits are done. If sep is not specified or is None, any\n\ 11888whitespace string is a separator and empty strings are\n\ 11889removed from the result."); 11890 11891static PyObject* 11892unicode_split(PyObject *self, PyObject *args) 11893{ 11894 PyObject *substring = Py_None; 11895 Py_ssize_t maxcount = -1; 11896 11897 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11898 return NULL; 11899 11900 if (substring == Py_None) 11901 return split(self, NULL, maxcount); 11902 else if (PyUnicode_Check(substring)) 11903 return split(self, substring, maxcount); 11904 else 11905 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11906} 11907 11908PyObject * 11909PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11910{ 11911 PyObject* str_obj; 11912 PyObject* sep_obj; 11913 PyObject* out; 11914 int kind1, kind2, kind; 11915 void *buf1 = NULL, *buf2 = NULL; 11916 Py_ssize_t len1, len2; 11917 11918 str_obj = PyUnicode_FromObject(str_in); 11919 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11920 return NULL; 11921 sep_obj = PyUnicode_FromObject(sep_in); 11922 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11923 Py_DECREF(str_obj); 11924 return NULL; 11925 } 11926 11927 kind1 = PyUnicode_KIND(str_obj); 11928 kind2 = PyUnicode_KIND(sep_obj); 11929 kind = Py_MAX(kind1, kind2); 11930 buf1 = PyUnicode_DATA(str_obj); 11931 if (kind1 != kind) 11932 buf1 = _PyUnicode_AsKind(str_obj, kind); 11933 if (!buf1) 11934 goto onError; 11935 buf2 = PyUnicode_DATA(sep_obj); 11936 if (kind2 != kind) 11937 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11938 if (!buf2) 11939 goto onError; 11940 len1 = PyUnicode_GET_LENGTH(str_obj); 11941 len2 = PyUnicode_GET_LENGTH(sep_obj); 11942 11943 switch(PyUnicode_KIND(str_obj)) { 11944 case PyUnicode_1BYTE_KIND: 11945 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 11946 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11947 else 11948 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11949 break; 11950 case PyUnicode_2BYTE_KIND: 11951 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11952 break; 11953 case PyUnicode_4BYTE_KIND: 11954 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11955 break; 11956 default: 11957 assert(0); 11958 out = 0; 11959 } 11960 11961 Py_DECREF(sep_obj); 11962 Py_DECREF(str_obj); 11963 if (kind1 != kind) 11964 PyMem_Free(buf1); 11965 if (kind2 != kind) 11966 PyMem_Free(buf2); 11967 11968 return out; 11969 onError: 11970 Py_DECREF(sep_obj); 11971 Py_DECREF(str_obj); 11972 if (kind1 != kind && buf1) 11973 PyMem_Free(buf1); 11974 if (kind2 != kind && buf2) 11975 PyMem_Free(buf2); 11976 return NULL; 11977} 11978 11979 11980PyObject * 11981PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11982{ 11983 PyObject* str_obj; 11984 PyObject* sep_obj; 11985 PyObject* out; 11986 int kind1, kind2, kind; 11987 void *buf1 = NULL, *buf2 = NULL; 11988 Py_ssize_t len1, len2; 11989 11990 str_obj = PyUnicode_FromObject(str_in); 11991 if (!str_obj) 11992 return NULL; 11993 sep_obj = PyUnicode_FromObject(sep_in); 11994 if (!sep_obj) { 11995 Py_DECREF(str_obj); 11996 return NULL; 11997 } 11998 11999 kind1 = PyUnicode_KIND(str_in); 12000 kind2 = PyUnicode_KIND(sep_obj); 12001 kind = Py_MAX(kind1, kind2); 12002 buf1 = PyUnicode_DATA(str_in); 12003 if (kind1 != kind) 12004 buf1 = _PyUnicode_AsKind(str_in, kind); 12005 if (!buf1) 12006 goto onError; 12007 buf2 = PyUnicode_DATA(sep_obj); 12008 if (kind2 != kind) 12009 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12010 if (!buf2) 12011 goto onError; 12012 len1 = PyUnicode_GET_LENGTH(str_obj); 12013 len2 = PyUnicode_GET_LENGTH(sep_obj); 12014 12015 switch(PyUnicode_KIND(str_in)) { 12016 case PyUnicode_1BYTE_KIND: 12017 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12018 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12019 else 12020 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12021 break; 12022 case PyUnicode_2BYTE_KIND: 12023 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12024 break; 12025 case PyUnicode_4BYTE_KIND: 12026 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12027 break; 12028 default: 12029 assert(0); 12030 out = 0; 12031 } 12032 12033 Py_DECREF(sep_obj); 12034 Py_DECREF(str_obj); 12035 if (kind1 != kind) 12036 PyMem_Free(buf1); 12037 if (kind2 != kind) 12038 PyMem_Free(buf2); 12039 12040 return out; 12041 onError: 12042 Py_DECREF(sep_obj); 12043 Py_DECREF(str_obj); 12044 if (kind1 != kind && buf1) 12045 PyMem_Free(buf1); 12046 if (kind2 != kind && buf2) 12047 PyMem_Free(buf2); 12048 return NULL; 12049} 12050 12051PyDoc_STRVAR(partition__doc__, 12052 "S.partition(sep) -> (head, sep, tail)\n\ 12053\n\ 12054Search for the separator sep in S, and return the part before it,\n\ 12055the separator itself, and the part after it. If the separator is not\n\ 12056found, return S and two empty strings."); 12057 12058static PyObject* 12059unicode_partition(PyObject *self, PyObject *separator) 12060{ 12061 return PyUnicode_Partition(self, separator); 12062} 12063 12064PyDoc_STRVAR(rpartition__doc__, 12065 "S.rpartition(sep) -> (head, sep, tail)\n\ 12066\n\ 12067Search for the separator sep in S, starting at the end of S, and return\n\ 12068the part before it, the separator itself, and the part after it. If the\n\ 12069separator is not found, return two empty strings and S."); 12070 12071static PyObject* 12072unicode_rpartition(PyObject *self, PyObject *separator) 12073{ 12074 return PyUnicode_RPartition(self, separator); 12075} 12076 12077PyObject * 12078PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12079{ 12080 PyObject *result; 12081 12082 s = PyUnicode_FromObject(s); 12083 if (s == NULL) 12084 return NULL; 12085 if (sep != NULL) { 12086 sep = PyUnicode_FromObject(sep); 12087 if (sep == NULL) { 12088 Py_DECREF(s); 12089 return NULL; 12090 } 12091 } 12092 12093 result = rsplit(s, sep, maxsplit); 12094 12095 Py_DECREF(s); 12096 Py_XDECREF(sep); 12097 return result; 12098} 12099 12100PyDoc_STRVAR(rsplit__doc__, 12101 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 12102\n\ 12103Return a list of the words in S, using sep as the\n\ 12104delimiter string, starting at the end of the string and\n\ 12105working to the front. If maxsplit is given, at most maxsplit\n\ 12106splits are done. If sep is not specified, any whitespace string\n\ 12107is a separator."); 12108 12109static PyObject* 12110unicode_rsplit(PyObject *self, PyObject *args) 12111{ 12112 PyObject *substring = Py_None; 12113 Py_ssize_t maxcount = -1; 12114 12115 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 12116 return NULL; 12117 12118 if (substring == Py_None) 12119 return rsplit(self, NULL, maxcount); 12120 else if (PyUnicode_Check(substring)) 12121 return rsplit(self, substring, maxcount); 12122 else 12123 return PyUnicode_RSplit(self, substring, maxcount); 12124} 12125 12126PyDoc_STRVAR(splitlines__doc__, 12127 "S.splitlines([keepends]) -> list of strings\n\ 12128\n\ 12129Return a list of the lines in S, breaking at line boundaries.\n\ 12130Line breaks are not included in the resulting list unless keepends\n\ 12131is given and true."); 12132 12133static PyObject* 12134unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 12135{ 12136 static char *kwlist[] = {"keepends", 0}; 12137 int keepends = 0; 12138 12139 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12140 kwlist, &keepends)) 12141 return NULL; 12142 12143 return PyUnicode_Splitlines((PyObject *)self, keepends); 12144} 12145 12146static 12147PyObject *unicode_str(PyObject *self) 12148{ 12149 if (PyUnicode_CheckExact(self)) { 12150 Py_INCREF(self); 12151 return self; 12152 } else 12153 /* Subtype -- return genuine unicode string with the same value. */ 12154 return PyUnicode_Copy(self); 12155} 12156 12157PyDoc_STRVAR(swapcase__doc__, 12158 "S.swapcase() -> str\n\ 12159\n\ 12160Return a copy of S with uppercase characters converted to lowercase\n\ 12161and vice versa."); 12162 12163static PyObject* 12164unicode_swapcase(PyObject *self) 12165{ 12166 return fixup(self, fixswapcase); 12167} 12168 12169PyDoc_STRVAR(maketrans__doc__, 12170 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12171\n\ 12172Return a translation table usable for str.translate().\n\ 12173If there is only one argument, it must be a dictionary mapping Unicode\n\ 12174ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12175Character keys will be then converted to ordinals.\n\ 12176If there are two arguments, they must be strings of equal length, and\n\ 12177in the resulting dictionary, each character in x will be mapped to the\n\ 12178character at the same position in y. If there is a third argument, it\n\ 12179must be a string, whose characters will be mapped to None in the result."); 12180 12181static PyObject* 12182unicode_maketrans(PyUnicodeObject *null, PyObject *args) 12183{ 12184 PyObject *x, *y = NULL, *z = NULL; 12185 PyObject *new = NULL, *key, *value; 12186 Py_ssize_t i = 0; 12187 int res; 12188 12189 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12190 return NULL; 12191 new = PyDict_New(); 12192 if (!new) 12193 return NULL; 12194 if (y != NULL) { 12195 int x_kind, y_kind, z_kind; 12196 void *x_data, *y_data, *z_data; 12197 12198 /* x must be a string too, of equal length */ 12199 if (!PyUnicode_Check(x)) { 12200 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12201 "be a string if there is a second argument"); 12202 goto err; 12203 } 12204 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12205 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12206 "arguments must have equal length"); 12207 goto err; 12208 } 12209 /* create entries for translating chars in x to those in y */ 12210 x_kind = PyUnicode_KIND(x); 12211 y_kind = PyUnicode_KIND(y); 12212 x_data = PyUnicode_DATA(x); 12213 y_data = PyUnicode_DATA(y); 12214 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12215 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12216 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12217 if (!key || !value) 12218 goto err; 12219 res = PyDict_SetItem(new, key, value); 12220 Py_DECREF(key); 12221 Py_DECREF(value); 12222 if (res < 0) 12223 goto err; 12224 } 12225 /* create entries for deleting chars in z */ 12226 if (z != NULL) { 12227 z_kind = PyUnicode_KIND(z); 12228 z_data = PyUnicode_DATA(z); 12229 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 12230 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12231 if (!key) 12232 goto err; 12233 res = PyDict_SetItem(new, key, Py_None); 12234 Py_DECREF(key); 12235 if (res < 0) 12236 goto err; 12237 } 12238 } 12239 } else { 12240 int kind; 12241 void *data; 12242 12243 /* x must be a dict */ 12244 if (!PyDict_CheckExact(x)) { 12245 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12246 "to maketrans it must be a dict"); 12247 goto err; 12248 } 12249 /* copy entries into the new dict, converting string keys to int keys */ 12250 while (PyDict_Next(x, &i, &key, &value)) { 12251 if (PyUnicode_Check(key)) { 12252 /* convert string keys to integer keys */ 12253 PyObject *newkey; 12254 if (PyUnicode_GET_SIZE(key) != 1) { 12255 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12256 "table must be of length 1"); 12257 goto err; 12258 } 12259 kind = PyUnicode_KIND(key); 12260 data = PyUnicode_DATA(key); 12261 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12262 if (!newkey) 12263 goto err; 12264 res = PyDict_SetItem(new, newkey, value); 12265 Py_DECREF(newkey); 12266 if (res < 0) 12267 goto err; 12268 } else if (PyLong_Check(key)) { 12269 /* just keep integer keys */ 12270 if (PyDict_SetItem(new, key, value) < 0) 12271 goto err; 12272 } else { 12273 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12274 "be strings or integers"); 12275 goto err; 12276 } 12277 } 12278 } 12279 return new; 12280 err: 12281 Py_DECREF(new); 12282 return NULL; 12283} 12284 12285PyDoc_STRVAR(translate__doc__, 12286 "S.translate(table) -> str\n\ 12287\n\ 12288Return a copy of the string S, where all characters have been mapped\n\ 12289through the given translation table, which must be a mapping of\n\ 12290Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12291Unmapped characters are left untouched. Characters mapped to None\n\ 12292are deleted."); 12293 12294static PyObject* 12295unicode_translate(PyObject *self, PyObject *table) 12296{ 12297 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12298} 12299 12300PyDoc_STRVAR(upper__doc__, 12301 "S.upper() -> str\n\ 12302\n\ 12303Return a copy of S converted to uppercase."); 12304 12305static PyObject* 12306unicode_upper(PyObject *self) 12307{ 12308 return fixup(self, fixupper); 12309} 12310 12311PyDoc_STRVAR(zfill__doc__, 12312 "S.zfill(width) -> str\n\ 12313\n\ 12314Pad a numeric string S with zeros on the left, to fill a field\n\ 12315of the specified width. The string S is never truncated."); 12316 12317static PyObject * 12318unicode_zfill(PyObject *self, PyObject *args) 12319{ 12320 Py_ssize_t fill; 12321 PyObject *u; 12322 Py_ssize_t width; 12323 int kind; 12324 void *data; 12325 Py_UCS4 chr; 12326 12327 if (PyUnicode_READY(self) == -1) 12328 return NULL; 12329 12330 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12331 return NULL; 12332 12333 if (PyUnicode_GET_LENGTH(self) >= width) { 12334 if (PyUnicode_CheckExact(self)) { 12335 Py_INCREF(self); 12336 return (PyObject*) self; 12337 } 12338 else 12339 return PyUnicode_Copy((PyObject*)self); 12340 } 12341 12342 fill = width - _PyUnicode_LENGTH(self); 12343 12344 u = pad(self, fill, 0, '0'); 12345 12346 if (u == NULL) 12347 return NULL; 12348 12349 kind = PyUnicode_KIND(u); 12350 data = PyUnicode_DATA(u); 12351 chr = PyUnicode_READ(kind, data, fill); 12352 12353 if (chr == '+' || chr == '-') { 12354 /* move sign to beginning of string */ 12355 PyUnicode_WRITE(kind, data, 0, chr); 12356 PyUnicode_WRITE(kind, data, fill, '0'); 12357 } 12358 12359 assert(_PyUnicode_CheckConsistency(u, 1)); 12360 return (PyObject*) u; 12361} 12362 12363#if 0 12364static PyObject * 12365unicode__decimal2ascii(PyObject *self) 12366{ 12367 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12368} 12369#endif 12370 12371PyDoc_STRVAR(startswith__doc__, 12372 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12373\n\ 12374Return True if S starts with the specified prefix, False otherwise.\n\ 12375With optional start, test S beginning at that position.\n\ 12376With optional end, stop comparing S at that position.\n\ 12377prefix can also be a tuple of strings to try."); 12378 12379static PyObject * 12380unicode_startswith(PyUnicodeObject *self, 12381 PyObject *args) 12382{ 12383 PyObject *subobj; 12384 PyUnicodeObject *substring; 12385 Py_ssize_t start = 0; 12386 Py_ssize_t end = PY_SSIZE_T_MAX; 12387 int result; 12388 12389 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12390 return NULL; 12391 if (PyTuple_Check(subobj)) { 12392 Py_ssize_t i; 12393 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12394 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12395 PyTuple_GET_ITEM(subobj, i)); 12396 if (substring == NULL) 12397 return NULL; 12398 result = tailmatch(self, substring, start, end, -1); 12399 Py_DECREF(substring); 12400 if (result) { 12401 Py_RETURN_TRUE; 12402 } 12403 } 12404 /* nothing matched */ 12405 Py_RETURN_FALSE; 12406 } 12407 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12408 if (substring == NULL) { 12409 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12410 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12411 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12412 return NULL; 12413 } 12414 result = tailmatch(self, substring, start, end, -1); 12415 Py_DECREF(substring); 12416 return PyBool_FromLong(result); 12417} 12418 12419 12420PyDoc_STRVAR(endswith__doc__, 12421 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12422\n\ 12423Return True if S ends with the specified suffix, False otherwise.\n\ 12424With optional start, test S beginning at that position.\n\ 12425With optional end, stop comparing S at that position.\n\ 12426suffix can also be a tuple of strings to try."); 12427 12428static PyObject * 12429unicode_endswith(PyUnicodeObject *self, 12430 PyObject *args) 12431{ 12432 PyObject *subobj; 12433 PyUnicodeObject *substring; 12434 Py_ssize_t start = 0; 12435 Py_ssize_t end = PY_SSIZE_T_MAX; 12436 int result; 12437 12438 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12439 return NULL; 12440 if (PyTuple_Check(subobj)) { 12441 Py_ssize_t i; 12442 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12443 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12444 PyTuple_GET_ITEM(subobj, i)); 12445 if (substring == NULL) 12446 return NULL; 12447 result = tailmatch(self, substring, start, end, +1); 12448 Py_DECREF(substring); 12449 if (result) { 12450 Py_RETURN_TRUE; 12451 } 12452 } 12453 Py_RETURN_FALSE; 12454 } 12455 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12456 if (substring == NULL) { 12457 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12458 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12459 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12460 return NULL; 12461 } 12462 result = tailmatch(self, substring, start, end, +1); 12463 Py_DECREF(substring); 12464 return PyBool_FromLong(result); 12465} 12466 12467#include "stringlib/unicode_format.h" 12468 12469PyDoc_STRVAR(format__doc__, 12470 "S.format(*args, **kwargs) -> str\n\ 12471\n\ 12472Return a formatted version of S, using substitutions from args and kwargs.\n\ 12473The substitutions are identified by braces ('{' and '}')."); 12474 12475PyDoc_STRVAR(format_map__doc__, 12476 "S.format_map(mapping) -> str\n\ 12477\n\ 12478Return a formatted version of S, using substitutions from mapping.\n\ 12479The substitutions are identified by braces ('{' and '}')."); 12480 12481static PyObject * 12482unicode__format__(PyObject* self, PyObject* args) 12483{ 12484 PyObject *format_spec, *out; 12485 12486 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12487 return NULL; 12488 12489 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, 12490 PyUnicode_GET_LENGTH(format_spec)); 12491 return out; 12492} 12493 12494PyDoc_STRVAR(p_format__doc__, 12495 "S.__format__(format_spec) -> str\n\ 12496\n\ 12497Return a formatted version of S as described by format_spec."); 12498 12499static PyObject * 12500unicode__sizeof__(PyUnicodeObject *v) 12501{ 12502 Py_ssize_t size; 12503 12504 /* If it's a compact object, account for base structure + 12505 character data. */ 12506 if (PyUnicode_IS_COMPACT_ASCII(v)) 12507 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12508 else if (PyUnicode_IS_COMPACT(v)) 12509 size = sizeof(PyCompactUnicodeObject) + 12510 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12511 else { 12512 /* If it is a two-block object, account for base object, and 12513 for character block if present. */ 12514 size = sizeof(PyUnicodeObject); 12515 if (_PyUnicode_DATA_ANY(v)) 12516 size += (PyUnicode_GET_LENGTH(v) + 1) * 12517 PyUnicode_KIND(v); 12518 } 12519 /* If the wstr pointer is present, account for it unless it is shared 12520 with the data pointer. Check if the data is not shared. */ 12521 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12522 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12523 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12524 size += PyUnicode_UTF8_LENGTH(v) + 1; 12525 12526 return PyLong_FromSsize_t(size); 12527} 12528 12529PyDoc_STRVAR(sizeof__doc__, 12530 "S.__sizeof__() -> size of S in memory, in bytes"); 12531 12532static PyObject * 12533unicode_getnewargs(PyObject *v) 12534{ 12535 PyObject *copy = PyUnicode_Copy(v); 12536 if (!copy) 12537 return NULL; 12538 return Py_BuildValue("(N)", copy); 12539} 12540 12541static PyMethodDef unicode_methods[] = { 12542 12543 /* Order is according to common usage: often used methods should 12544 appear first, since lookup is done sequentially. */ 12545 12546 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12547 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12548 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12549 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12550 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12551 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12552 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12553 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12554 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12555 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12556 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12557 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12558 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12559 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12560 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12561 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12562 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12563 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12564 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12565 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12566 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12567 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12568 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12569 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12570 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12571 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12572 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12573 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12574 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12575 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12576 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12577 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12578 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12579 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12580 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12581 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12582 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12583 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12584 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12585 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12586 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12587 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12588 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12589 {"maketrans", (PyCFunction) unicode_maketrans, 12590 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12591 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12592#if 0 12593 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12594#endif 12595 12596#if 0 12597 /* These methods are just used for debugging the implementation. */ 12598 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12599#endif 12600 12601 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12602 {NULL, NULL} 12603}; 12604 12605static PyObject * 12606unicode_mod(PyObject *v, PyObject *w) 12607{ 12608 if (!PyUnicode_Check(v)) 12609 Py_RETURN_NOTIMPLEMENTED; 12610 return PyUnicode_Format(v, w); 12611} 12612 12613static PyNumberMethods unicode_as_number = { 12614 0, /*nb_add*/ 12615 0, /*nb_subtract*/ 12616 0, /*nb_multiply*/ 12617 unicode_mod, /*nb_remainder*/ 12618}; 12619 12620static PySequenceMethods unicode_as_sequence = { 12621 (lenfunc) unicode_length, /* sq_length */ 12622 PyUnicode_Concat, /* sq_concat */ 12623 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12624 (ssizeargfunc) unicode_getitem, /* sq_item */ 12625 0, /* sq_slice */ 12626 0, /* sq_ass_item */ 12627 0, /* sq_ass_slice */ 12628 PyUnicode_Contains, /* sq_contains */ 12629}; 12630 12631static PyObject* 12632unicode_subscript(PyUnicodeObject* self, PyObject* item) 12633{ 12634 if (PyUnicode_READY(self) == -1) 12635 return NULL; 12636 12637 if (PyIndex_Check(item)) { 12638 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12639 if (i == -1 && PyErr_Occurred()) 12640 return NULL; 12641 if (i < 0) 12642 i += PyUnicode_GET_LENGTH(self); 12643 return unicode_getitem((PyObject*)self, i); 12644 } else if (PySlice_Check(item)) { 12645 Py_ssize_t start, stop, step, slicelength, cur, i; 12646 PyObject *result; 12647 void *src_data, *dest_data; 12648 int src_kind, dest_kind; 12649 Py_UCS4 ch, max_char, kind_limit; 12650 12651 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12652 &start, &stop, &step, &slicelength) < 0) { 12653 return NULL; 12654 } 12655 12656 if (slicelength <= 0) { 12657 return PyUnicode_New(0, 0); 12658 } else if (start == 0 && step == 1 && 12659 slicelength == PyUnicode_GET_LENGTH(self) && 12660 PyUnicode_CheckExact(self)) { 12661 Py_INCREF(self); 12662 return (PyObject *)self; 12663 } else if (step == 1) { 12664 return PyUnicode_Substring((PyObject*)self, 12665 start, start + slicelength); 12666 } 12667 /* General case */ 12668 max_char = 0; 12669 src_kind = PyUnicode_KIND(self); 12670 kind_limit = kind_maxchar_limit(src_kind); 12671 src_data = PyUnicode_DATA(self); 12672 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12673 ch = PyUnicode_READ(src_kind, src_data, cur); 12674 if (ch > max_char) { 12675 max_char = ch; 12676 if (max_char >= kind_limit) 12677 break; 12678 } 12679 } 12680 result = PyUnicode_New(slicelength, max_char); 12681 if (result == NULL) 12682 return NULL; 12683 dest_kind = PyUnicode_KIND(result); 12684 dest_data = PyUnicode_DATA(result); 12685 12686 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12687 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 12688 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 12689 } 12690 assert(_PyUnicode_CheckConsistency(result, 1)); 12691 return result; 12692 } else { 12693 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12694 return NULL; 12695 } 12696} 12697 12698static PyMappingMethods unicode_as_mapping = { 12699 (lenfunc)unicode_length, /* mp_length */ 12700 (binaryfunc)unicode_subscript, /* mp_subscript */ 12701 (objobjargproc)0, /* mp_ass_subscript */ 12702}; 12703 12704 12705/* Helpers for PyUnicode_Format() */ 12706 12707static PyObject * 12708getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12709{ 12710 Py_ssize_t argidx = *p_argidx; 12711 if (argidx < arglen) { 12712 (*p_argidx)++; 12713 if (arglen < 0) 12714 return args; 12715 else 12716 return PyTuple_GetItem(args, argidx); 12717 } 12718 PyErr_SetString(PyExc_TypeError, 12719 "not enough arguments for format string"); 12720 return NULL; 12721} 12722 12723/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12724 12725static PyObject * 12726formatfloat(PyObject *v, int flags, int prec, int type) 12727{ 12728 char *p; 12729 PyObject *result; 12730 double x; 12731 12732 x = PyFloat_AsDouble(v); 12733 if (x == -1.0 && PyErr_Occurred()) 12734 return NULL; 12735 12736 if (prec < 0) 12737 prec = 6; 12738 12739 p = PyOS_double_to_string(x, type, prec, 12740 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12741 if (p == NULL) 12742 return NULL; 12743 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12744 PyMem_Free(p); 12745 return result; 12746} 12747 12748static PyObject* 12749formatlong(PyObject *val, int flags, int prec, int type) 12750{ 12751 char *buf; 12752 int len; 12753 PyObject *str; /* temporary string object. */ 12754 PyObject *result; 12755 12756 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12757 if (!str) 12758 return NULL; 12759 result = PyUnicode_DecodeASCII(buf, len, NULL); 12760 Py_DECREF(str); 12761 return result; 12762} 12763 12764static Py_UCS4 12765formatchar(PyObject *v) 12766{ 12767 /* presume that the buffer is at least 3 characters long */ 12768 if (PyUnicode_Check(v)) { 12769 if (PyUnicode_GET_LENGTH(v) == 1) { 12770 return PyUnicode_READ_CHAR(v, 0); 12771 } 12772 goto onError; 12773 } 12774 else { 12775 /* Integer input truncated to a character */ 12776 long x; 12777 x = PyLong_AsLong(v); 12778 if (x == -1 && PyErr_Occurred()) 12779 goto onError; 12780 12781 if (x < 0 || x > 0x10ffff) { 12782 PyErr_SetString(PyExc_OverflowError, 12783 "%c arg not in range(0x110000)"); 12784 return (Py_UCS4) -1; 12785 } 12786 12787 return (Py_UCS4) x; 12788 } 12789 12790 onError: 12791 PyErr_SetString(PyExc_TypeError, 12792 "%c requires int or char"); 12793 return (Py_UCS4) -1; 12794} 12795 12796static int 12797repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) 12798{ 12799 int r; 12800 assert(count > 0); 12801 assert(PyUnicode_Check(obj)); 12802 if (count > 5) { 12803 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count); 12804 if (repeated == NULL) 12805 return -1; 12806 r = _PyAccu_Accumulate(acc, repeated); 12807 Py_DECREF(repeated); 12808 return r; 12809 } 12810 else { 12811 do { 12812 if (_PyAccu_Accumulate(acc, obj)) 12813 return -1; 12814 } while (--count); 12815 return 0; 12816 } 12817} 12818 12819PyObject * 12820PyUnicode_Format(PyObject *format, PyObject *args) 12821{ 12822 void *fmt; 12823 int fmtkind; 12824 PyObject *result; 12825 int kind; 12826 int r; 12827 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 12828 int args_owned = 0; 12829 PyObject *dict = NULL; 12830 PyObject *temp = NULL; 12831 PyObject *second = NULL; 12832 PyUnicodeObject *uformat; 12833 _PyAccu acc; 12834 static PyObject *plus, *minus, *blank, *zero, *percent; 12835 12836 if (!plus && !(plus = get_latin1_char('+'))) 12837 return NULL; 12838 if (!minus && !(minus = get_latin1_char('-'))) 12839 return NULL; 12840 if (!blank && !(blank = get_latin1_char(' '))) 12841 return NULL; 12842 if (!zero && !(zero = get_latin1_char('0'))) 12843 return NULL; 12844 if (!percent && !(percent = get_latin1_char('%'))) 12845 return NULL; 12846 12847 if (format == NULL || args == NULL) { 12848 PyErr_BadInternalCall(); 12849 return NULL; 12850 } 12851 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12852 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12853 return NULL; 12854 if (_PyAccu_Init(&acc)) 12855 goto onError; 12856 fmt = PyUnicode_DATA(uformat); 12857 fmtkind = PyUnicode_KIND(uformat); 12858 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12859 fmtpos = 0; 12860 12861 if (PyTuple_Check(args)) { 12862 arglen = PyTuple_Size(args); 12863 argidx = 0; 12864 } 12865 else { 12866 arglen = -1; 12867 argidx = -2; 12868 } 12869 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12870 !PyUnicode_Check(args)) 12871 dict = args; 12872 12873 while (--fmtcnt >= 0) { 12874 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12875 PyObject *nonfmt; 12876 Py_ssize_t nonfmtpos; 12877 nonfmtpos = fmtpos++; 12878 while (fmtcnt >= 0 && 12879 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12880 fmtpos++; 12881 fmtcnt--; 12882 } 12883 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos); 12884 if (nonfmt == NULL) 12885 goto onError; 12886 r = _PyAccu_Accumulate(&acc, nonfmt); 12887 Py_DECREF(nonfmt); 12888 if (r) 12889 goto onError; 12890 } 12891 else { 12892 /* Got a format specifier */ 12893 int flags = 0; 12894 Py_ssize_t width = -1; 12895 int prec = -1; 12896 Py_UCS4 c = '\0'; 12897 Py_UCS4 fill, sign; 12898 int isnumok; 12899 PyObject *v = NULL; 12900 void *pbuf = NULL; 12901 Py_ssize_t pindex, len; 12902 PyObject *signobj = NULL, *fillobj = NULL; 12903 12904 fmtpos++; 12905 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12906 Py_ssize_t keystart; 12907 Py_ssize_t keylen; 12908 PyObject *key; 12909 int pcount = 1; 12910 12911 if (dict == NULL) { 12912 PyErr_SetString(PyExc_TypeError, 12913 "format requires a mapping"); 12914 goto onError; 12915 } 12916 ++fmtpos; 12917 --fmtcnt; 12918 keystart = fmtpos; 12919 /* Skip over balanced parentheses */ 12920 while (pcount > 0 && --fmtcnt >= 0) { 12921 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12922 --pcount; 12923 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12924 ++pcount; 12925 fmtpos++; 12926 } 12927 keylen = fmtpos - keystart - 1; 12928 if (fmtcnt < 0 || pcount > 0) { 12929 PyErr_SetString(PyExc_ValueError, 12930 "incomplete format key"); 12931 goto onError; 12932 } 12933 key = PyUnicode_Substring((PyObject*)uformat, 12934 keystart, keystart + keylen); 12935 if (key == NULL) 12936 goto onError; 12937 if (args_owned) { 12938 Py_DECREF(args); 12939 args_owned = 0; 12940 } 12941 args = PyObject_GetItem(dict, key); 12942 Py_DECREF(key); 12943 if (args == NULL) { 12944 goto onError; 12945 } 12946 args_owned = 1; 12947 arglen = -1; 12948 argidx = -2; 12949 } 12950 while (--fmtcnt >= 0) { 12951 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12952 case '-': flags |= F_LJUST; continue; 12953 case '+': flags |= F_SIGN; continue; 12954 case ' ': flags |= F_BLANK; continue; 12955 case '#': flags |= F_ALT; continue; 12956 case '0': flags |= F_ZERO; continue; 12957 } 12958 break; 12959 } 12960 if (c == '*') { 12961 v = getnextarg(args, arglen, &argidx); 12962 if (v == NULL) 12963 goto onError; 12964 if (!PyLong_Check(v)) { 12965 PyErr_SetString(PyExc_TypeError, 12966 "* wants int"); 12967 goto onError; 12968 } 12969 width = PyLong_AsLong(v); 12970 if (width == -1 && PyErr_Occurred()) 12971 goto onError; 12972 if (width < 0) { 12973 flags |= F_LJUST; 12974 width = -width; 12975 } 12976 if (--fmtcnt >= 0) 12977 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12978 } 12979 else if (c >= '0' && c <= '9') { 12980 width = c - '0'; 12981 while (--fmtcnt >= 0) { 12982 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12983 if (c < '0' || c > '9') 12984 break; 12985 if ((width*10) / 10 != width) { 12986 PyErr_SetString(PyExc_ValueError, 12987 "width too big"); 12988 goto onError; 12989 } 12990 width = width*10 + (c - '0'); 12991 } 12992 } 12993 if (c == '.') { 12994 prec = 0; 12995 if (--fmtcnt >= 0) 12996 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12997 if (c == '*') { 12998 v = getnextarg(args, arglen, &argidx); 12999 if (v == NULL) 13000 goto onError; 13001 if (!PyLong_Check(v)) { 13002 PyErr_SetString(PyExc_TypeError, 13003 "* wants int"); 13004 goto onError; 13005 } 13006 prec = PyLong_AsLong(v); 13007 if (prec == -1 && PyErr_Occurred()) 13008 goto onError; 13009 if (prec < 0) 13010 prec = 0; 13011 if (--fmtcnt >= 0) 13012 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13013 } 13014 else if (c >= '0' && c <= '9') { 13015 prec = c - '0'; 13016 while (--fmtcnt >= 0) { 13017 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13018 if (c < '0' || c > '9') 13019 break; 13020 if ((prec*10) / 10 != prec) { 13021 PyErr_SetString(PyExc_ValueError, 13022 "prec too big"); 13023 goto onError; 13024 } 13025 prec = prec*10 + (c - '0'); 13026 } 13027 } 13028 } /* prec */ 13029 if (fmtcnt >= 0) { 13030 if (c == 'h' || c == 'l' || c == 'L') { 13031 if (--fmtcnt >= 0) 13032 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13033 } 13034 } 13035 if (fmtcnt < 0) { 13036 PyErr_SetString(PyExc_ValueError, 13037 "incomplete format"); 13038 goto onError; 13039 } 13040 if (c != '%') { 13041 v = getnextarg(args, arglen, &argidx); 13042 if (v == NULL) 13043 goto onError; 13044 } 13045 sign = 0; 13046 fill = ' '; 13047 fillobj = blank; 13048 switch (c) { 13049 13050 case '%': 13051 _PyAccu_Accumulate(&acc, percent); 13052 continue; 13053 13054 case 's': 13055 case 'r': 13056 case 'a': 13057 if (PyUnicode_CheckExact(v) && c == 's') { 13058 temp = v; 13059 Py_INCREF(temp); 13060 } 13061 else { 13062 if (c == 's') 13063 temp = PyObject_Str(v); 13064 else if (c == 'r') 13065 temp = PyObject_Repr(v); 13066 else 13067 temp = PyObject_ASCII(v); 13068 if (temp == NULL) 13069 goto onError; 13070 if (PyUnicode_Check(temp)) 13071 /* nothing to do */; 13072 else { 13073 Py_DECREF(temp); 13074 PyErr_SetString(PyExc_TypeError, 13075 "%s argument has non-string str()"); 13076 goto onError; 13077 } 13078 } 13079 if (PyUnicode_READY(temp) == -1) { 13080 Py_CLEAR(temp); 13081 goto onError; 13082 } 13083 pbuf = PyUnicode_DATA(temp); 13084 kind = PyUnicode_KIND(temp); 13085 len = PyUnicode_GET_LENGTH(temp); 13086 if (prec >= 0 && len > prec) 13087 len = prec; 13088 break; 13089 13090 case 'i': 13091 case 'd': 13092 case 'u': 13093 case 'o': 13094 case 'x': 13095 case 'X': 13096 isnumok = 0; 13097 if (PyNumber_Check(v)) { 13098 PyObject *iobj=NULL; 13099 13100 if (PyLong_Check(v)) { 13101 iobj = v; 13102 Py_INCREF(iobj); 13103 } 13104 else { 13105 iobj = PyNumber_Long(v); 13106 } 13107 if (iobj!=NULL) { 13108 if (PyLong_Check(iobj)) { 13109 isnumok = 1; 13110 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13111 Py_DECREF(iobj); 13112 if (!temp) 13113 goto onError; 13114 if (PyUnicode_READY(temp) == -1) { 13115 Py_CLEAR(temp); 13116 goto onError; 13117 } 13118 pbuf = PyUnicode_DATA(temp); 13119 kind = PyUnicode_KIND(temp); 13120 len = PyUnicode_GET_LENGTH(temp); 13121 sign = 1; 13122 } 13123 else { 13124 Py_DECREF(iobj); 13125 } 13126 } 13127 } 13128 if (!isnumok) { 13129 PyErr_Format(PyExc_TypeError, 13130 "%%%c format: a number is required, " 13131 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13132 goto onError; 13133 } 13134 if (flags & F_ZERO) { 13135 fill = '0'; 13136 fillobj = zero; 13137 } 13138 break; 13139 13140 case 'e': 13141 case 'E': 13142 case 'f': 13143 case 'F': 13144 case 'g': 13145 case 'G': 13146 temp = formatfloat(v, flags, prec, c); 13147 if (!temp) 13148 goto onError; 13149 if (PyUnicode_READY(temp) == -1) { 13150 Py_CLEAR(temp); 13151 goto onError; 13152 } 13153 pbuf = PyUnicode_DATA(temp); 13154 kind = PyUnicode_KIND(temp); 13155 len = PyUnicode_GET_LENGTH(temp); 13156 sign = 1; 13157 if (flags & F_ZERO) { 13158 fill = '0'; 13159 fillobj = zero; 13160 } 13161 break; 13162 13163 case 'c': 13164 { 13165 Py_UCS4 ch = formatchar(v); 13166 if (ch == (Py_UCS4) -1) 13167 goto onError; 13168 temp = _PyUnicode_FromUCS4(&ch, 1); 13169 if (temp == NULL) 13170 goto onError; 13171 pbuf = PyUnicode_DATA(temp); 13172 kind = PyUnicode_KIND(temp); 13173 len = PyUnicode_GET_LENGTH(temp); 13174 break; 13175 } 13176 13177 default: 13178 PyErr_Format(PyExc_ValueError, 13179 "unsupported format character '%c' (0x%x) " 13180 "at index %zd", 13181 (31<=c && c<=126) ? (char)c : '?', 13182 (int)c, 13183 fmtpos - 1); 13184 goto onError; 13185 } 13186 /* pbuf is initialized here. */ 13187 pindex = 0; 13188 if (sign) { 13189 if (PyUnicode_READ(kind, pbuf, pindex) == '-') { 13190 signobj = minus; 13191 len--; 13192 pindex++; 13193 } 13194 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') { 13195 signobj = plus; 13196 len--; 13197 pindex++; 13198 } 13199 else if (flags & F_SIGN) 13200 signobj = plus; 13201 else if (flags & F_BLANK) 13202 signobj = blank; 13203 else 13204 sign = 0; 13205 } 13206 if (width < len) 13207 width = len; 13208 if (sign) { 13209 if (fill != ' ') { 13210 assert(signobj != NULL); 13211 if (_PyAccu_Accumulate(&acc, signobj)) 13212 goto onError; 13213 } 13214 if (width > len) 13215 width--; 13216 } 13217 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13218 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13219 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13220 if (fill != ' ') { 13221 second = get_latin1_char( 13222 PyUnicode_READ(kind, pbuf, pindex + 1)); 13223 pindex += 2; 13224 if (second == NULL || 13225 _PyAccu_Accumulate(&acc, zero) || 13226 _PyAccu_Accumulate(&acc, second)) 13227 goto onError; 13228 Py_CLEAR(second); 13229 } 13230 width -= 2; 13231 if (width < 0) 13232 width = 0; 13233 len -= 2; 13234 } 13235 if (width > len && !(flags & F_LJUST)) { 13236 assert(fillobj != NULL); 13237 if (repeat_accumulate(&acc, fillobj, width - len)) 13238 goto onError; 13239 width = len; 13240 } 13241 if (fill == ' ') { 13242 if (sign) { 13243 assert(signobj != NULL); 13244 if (_PyAccu_Accumulate(&acc, signobj)) 13245 goto onError; 13246 } 13247 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13248 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13249 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13250 second = get_latin1_char( 13251 PyUnicode_READ(kind, pbuf, pindex + 1)); 13252 pindex += 2; 13253 if (second == NULL || 13254 _PyAccu_Accumulate(&acc, zero) || 13255 _PyAccu_Accumulate(&acc, second)) 13256 goto onError; 13257 Py_CLEAR(second); 13258 } 13259 } 13260 /* Copy all characters, preserving len */ 13261 if (temp != NULL) { 13262 assert(pbuf == PyUnicode_DATA(temp)); 13263 v = PyUnicode_Substring(temp, pindex, pindex + len); 13264 } 13265 else { 13266 const char *p = (const char *) pbuf; 13267 assert(pbuf != NULL); 13268 p += kind * pindex; 13269 v = PyUnicode_FromKindAndData(kind, p, len); 13270 } 13271 if (v == NULL) 13272 goto onError; 13273 r = _PyAccu_Accumulate(&acc, v); 13274 Py_DECREF(v); 13275 if (r) 13276 goto onError; 13277 if (width > len && repeat_accumulate(&acc, blank, width - len)) 13278 goto onError; 13279 if (dict && (argidx < arglen) && c != '%') { 13280 PyErr_SetString(PyExc_TypeError, 13281 "not all arguments converted during string formatting"); 13282 goto onError; 13283 } 13284 Py_CLEAR(temp); 13285 } /* '%' */ 13286 } /* until end */ 13287 if (argidx < arglen && !dict) { 13288 PyErr_SetString(PyExc_TypeError, 13289 "not all arguments converted during string formatting"); 13290 goto onError; 13291 } 13292 13293 result = _PyAccu_Finish(&acc); 13294 if (args_owned) { 13295 Py_DECREF(args); 13296 } 13297 Py_DECREF(uformat); 13298 Py_XDECREF(temp); 13299 Py_XDECREF(second); 13300 return (PyObject *)result; 13301 13302 onError: 13303 Py_DECREF(uformat); 13304 Py_XDECREF(temp); 13305 Py_XDECREF(second); 13306 _PyAccu_Destroy(&acc); 13307 if (args_owned) { 13308 Py_DECREF(args); 13309 } 13310 return NULL; 13311} 13312 13313static PyObject * 13314unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13315 13316static PyObject * 13317unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13318{ 13319 PyObject *x = NULL; 13320 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13321 char *encoding = NULL; 13322 char *errors = NULL; 13323 13324 if (type != &PyUnicode_Type) 13325 return unicode_subtype_new(type, args, kwds); 13326 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13327 kwlist, &x, &encoding, &errors)) 13328 return NULL; 13329 if (x == NULL) 13330 return (PyObject *)PyUnicode_New(0, 0); 13331 if (encoding == NULL && errors == NULL) 13332 return PyObject_Str(x); 13333 else 13334 return PyUnicode_FromEncodedObject(x, encoding, errors); 13335} 13336 13337static PyObject * 13338unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13339{ 13340 PyUnicodeObject *unicode, *self; 13341 Py_ssize_t length, char_size; 13342 int share_wstr, share_utf8; 13343 unsigned int kind; 13344 void *data; 13345 13346 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13347 13348 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 13349 if (unicode == NULL) 13350 return NULL; 13351 assert(_PyUnicode_CHECK(unicode)); 13352 if (PyUnicode_READY(unicode)) 13353 return NULL; 13354 13355 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 13356 if (self == NULL) { 13357 Py_DECREF(unicode); 13358 return NULL; 13359 } 13360 kind = PyUnicode_KIND(unicode); 13361 length = PyUnicode_GET_LENGTH(unicode); 13362 13363 _PyUnicode_LENGTH(self) = length; 13364#ifdef Py_DEBUG 13365 _PyUnicode_HASH(self) = -1; 13366#else 13367 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13368#endif 13369 _PyUnicode_STATE(self).interned = 0; 13370 _PyUnicode_STATE(self).kind = kind; 13371 _PyUnicode_STATE(self).compact = 0; 13372 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13373 _PyUnicode_STATE(self).ready = 1; 13374 _PyUnicode_WSTR(self) = NULL; 13375 _PyUnicode_UTF8_LENGTH(self) = 0; 13376 _PyUnicode_UTF8(self) = NULL; 13377 _PyUnicode_WSTR_LENGTH(self) = 0; 13378 _PyUnicode_DATA_ANY(self) = NULL; 13379 13380 share_utf8 = 0; 13381 share_wstr = 0; 13382 if (kind == PyUnicode_1BYTE_KIND) { 13383 char_size = 1; 13384 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13385 share_utf8 = 1; 13386 } 13387 else if (kind == PyUnicode_2BYTE_KIND) { 13388 char_size = 2; 13389 if (sizeof(wchar_t) == 2) 13390 share_wstr = 1; 13391 } 13392 else { 13393 assert(kind == PyUnicode_4BYTE_KIND); 13394 char_size = 4; 13395 if (sizeof(wchar_t) == 4) 13396 share_wstr = 1; 13397 } 13398 13399 /* Ensure we won't overflow the length. */ 13400 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13401 PyErr_NoMemory(); 13402 goto onError; 13403 } 13404 data = PyObject_MALLOC((length + 1) * char_size); 13405 if (data == NULL) { 13406 PyErr_NoMemory(); 13407 goto onError; 13408 } 13409 13410 _PyUnicode_DATA_ANY(self) = data; 13411 if (share_utf8) { 13412 _PyUnicode_UTF8_LENGTH(self) = length; 13413 _PyUnicode_UTF8(self) = data; 13414 } 13415 if (share_wstr) { 13416 _PyUnicode_WSTR_LENGTH(self) = length; 13417 _PyUnicode_WSTR(self) = (wchar_t *)data; 13418 } 13419 13420 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13421 kind * (length + 1)); 13422 Py_DECREF(unicode); 13423 assert(_PyUnicode_CheckConsistency(self, 1)); 13424#ifdef Py_DEBUG 13425 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13426#endif 13427 return (PyObject *)self; 13428 13429onError: 13430 Py_DECREF(unicode); 13431 Py_DECREF(self); 13432 return NULL; 13433} 13434 13435PyDoc_STRVAR(unicode_doc, 13436 "str(string[, encoding[, errors]]) -> str\n\ 13437\n\ 13438Create a new string object from the given encoded string.\n\ 13439encoding defaults to the current default string encoding.\n\ 13440errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13441 13442static PyObject *unicode_iter(PyObject *seq); 13443 13444PyTypeObject PyUnicode_Type = { 13445 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13446 "str", /* tp_name */ 13447 sizeof(PyUnicodeObject), /* tp_size */ 13448 0, /* tp_itemsize */ 13449 /* Slots */ 13450 (destructor)unicode_dealloc, /* tp_dealloc */ 13451 0, /* tp_print */ 13452 0, /* tp_getattr */ 13453 0, /* tp_setattr */ 13454 0, /* tp_reserved */ 13455 unicode_repr, /* tp_repr */ 13456 &unicode_as_number, /* tp_as_number */ 13457 &unicode_as_sequence, /* tp_as_sequence */ 13458 &unicode_as_mapping, /* tp_as_mapping */ 13459 (hashfunc) unicode_hash, /* tp_hash*/ 13460 0, /* tp_call*/ 13461 (reprfunc) unicode_str, /* tp_str */ 13462 PyObject_GenericGetAttr, /* tp_getattro */ 13463 0, /* tp_setattro */ 13464 0, /* tp_as_buffer */ 13465 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13466 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13467 unicode_doc, /* tp_doc */ 13468 0, /* tp_traverse */ 13469 0, /* tp_clear */ 13470 PyUnicode_RichCompare, /* tp_richcompare */ 13471 0, /* tp_weaklistoffset */ 13472 unicode_iter, /* tp_iter */ 13473 0, /* tp_iternext */ 13474 unicode_methods, /* tp_methods */ 13475 0, /* tp_members */ 13476 0, /* tp_getset */ 13477 &PyBaseObject_Type, /* tp_base */ 13478 0, /* tp_dict */ 13479 0, /* tp_descr_get */ 13480 0, /* tp_descr_set */ 13481 0, /* tp_dictoffset */ 13482 0, /* tp_init */ 13483 0, /* tp_alloc */ 13484 unicode_new, /* tp_new */ 13485 PyObject_Del, /* tp_free */ 13486}; 13487 13488/* Initialize the Unicode implementation */ 13489 13490void _PyUnicode_Init(void) 13491{ 13492 int i; 13493 13494 /* XXX - move this array to unicodectype.c ? */ 13495 Py_UCS2 linebreak[] = { 13496 0x000A, /* LINE FEED */ 13497 0x000D, /* CARRIAGE RETURN */ 13498 0x001C, /* FILE SEPARATOR */ 13499 0x001D, /* GROUP SEPARATOR */ 13500 0x001E, /* RECORD SEPARATOR */ 13501 0x0085, /* NEXT LINE */ 13502 0x2028, /* LINE SEPARATOR */ 13503 0x2029, /* PARAGRAPH SEPARATOR */ 13504 }; 13505 13506 /* Init the implementation */ 13507 unicode_empty = PyUnicode_New(0, 0); 13508 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 13509 if (!unicode_empty) 13510 Py_FatalError("Can't create empty string"); 13511 13512 for (i = 0; i < 256; i++) 13513 unicode_latin1[i] = NULL; 13514 if (PyType_Ready(&PyUnicode_Type) < 0) 13515 Py_FatalError("Can't initialize 'unicode'"); 13516 13517 /* initialize the linebreak bloom filter */ 13518 bloom_linebreak = make_bloom_mask( 13519 PyUnicode_2BYTE_KIND, linebreak, 13520 Py_ARRAY_LENGTH(linebreak)); 13521 13522 PyType_Ready(&EncodingMapType); 13523} 13524 13525/* Finalize the Unicode implementation */ 13526 13527int 13528PyUnicode_ClearFreeList(void) 13529{ 13530 return 0; 13531} 13532 13533void 13534_PyUnicode_Fini(void) 13535{ 13536 int i; 13537 13538 Py_XDECREF(unicode_empty); 13539 unicode_empty = NULL; 13540 13541 for (i = 0; i < 256; i++) { 13542 if (unicode_latin1[i]) { 13543 Py_DECREF(unicode_latin1[i]); 13544 unicode_latin1[i] = NULL; 13545 } 13546 } 13547 _PyUnicode_ClearStaticStrings(); 13548 (void)PyUnicode_ClearFreeList(); 13549} 13550 13551void 13552PyUnicode_InternInPlace(PyObject **p) 13553{ 13554 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13555 PyObject *t; 13556#ifdef Py_DEBUG 13557 assert(s != NULL); 13558 assert(_PyUnicode_CHECK(s)); 13559#else 13560 if (s == NULL || !PyUnicode_Check(s)) 13561 return; 13562#endif 13563 /* If it's a subclass, we don't really know what putting 13564 it in the interned dict might do. */ 13565 if (!PyUnicode_CheckExact(s)) 13566 return; 13567 if (PyUnicode_CHECK_INTERNED(s)) 13568 return; 13569 if (_PyUnicode_READY_REPLACE(p)) { 13570 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 13571 return; 13572 } 13573 s = (PyUnicodeObject *)(*p); 13574 if (interned == NULL) { 13575 interned = PyDict_New(); 13576 if (interned == NULL) { 13577 PyErr_Clear(); /* Don't leave an exception */ 13578 return; 13579 } 13580 } 13581 /* It might be that the GetItem call fails even 13582 though the key is present in the dictionary, 13583 namely when this happens during a stack overflow. */ 13584 Py_ALLOW_RECURSION 13585 t = PyDict_GetItem(interned, (PyObject *)s); 13586 Py_END_ALLOW_RECURSION 13587 13588 if (t) { 13589 Py_INCREF(t); 13590 Py_DECREF(*p); 13591 *p = t; 13592 return; 13593 } 13594 13595 PyThreadState_GET()->recursion_critical = 1; 13596 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13597 PyErr_Clear(); 13598 PyThreadState_GET()->recursion_critical = 0; 13599 return; 13600 } 13601 PyThreadState_GET()->recursion_critical = 0; 13602 /* The two references in interned are not counted by refcnt. 13603 The deallocator will take care of this */ 13604 Py_REFCNT(s) -= 2; 13605 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13606} 13607 13608void 13609PyUnicode_InternImmortal(PyObject **p) 13610{ 13611 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13612 13613 PyUnicode_InternInPlace(p); 13614 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13615 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13616 Py_INCREF(*p); 13617 } 13618} 13619 13620PyObject * 13621PyUnicode_InternFromString(const char *cp) 13622{ 13623 PyObject *s = PyUnicode_FromString(cp); 13624 if (s == NULL) 13625 return NULL; 13626 PyUnicode_InternInPlace(&s); 13627 return s; 13628} 13629 13630void 13631_Py_ReleaseInternedUnicodeStrings(void) 13632{ 13633 PyObject *keys; 13634 PyUnicodeObject *s; 13635 Py_ssize_t i, n; 13636 Py_ssize_t immortal_size = 0, mortal_size = 0; 13637 13638 if (interned == NULL || !PyDict_Check(interned)) 13639 return; 13640 keys = PyDict_Keys(interned); 13641 if (keys == NULL || !PyList_Check(keys)) { 13642 PyErr_Clear(); 13643 return; 13644 } 13645 13646 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13647 detector, interned unicode strings are not forcibly deallocated; 13648 rather, we give them their stolen references back, and then clear 13649 and DECREF the interned dict. */ 13650 13651 n = PyList_GET_SIZE(keys); 13652 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13653 n); 13654 for (i = 0; i < n; i++) { 13655 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13656 if (PyUnicode_READY(s) == -1) { 13657 assert(0 && "could not ready string"); 13658 fprintf(stderr, "could not ready string\n"); 13659 } 13660 switch (PyUnicode_CHECK_INTERNED(s)) { 13661 case SSTATE_NOT_INTERNED: 13662 /* XXX Shouldn't happen */ 13663 break; 13664 case SSTATE_INTERNED_IMMORTAL: 13665 Py_REFCNT(s) += 1; 13666 immortal_size += PyUnicode_GET_LENGTH(s); 13667 break; 13668 case SSTATE_INTERNED_MORTAL: 13669 Py_REFCNT(s) += 2; 13670 mortal_size += PyUnicode_GET_LENGTH(s); 13671 break; 13672 default: 13673 Py_FatalError("Inconsistent interned string state."); 13674 } 13675 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13676 } 13677 fprintf(stderr, "total size of all interned strings: " 13678 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13679 "mortal/immortal\n", mortal_size, immortal_size); 13680 Py_DECREF(keys); 13681 PyDict_Clear(interned); 13682 Py_DECREF(interned); 13683 interned = NULL; 13684} 13685 13686 13687/********************* Unicode Iterator **************************/ 13688 13689typedef struct { 13690 PyObject_HEAD 13691 Py_ssize_t it_index; 13692 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13693} unicodeiterobject; 13694 13695static void 13696unicodeiter_dealloc(unicodeiterobject *it) 13697{ 13698 _PyObject_GC_UNTRACK(it); 13699 Py_XDECREF(it->it_seq); 13700 PyObject_GC_Del(it); 13701} 13702 13703static int 13704unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13705{ 13706 Py_VISIT(it->it_seq); 13707 return 0; 13708} 13709 13710static PyObject * 13711unicodeiter_next(unicodeiterobject *it) 13712{ 13713 PyUnicodeObject *seq; 13714 PyObject *item; 13715 13716 assert(it != NULL); 13717 seq = it->it_seq; 13718 if (seq == NULL) 13719 return NULL; 13720 assert(_PyUnicode_CHECK(seq)); 13721 13722 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13723 int kind = PyUnicode_KIND(seq); 13724 void *data = PyUnicode_DATA(seq); 13725 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13726 item = PyUnicode_FromOrdinal(chr); 13727 if (item != NULL) 13728 ++it->it_index; 13729 return item; 13730 } 13731 13732 Py_DECREF(seq); 13733 it->it_seq = NULL; 13734 return NULL; 13735} 13736 13737static PyObject * 13738unicodeiter_len(unicodeiterobject *it) 13739{ 13740 Py_ssize_t len = 0; 13741 if (it->it_seq) 13742 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13743 return PyLong_FromSsize_t(len); 13744} 13745 13746PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13747 13748static PyMethodDef unicodeiter_methods[] = { 13749 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13750 length_hint_doc}, 13751 {NULL, NULL} /* sentinel */ 13752}; 13753 13754PyTypeObject PyUnicodeIter_Type = { 13755 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13756 "str_iterator", /* tp_name */ 13757 sizeof(unicodeiterobject), /* tp_basicsize */ 13758 0, /* tp_itemsize */ 13759 /* methods */ 13760 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13761 0, /* tp_print */ 13762 0, /* tp_getattr */ 13763 0, /* tp_setattr */ 13764 0, /* tp_reserved */ 13765 0, /* tp_repr */ 13766 0, /* tp_as_number */ 13767 0, /* tp_as_sequence */ 13768 0, /* tp_as_mapping */ 13769 0, /* tp_hash */ 13770 0, /* tp_call */ 13771 0, /* tp_str */ 13772 PyObject_GenericGetAttr, /* tp_getattro */ 13773 0, /* tp_setattro */ 13774 0, /* tp_as_buffer */ 13775 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13776 0, /* tp_doc */ 13777 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13778 0, /* tp_clear */ 13779 0, /* tp_richcompare */ 13780 0, /* tp_weaklistoffset */ 13781 PyObject_SelfIter, /* tp_iter */ 13782 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13783 unicodeiter_methods, /* tp_methods */ 13784 0, 13785}; 13786 13787static PyObject * 13788unicode_iter(PyObject *seq) 13789{ 13790 unicodeiterobject *it; 13791 13792 if (!PyUnicode_Check(seq)) { 13793 PyErr_BadInternalCall(); 13794 return NULL; 13795 } 13796 if (PyUnicode_READY(seq) == -1) 13797 return NULL; 13798 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13799 if (it == NULL) 13800 return NULL; 13801 it->it_index = 0; 13802 Py_INCREF(seq); 13803 it->it_seq = (PyUnicodeObject *)seq; 13804 _PyObject_GC_TRACK(it); 13805 return (PyObject *)it; 13806} 13807 13808#define UNIOP(x) Py_UNICODE_##x 13809#define UNIOP_t Py_UNICODE 13810#include "uniops.h" 13811#undef UNIOP 13812#undef UNIOP_t 13813#define UNIOP(x) Py_UCS4_##x 13814#define UNIOP_t Py_UCS4 13815#include "uniops.h" 13816#undef UNIOP 13817#undef UNIOP_t 13818 13819Py_UNICODE* 13820PyUnicode_AsUnicodeCopy(PyObject *object) 13821{ 13822 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13823 Py_UNICODE *copy; 13824 Py_ssize_t size; 13825 13826 if (!PyUnicode_Check(unicode)) { 13827 PyErr_BadArgument(); 13828 return NULL; 13829 } 13830 /* Ensure we won't overflow the size. */ 13831 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13832 PyErr_NoMemory(); 13833 return NULL; 13834 } 13835 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13836 size *= sizeof(Py_UNICODE); 13837 copy = PyMem_Malloc(size); 13838 if (copy == NULL) { 13839 PyErr_NoMemory(); 13840 return NULL; 13841 } 13842 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13843 return copy; 13844} 13845 13846/* A _string module, to export formatter_parser and formatter_field_name_split 13847 to the string.Formatter class implemented in Python. */ 13848 13849static PyMethodDef _string_methods[] = { 13850 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13851 METH_O, PyDoc_STR("split the argument as a field name")}, 13852 {"formatter_parser", (PyCFunction) formatter_parser, 13853 METH_O, PyDoc_STR("parse the argument as a format string")}, 13854 {NULL, NULL} 13855}; 13856 13857static struct PyModuleDef _string_module = { 13858 PyModuleDef_HEAD_INIT, 13859 "_string", 13860 PyDoc_STR("string helper module"), 13861 0, 13862 _string_methods, 13863 NULL, 13864 NULL, 13865 NULL, 13866 NULL 13867}; 13868 13869PyMODINIT_FUNC 13870PyInit__string(void) 13871{ 13872 return PyModule_Create(&_string_module); 13873} 13874 13875 13876#ifdef __cplusplus 13877} 13878#endif 13879