unicodeobject.c revision 7597addbd4f56e6a3a8a595db404824c5f825c3a
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Limit for the Unicode object free list */ 50 51#define PyUnicode_MAXFREELIST 1024 52 53/* Limit for the Unicode object free list stay alive optimization. 54 55 The implementation will keep allocated Unicode memory intact for 56 all objects on the free list having a size less than this 57 limit. This reduces malloc() overhead for small Unicode objects. 58 59 At worst this will result in PyUnicode_MAXFREELIST * 60 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 61 malloc()-overhead) bytes of unused garbage. 62 63 Setting the limit to 0 effectively turns the feature off. 64 65 Note: This is an experimental feature ! If you get core dumps when 66 using Unicode objects, turn this feature off. 67 68*/ 69 70#define KEEPALIVE_SIZE_LIMIT 9 71 72/* Endianness switches; defaults to little endian */ 73 74#ifdef WORDS_BIGENDIAN 75# define BYTEORDER_IS_BIG_ENDIAN 76#else 77# define BYTEORDER_IS_LITTLE_ENDIAN 78#endif 79 80/* --- Globals ------------------------------------------------------------ 81 82 The globals are initialized by the _PyUnicode_Init() API and should 83 not be used before calling that API. 84 85*/ 86 87 88#ifdef __cplusplus 89extern "C" { 90#endif 91 92#ifdef Py_DEBUG 93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) 94#else 95# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 96#endif 97 98#define _PyUnicode_UTF8(op) \ 99 (((PyCompactUnicodeObject*)(op))->utf8) 100#define PyUnicode_UTF8(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 assert(PyUnicode_IS_READY(op)), \ 103 PyUnicode_IS_COMPACT_ASCII(op) ? \ 104 ((char*)((PyASCIIObject*)(op) + 1)) : \ 105 _PyUnicode_UTF8(op)) 106#define _PyUnicode_UTF8_LENGTH(op) \ 107 (((PyCompactUnicodeObject*)(op))->utf8_length) 108#define PyUnicode_UTF8_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 assert(PyUnicode_IS_READY(op)), \ 111 PyUnicode_IS_COMPACT_ASCII(op) ? \ 112 ((PyASCIIObject*)(op))->length : \ 113 _PyUnicode_UTF8_LENGTH(op)) 114#define _PyUnicode_WSTR(op) \ 115 (((PyASCIIObject*)(op))->wstr) 116#define _PyUnicode_WSTR_LENGTH(op) \ 117 (((PyCompactUnicodeObject*)(op))->wstr_length) 118#define _PyUnicode_LENGTH(op) \ 119 (((PyASCIIObject *)(op))->length) 120#define _PyUnicode_STATE(op) \ 121 (((PyASCIIObject *)(op))->state) 122#define _PyUnicode_HASH(op) \ 123 (((PyASCIIObject *)(op))->hash) 124#define _PyUnicode_KIND(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 ((PyASCIIObject *)(op))->state.kind) 127#define _PyUnicode_GET_LENGTH(op) \ 128 (assert(_PyUnicode_CHECK(op)), \ 129 ((PyASCIIObject *)(op))->length) 130#define _PyUnicode_DATA_ANY(op) \ 131 (((PyUnicodeObject*)(op))->data.any) 132 133#undef PyUnicode_READY 134#define PyUnicode_READY(op) \ 135 (assert(_PyUnicode_CHECK(op)), \ 136 (PyUnicode_IS_READY(op) ? \ 137 0 : \ 138 _PyUnicode_Ready((PyObject *)(op)))) 139 140#define _PyUnicode_READY_REPLACE(p_obj) \ 141 (assert(_PyUnicode_CHECK(*p_obj)), \ 142 (PyUnicode_IS_READY(*p_obj) ? \ 143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 144 145#define _PyUnicode_SHARE_UTF8(op) \ 146 (assert(_PyUnicode_CHECK(op)), \ 147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 149#define _PyUnicode_SHARE_WSTR(op) \ 150 (assert(_PyUnicode_CHECK(op)), \ 151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 152 153/* true if the Unicode object has an allocated UTF-8 memory block 154 (not shared with other data) */ 155#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 156 (assert(_PyUnicode_CHECK(op)), \ 157 (!PyUnicode_IS_COMPACT_ASCII(op) \ 158 && _PyUnicode_UTF8(op) \ 159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 160 161/* true if the Unicode object has an allocated wstr memory block 162 (not shared with other data) */ 163#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 164 (assert(_PyUnicode_CHECK(op)), \ 165 (_PyUnicode_WSTR(op) && \ 166 (!PyUnicode_IS_READY(op) || \ 167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 168 169/* Generic helper macro to convert characters of different types. 170 from_type and to_type have to be valid type names, begin and end 171 are pointers to the source characters which should be of type 172 "from_type *". to is a pointer of type "to_type *" and points to the 173 buffer where the result characters are written to. */ 174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 175 do { \ 176 const from_type *iter_; to_type *to_; \ 177 for (iter_ = (begin), to_ = (to_type *)(to); \ 178 iter_ < (end); \ 179 ++iter_, ++to_) { \ 180 *to_ = (to_type)*iter_; \ 181 } \ 182 } while (0) 183 184/* The Unicode string has been modified: reset the hash */ 185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 186 187/* This dictionary holds all interned unicode strings. Note that references 188 to strings in this dictionary are *not* counted in the string's ob_refcnt. 189 When the interned string reaches a refcnt of 0 the string deallocation 190 function will delete the reference from this dictionary. 191 192 Another way to look at this is that to say that the actual reference 193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 194*/ 195static PyObject *interned; 196 197/* The empty Unicode object is shared to improve performance. */ 198static PyObject *unicode_empty; 199 200/* Single character Unicode strings in the Latin-1 range are being 201 shared as well. */ 202static PyObject *unicode_latin1[256]; 203 204/* Fast detection of the most frequent whitespace characters */ 205const unsigned char _Py_ascii_whitespace[] = { 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x0009: * CHARACTER TABULATION */ 208/* case 0x000A: * LINE FEED */ 209/* case 0x000B: * LINE TABULATION */ 210/* case 0x000C: * FORM FEED */ 211/* case 0x000D: * CARRIAGE RETURN */ 212 0, 1, 1, 1, 1, 1, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214/* case 0x001C: * FILE SEPARATOR */ 215/* case 0x001D: * GROUP SEPARATOR */ 216/* case 0x001E: * RECORD SEPARATOR */ 217/* case 0x001F: * UNIT SEPARATOR */ 218 0, 0, 0, 0, 1, 1, 1, 1, 219/* case 0x0020: * SPACE */ 220 1, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0 233}; 234 235/* forward */ 236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 237static PyObject* get_latin1_char(unsigned char ch); 238 239static PyObject * 240unicode_encode_call_errorhandler(const char *errors, 241 PyObject **errorHandler,const char *encoding, const char *reason, 242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 244 245static void 246raise_encode_exception(PyObject **exceptionObject, 247 const char *encoding, 248 const Py_UNICODE *unicode, Py_ssize_t size, 249 Py_ssize_t startpos, Py_ssize_t endpos, 250 const char *reason); 251 252/* Same for linebreaks */ 253static unsigned char ascii_linebreak[] = { 254 0, 0, 0, 0, 0, 0, 0, 0, 255/* 0x000A, * LINE FEED */ 256/* 0x000B, * LINE TABULATION */ 257/* 0x000C, * FORM FEED */ 258/* 0x000D, * CARRIAGE RETURN */ 259 0, 0, 1, 1, 1, 1, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261/* 0x001C, * FILE SEPARATOR */ 262/* 0x001D, * GROUP SEPARATOR */ 263/* 0x001E, * RECORD SEPARATOR */ 264 0, 0, 0, 0, 1, 1, 1, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0 278}; 279 280/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 281 This function is kept for backward compatibility with the old API. */ 282Py_UNICODE 283PyUnicode_GetMax(void) 284{ 285#ifdef Py_UNICODE_WIDE 286 return 0x10FFFF; 287#else 288 /* This is actually an illegal character, so it should 289 not be passed to unichr. */ 290 return 0xFFFF; 291#endif 292} 293 294#ifdef Py_DEBUG 295static int 296_PyUnicode_CheckConsistency(void *op) 297{ 298 PyASCIIObject *ascii; 299 unsigned int kind; 300 301 assert(PyUnicode_Check(op)); 302 303 ascii = (PyASCIIObject *)op; 304 kind = ascii->state.kind; 305 306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 307 assert(kind == PyUnicode_1BYTE_KIND); 308 assert(ascii->state.ready == 1); 309 } 310 else { 311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 312 void *data; 313 314 if (ascii->state.compact == 1) { 315 data = compact + 1; 316 assert(kind == PyUnicode_1BYTE_KIND 317 || kind == PyUnicode_2BYTE_KIND 318 || kind == PyUnicode_4BYTE_KIND); 319 assert(ascii->state.ascii == 0); 320 assert(ascii->state.ready == 1); 321 assert (compact->utf8 != data); 322 } else { 323 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 324 325 data = unicode->data.any; 326 if (kind == PyUnicode_WCHAR_KIND) { 327 assert(ascii->state.compact == 0); 328 assert(ascii->state.ascii == 0); 329 assert(ascii->state.ready == 0); 330 assert(ascii->wstr != NULL); 331 assert(data == NULL); 332 assert(compact->utf8 == NULL); 333 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 334 } 335 else { 336 assert(kind == PyUnicode_1BYTE_KIND 337 || kind == PyUnicode_2BYTE_KIND 338 || kind == PyUnicode_4BYTE_KIND); 339 assert(ascii->state.compact == 0); 340 assert(ascii->state.ready == 1); 341 assert(data != NULL); 342 if (ascii->state.ascii) { 343 assert (compact->utf8 == data); 344 assert (compact->utf8_length == ascii->length); 345 } 346 else 347 assert (compact->utf8 != data); 348 } 349 } 350 if (kind != PyUnicode_WCHAR_KIND) { 351 if ( 352#if SIZEOF_WCHAR_T == 2 353 kind == PyUnicode_2BYTE_KIND 354#else 355 kind == PyUnicode_4BYTE_KIND 356#endif 357 ) 358 { 359 assert(ascii->wstr == data); 360 assert(compact->wstr_length == ascii->length); 361 } else 362 assert(ascii->wstr != data); 363 } 364 365 if (compact->utf8 == NULL) 366 assert(compact->utf8_length == 0); 367 if (ascii->wstr == NULL) 368 assert(compact->wstr_length == 0); 369 } 370 return 1; 371} 372#else 373static int 374_PyUnicode_CheckConsistency(void *op) 375{ 376 return 1; 377} 378#endif 379 380/* --- Bloom Filters ----------------------------------------------------- */ 381 382/* stuff to implement simple "bloom filters" for Unicode characters. 383 to keep things simple, we use a single bitmask, using the least 5 384 bits from each unicode characters as the bit index. */ 385 386/* the linebreak mask is set up by Unicode_Init below */ 387 388#if LONG_BIT >= 128 389#define BLOOM_WIDTH 128 390#elif LONG_BIT >= 64 391#define BLOOM_WIDTH 64 392#elif LONG_BIT >= 32 393#define BLOOM_WIDTH 32 394#else 395#error "LONG_BIT is smaller than 32" 396#endif 397 398#define BLOOM_MASK unsigned long 399 400static BLOOM_MASK bloom_linebreak; 401 402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 404 405#define BLOOM_LINEBREAK(ch) \ 406 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 408 409Py_LOCAL_INLINE(BLOOM_MASK) 410make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 411{ 412 /* calculate simple bloom-style bitmask for a given unicode string */ 413 414 BLOOM_MASK mask; 415 Py_ssize_t i; 416 417 mask = 0; 418 for (i = 0; i < len; i++) 419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 420 421 return mask; 422} 423 424#define BLOOM_MEMBER(mask, chr, str) \ 425 (BLOOM(mask, chr) \ 426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 427 428/* --- Unicode Object ----------------------------------------------------- */ 429 430static PyObject * 431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); 432 433Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 434 Py_ssize_t size, Py_UCS4 ch, 435 int direction) 436{ 437 /* like wcschr, but doesn't stop at NULL characters */ 438 Py_ssize_t i; 439 if (direction == 1) { 440 for(i = 0; i < size; i++) 441 if (PyUnicode_READ(kind, s, i) == ch) 442 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 443 } 444 else { 445 for(i = size-1; i >= 0; i--) 446 if (PyUnicode_READ(kind, s, i) == ch) 447 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 448 } 449 return NULL; 450} 451 452static PyObject* 453resize_compact(PyObject *unicode, Py_ssize_t length) 454{ 455 Py_ssize_t char_size; 456 Py_ssize_t struct_size; 457 Py_ssize_t new_size; 458 int share_wstr; 459 460 assert(PyUnicode_IS_READY(unicode)); 461 char_size = PyUnicode_CHARACTER_SIZE(unicode); 462 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 463 struct_size = sizeof(PyASCIIObject); 464 else 465 struct_size = sizeof(PyCompactUnicodeObject); 466 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 467 468 _Py_DEC_REFTOTAL; 469 _Py_ForgetReference(unicode); 470 471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 472 PyErr_NoMemory(); 473 return NULL; 474 } 475 new_size = (struct_size + (length + 1) * char_size); 476 477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 478 if (unicode == NULL) { 479 PyObject_Del(unicode); 480 PyErr_NoMemory(); 481 return NULL; 482 } 483 _Py_NewReference(unicode); 484 _PyUnicode_LENGTH(unicode) = length; 485 if (share_wstr) { 486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 487 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 488 _PyUnicode_WSTR_LENGTH(unicode) = length; 489 } 490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 491 length, 0); 492 return unicode; 493} 494 495static int 496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) 497{ 498 wchar_t *wstr; 499 assert(!PyUnicode_IS_COMPACT(unicode)); 500 assert(Py_REFCNT(unicode) == 1); 501 502 _PyUnicode_DIRTY(unicode); 503 504 if (PyUnicode_IS_READY(unicode)) { 505 Py_ssize_t char_size; 506 Py_ssize_t new_size; 507 int share_wstr, share_utf8; 508 void *data; 509 510 data = _PyUnicode_DATA_ANY(unicode); 511 assert(data != NULL); 512 char_size = PyUnicode_CHARACTER_SIZE(unicode); 513 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 516 { 517 PyObject_DEL(_PyUnicode_UTF8(unicode)); 518 _PyUnicode_UTF8(unicode) = NULL; 519 _PyUnicode_UTF8_LENGTH(unicode) = 0; 520 } 521 522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 523 PyErr_NoMemory(); 524 return -1; 525 } 526 new_size = (length + 1) * char_size; 527 528 data = (PyObject *)PyObject_REALLOC(data, new_size); 529 if (data == NULL) { 530 PyErr_NoMemory(); 531 return -1; 532 } 533 _PyUnicode_DATA_ANY(unicode) = data; 534 if (share_wstr) { 535 _PyUnicode_WSTR(unicode) = data; 536 _PyUnicode_WSTR_LENGTH(unicode) = length; 537 } 538 if (share_utf8) { 539 _PyUnicode_UTF8(unicode) = data; 540 _PyUnicode_UTF8_LENGTH(unicode) = length; 541 } 542 _PyUnicode_LENGTH(unicode) = length; 543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 545 _PyUnicode_CheckConsistency(unicode); 546 return 0; 547 } 548 } 549 assert(_PyUnicode_WSTR(unicode) != NULL); 550 551 /* check for integer overflow */ 552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 553 PyErr_NoMemory(); 554 return -1; 555 } 556 wstr = _PyUnicode_WSTR(unicode); 557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 558 if (!wstr) { 559 PyErr_NoMemory(); 560 return -1; 561 } 562 _PyUnicode_WSTR(unicode) = wstr; 563 _PyUnicode_WSTR(unicode)[length] = 0; 564 _PyUnicode_WSTR_LENGTH(unicode) = length; 565 _PyUnicode_CheckConsistency(unicode); 566 return 0; 567} 568 569static PyObject* 570resize_copy(PyObject *unicode, Py_ssize_t length) 571{ 572 Py_ssize_t copy_length; 573 if (PyUnicode_IS_COMPACT(unicode)) { 574 PyObject *copy; 575 assert(PyUnicode_IS_READY(unicode)); 576 577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 578 if (copy == NULL) 579 return NULL; 580 581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 582 if (PyUnicode_CopyCharacters(copy, 0, 583 unicode, 0, 584 copy_length) < 0) 585 { 586 Py_DECREF(copy); 587 return NULL; 588 } 589 return copy; 590 } 591 else { 592 PyUnicodeObject *w; 593 assert(_PyUnicode_WSTR(unicode) != NULL); 594 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 595 w = _PyUnicode_New(length); 596 if (w == NULL) 597 return NULL; 598 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 599 copy_length = Py_MIN(copy_length, length); 600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 601 copy_length); 602 return (PyObject*)w; 603 } 604} 605 606/* We allocate one more byte to make sure the string is 607 Ux0000 terminated; some code (e.g. new_identifier) 608 relies on that. 609 610 XXX This allocator could further be enhanced by assuring that the 611 free list never reduces its size below 1. 612 613*/ 614 615#ifdef Py_DEBUG 616int unicode_old_new_calls = 0; 617#endif 618 619static PyUnicodeObject * 620_PyUnicode_New(Py_ssize_t length) 621{ 622 register PyUnicodeObject *unicode; 623 size_t new_size; 624 625 /* Optimization for empty strings */ 626 if (length == 0 && unicode_empty != NULL) { 627 Py_INCREF(unicode_empty); 628 return (PyUnicodeObject*)unicode_empty; 629 } 630 631 /* Ensure we won't overflow the size. */ 632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 633 return (PyUnicodeObject *)PyErr_NoMemory(); 634 } 635 if (length < 0) { 636 PyErr_SetString(PyExc_SystemError, 637 "Negative size passed to _PyUnicode_New"); 638 return NULL; 639 } 640 641#ifdef Py_DEBUG 642 ++unicode_old_new_calls; 643#endif 644 645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 646 if (unicode == NULL) 647 return NULL; 648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 650 if (!_PyUnicode_WSTR(unicode)) { 651 PyErr_NoMemory(); 652 goto onError; 653 } 654 655 /* Initialize the first element to guard against cases where 656 * the caller fails before initializing str -- unicode_resize() 657 * reads str[0], and the Keep-Alive optimization can keep memory 658 * allocated for str alive across a call to unicode_dealloc(unicode). 659 * We don't want unicode_resize to read uninitialized memory in 660 * that case. 661 */ 662 _PyUnicode_WSTR(unicode)[0] = 0; 663 _PyUnicode_WSTR(unicode)[length] = 0; 664 _PyUnicode_WSTR_LENGTH(unicode) = length; 665 _PyUnicode_HASH(unicode) = -1; 666 _PyUnicode_STATE(unicode).interned = 0; 667 _PyUnicode_STATE(unicode).kind = 0; 668 _PyUnicode_STATE(unicode).compact = 0; 669 _PyUnicode_STATE(unicode).ready = 0; 670 _PyUnicode_STATE(unicode).ascii = 0; 671 _PyUnicode_DATA_ANY(unicode) = NULL; 672 _PyUnicode_LENGTH(unicode) = 0; 673 _PyUnicode_UTF8(unicode) = NULL; 674 _PyUnicode_UTF8_LENGTH(unicode) = 0; 675 return unicode; 676 677 onError: 678 /* XXX UNREF/NEWREF interface should be more symmetrical */ 679 _Py_DEC_REFTOTAL; 680 _Py_ForgetReference((PyObject *)unicode); 681 PyObject_Del(unicode); 682 return NULL; 683} 684 685static const char* 686unicode_kind_name(PyObject *unicode) 687{ 688 /* don't check consistency: unicode_kind_name() is called from 689 _PyUnicode_Dump() */ 690 if (!PyUnicode_IS_COMPACT(unicode)) 691 { 692 if (!PyUnicode_IS_READY(unicode)) 693 return "wstr"; 694 switch(PyUnicode_KIND(unicode)) 695 { 696 case PyUnicode_1BYTE_KIND: 697 if (PyUnicode_IS_ASCII(unicode)) 698 return "legacy ascii"; 699 else 700 return "legacy latin1"; 701 case PyUnicode_2BYTE_KIND: 702 return "legacy UCS2"; 703 case PyUnicode_4BYTE_KIND: 704 return "legacy UCS4"; 705 default: 706 return "<legacy invalid kind>"; 707 } 708 } 709 assert(PyUnicode_IS_READY(unicode)); 710 switch(PyUnicode_KIND(unicode)) 711 { 712 case PyUnicode_1BYTE_KIND: 713 if (PyUnicode_IS_ASCII(unicode)) 714 return "ascii"; 715 else 716 return "latin1"; 717 case PyUnicode_2BYTE_KIND: 718 return "UCS2"; 719 case PyUnicode_4BYTE_KIND: 720 return "UCS4"; 721 default: 722 return "<invalid compact kind>"; 723 } 724} 725 726#ifdef Py_DEBUG 727int unicode_new_new_calls = 0; 728 729/* Functions wrapping macros for use in debugger */ 730char *_PyUnicode_utf8(void *unicode){ 731 return PyUnicode_UTF8(unicode); 732} 733 734void *_PyUnicode_compact_data(void *unicode) { 735 return _PyUnicode_COMPACT_DATA(unicode); 736} 737void *_PyUnicode_data(void *unicode){ 738 printf("obj %p\n", unicode); 739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 744 return PyUnicode_DATA(unicode); 745} 746 747void 748_PyUnicode_Dump(PyObject *op) 749{ 750 PyASCIIObject *ascii = (PyASCIIObject *)op; 751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 752 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 753 void *data; 754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 755 if (ascii->state.compact) 756 data = (compact + 1); 757 else 758 data = unicode->data.any; 759 if (ascii->wstr == data) 760 printf("shared "); 761 printf("wstr=%p", ascii->wstr); 762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 763 printf(" (%zu), ", compact->wstr_length); 764 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 765 printf("shared "); 766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 767 } 768 printf(", data=%p\n", data); 769} 770#endif 771 772PyObject * 773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 774{ 775 PyObject *obj; 776 PyCompactUnicodeObject *unicode; 777 void *data; 778 int kind_state; 779 int is_sharing, is_ascii; 780 Py_ssize_t char_size; 781 Py_ssize_t struct_size; 782 783 /* Optimization for empty strings */ 784 if (size == 0 && unicode_empty != NULL) { 785 Py_INCREF(unicode_empty); 786 return unicode_empty; 787 } 788 789#ifdef Py_DEBUG 790 ++unicode_new_new_calls; 791#endif 792 793 is_ascii = 0; 794 is_sharing = 0; 795 struct_size = sizeof(PyCompactUnicodeObject); 796 if (maxchar < 128) { 797 kind_state = PyUnicode_1BYTE_KIND; 798 char_size = 1; 799 is_ascii = 1; 800 struct_size = sizeof(PyASCIIObject); 801 } 802 else if (maxchar < 256) { 803 kind_state = PyUnicode_1BYTE_KIND; 804 char_size = 1; 805 } 806 else if (maxchar < 65536) { 807 kind_state = PyUnicode_2BYTE_KIND; 808 char_size = 2; 809 if (sizeof(wchar_t) == 2) 810 is_sharing = 1; 811 } 812 else { 813 kind_state = PyUnicode_4BYTE_KIND; 814 char_size = 4; 815 if (sizeof(wchar_t) == 4) 816 is_sharing = 1; 817 } 818 819 /* Ensure we won't overflow the size. */ 820 if (size < 0) { 821 PyErr_SetString(PyExc_SystemError, 822 "Negative size passed to PyUnicode_New"); 823 return NULL; 824 } 825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 826 return PyErr_NoMemory(); 827 828 /* Duplicated allocation code from _PyObject_New() instead of a call to 829 * PyObject_New() so we are able to allocate space for the object and 830 * it's data buffer. 831 */ 832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 833 if (obj == NULL) 834 return PyErr_NoMemory(); 835 obj = PyObject_INIT(obj, &PyUnicode_Type); 836 if (obj == NULL) 837 return NULL; 838 839 unicode = (PyCompactUnicodeObject *)obj; 840 if (is_ascii) 841 data = ((PyASCIIObject*)obj) + 1; 842 else 843 data = unicode + 1; 844 _PyUnicode_LENGTH(unicode) = size; 845 _PyUnicode_HASH(unicode) = -1; 846 _PyUnicode_STATE(unicode).interned = 0; 847 _PyUnicode_STATE(unicode).kind = kind_state; 848 _PyUnicode_STATE(unicode).compact = 1; 849 _PyUnicode_STATE(unicode).ready = 1; 850 _PyUnicode_STATE(unicode).ascii = is_ascii; 851 if (is_ascii) { 852 ((char*)data)[size] = 0; 853 _PyUnicode_WSTR(unicode) = NULL; 854 } 855 else if (kind_state == PyUnicode_1BYTE_KIND) { 856 ((char*)data)[size] = 0; 857 _PyUnicode_WSTR(unicode) = NULL; 858 _PyUnicode_WSTR_LENGTH(unicode) = 0; 859 unicode->utf8 = NULL; 860 unicode->utf8_length = 0; 861 } 862 else { 863 unicode->utf8 = NULL; 864 unicode->utf8_length = 0; 865 if (kind_state == PyUnicode_2BYTE_KIND) 866 ((Py_UCS2*)data)[size] = 0; 867 else /* kind_state == PyUnicode_4BYTE_KIND */ 868 ((Py_UCS4*)data)[size] = 0; 869 if (is_sharing) { 870 _PyUnicode_WSTR_LENGTH(unicode) = size; 871 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 872 } 873 else { 874 _PyUnicode_WSTR_LENGTH(unicode) = 0; 875 _PyUnicode_WSTR(unicode) = NULL; 876 } 877 } 878 return obj; 879} 880 881#if SIZEOF_WCHAR_T == 2 882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 883 will decode surrogate pairs, the other conversions are implemented as macros 884 for efficiency. 885 886 This function assumes that unicode can hold one more code point than wstr 887 characters for a terminating null character. */ 888static void 889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 890 PyUnicodeObject *unicode) 891{ 892 const wchar_t *iter; 893 Py_UCS4 *ucs4_out; 894 895 assert(unicode != NULL); 896 assert(_PyUnicode_CHECK(unicode)); 897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 898 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 899 900 for (iter = begin; iter < end; ) { 901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 902 _PyUnicode_GET_LENGTH(unicode))); 903 if (*iter >= 0xD800 && *iter <= 0xDBFF 904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 905 { 906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 907 iter += 2; 908 } 909 else { 910 *ucs4_out++ = *iter; 911 iter++; 912 } 913 } 914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 915 _PyUnicode_GET_LENGTH(unicode))); 916 917} 918#endif 919 920static int 921_PyUnicode_Dirty(PyObject *unicode) 922{ 923 assert(_PyUnicode_CHECK(unicode)); 924 if (Py_REFCNT(unicode) != 1) { 925 PyErr_SetString(PyExc_SystemError, 926 "Cannot modify a string having more than 1 reference"); 927 return -1; 928 } 929 _PyUnicode_DIRTY(unicode); 930 return 0; 931} 932 933Py_ssize_t 934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 935 PyObject *from, Py_ssize_t from_start, 936 Py_ssize_t how_many) 937{ 938 unsigned int from_kind, to_kind; 939 void *from_data, *to_data; 940 941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 942 PyErr_BadInternalCall(); 943 return -1; 944 } 945 946 if (PyUnicode_READY(from)) 947 return -1; 948 if (PyUnicode_READY(to)) 949 return -1; 950 951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 953 PyErr_Format(PyExc_SystemError, 954 "Cannot write %zi characters at %zi " 955 "in a string of %zi characters", 956 how_many, to_start, PyUnicode_GET_LENGTH(to)); 957 return -1; 958 } 959 if (how_many == 0) 960 return 0; 961 962 if (_PyUnicode_Dirty(to)) 963 return -1; 964 965 from_kind = PyUnicode_KIND(from); 966 from_data = PyUnicode_DATA(from); 967 to_kind = PyUnicode_KIND(to); 968 to_data = PyUnicode_DATA(to); 969 970 if (from_kind == to_kind 971 /* deny latin1 => ascii */ 972 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 973 { 974 Py_MEMCPY((char*)to_data 975 + PyUnicode_KIND_SIZE(to_kind, to_start), 976 (char*)from_data 977 + PyUnicode_KIND_SIZE(from_kind, from_start), 978 PyUnicode_KIND_SIZE(to_kind, how_many)); 979 } 980 else if (from_kind == PyUnicode_1BYTE_KIND 981 && to_kind == PyUnicode_2BYTE_KIND) 982 { 983 _PyUnicode_CONVERT_BYTES( 984 Py_UCS1, Py_UCS2, 985 PyUnicode_1BYTE_DATA(from) + from_start, 986 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 987 PyUnicode_2BYTE_DATA(to) + to_start 988 ); 989 } 990 else if (from_kind == PyUnicode_1BYTE_KIND 991 && to_kind == PyUnicode_4BYTE_KIND) 992 { 993 _PyUnicode_CONVERT_BYTES( 994 Py_UCS1, Py_UCS4, 995 PyUnicode_1BYTE_DATA(from) + from_start, 996 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 997 PyUnicode_4BYTE_DATA(to) + to_start 998 ); 999 } 1000 else if (from_kind == PyUnicode_2BYTE_KIND 1001 && to_kind == PyUnicode_4BYTE_KIND) 1002 { 1003 _PyUnicode_CONVERT_BYTES( 1004 Py_UCS2, Py_UCS4, 1005 PyUnicode_2BYTE_DATA(from) + from_start, 1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1007 PyUnicode_4BYTE_DATA(to) + to_start 1008 ); 1009 } 1010 else { 1011 int invalid_kinds; 1012 1013 /* check if max_char(from substring) <= max_char(to) */ 1014 if (from_kind > to_kind 1015 /* latin1 => ascii */ 1016 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1017 { 1018 /* slow path to check for character overflow */ 1019 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1020 Py_UCS4 ch, maxchar; 1021 Py_ssize_t i; 1022 1023 maxchar = 0; 1024 invalid_kinds = 0; 1025 for (i=0; i < how_many; i++) { 1026 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1027 if (ch > maxchar) { 1028 maxchar = ch; 1029 if (maxchar > to_maxchar) { 1030 invalid_kinds = 1; 1031 break; 1032 } 1033 } 1034 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1035 } 1036 } 1037 else 1038 invalid_kinds = 1; 1039 if (invalid_kinds) { 1040 PyErr_Format(PyExc_SystemError, 1041 "Cannot copy %s characters " 1042 "into a string of %s characters", 1043 unicode_kind_name(from), 1044 unicode_kind_name(to)); 1045 return -1; 1046 } 1047 } 1048 return how_many; 1049} 1050 1051/* Find the maximum code point and count the number of surrogate pairs so a 1052 correct string length can be computed before converting a string to UCS4. 1053 This function counts single surrogates as a character and not as a pair. 1054 1055 Return 0 on success, or -1 on error. */ 1056static int 1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1058 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1059{ 1060 const wchar_t *iter; 1061 1062 assert(num_surrogates != NULL && maxchar != NULL); 1063 *num_surrogates = 0; 1064 *maxchar = 0; 1065 1066 for (iter = begin; iter < end; ) { 1067 if (*iter > *maxchar) { 1068 *maxchar = *iter; 1069#if SIZEOF_WCHAR_T != 2 1070 if (*maxchar >= 0x10000) 1071 return 0; 1072#endif 1073 } 1074#if SIZEOF_WCHAR_T == 2 1075 if (*iter >= 0xD800 && *iter <= 0xDBFF 1076 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1077 { 1078 Py_UCS4 surrogate_val; 1079 surrogate_val = (((iter[0] & 0x3FF)<<10) 1080 | (iter[1] & 0x3FF)) + 0x10000; 1081 ++(*num_surrogates); 1082 if (surrogate_val > *maxchar) 1083 *maxchar = surrogate_val; 1084 iter += 2; 1085 } 1086 else 1087 iter++; 1088#else 1089 iter++; 1090#endif 1091 } 1092 return 0; 1093} 1094 1095#ifdef Py_DEBUG 1096int unicode_ready_calls = 0; 1097#endif 1098 1099static int 1100unicode_ready(PyObject **p_obj, int replace) 1101{ 1102 PyUnicodeObject *unicode; 1103 wchar_t *end; 1104 Py_UCS4 maxchar = 0; 1105 Py_ssize_t num_surrogates; 1106#if SIZEOF_WCHAR_T == 2 1107 Py_ssize_t length_wo_surrogates; 1108#endif 1109 1110 assert(p_obj != NULL); 1111 unicode = (PyUnicodeObject *)*p_obj; 1112 1113 /* _PyUnicode_Ready() is only intended for old-style API usage where 1114 strings were created using _PyObject_New() and where no canonical 1115 representation (the str field) has been set yet aka strings 1116 which are not yet ready. */ 1117 assert(_PyUnicode_CHECK(unicode)); 1118 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1119 assert(_PyUnicode_WSTR(unicode) != NULL); 1120 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1121 assert(_PyUnicode_UTF8(unicode) == NULL); 1122 /* Actually, it should neither be interned nor be anything else: */ 1123 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1124 1125#ifdef Py_DEBUG 1126 ++unicode_ready_calls; 1127#endif 1128 1129#ifdef Py_DEBUG 1130 assert(!replace || Py_REFCNT(unicode) == 1); 1131#else 1132 if (replace && Py_REFCNT(unicode) != 1) 1133 replace = 0; 1134#endif 1135 if (replace) { 1136 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1137 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1138 /* Optimization for empty strings */ 1139 if (len == 0) { 1140 Py_INCREF(unicode_empty); 1141 Py_DECREF(*p_obj); 1142 *p_obj = unicode_empty; 1143 return 0; 1144 } 1145 if (len == 1 && wstr[0] < 256) { 1146 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1147 if (latin1_char == NULL) 1148 return -1; 1149 Py_DECREF(*p_obj); 1150 *p_obj = latin1_char; 1151 return 0; 1152 } 1153 } 1154 1155 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1156 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1157 &maxchar, &num_surrogates) == -1) 1158 return -1; 1159 1160 if (maxchar < 256) { 1161 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1162 if (!_PyUnicode_DATA_ANY(unicode)) { 1163 PyErr_NoMemory(); 1164 return -1; 1165 } 1166 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1167 _PyUnicode_WSTR(unicode), end, 1168 PyUnicode_1BYTE_DATA(unicode)); 1169 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1170 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1171 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1172 if (maxchar < 128) { 1173 _PyUnicode_STATE(unicode).ascii = 1; 1174 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1175 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1176 } 1177 else { 1178 _PyUnicode_STATE(unicode).ascii = 0; 1179 _PyUnicode_UTF8(unicode) = NULL; 1180 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1181 } 1182 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1183 _PyUnicode_WSTR(unicode) = NULL; 1184 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1185 } 1186 /* In this case we might have to convert down from 4-byte native 1187 wchar_t to 2-byte unicode. */ 1188 else if (maxchar < 65536) { 1189 assert(num_surrogates == 0 && 1190 "FindMaxCharAndNumSurrogatePairs() messed up"); 1191 1192#if SIZEOF_WCHAR_T == 2 1193 /* We can share representations and are done. */ 1194 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1195 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1196 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1197 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1198 _PyUnicode_UTF8(unicode) = NULL; 1199 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1200#else 1201 /* sizeof(wchar_t) == 4 */ 1202 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1203 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1204 if (!_PyUnicode_DATA_ANY(unicode)) { 1205 PyErr_NoMemory(); 1206 return -1; 1207 } 1208 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1209 _PyUnicode_WSTR(unicode), end, 1210 PyUnicode_2BYTE_DATA(unicode)); 1211 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1212 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1213 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1214 _PyUnicode_UTF8(unicode) = NULL; 1215 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1216 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1217 _PyUnicode_WSTR(unicode) = NULL; 1218 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1219#endif 1220 } 1221 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1222 else { 1223#if SIZEOF_WCHAR_T == 2 1224 /* in case the native representation is 2-bytes, we need to allocate a 1225 new normalized 4-byte version. */ 1226 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1227 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1228 if (!_PyUnicode_DATA_ANY(unicode)) { 1229 PyErr_NoMemory(); 1230 return -1; 1231 } 1232 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1233 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1234 _PyUnicode_UTF8(unicode) = NULL; 1235 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1236 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1237 _PyUnicode_STATE(unicode).ready = 1; 1238 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1239 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1240 _PyUnicode_WSTR(unicode) = NULL; 1241 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1242#else 1243 assert(num_surrogates == 0); 1244 1245 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1246 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1247 _PyUnicode_UTF8(unicode) = NULL; 1248 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1249 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1250#endif 1251 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1252 } 1253 _PyUnicode_STATE(unicode).ready = 1; 1254 return 0; 1255} 1256 1257int 1258_PyUnicode_ReadyReplace(PyObject **op) 1259{ 1260 return unicode_ready(op, 1); 1261} 1262 1263int 1264_PyUnicode_Ready(PyObject *op) 1265{ 1266 return unicode_ready(&op, 0); 1267} 1268 1269static void 1270unicode_dealloc(register PyUnicodeObject *unicode) 1271{ 1272 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1273 case SSTATE_NOT_INTERNED: 1274 break; 1275 1276 case SSTATE_INTERNED_MORTAL: 1277 /* revive dead object temporarily for DelItem */ 1278 Py_REFCNT(unicode) = 3; 1279 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1280 Py_FatalError( 1281 "deletion of interned string failed"); 1282 break; 1283 1284 case SSTATE_INTERNED_IMMORTAL: 1285 Py_FatalError("Immortal interned string died."); 1286 1287 default: 1288 Py_FatalError("Inconsistent interned string state."); 1289 } 1290 1291 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1292 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1293 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1294 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1295 1296 if (PyUnicode_IS_COMPACT(unicode)) { 1297 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1298 } 1299 else { 1300 if (_PyUnicode_DATA_ANY(unicode)) 1301 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1302 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1303 } 1304} 1305 1306static int 1307unicode_resizable(PyObject *unicode) 1308{ 1309 if (Py_REFCNT(unicode) != 1) 1310 return 0; 1311 if (PyUnicode_CHECK_INTERNED(unicode)) 1312 return 0; 1313 assert(unicode != unicode_empty); 1314#ifdef Py_DEBUG 1315 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND 1316 && PyUnicode_GET_LENGTH(unicode) == 1) 1317 { 1318 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1319 if (ch < 256 && unicode_latin1[ch] == unicode) 1320 return 0; 1321 } 1322#endif 1323 return 1; 1324} 1325 1326static int 1327unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1328{ 1329 PyObject *unicode; 1330 Py_ssize_t old_length; 1331 1332 assert(p_unicode != NULL); 1333 unicode = *p_unicode; 1334 1335 assert(unicode != NULL); 1336 assert(PyUnicode_Check(unicode)); 1337 assert(0 <= length); 1338 1339 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1340 old_length = PyUnicode_WSTR_LENGTH(unicode); 1341 else 1342 old_length = PyUnicode_GET_LENGTH(unicode); 1343 if (old_length == length) 1344 return 0; 1345 1346 if (!unicode_resizable(unicode)) { 1347 PyObject *copy = resize_copy(unicode, length); 1348 if (copy == NULL) 1349 return -1; 1350 Py_DECREF(*p_unicode); 1351 *p_unicode = copy; 1352 return 0; 1353 } 1354 1355 if (PyUnicode_IS_COMPACT(unicode)) { 1356 *p_unicode = resize_compact(unicode, length); 1357 if (*p_unicode == NULL) 1358 return -1; 1359 _PyUnicode_CheckConsistency(*p_unicode); 1360 return 0; 1361 } 1362 return resize_inplace((PyUnicodeObject*)unicode, length); 1363} 1364 1365int 1366PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1367{ 1368 PyObject *unicode; 1369 if (p_unicode == NULL) { 1370 PyErr_BadInternalCall(); 1371 return -1; 1372 } 1373 unicode = *p_unicode; 1374 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1375 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1376 { 1377 PyErr_BadInternalCall(); 1378 return -1; 1379 } 1380 return unicode_resize(p_unicode, length); 1381} 1382 1383static PyObject* 1384get_latin1_char(unsigned char ch) 1385{ 1386 PyObject *unicode = unicode_latin1[ch]; 1387 if (!unicode) { 1388 unicode = PyUnicode_New(1, ch); 1389 if (!unicode) 1390 return NULL; 1391 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1392 unicode_latin1[ch] = unicode; 1393 } 1394 Py_INCREF(unicode); 1395 return unicode; 1396} 1397 1398PyObject * 1399PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1400{ 1401 PyUnicodeObject *unicode; 1402 Py_UCS4 maxchar = 0; 1403 Py_ssize_t num_surrogates; 1404 1405 if (u == NULL) 1406 return (PyObject*)_PyUnicode_New(size); 1407 1408 /* If the Unicode data is known at construction time, we can apply 1409 some optimizations which share commonly used objects. */ 1410 1411 /* Optimization for empty strings */ 1412 if (size == 0 && unicode_empty != NULL) { 1413 Py_INCREF(unicode_empty); 1414 return unicode_empty; 1415 } 1416 1417 /* Single character Unicode objects in the Latin-1 range are 1418 shared when using this constructor */ 1419 if (size == 1 && *u < 256) 1420 return get_latin1_char((unsigned char)*u); 1421 1422 /* If not empty and not single character, copy the Unicode data 1423 into the new object */ 1424 if (find_maxchar_surrogates(u, u + size, 1425 &maxchar, &num_surrogates) == -1) 1426 return NULL; 1427 1428 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1429 maxchar); 1430 if (!unicode) 1431 return NULL; 1432 1433 switch (PyUnicode_KIND(unicode)) { 1434 case PyUnicode_1BYTE_KIND: 1435 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1436 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1437 break; 1438 case PyUnicode_2BYTE_KIND: 1439#if Py_UNICODE_SIZE == 2 1440 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1441#else 1442 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1443 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1444#endif 1445 break; 1446 case PyUnicode_4BYTE_KIND: 1447#if SIZEOF_WCHAR_T == 2 1448 /* This is the only case which has to process surrogates, thus 1449 a simple copy loop is not enough and we need a function. */ 1450 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1451#else 1452 assert(num_surrogates == 0); 1453 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1454#endif 1455 break; 1456 default: 1457 assert(0 && "Impossible state"); 1458 } 1459 1460 return (PyObject *)unicode; 1461} 1462 1463PyObject * 1464PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1465{ 1466 PyUnicodeObject *unicode; 1467 1468 if (size < 0) { 1469 PyErr_SetString(PyExc_SystemError, 1470 "Negative size passed to PyUnicode_FromStringAndSize"); 1471 return NULL; 1472 } 1473 1474 /* If the Unicode data is known at construction time, we can apply 1475 some optimizations which share commonly used objects. 1476 Also, this means the input must be UTF-8, so fall back to the 1477 UTF-8 decoder at the end. */ 1478 if (u != NULL) { 1479 1480 /* Optimization for empty strings */ 1481 if (size == 0 && unicode_empty != NULL) { 1482 Py_INCREF(unicode_empty); 1483 return unicode_empty; 1484 } 1485 1486 /* Single characters are shared when using this constructor. 1487 Restrict to ASCII, since the input must be UTF-8. */ 1488 if (size == 1 && Py_CHARMASK(*u) < 128) 1489 return get_latin1_char(Py_CHARMASK(*u)); 1490 1491 return PyUnicode_DecodeUTF8(u, size, NULL); 1492 } 1493 1494 unicode = _PyUnicode_New(size); 1495 if (!unicode) 1496 return NULL; 1497 1498 return (PyObject *)unicode; 1499} 1500 1501PyObject * 1502PyUnicode_FromString(const char *u) 1503{ 1504 size_t size = strlen(u); 1505 if (size > PY_SSIZE_T_MAX) { 1506 PyErr_SetString(PyExc_OverflowError, "input too long"); 1507 return NULL; 1508 } 1509 1510 return PyUnicode_FromStringAndSize(u, size); 1511} 1512 1513static PyObject* 1514unicode_fromascii(const unsigned char* u, Py_ssize_t size) 1515{ 1516 PyObject *res = PyUnicode_New(size, 127); 1517 if (!res) 1518 return NULL; 1519 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1520 return res; 1521} 1522 1523static Py_UCS4 1524kind_maxchar_limit(unsigned int kind) 1525{ 1526 switch(kind) { 1527 case PyUnicode_1BYTE_KIND: 1528 return 0x80; 1529 case PyUnicode_2BYTE_KIND: 1530 return 0x100; 1531 case PyUnicode_4BYTE_KIND: 1532 return 0x10000; 1533 default: 1534 assert(0 && "invalid kind"); 1535 return 0x10ffff; 1536 } 1537} 1538 1539static PyObject* 1540_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1541{ 1542 PyObject *res; 1543 unsigned char max_char = 127; 1544 Py_ssize_t i; 1545 1546 assert(size >= 0); 1547 for (i = 0; i < size; i++) { 1548 if (u[i] & 0x80) { 1549 max_char = 255; 1550 break; 1551 } 1552 } 1553 res = PyUnicode_New(size, max_char); 1554 if (!res) 1555 return NULL; 1556 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1557 return res; 1558} 1559 1560static PyObject* 1561_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1562{ 1563 PyObject *res; 1564 Py_UCS2 max_char = 0; 1565 Py_ssize_t i; 1566 1567 assert(size >= 0); 1568 for (i = 0; i < size; i++) { 1569 if (u[i] > max_char) { 1570 max_char = u[i]; 1571 if (max_char >= 256) 1572 break; 1573 } 1574 } 1575 res = PyUnicode_New(size, max_char); 1576 if (!res) 1577 return NULL; 1578 if (max_char >= 256) 1579 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1580 else 1581 for (i = 0; i < size; i++) 1582 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1583 return res; 1584} 1585 1586static PyObject* 1587_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1588{ 1589 PyObject *res; 1590 Py_UCS4 max_char = 0; 1591 Py_ssize_t i; 1592 1593 assert(size >= 0); 1594 for (i = 0; i < size; i++) { 1595 if (u[i] > max_char) { 1596 max_char = u[i]; 1597 if (max_char >= 0x10000) 1598 break; 1599 } 1600 } 1601 res = PyUnicode_New(size, max_char); 1602 if (!res) 1603 return NULL; 1604 if (max_char >= 0x10000) 1605 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1606 else { 1607 int kind = PyUnicode_KIND(res); 1608 void *data = PyUnicode_DATA(res); 1609 for (i = 0; i < size; i++) 1610 PyUnicode_WRITE(kind, data, i, u[i]); 1611 } 1612 return res; 1613} 1614 1615PyObject* 1616PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1617{ 1618 switch(kind) { 1619 case PyUnicode_1BYTE_KIND: 1620 return _PyUnicode_FromUCS1(buffer, size); 1621 case PyUnicode_2BYTE_KIND: 1622 return _PyUnicode_FromUCS2(buffer, size); 1623 case PyUnicode_4BYTE_KIND: 1624 return _PyUnicode_FromUCS4(buffer, size); 1625 default: 1626 assert(0 && "invalid kind"); 1627 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1628 return NULL; 1629 } 1630} 1631 1632PyObject* 1633PyUnicode_Copy(PyObject *unicode) 1634{ 1635 Py_ssize_t size; 1636 PyObject *copy; 1637 void *data; 1638 1639 if (!PyUnicode_Check(unicode)) { 1640 PyErr_BadInternalCall(); 1641 return NULL; 1642 } 1643 if (PyUnicode_READY(unicode)) 1644 return NULL; 1645 1646 size = PyUnicode_GET_LENGTH(unicode); 1647 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1648 if (!copy) 1649 return NULL; 1650 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1651 1652 data = PyUnicode_DATA(unicode); 1653 switch (PyUnicode_KIND(unicode)) 1654 { 1655 case PyUnicode_1BYTE_KIND: 1656 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1657 break; 1658 case PyUnicode_2BYTE_KIND: 1659 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1660 break; 1661 case PyUnicode_4BYTE_KIND: 1662 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1663 break; 1664 default: 1665 assert(0); 1666 break; 1667 } 1668 return copy; 1669} 1670 1671 1672/* Widen Unicode objects to larger buffers. Don't write terminating null 1673 character. Return NULL on error. */ 1674 1675void* 1676_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1677{ 1678 Py_ssize_t len; 1679 void *result; 1680 unsigned int skind; 1681 1682 if (PyUnicode_READY(s)) 1683 return NULL; 1684 1685 len = PyUnicode_GET_LENGTH(s); 1686 skind = PyUnicode_KIND(s); 1687 if (skind >= kind) { 1688 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1689 return NULL; 1690 } 1691 switch(kind) { 1692 case PyUnicode_2BYTE_KIND: 1693 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1694 if (!result) 1695 return PyErr_NoMemory(); 1696 assert(skind == PyUnicode_1BYTE_KIND); 1697 _PyUnicode_CONVERT_BYTES( 1698 Py_UCS1, Py_UCS2, 1699 PyUnicode_1BYTE_DATA(s), 1700 PyUnicode_1BYTE_DATA(s) + len, 1701 result); 1702 return result; 1703 case PyUnicode_4BYTE_KIND: 1704 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1705 if (!result) 1706 return PyErr_NoMemory(); 1707 if (skind == PyUnicode_2BYTE_KIND) { 1708 _PyUnicode_CONVERT_BYTES( 1709 Py_UCS2, Py_UCS4, 1710 PyUnicode_2BYTE_DATA(s), 1711 PyUnicode_2BYTE_DATA(s) + len, 1712 result); 1713 } 1714 else { 1715 assert(skind == PyUnicode_1BYTE_KIND); 1716 _PyUnicode_CONVERT_BYTES( 1717 Py_UCS1, Py_UCS4, 1718 PyUnicode_1BYTE_DATA(s), 1719 PyUnicode_1BYTE_DATA(s) + len, 1720 result); 1721 } 1722 return result; 1723 default: 1724 break; 1725 } 1726 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1727 return NULL; 1728} 1729 1730static Py_UCS4* 1731as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1732 int copy_null) 1733{ 1734 int kind; 1735 void *data; 1736 Py_ssize_t len, targetlen; 1737 if (PyUnicode_READY(string) == -1) 1738 return NULL; 1739 kind = PyUnicode_KIND(string); 1740 data = PyUnicode_DATA(string); 1741 len = PyUnicode_GET_LENGTH(string); 1742 targetlen = len; 1743 if (copy_null) 1744 targetlen++; 1745 if (!target) { 1746 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1747 PyErr_NoMemory(); 1748 return NULL; 1749 } 1750 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1751 if (!target) { 1752 PyErr_NoMemory(); 1753 return NULL; 1754 } 1755 } 1756 else { 1757 if (targetsize < targetlen) { 1758 PyErr_Format(PyExc_SystemError, 1759 "string is longer than the buffer"); 1760 if (copy_null && 0 < targetsize) 1761 target[0] = 0; 1762 return NULL; 1763 } 1764 } 1765 if (kind != PyUnicode_4BYTE_KIND) { 1766 Py_ssize_t i; 1767 for (i = 0; i < len; i++) 1768 target[i] = PyUnicode_READ(kind, data, i); 1769 } 1770 else 1771 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1772 if (copy_null) 1773 target[len] = 0; 1774 return target; 1775} 1776 1777Py_UCS4* 1778PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1779 int copy_null) 1780{ 1781 if (target == NULL || targetsize < 1) { 1782 PyErr_BadInternalCall(); 1783 return NULL; 1784 } 1785 return as_ucs4(string, target, targetsize, copy_null); 1786} 1787 1788Py_UCS4* 1789PyUnicode_AsUCS4Copy(PyObject *string) 1790{ 1791 return as_ucs4(string, NULL, 0, 1); 1792} 1793 1794#ifdef HAVE_WCHAR_H 1795 1796PyObject * 1797PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1798{ 1799 if (w == NULL) { 1800 if (size == 0) 1801 return PyUnicode_New(0, 0); 1802 PyErr_BadInternalCall(); 1803 return NULL; 1804 } 1805 1806 if (size == -1) { 1807 size = wcslen(w); 1808 } 1809 1810 return PyUnicode_FromUnicode(w, size); 1811} 1812 1813#endif /* HAVE_WCHAR_H */ 1814 1815static void 1816makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1817 int zeropad, int width, int precision, char c) 1818{ 1819 *fmt++ = '%'; 1820 if (width) { 1821 if (zeropad) 1822 *fmt++ = '0'; 1823 fmt += sprintf(fmt, "%d", width); 1824 } 1825 if (precision) 1826 fmt += sprintf(fmt, ".%d", precision); 1827 if (longflag) 1828 *fmt++ = 'l'; 1829 else if (longlongflag) { 1830 /* longlongflag should only ever be nonzero on machines with 1831 HAVE_LONG_LONG defined */ 1832#ifdef HAVE_LONG_LONG 1833 char *f = PY_FORMAT_LONG_LONG; 1834 while (*f) 1835 *fmt++ = *f++; 1836#else 1837 /* we shouldn't ever get here */ 1838 assert(0); 1839 *fmt++ = 'l'; 1840#endif 1841 } 1842 else if (size_tflag) { 1843 char *f = PY_FORMAT_SIZE_T; 1844 while (*f) 1845 *fmt++ = *f++; 1846 } 1847 *fmt++ = c; 1848 *fmt = '\0'; 1849} 1850 1851/* helper for PyUnicode_FromFormatV() */ 1852 1853static const char* 1854parse_format_flags(const char *f, 1855 int *p_width, int *p_precision, 1856 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1857{ 1858 int width, precision, longflag, longlongflag, size_tflag; 1859 1860 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1861 f++; 1862 width = 0; 1863 while (Py_ISDIGIT((unsigned)*f)) 1864 width = (width*10) + *f++ - '0'; 1865 precision = 0; 1866 if (*f == '.') { 1867 f++; 1868 while (Py_ISDIGIT((unsigned)*f)) 1869 precision = (precision*10) + *f++ - '0'; 1870 if (*f == '%') { 1871 /* "%.3%s" => f points to "3" */ 1872 f--; 1873 } 1874 } 1875 if (*f == '\0') { 1876 /* bogus format "%.1" => go backward, f points to "1" */ 1877 f--; 1878 } 1879 if (p_width != NULL) 1880 *p_width = width; 1881 if (p_precision != NULL) 1882 *p_precision = precision; 1883 1884 /* Handle %ld, %lu, %lld and %llu. */ 1885 longflag = 0; 1886 longlongflag = 0; 1887 size_tflag = 0; 1888 1889 if (*f == 'l') { 1890 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1891 longflag = 1; 1892 ++f; 1893 } 1894#ifdef HAVE_LONG_LONG 1895 else if (f[1] == 'l' && 1896 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1897 longlongflag = 1; 1898 f += 2; 1899 } 1900#endif 1901 } 1902 /* handle the size_t flag. */ 1903 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 1904 size_tflag = 1; 1905 ++f; 1906 } 1907 if (p_longflag != NULL) 1908 *p_longflag = longflag; 1909 if (p_longlongflag != NULL) 1910 *p_longlongflag = longlongflag; 1911 if (p_size_tflag != NULL) 1912 *p_size_tflag = size_tflag; 1913 return f; 1914} 1915 1916/* maximum number of characters required for output of %ld. 21 characters 1917 allows for 64-bit integers (in decimal) and an optional sign. */ 1918#define MAX_LONG_CHARS 21 1919/* maximum number of characters required for output of %lld. 1920 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 1921 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 1922#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 1923 1924PyObject * 1925PyUnicode_FromFormatV(const char *format, va_list vargs) 1926{ 1927 va_list count; 1928 Py_ssize_t callcount = 0; 1929 PyObject **callresults = NULL; 1930 PyObject **callresult = NULL; 1931 Py_ssize_t n = 0; 1932 int width = 0; 1933 int precision = 0; 1934 int zeropad; 1935 const char* f; 1936 PyUnicodeObject *string; 1937 /* used by sprintf */ 1938 char fmt[61]; /* should be enough for %0width.precisionlld */ 1939 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 1940 Py_UCS4 argmaxchar; 1941 Py_ssize_t numbersize = 0; 1942 char *numberresults = NULL; 1943 char *numberresult = NULL; 1944 Py_ssize_t i; 1945 int kind; 1946 void *data; 1947 1948 Py_VA_COPY(count, vargs); 1949 /* step 1: count the number of %S/%R/%A/%s format specifications 1950 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 1951 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 1952 * result in an array) 1953 * also estimate a upper bound for all the number formats in the string, 1954 * numbers will be formatted in step 3 and be kept in a '\0'-separated 1955 * buffer before putting everything together. */ 1956 for (f = format; *f; f++) { 1957 if (*f == '%') { 1958 int longlongflag; 1959 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 1960 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 1961 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 1962 ++callcount; 1963 1964 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 1965#ifdef HAVE_LONG_LONG 1966 if (longlongflag) { 1967 if (width < MAX_LONG_LONG_CHARS) 1968 width = MAX_LONG_LONG_CHARS; 1969 } 1970 else 1971#endif 1972 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 1973 including sign. Decimal takes the most space. This 1974 isn't enough for octal. If a width is specified we 1975 need more (which we allocate later). */ 1976 if (width < MAX_LONG_CHARS) 1977 width = MAX_LONG_CHARS; 1978 1979 /* account for the size + '\0' to separate numbers 1980 inside of the numberresults buffer */ 1981 numbersize += (width + 1); 1982 } 1983 } 1984 else if ((unsigned char)*f > 127) { 1985 PyErr_Format(PyExc_ValueError, 1986 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1987 "string, got a non-ASCII byte: 0x%02x", 1988 (unsigned char)*f); 1989 return NULL; 1990 } 1991 } 1992 /* step 2: allocate memory for the results of 1993 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 1994 if (callcount) { 1995 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 1996 if (!callresults) { 1997 PyErr_NoMemory(); 1998 return NULL; 1999 } 2000 callresult = callresults; 2001 } 2002 /* step 2.5: allocate memory for the results of formating numbers */ 2003 if (numbersize) { 2004 numberresults = PyObject_Malloc(numbersize); 2005 if (!numberresults) { 2006 PyErr_NoMemory(); 2007 goto fail; 2008 } 2009 numberresult = numberresults; 2010 } 2011 2012 /* step 3: format numbers and figure out how large a buffer we need */ 2013 for (f = format; *f; f++) { 2014 if (*f == '%') { 2015 const char* p; 2016 int longflag; 2017 int longlongflag; 2018 int size_tflag; 2019 int numprinted; 2020 2021 p = f; 2022 zeropad = (f[1] == '0'); 2023 f = parse_format_flags(f, &width, &precision, 2024 &longflag, &longlongflag, &size_tflag); 2025 switch (*f) { 2026 case 'c': 2027 { 2028 Py_UCS4 ordinal = va_arg(count, int); 2029 maxchar = Py_MAX(maxchar, ordinal); 2030 n++; 2031 break; 2032 } 2033 case '%': 2034 n++; 2035 break; 2036 case 'i': 2037 case 'd': 2038 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2039 width, precision, *f); 2040 if (longflag) 2041 numprinted = sprintf(numberresult, fmt, 2042 va_arg(count, long)); 2043#ifdef HAVE_LONG_LONG 2044 else if (longlongflag) 2045 numprinted = sprintf(numberresult, fmt, 2046 va_arg(count, PY_LONG_LONG)); 2047#endif 2048 else if (size_tflag) 2049 numprinted = sprintf(numberresult, fmt, 2050 va_arg(count, Py_ssize_t)); 2051 else 2052 numprinted = sprintf(numberresult, fmt, 2053 va_arg(count, int)); 2054 n += numprinted; 2055 /* advance by +1 to skip over the '\0' */ 2056 numberresult += (numprinted + 1); 2057 assert(*(numberresult - 1) == '\0'); 2058 assert(*(numberresult - 2) != '\0'); 2059 assert(numprinted >= 0); 2060 assert(numberresult <= numberresults + numbersize); 2061 break; 2062 case 'u': 2063 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2064 width, precision, 'u'); 2065 if (longflag) 2066 numprinted = sprintf(numberresult, fmt, 2067 va_arg(count, unsigned long)); 2068#ifdef HAVE_LONG_LONG 2069 else if (longlongflag) 2070 numprinted = sprintf(numberresult, fmt, 2071 va_arg(count, unsigned PY_LONG_LONG)); 2072#endif 2073 else if (size_tflag) 2074 numprinted = sprintf(numberresult, fmt, 2075 va_arg(count, size_t)); 2076 else 2077 numprinted = sprintf(numberresult, fmt, 2078 va_arg(count, unsigned int)); 2079 n += numprinted; 2080 numberresult += (numprinted + 1); 2081 assert(*(numberresult - 1) == '\0'); 2082 assert(*(numberresult - 2) != '\0'); 2083 assert(numprinted >= 0); 2084 assert(numberresult <= numberresults + numbersize); 2085 break; 2086 case 'x': 2087 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2088 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2089 n += numprinted; 2090 numberresult += (numprinted + 1); 2091 assert(*(numberresult - 1) == '\0'); 2092 assert(*(numberresult - 2) != '\0'); 2093 assert(numprinted >= 0); 2094 assert(numberresult <= numberresults + numbersize); 2095 break; 2096 case 'p': 2097 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2098 /* %p is ill-defined: ensure leading 0x. */ 2099 if (numberresult[1] == 'X') 2100 numberresult[1] = 'x'; 2101 else if (numberresult[1] != 'x') { 2102 memmove(numberresult + 2, numberresult, 2103 strlen(numberresult) + 1); 2104 numberresult[0] = '0'; 2105 numberresult[1] = 'x'; 2106 numprinted += 2; 2107 } 2108 n += numprinted; 2109 numberresult += (numprinted + 1); 2110 assert(*(numberresult - 1) == '\0'); 2111 assert(*(numberresult - 2) != '\0'); 2112 assert(numprinted >= 0); 2113 assert(numberresult <= numberresults + numbersize); 2114 break; 2115 case 's': 2116 { 2117 /* UTF-8 */ 2118 const char *s = va_arg(count, const char*); 2119 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2120 if (!str) 2121 goto fail; 2122 /* since PyUnicode_DecodeUTF8 returns already flexible 2123 unicode objects, there is no need to call ready on them */ 2124 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2125 maxchar = Py_MAX(maxchar, argmaxchar); 2126 n += PyUnicode_GET_LENGTH(str); 2127 /* Remember the str and switch to the next slot */ 2128 *callresult++ = str; 2129 break; 2130 } 2131 case 'U': 2132 { 2133 PyObject *obj = va_arg(count, PyObject *); 2134 assert(obj && _PyUnicode_CHECK(obj)); 2135 if (PyUnicode_READY(obj) == -1) 2136 goto fail; 2137 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2138 maxchar = Py_MAX(maxchar, argmaxchar); 2139 n += PyUnicode_GET_LENGTH(obj); 2140 break; 2141 } 2142 case 'V': 2143 { 2144 PyObject *obj = va_arg(count, PyObject *); 2145 const char *str = va_arg(count, const char *); 2146 PyObject *str_obj; 2147 assert(obj || str); 2148 assert(!obj || _PyUnicode_CHECK(obj)); 2149 if (obj) { 2150 if (PyUnicode_READY(obj) == -1) 2151 goto fail; 2152 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2153 maxchar = Py_MAX(maxchar, argmaxchar); 2154 n += PyUnicode_GET_LENGTH(obj); 2155 *callresult++ = NULL; 2156 } 2157 else { 2158 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2159 if (!str_obj) 2160 goto fail; 2161 if (PyUnicode_READY(str_obj)) { 2162 Py_DECREF(str_obj); 2163 goto fail; 2164 } 2165 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2166 maxchar = Py_MAX(maxchar, argmaxchar); 2167 n += PyUnicode_GET_LENGTH(str_obj); 2168 *callresult++ = str_obj; 2169 } 2170 break; 2171 } 2172 case 'S': 2173 { 2174 PyObject *obj = va_arg(count, PyObject *); 2175 PyObject *str; 2176 assert(obj); 2177 str = PyObject_Str(obj); 2178 if (!str || PyUnicode_READY(str) == -1) 2179 goto fail; 2180 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2181 maxchar = Py_MAX(maxchar, argmaxchar); 2182 n += PyUnicode_GET_LENGTH(str); 2183 /* Remember the str and switch to the next slot */ 2184 *callresult++ = str; 2185 break; 2186 } 2187 case 'R': 2188 { 2189 PyObject *obj = va_arg(count, PyObject *); 2190 PyObject *repr; 2191 assert(obj); 2192 repr = PyObject_Repr(obj); 2193 if (!repr || PyUnicode_READY(repr) == -1) 2194 goto fail; 2195 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2196 maxchar = Py_MAX(maxchar, argmaxchar); 2197 n += PyUnicode_GET_LENGTH(repr); 2198 /* Remember the repr and switch to the next slot */ 2199 *callresult++ = repr; 2200 break; 2201 } 2202 case 'A': 2203 { 2204 PyObject *obj = va_arg(count, PyObject *); 2205 PyObject *ascii; 2206 assert(obj); 2207 ascii = PyObject_ASCII(obj); 2208 if (!ascii || PyUnicode_READY(ascii) == -1) 2209 goto fail; 2210 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2211 maxchar = Py_MAX(maxchar, argmaxchar); 2212 n += PyUnicode_GET_LENGTH(ascii); 2213 /* Remember the repr and switch to the next slot */ 2214 *callresult++ = ascii; 2215 break; 2216 } 2217 default: 2218 /* if we stumble upon an unknown 2219 formatting code, copy the rest of 2220 the format string to the output 2221 string. (we cannot just skip the 2222 code, since there's no way to know 2223 what's in the argument list) */ 2224 n += strlen(p); 2225 goto expand; 2226 } 2227 } else 2228 n++; 2229 } 2230 expand: 2231 /* step 4: fill the buffer */ 2232 /* Since we've analyzed how much space we need, 2233 we don't have to resize the string. 2234 There can be no errors beyond this point. */ 2235 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); 2236 if (!string) 2237 goto fail; 2238 kind = PyUnicode_KIND(string); 2239 data = PyUnicode_DATA(string); 2240 callresult = callresults; 2241 numberresult = numberresults; 2242 2243 for (i = 0, f = format; *f; f++) { 2244 if (*f == '%') { 2245 const char* p; 2246 2247 p = f; 2248 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2249 /* checking for == because the last argument could be a empty 2250 string, which causes i to point to end, the assert at the end of 2251 the loop */ 2252 assert(i <= PyUnicode_GET_LENGTH(string)); 2253 2254 switch (*f) { 2255 case 'c': 2256 { 2257 const int ordinal = va_arg(vargs, int); 2258 PyUnicode_WRITE(kind, data, i++, ordinal); 2259 break; 2260 } 2261 case 'i': 2262 case 'd': 2263 case 'u': 2264 case 'x': 2265 case 'p': 2266 /* unused, since we already have the result */ 2267 if (*f == 'p') 2268 (void) va_arg(vargs, void *); 2269 else 2270 (void) va_arg(vargs, int); 2271 /* extract the result from numberresults and append. */ 2272 for (; *numberresult; ++i, ++numberresult) 2273 PyUnicode_WRITE(kind, data, i, *numberresult); 2274 /* skip over the separating '\0' */ 2275 assert(*numberresult == '\0'); 2276 numberresult++; 2277 assert(numberresult <= numberresults + numbersize); 2278 break; 2279 case 's': 2280 { 2281 /* unused, since we already have the result */ 2282 Py_ssize_t size; 2283 (void) va_arg(vargs, char *); 2284 size = PyUnicode_GET_LENGTH(*callresult); 2285 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2286 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2287 *callresult, 0, 2288 size) < 0) 2289 goto fail; 2290 i += size; 2291 /* We're done with the unicode()/repr() => forget it */ 2292 Py_DECREF(*callresult); 2293 /* switch to next unicode()/repr() result */ 2294 ++callresult; 2295 break; 2296 } 2297 case 'U': 2298 { 2299 PyObject *obj = va_arg(vargs, PyObject *); 2300 Py_ssize_t size; 2301 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2302 size = PyUnicode_GET_LENGTH(obj); 2303 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2304 obj, 0, 2305 size) < 0) 2306 goto fail; 2307 i += size; 2308 break; 2309 } 2310 case 'V': 2311 { 2312 Py_ssize_t size; 2313 PyObject *obj = va_arg(vargs, PyObject *); 2314 va_arg(vargs, const char *); 2315 if (obj) { 2316 size = PyUnicode_GET_LENGTH(obj); 2317 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2318 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2319 obj, 0, 2320 size) < 0) 2321 goto fail; 2322 i += size; 2323 } else { 2324 size = PyUnicode_GET_LENGTH(*callresult); 2325 assert(PyUnicode_KIND(*callresult) <= 2326 PyUnicode_KIND(string)); 2327 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2328 *callresult, 2329 0, size) < 0) 2330 goto fail; 2331 i += size; 2332 Py_DECREF(*callresult); 2333 } 2334 ++callresult; 2335 break; 2336 } 2337 case 'S': 2338 case 'R': 2339 case 'A': 2340 { 2341 /* unused, since we already have the result */ 2342 (void) va_arg(vargs, PyObject *); 2343 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2344 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2345 *callresult, 0, 2346 PyUnicode_GET_LENGTH(*callresult)) < 0) 2347 goto fail; 2348 i += PyUnicode_GET_LENGTH(*callresult); 2349 /* We're done with the unicode()/repr() => forget it */ 2350 Py_DECREF(*callresult); 2351 /* switch to next unicode()/repr() result */ 2352 ++callresult; 2353 break; 2354 } 2355 case '%': 2356 PyUnicode_WRITE(kind, data, i++, '%'); 2357 break; 2358 default: 2359 for (; *p; ++p, ++i) 2360 PyUnicode_WRITE(kind, data, i, *p); 2361 assert(i == PyUnicode_GET_LENGTH(string)); 2362 goto end; 2363 } 2364 } 2365 else { 2366 assert(i < PyUnicode_GET_LENGTH(string)); 2367 PyUnicode_WRITE(kind, data, i++, *f); 2368 } 2369 } 2370 assert(i == PyUnicode_GET_LENGTH(string)); 2371 2372 end: 2373 if (callresults) 2374 PyObject_Free(callresults); 2375 if (numberresults) 2376 PyObject_Free(numberresults); 2377 return (PyObject *)string; 2378 fail: 2379 if (callresults) { 2380 PyObject **callresult2 = callresults; 2381 while (callresult2 < callresult) { 2382 Py_XDECREF(*callresult2); 2383 ++callresult2; 2384 } 2385 PyObject_Free(callresults); 2386 } 2387 if (numberresults) 2388 PyObject_Free(numberresults); 2389 return NULL; 2390} 2391 2392PyObject * 2393PyUnicode_FromFormat(const char *format, ...) 2394{ 2395 PyObject* ret; 2396 va_list vargs; 2397 2398#ifdef HAVE_STDARG_PROTOTYPES 2399 va_start(vargs, format); 2400#else 2401 va_start(vargs); 2402#endif 2403 ret = PyUnicode_FromFormatV(format, vargs); 2404 va_end(vargs); 2405 return ret; 2406} 2407 2408#ifdef HAVE_WCHAR_H 2409 2410/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2411 convert a Unicode object to a wide character string. 2412 2413 - If w is NULL: return the number of wide characters (including the null 2414 character) required to convert the unicode object. Ignore size argument. 2415 2416 - Otherwise: return the number of wide characters (excluding the null 2417 character) written into w. Write at most size wide characters (including 2418 the null character). */ 2419static Py_ssize_t 2420unicode_aswidechar(PyUnicodeObject *unicode, 2421 wchar_t *w, 2422 Py_ssize_t size) 2423{ 2424 Py_ssize_t res; 2425 const wchar_t *wstr; 2426 2427 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2428 if (wstr == NULL) 2429 return -1; 2430 2431 if (w != NULL) { 2432 if (size > res) 2433 size = res + 1; 2434 else 2435 res = size; 2436 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2437 return res; 2438 } 2439 else 2440 return res + 1; 2441} 2442 2443Py_ssize_t 2444PyUnicode_AsWideChar(PyObject *unicode, 2445 wchar_t *w, 2446 Py_ssize_t size) 2447{ 2448 if (unicode == NULL) { 2449 PyErr_BadInternalCall(); 2450 return -1; 2451 } 2452 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2453} 2454 2455wchar_t* 2456PyUnicode_AsWideCharString(PyObject *unicode, 2457 Py_ssize_t *size) 2458{ 2459 wchar_t* buffer; 2460 Py_ssize_t buflen; 2461 2462 if (unicode == NULL) { 2463 PyErr_BadInternalCall(); 2464 return NULL; 2465 } 2466 2467 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2468 if (buflen == -1) 2469 return NULL; 2470 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2471 PyErr_NoMemory(); 2472 return NULL; 2473 } 2474 2475 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2476 if (buffer == NULL) { 2477 PyErr_NoMemory(); 2478 return NULL; 2479 } 2480 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2481 if (buflen == -1) 2482 return NULL; 2483 if (size != NULL) 2484 *size = buflen; 2485 return buffer; 2486} 2487 2488#endif /* HAVE_WCHAR_H */ 2489 2490PyObject * 2491PyUnicode_FromOrdinal(int ordinal) 2492{ 2493 PyObject *v; 2494 if (ordinal < 0 || ordinal > 0x10ffff) { 2495 PyErr_SetString(PyExc_ValueError, 2496 "chr() arg not in range(0x110000)"); 2497 return NULL; 2498 } 2499 2500 if (ordinal < 256) 2501 return get_latin1_char(ordinal); 2502 2503 v = PyUnicode_New(1, ordinal); 2504 if (v == NULL) 2505 return NULL; 2506 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2507 return v; 2508} 2509 2510PyObject * 2511PyUnicode_FromObject(register PyObject *obj) 2512{ 2513 /* XXX Perhaps we should make this API an alias of 2514 PyObject_Str() instead ?! */ 2515 if (PyUnicode_CheckExact(obj)) { 2516 if (PyUnicode_READY(obj)) 2517 return NULL; 2518 Py_INCREF(obj); 2519 return obj; 2520 } 2521 if (PyUnicode_Check(obj)) { 2522 /* For a Unicode subtype that's not a Unicode object, 2523 return a true Unicode object with the same data. */ 2524 return PyUnicode_Copy(obj); 2525 } 2526 PyErr_Format(PyExc_TypeError, 2527 "Can't convert '%.100s' object to str implicitly", 2528 Py_TYPE(obj)->tp_name); 2529 return NULL; 2530} 2531 2532PyObject * 2533PyUnicode_FromEncodedObject(register PyObject *obj, 2534 const char *encoding, 2535 const char *errors) 2536{ 2537 Py_buffer buffer; 2538 PyObject *v; 2539 2540 if (obj == NULL) { 2541 PyErr_BadInternalCall(); 2542 return NULL; 2543 } 2544 2545 /* Decoding bytes objects is the most common case and should be fast */ 2546 if (PyBytes_Check(obj)) { 2547 if (PyBytes_GET_SIZE(obj) == 0) { 2548 Py_INCREF(unicode_empty); 2549 v = unicode_empty; 2550 } 2551 else { 2552 v = PyUnicode_Decode( 2553 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2554 encoding, errors); 2555 } 2556 return v; 2557 } 2558 2559 if (PyUnicode_Check(obj)) { 2560 PyErr_SetString(PyExc_TypeError, 2561 "decoding str is not supported"); 2562 return NULL; 2563 } 2564 2565 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2566 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2567 PyErr_Format(PyExc_TypeError, 2568 "coercing to str: need bytes, bytearray " 2569 "or buffer-like object, %.80s found", 2570 Py_TYPE(obj)->tp_name); 2571 return NULL; 2572 } 2573 2574 if (buffer.len == 0) { 2575 Py_INCREF(unicode_empty); 2576 v = unicode_empty; 2577 } 2578 else 2579 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2580 2581 PyBuffer_Release(&buffer); 2582 return v; 2583} 2584 2585/* Convert encoding to lower case and replace '_' with '-' in order to 2586 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2587 1 on success. */ 2588static int 2589normalize_encoding(const char *encoding, 2590 char *lower, 2591 size_t lower_len) 2592{ 2593 const char *e; 2594 char *l; 2595 char *l_end; 2596 2597 e = encoding; 2598 l = lower; 2599 l_end = &lower[lower_len - 1]; 2600 while (*e) { 2601 if (l == l_end) 2602 return 0; 2603 if (Py_ISUPPER(*e)) { 2604 *l++ = Py_TOLOWER(*e++); 2605 } 2606 else if (*e == '_') { 2607 *l++ = '-'; 2608 e++; 2609 } 2610 else { 2611 *l++ = *e++; 2612 } 2613 } 2614 *l = '\0'; 2615 return 1; 2616} 2617 2618PyObject * 2619PyUnicode_Decode(const char *s, 2620 Py_ssize_t size, 2621 const char *encoding, 2622 const char *errors) 2623{ 2624 PyObject *buffer = NULL, *unicode; 2625 Py_buffer info; 2626 char lower[11]; /* Enough for any encoding shortcut */ 2627 2628 if (encoding == NULL) 2629 return PyUnicode_DecodeUTF8(s, size, errors); 2630 2631 /* Shortcuts for common default encodings */ 2632 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2633 if ((strcmp(lower, "utf-8") == 0) || 2634 (strcmp(lower, "utf8") == 0)) 2635 return PyUnicode_DecodeUTF8(s, size, errors); 2636 else if ((strcmp(lower, "latin-1") == 0) || 2637 (strcmp(lower, "latin1") == 0) || 2638 (strcmp(lower, "iso-8859-1") == 0)) 2639 return PyUnicode_DecodeLatin1(s, size, errors); 2640#ifdef HAVE_MBCS 2641 else if (strcmp(lower, "mbcs") == 0) 2642 return PyUnicode_DecodeMBCS(s, size, errors); 2643#endif 2644 else if (strcmp(lower, "ascii") == 0) 2645 return PyUnicode_DecodeASCII(s, size, errors); 2646 else if (strcmp(lower, "utf-16") == 0) 2647 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2648 else if (strcmp(lower, "utf-32") == 0) 2649 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2650 } 2651 2652 /* Decode via the codec registry */ 2653 buffer = NULL; 2654 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2655 goto onError; 2656 buffer = PyMemoryView_FromBuffer(&info); 2657 if (buffer == NULL) 2658 goto onError; 2659 unicode = PyCodec_Decode(buffer, encoding, errors); 2660 if (unicode == NULL) 2661 goto onError; 2662 if (!PyUnicode_Check(unicode)) { 2663 PyErr_Format(PyExc_TypeError, 2664 "decoder did not return a str object (type=%.400s)", 2665 Py_TYPE(unicode)->tp_name); 2666 Py_DECREF(unicode); 2667 goto onError; 2668 } 2669 Py_DECREF(buffer); 2670#ifndef DONT_MAKE_RESULT_READY 2671 if (_PyUnicode_READY_REPLACE(&unicode)) { 2672 Py_DECREF(unicode); 2673 return NULL; 2674 } 2675#endif 2676 return unicode; 2677 2678 onError: 2679 Py_XDECREF(buffer); 2680 return NULL; 2681} 2682 2683PyObject * 2684PyUnicode_AsDecodedObject(PyObject *unicode, 2685 const char *encoding, 2686 const char *errors) 2687{ 2688 PyObject *v; 2689 2690 if (!PyUnicode_Check(unicode)) { 2691 PyErr_BadArgument(); 2692 goto onError; 2693 } 2694 2695 if (encoding == NULL) 2696 encoding = PyUnicode_GetDefaultEncoding(); 2697 2698 /* Decode via the codec registry */ 2699 v = PyCodec_Decode(unicode, encoding, errors); 2700 if (v == NULL) 2701 goto onError; 2702 return v; 2703 2704 onError: 2705 return NULL; 2706} 2707 2708PyObject * 2709PyUnicode_AsDecodedUnicode(PyObject *unicode, 2710 const char *encoding, 2711 const char *errors) 2712{ 2713 PyObject *v; 2714 2715 if (!PyUnicode_Check(unicode)) { 2716 PyErr_BadArgument(); 2717 goto onError; 2718 } 2719 2720 if (encoding == NULL) 2721 encoding = PyUnicode_GetDefaultEncoding(); 2722 2723 /* Decode via the codec registry */ 2724 v = PyCodec_Decode(unicode, encoding, errors); 2725 if (v == NULL) 2726 goto onError; 2727 if (!PyUnicode_Check(v)) { 2728 PyErr_Format(PyExc_TypeError, 2729 "decoder did not return a str object (type=%.400s)", 2730 Py_TYPE(v)->tp_name); 2731 Py_DECREF(v); 2732 goto onError; 2733 } 2734 return v; 2735 2736 onError: 2737 return NULL; 2738} 2739 2740PyObject * 2741PyUnicode_Encode(const Py_UNICODE *s, 2742 Py_ssize_t size, 2743 const char *encoding, 2744 const char *errors) 2745{ 2746 PyObject *v, *unicode; 2747 2748 unicode = PyUnicode_FromUnicode(s, size); 2749 if (unicode == NULL) 2750 return NULL; 2751 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2752 Py_DECREF(unicode); 2753 return v; 2754} 2755 2756PyObject * 2757PyUnicode_AsEncodedObject(PyObject *unicode, 2758 const char *encoding, 2759 const char *errors) 2760{ 2761 PyObject *v; 2762 2763 if (!PyUnicode_Check(unicode)) { 2764 PyErr_BadArgument(); 2765 goto onError; 2766 } 2767 2768 if (encoding == NULL) 2769 encoding = PyUnicode_GetDefaultEncoding(); 2770 2771 /* Encode via the codec registry */ 2772 v = PyCodec_Encode(unicode, encoding, errors); 2773 if (v == NULL) 2774 goto onError; 2775 return v; 2776 2777 onError: 2778 return NULL; 2779} 2780 2781PyObject * 2782PyUnicode_EncodeFSDefault(PyObject *unicode) 2783{ 2784#ifdef HAVE_MBCS 2785 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2786 PyUnicode_GET_SIZE(unicode), 2787 NULL); 2788#elif defined(__APPLE__) 2789 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2790#else 2791 PyInterpreterState *interp = PyThreadState_GET()->interp; 2792 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2793 cannot use it to encode and decode filenames before it is loaded. Load 2794 the Python codec requires to encode at least its own filename. Use the C 2795 version of the locale codec until the codec registry is initialized and 2796 the Python codec is loaded. 2797 2798 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2799 cannot only rely on it: check also interp->fscodec_initialized for 2800 subinterpreters. */ 2801 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2802 return PyUnicode_AsEncodedString(unicode, 2803 Py_FileSystemDefaultEncoding, 2804 "surrogateescape"); 2805 } 2806 else { 2807 /* locale encoding with surrogateescape */ 2808 wchar_t *wchar; 2809 char *bytes; 2810 PyObject *bytes_obj; 2811 size_t error_pos; 2812 2813 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2814 if (wchar == NULL) 2815 return NULL; 2816 bytes = _Py_wchar2char(wchar, &error_pos); 2817 if (bytes == NULL) { 2818 if (error_pos != (size_t)-1) { 2819 char *errmsg = strerror(errno); 2820 PyObject *exc = NULL; 2821 if (errmsg == NULL) 2822 errmsg = "Py_wchar2char() failed"; 2823 raise_encode_exception(&exc, 2824 "filesystemencoding", 2825 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2826 error_pos, error_pos+1, 2827 errmsg); 2828 Py_XDECREF(exc); 2829 } 2830 else 2831 PyErr_NoMemory(); 2832 PyMem_Free(wchar); 2833 return NULL; 2834 } 2835 PyMem_Free(wchar); 2836 2837 bytes_obj = PyBytes_FromString(bytes); 2838 PyMem_Free(bytes); 2839 return bytes_obj; 2840 } 2841#endif 2842} 2843 2844PyObject * 2845PyUnicode_AsEncodedString(PyObject *unicode, 2846 const char *encoding, 2847 const char *errors) 2848{ 2849 PyObject *v; 2850 char lower[11]; /* Enough for any encoding shortcut */ 2851 2852 if (!PyUnicode_Check(unicode)) { 2853 PyErr_BadArgument(); 2854 return NULL; 2855 } 2856 2857 if (encoding == NULL) { 2858 if (errors == NULL || strcmp(errors, "strict") == 0) 2859 return _PyUnicode_AsUTF8String(unicode, NULL); 2860 else 2861 return _PyUnicode_AsUTF8String(unicode, errors); 2862 } 2863 2864 /* Shortcuts for common default encodings */ 2865 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2866 if ((strcmp(lower, "utf-8") == 0) || 2867 (strcmp(lower, "utf8") == 0)) 2868 { 2869 if (errors == NULL || strcmp(errors, "strict") == 0) 2870 return _PyUnicode_AsUTF8String(unicode, NULL); 2871 else 2872 return _PyUnicode_AsUTF8String(unicode, errors); 2873 } 2874 else if ((strcmp(lower, "latin-1") == 0) || 2875 (strcmp(lower, "latin1") == 0) || 2876 (strcmp(lower, "iso-8859-1") == 0)) 2877 return _PyUnicode_AsLatin1String(unicode, errors); 2878#ifdef HAVE_MBCS 2879 else if (strcmp(lower, "mbcs") == 0) 2880 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2881 PyUnicode_GET_SIZE(unicode), 2882 errors); 2883#endif 2884 else if (strcmp(lower, "ascii") == 0) 2885 return _PyUnicode_AsASCIIString(unicode, errors); 2886 } 2887 2888 /* Encode via the codec registry */ 2889 v = PyCodec_Encode(unicode, encoding, errors); 2890 if (v == NULL) 2891 return NULL; 2892 2893 /* The normal path */ 2894 if (PyBytes_Check(v)) 2895 return v; 2896 2897 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2898 if (PyByteArray_Check(v)) { 2899 int error; 2900 PyObject *b; 2901 2902 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2903 "encoder %s returned bytearray instead of bytes", 2904 encoding); 2905 if (error) { 2906 Py_DECREF(v); 2907 return NULL; 2908 } 2909 2910 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2911 Py_DECREF(v); 2912 return b; 2913 } 2914 2915 PyErr_Format(PyExc_TypeError, 2916 "encoder did not return a bytes object (type=%.400s)", 2917 Py_TYPE(v)->tp_name); 2918 Py_DECREF(v); 2919 return NULL; 2920} 2921 2922PyObject * 2923PyUnicode_AsEncodedUnicode(PyObject *unicode, 2924 const char *encoding, 2925 const char *errors) 2926{ 2927 PyObject *v; 2928 2929 if (!PyUnicode_Check(unicode)) { 2930 PyErr_BadArgument(); 2931 goto onError; 2932 } 2933 2934 if (encoding == NULL) 2935 encoding = PyUnicode_GetDefaultEncoding(); 2936 2937 /* Encode via the codec registry */ 2938 v = PyCodec_Encode(unicode, encoding, errors); 2939 if (v == NULL) 2940 goto onError; 2941 if (!PyUnicode_Check(v)) { 2942 PyErr_Format(PyExc_TypeError, 2943 "encoder did not return an str object (type=%.400s)", 2944 Py_TYPE(v)->tp_name); 2945 Py_DECREF(v); 2946 goto onError; 2947 } 2948 return v; 2949 2950 onError: 2951 return NULL; 2952} 2953 2954PyObject* 2955PyUnicode_DecodeFSDefault(const char *s) { 2956 Py_ssize_t size = (Py_ssize_t)strlen(s); 2957 return PyUnicode_DecodeFSDefaultAndSize(s, size); 2958} 2959 2960PyObject* 2961PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 2962{ 2963#ifdef HAVE_MBCS 2964 return PyUnicode_DecodeMBCS(s, size, NULL); 2965#elif defined(__APPLE__) 2966 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 2967#else 2968 PyInterpreterState *interp = PyThreadState_GET()->interp; 2969 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2970 cannot use it to encode and decode filenames before it is loaded. Load 2971 the Python codec requires to encode at least its own filename. Use the C 2972 version of the locale codec until the codec registry is initialized and 2973 the Python codec is loaded. 2974 2975 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2976 cannot only rely on it: check also interp->fscodec_initialized for 2977 subinterpreters. */ 2978 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2979 return PyUnicode_Decode(s, size, 2980 Py_FileSystemDefaultEncoding, 2981 "surrogateescape"); 2982 } 2983 else { 2984 /* locale encoding with surrogateescape */ 2985 wchar_t *wchar; 2986 PyObject *unicode; 2987 size_t len; 2988 2989 if (s[size] != '\0' || size != strlen(s)) { 2990 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2991 return NULL; 2992 } 2993 2994 wchar = _Py_char2wchar(s, &len); 2995 if (wchar == NULL) 2996 return PyErr_NoMemory(); 2997 2998 unicode = PyUnicode_FromWideChar(wchar, len); 2999 PyMem_Free(wchar); 3000 return unicode; 3001 } 3002#endif 3003} 3004 3005 3006int 3007PyUnicode_FSConverter(PyObject* arg, void* addr) 3008{ 3009 PyObject *output = NULL; 3010 Py_ssize_t size; 3011 void *data; 3012 if (arg == NULL) { 3013 Py_DECREF(*(PyObject**)addr); 3014 return 1; 3015 } 3016 if (PyBytes_Check(arg)) { 3017 output = arg; 3018 Py_INCREF(output); 3019 } 3020 else { 3021 arg = PyUnicode_FromObject(arg); 3022 if (!arg) 3023 return 0; 3024 output = PyUnicode_EncodeFSDefault(arg); 3025 Py_DECREF(arg); 3026 if (!output) 3027 return 0; 3028 if (!PyBytes_Check(output)) { 3029 Py_DECREF(output); 3030 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3031 return 0; 3032 } 3033 } 3034 size = PyBytes_GET_SIZE(output); 3035 data = PyBytes_AS_STRING(output); 3036 if (size != strlen(data)) { 3037 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3038 Py_DECREF(output); 3039 return 0; 3040 } 3041 *(PyObject**)addr = output; 3042 return Py_CLEANUP_SUPPORTED; 3043} 3044 3045 3046int 3047PyUnicode_FSDecoder(PyObject* arg, void* addr) 3048{ 3049 PyObject *output = NULL; 3050 if (arg == NULL) { 3051 Py_DECREF(*(PyObject**)addr); 3052 return 1; 3053 } 3054 if (PyUnicode_Check(arg)) { 3055 if (PyUnicode_READY(arg)) 3056 return 0; 3057 output = arg; 3058 Py_INCREF(output); 3059 } 3060 else { 3061 arg = PyBytes_FromObject(arg); 3062 if (!arg) 3063 return 0; 3064 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3065 PyBytes_GET_SIZE(arg)); 3066 Py_DECREF(arg); 3067 if (!output) 3068 return 0; 3069 if (!PyUnicode_Check(output)) { 3070 Py_DECREF(output); 3071 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3072 return 0; 3073 } 3074 } 3075 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3076 PyUnicode_GET_LENGTH(output), 0, 1)) { 3077 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3078 Py_DECREF(output); 3079 return 0; 3080 } 3081 *(PyObject**)addr = output; 3082 return Py_CLEANUP_SUPPORTED; 3083} 3084 3085 3086char* 3087PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3088{ 3089 PyObject *bytes; 3090 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3091 3092 if (!PyUnicode_Check(unicode)) { 3093 PyErr_BadArgument(); 3094 return NULL; 3095 } 3096 if (PyUnicode_READY(u) == -1) 3097 return NULL; 3098 3099 if (PyUnicode_UTF8(unicode) == NULL) { 3100 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3101 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3102 if (bytes == NULL) 3103 return NULL; 3104 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3105 if (_PyUnicode_UTF8(u) == NULL) { 3106 Py_DECREF(bytes); 3107 return NULL; 3108 } 3109 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3110 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3111 Py_DECREF(bytes); 3112 } 3113 3114 if (psize) 3115 *psize = PyUnicode_UTF8_LENGTH(unicode); 3116 return PyUnicode_UTF8(unicode); 3117} 3118 3119char* 3120PyUnicode_AsUTF8(PyObject *unicode) 3121{ 3122 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3123} 3124 3125#ifdef Py_DEBUG 3126int unicode_as_unicode_calls = 0; 3127#endif 3128 3129 3130Py_UNICODE * 3131PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3132{ 3133 PyUnicodeObject *u; 3134 const unsigned char *one_byte; 3135#if SIZEOF_WCHAR_T == 4 3136 const Py_UCS2 *two_bytes; 3137#else 3138 const Py_UCS4 *four_bytes; 3139 const Py_UCS4 *ucs4_end; 3140 Py_ssize_t num_surrogates; 3141#endif 3142 wchar_t *w; 3143 wchar_t *wchar_end; 3144 3145 if (!PyUnicode_Check(unicode)) { 3146 PyErr_BadArgument(); 3147 return NULL; 3148 } 3149 u = (PyUnicodeObject*)unicode; 3150 if (_PyUnicode_WSTR(u) == NULL) { 3151 /* Non-ASCII compact unicode object */ 3152 assert(_PyUnicode_KIND(u) != 0); 3153 assert(PyUnicode_IS_READY(u)); 3154 3155#ifdef Py_DEBUG 3156 ++unicode_as_unicode_calls; 3157#endif 3158 3159 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3160#if SIZEOF_WCHAR_T == 2 3161 four_bytes = PyUnicode_4BYTE_DATA(u); 3162 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3163 num_surrogates = 0; 3164 3165 for (; four_bytes < ucs4_end; ++four_bytes) { 3166 if (*four_bytes > 0xFFFF) 3167 ++num_surrogates; 3168 } 3169 3170 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3171 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3172 if (!_PyUnicode_WSTR(u)) { 3173 PyErr_NoMemory(); 3174 return NULL; 3175 } 3176 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3177 3178 w = _PyUnicode_WSTR(u); 3179 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3180 four_bytes = PyUnicode_4BYTE_DATA(u); 3181 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3182 if (*four_bytes > 0xFFFF) { 3183 /* encode surrogate pair in this case */ 3184 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3185 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3186 } 3187 else 3188 *w = *four_bytes; 3189 3190 if (w > wchar_end) { 3191 assert(0 && "Miscalculated string end"); 3192 } 3193 } 3194 *w = 0; 3195#else 3196 /* sizeof(wchar_t) == 4 */ 3197 Py_FatalError("Impossible unicode object state, wstr and str " 3198 "should share memory already."); 3199 return NULL; 3200#endif 3201 } 3202 else { 3203 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3204 (_PyUnicode_LENGTH(u) + 1)); 3205 if (!_PyUnicode_WSTR(u)) { 3206 PyErr_NoMemory(); 3207 return NULL; 3208 } 3209 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3210 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3211 w = _PyUnicode_WSTR(u); 3212 wchar_end = w + _PyUnicode_LENGTH(u); 3213 3214 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3215 one_byte = PyUnicode_1BYTE_DATA(u); 3216 for (; w < wchar_end; ++one_byte, ++w) 3217 *w = *one_byte; 3218 /* null-terminate the wstr */ 3219 *w = 0; 3220 } 3221 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3222#if SIZEOF_WCHAR_T == 4 3223 two_bytes = PyUnicode_2BYTE_DATA(u); 3224 for (; w < wchar_end; ++two_bytes, ++w) 3225 *w = *two_bytes; 3226 /* null-terminate the wstr */ 3227 *w = 0; 3228#else 3229 /* sizeof(wchar_t) == 2 */ 3230 PyObject_FREE(_PyUnicode_WSTR(u)); 3231 _PyUnicode_WSTR(u) = NULL; 3232 Py_FatalError("Impossible unicode object state, wstr " 3233 "and str should share memory already."); 3234 return NULL; 3235#endif 3236 } 3237 else { 3238 assert(0 && "This should never happen."); 3239 } 3240 } 3241 } 3242 if (size != NULL) 3243 *size = PyUnicode_WSTR_LENGTH(u); 3244 return _PyUnicode_WSTR(u); 3245} 3246 3247Py_UNICODE * 3248PyUnicode_AsUnicode(PyObject *unicode) 3249{ 3250 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3251} 3252 3253 3254Py_ssize_t 3255PyUnicode_GetSize(PyObject *unicode) 3256{ 3257 if (!PyUnicode_Check(unicode)) { 3258 PyErr_BadArgument(); 3259 goto onError; 3260 } 3261 return PyUnicode_GET_SIZE(unicode); 3262 3263 onError: 3264 return -1; 3265} 3266 3267Py_ssize_t 3268PyUnicode_GetLength(PyObject *unicode) 3269{ 3270 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3271 PyErr_BadArgument(); 3272 return -1; 3273 } 3274 3275 return PyUnicode_GET_LENGTH(unicode); 3276} 3277 3278Py_UCS4 3279PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3280{ 3281 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3282 PyErr_BadArgument(); 3283 return (Py_UCS4)-1; 3284 } 3285 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3286 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3287 return (Py_UCS4)-1; 3288 } 3289 return PyUnicode_READ_CHAR(unicode, index); 3290} 3291 3292int 3293PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3294{ 3295 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3296 PyErr_BadArgument(); 3297 return -1; 3298 } 3299 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3300 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3301 return -1; 3302 } 3303 if (_PyUnicode_Dirty(unicode)) 3304 return -1; 3305 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3306 index, ch); 3307 return 0; 3308} 3309 3310const char * 3311PyUnicode_GetDefaultEncoding(void) 3312{ 3313 return "utf-8"; 3314} 3315 3316/* create or adjust a UnicodeDecodeError */ 3317static void 3318make_decode_exception(PyObject **exceptionObject, 3319 const char *encoding, 3320 const char *input, Py_ssize_t length, 3321 Py_ssize_t startpos, Py_ssize_t endpos, 3322 const char *reason) 3323{ 3324 if (*exceptionObject == NULL) { 3325 *exceptionObject = PyUnicodeDecodeError_Create( 3326 encoding, input, length, startpos, endpos, reason); 3327 } 3328 else { 3329 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3330 goto onError; 3331 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3332 goto onError; 3333 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3334 goto onError; 3335 } 3336 return; 3337 3338onError: 3339 Py_DECREF(*exceptionObject); 3340 *exceptionObject = NULL; 3341} 3342 3343/* error handling callback helper: 3344 build arguments, call the callback and check the arguments, 3345 if no exception occurred, copy the replacement to the output 3346 and adjust various state variables. 3347 return 0 on success, -1 on error 3348*/ 3349 3350static int 3351unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3352 const char *encoding, const char *reason, 3353 const char **input, const char **inend, Py_ssize_t *startinpos, 3354 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3355 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3356{ 3357 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3358 3359 PyObject *restuple = NULL; 3360 PyObject *repunicode = NULL; 3361 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3362 Py_ssize_t insize; 3363 Py_ssize_t requiredsize; 3364 Py_ssize_t newpos; 3365 const Py_UNICODE *repptr; 3366 PyObject *inputobj = NULL; 3367 Py_ssize_t repsize; 3368 int res = -1; 3369 3370 if (*errorHandler == NULL) { 3371 *errorHandler = PyCodec_LookupError(errors); 3372 if (*errorHandler == NULL) 3373 goto onError; 3374 } 3375 3376 make_decode_exception(exceptionObject, 3377 encoding, 3378 *input, *inend - *input, 3379 *startinpos, *endinpos, 3380 reason); 3381 if (*exceptionObject == NULL) 3382 goto onError; 3383 3384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3385 if (restuple == NULL) 3386 goto onError; 3387 if (!PyTuple_Check(restuple)) { 3388 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3389 goto onError; 3390 } 3391 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3392 goto onError; 3393 3394 /* Copy back the bytes variables, which might have been modified by the 3395 callback */ 3396 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3397 if (!inputobj) 3398 goto onError; 3399 if (!PyBytes_Check(inputobj)) { 3400 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3401 } 3402 *input = PyBytes_AS_STRING(inputobj); 3403 insize = PyBytes_GET_SIZE(inputobj); 3404 *inend = *input + insize; 3405 /* we can DECREF safely, as the exception has another reference, 3406 so the object won't go away. */ 3407 Py_DECREF(inputobj); 3408 3409 if (newpos<0) 3410 newpos = insize+newpos; 3411 if (newpos<0 || newpos>insize) { 3412 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3413 goto onError; 3414 } 3415 3416 /* need more space? (at least enough for what we 3417 have+the replacement+the rest of the string (starting 3418 at the new input position), so we won't have to check space 3419 when there are no errors in the rest of the string) */ 3420 repptr = PyUnicode_AS_UNICODE(repunicode); 3421 repsize = PyUnicode_GET_SIZE(repunicode); 3422 requiredsize = *outpos + repsize + insize-newpos; 3423 if (requiredsize > outsize) { 3424 if (requiredsize<2*outsize) 3425 requiredsize = 2*outsize; 3426 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3427 goto onError; 3428 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3429 } 3430 *endinpos = newpos; 3431 *inptr = *input + newpos; 3432 Py_UNICODE_COPY(*outptr, repptr, repsize); 3433 *outptr += repsize; 3434 *outpos += repsize; 3435 3436 /* we made it! */ 3437 res = 0; 3438 3439 onError: 3440 Py_XDECREF(restuple); 3441 return res; 3442} 3443 3444/* --- UTF-7 Codec -------------------------------------------------------- */ 3445 3446/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3447 3448/* Three simple macros defining base-64. */ 3449 3450/* Is c a base-64 character? */ 3451 3452#define IS_BASE64(c) \ 3453 (((c) >= 'A' && (c) <= 'Z') || \ 3454 ((c) >= 'a' && (c) <= 'z') || \ 3455 ((c) >= '0' && (c) <= '9') || \ 3456 (c) == '+' || (c) == '/') 3457 3458/* given that c is a base-64 character, what is its base-64 value? */ 3459 3460#define FROM_BASE64(c) \ 3461 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3462 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3463 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3464 (c) == '+' ? 62 : 63) 3465 3466/* What is the base-64 character of the bottom 6 bits of n? */ 3467 3468#define TO_BASE64(n) \ 3469 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3470 3471/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3472 * decoded as itself. We are permissive on decoding; the only ASCII 3473 * byte not decoding to itself is the + which begins a base64 3474 * string. */ 3475 3476#define DECODE_DIRECT(c) \ 3477 ((c) <= 127 && (c) != '+') 3478 3479/* The UTF-7 encoder treats ASCII characters differently according to 3480 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3481 * the above). See RFC2152. This array identifies these different 3482 * sets: 3483 * 0 : "Set D" 3484 * alphanumeric and '(),-./:? 3485 * 1 : "Set O" 3486 * !"#$%&*;<=>@[]^_`{|} 3487 * 2 : "whitespace" 3488 * ht nl cr sp 3489 * 3 : special (must be base64 encoded) 3490 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3491 */ 3492 3493static 3494char utf7_category[128] = { 3495/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3496 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3497/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3498 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3499/* sp ! " # $ % & ' ( ) * + , - . / */ 3500 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3501/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3503/* @ A B C D E F G H I J K L M N O */ 3504 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3505/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3507/* ` a b c d e f g h i j k l m n o */ 3508 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3509/* p q r s t u v w x y z { | } ~ del */ 3510 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3511}; 3512 3513/* ENCODE_DIRECT: this character should be encoded as itself. The 3514 * answer depends on whether we are encoding set O as itself, and also 3515 * on whether we are encoding whitespace as itself. RFC2152 makes it 3516 * clear that the answers to these questions vary between 3517 * applications, so this code needs to be flexible. */ 3518 3519#define ENCODE_DIRECT(c, directO, directWS) \ 3520 ((c) < 128 && (c) > 0 && \ 3521 ((utf7_category[(c)] == 0) || \ 3522 (directWS && (utf7_category[(c)] == 2)) || \ 3523 (directO && (utf7_category[(c)] == 1)))) 3524 3525PyObject * 3526PyUnicode_DecodeUTF7(const char *s, 3527 Py_ssize_t size, 3528 const char *errors) 3529{ 3530 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3531} 3532 3533/* The decoder. The only state we preserve is our read position, 3534 * i.e. how many characters we have consumed. So if we end in the 3535 * middle of a shift sequence we have to back off the read position 3536 * and the output to the beginning of the sequence, otherwise we lose 3537 * all the shift state (seen bits, number of bits seen, high 3538 * surrogate). */ 3539 3540PyObject * 3541PyUnicode_DecodeUTF7Stateful(const char *s, 3542 Py_ssize_t size, 3543 const char *errors, 3544 Py_ssize_t *consumed) 3545{ 3546 const char *starts = s; 3547 Py_ssize_t startinpos; 3548 Py_ssize_t endinpos; 3549 Py_ssize_t outpos; 3550 const char *e; 3551 PyUnicodeObject *unicode; 3552 Py_UNICODE *p; 3553 const char *errmsg = ""; 3554 int inShift = 0; 3555 Py_UNICODE *shiftOutStart; 3556 unsigned int base64bits = 0; 3557 unsigned long base64buffer = 0; 3558 Py_UNICODE surrogate = 0; 3559 PyObject *errorHandler = NULL; 3560 PyObject *exc = NULL; 3561 3562 unicode = _PyUnicode_New(size); 3563 if (!unicode) 3564 return NULL; 3565 if (size == 0) { 3566 if (consumed) 3567 *consumed = 0; 3568 return (PyObject *)unicode; 3569 } 3570 3571 p = PyUnicode_AS_UNICODE(unicode); 3572 shiftOutStart = p; 3573 e = s + size; 3574 3575 while (s < e) { 3576 Py_UNICODE ch; 3577 restart: 3578 ch = (unsigned char) *s; 3579 3580 if (inShift) { /* in a base-64 section */ 3581 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3582 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3583 base64bits += 6; 3584 s++; 3585 if (base64bits >= 16) { 3586 /* we have enough bits for a UTF-16 value */ 3587 Py_UNICODE outCh = (Py_UNICODE) 3588 (base64buffer >> (base64bits-16)); 3589 base64bits -= 16; 3590 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3591 if (surrogate) { 3592 /* expecting a second surrogate */ 3593 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3594#ifdef Py_UNICODE_WIDE 3595 *p++ = (((surrogate & 0x3FF)<<10) 3596 | (outCh & 0x3FF)) + 0x10000; 3597#else 3598 *p++ = surrogate; 3599 *p++ = outCh; 3600#endif 3601 surrogate = 0; 3602 } 3603 else { 3604 surrogate = 0; 3605 errmsg = "second surrogate missing"; 3606 goto utf7Error; 3607 } 3608 } 3609 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3610 /* first surrogate */ 3611 surrogate = outCh; 3612 } 3613 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3614 errmsg = "unexpected second surrogate"; 3615 goto utf7Error; 3616 } 3617 else { 3618 *p++ = outCh; 3619 } 3620 } 3621 } 3622 else { /* now leaving a base-64 section */ 3623 inShift = 0; 3624 s++; 3625 if (surrogate) { 3626 errmsg = "second surrogate missing at end of shift sequence"; 3627 goto utf7Error; 3628 } 3629 if (base64bits > 0) { /* left-over bits */ 3630 if (base64bits >= 6) { 3631 /* We've seen at least one base-64 character */ 3632 errmsg = "partial character in shift sequence"; 3633 goto utf7Error; 3634 } 3635 else { 3636 /* Some bits remain; they should be zero */ 3637 if (base64buffer != 0) { 3638 errmsg = "non-zero padding bits in shift sequence"; 3639 goto utf7Error; 3640 } 3641 } 3642 } 3643 if (ch != '-') { 3644 /* '-' is absorbed; other terminating 3645 characters are preserved */ 3646 *p++ = ch; 3647 } 3648 } 3649 } 3650 else if ( ch == '+' ) { 3651 startinpos = s-starts; 3652 s++; /* consume '+' */ 3653 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3654 s++; 3655 *p++ = '+'; 3656 } 3657 else { /* begin base64-encoded section */ 3658 inShift = 1; 3659 shiftOutStart = p; 3660 base64bits = 0; 3661 } 3662 } 3663 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3664 *p++ = ch; 3665 s++; 3666 } 3667 else { 3668 startinpos = s-starts; 3669 s++; 3670 errmsg = "unexpected special character"; 3671 goto utf7Error; 3672 } 3673 continue; 3674utf7Error: 3675 outpos = p-PyUnicode_AS_UNICODE(unicode); 3676 endinpos = s-starts; 3677 if (unicode_decode_call_errorhandler( 3678 errors, &errorHandler, 3679 "utf7", errmsg, 3680 &starts, &e, &startinpos, &endinpos, &exc, &s, 3681 &unicode, &outpos, &p)) 3682 goto onError; 3683 } 3684 3685 /* end of string */ 3686 3687 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3688 /* if we're in an inconsistent state, that's an error */ 3689 if (surrogate || 3690 (base64bits >= 6) || 3691 (base64bits > 0 && base64buffer != 0)) { 3692 outpos = p-PyUnicode_AS_UNICODE(unicode); 3693 endinpos = size; 3694 if (unicode_decode_call_errorhandler( 3695 errors, &errorHandler, 3696 "utf7", "unterminated shift sequence", 3697 &starts, &e, &startinpos, &endinpos, &exc, &s, 3698 &unicode, &outpos, &p)) 3699 goto onError; 3700 if (s < e) 3701 goto restart; 3702 } 3703 } 3704 3705 /* return state */ 3706 if (consumed) { 3707 if (inShift) { 3708 p = shiftOutStart; /* back off output */ 3709 *consumed = startinpos; 3710 } 3711 else { 3712 *consumed = s-starts; 3713 } 3714 } 3715 3716 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3717 goto onError; 3718 3719 Py_XDECREF(errorHandler); 3720 Py_XDECREF(exc); 3721#ifndef DONT_MAKE_RESULT_READY 3722 if (_PyUnicode_READY_REPLACE(&unicode)) { 3723 Py_DECREF(unicode); 3724 return NULL; 3725 } 3726#endif 3727 return (PyObject *)unicode; 3728 3729 onError: 3730 Py_XDECREF(errorHandler); 3731 Py_XDECREF(exc); 3732 Py_DECREF(unicode); 3733 return NULL; 3734} 3735 3736 3737PyObject * 3738PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3739 Py_ssize_t size, 3740 int base64SetO, 3741 int base64WhiteSpace, 3742 const char *errors) 3743{ 3744 PyObject *v; 3745 /* It might be possible to tighten this worst case */ 3746 Py_ssize_t allocated = 8 * size; 3747 int inShift = 0; 3748 Py_ssize_t i = 0; 3749 unsigned int base64bits = 0; 3750 unsigned long base64buffer = 0; 3751 char * out; 3752 char * start; 3753 3754 if (size == 0) 3755 return PyBytes_FromStringAndSize(NULL, 0); 3756 3757 if (allocated / 8 != size) 3758 return PyErr_NoMemory(); 3759 3760 v = PyBytes_FromStringAndSize(NULL, allocated); 3761 if (v == NULL) 3762 return NULL; 3763 3764 start = out = PyBytes_AS_STRING(v); 3765 for (;i < size; ++i) { 3766 Py_UNICODE ch = s[i]; 3767 3768 if (inShift) { 3769 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3770 /* shifting out */ 3771 if (base64bits) { /* output remaining bits */ 3772 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3773 base64buffer = 0; 3774 base64bits = 0; 3775 } 3776 inShift = 0; 3777 /* Characters not in the BASE64 set implicitly unshift the sequence 3778 so no '-' is required, except if the character is itself a '-' */ 3779 if (IS_BASE64(ch) || ch == '-') { 3780 *out++ = '-'; 3781 } 3782 *out++ = (char) ch; 3783 } 3784 else { 3785 goto encode_char; 3786 } 3787 } 3788 else { /* not in a shift sequence */ 3789 if (ch == '+') { 3790 *out++ = '+'; 3791 *out++ = '-'; 3792 } 3793 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3794 *out++ = (char) ch; 3795 } 3796 else { 3797 *out++ = '+'; 3798 inShift = 1; 3799 goto encode_char; 3800 } 3801 } 3802 continue; 3803encode_char: 3804#ifdef Py_UNICODE_WIDE 3805 if (ch >= 0x10000) { 3806 /* code first surrogate */ 3807 base64bits += 16; 3808 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3809 while (base64bits >= 6) { 3810 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3811 base64bits -= 6; 3812 } 3813 /* prepare second surrogate */ 3814 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3815 } 3816#endif 3817 base64bits += 16; 3818 base64buffer = (base64buffer << 16) | ch; 3819 while (base64bits >= 6) { 3820 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3821 base64bits -= 6; 3822 } 3823 } 3824 if (base64bits) 3825 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3826 if (inShift) 3827 *out++ = '-'; 3828 if (_PyBytes_Resize(&v, out - start) < 0) 3829 return NULL; 3830 return v; 3831} 3832 3833#undef IS_BASE64 3834#undef FROM_BASE64 3835#undef TO_BASE64 3836#undef DECODE_DIRECT 3837#undef ENCODE_DIRECT 3838 3839/* --- UTF-8 Codec -------------------------------------------------------- */ 3840 3841static 3842char utf8_code_length[256] = { 3843 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3844 illegal prefix. See RFC 3629 for details */ 3845 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3846 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3847 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3848 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3849 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3850 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3851 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3852 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3853 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3854 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3855 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3856 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3857 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3858 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3859 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3860 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3861}; 3862 3863PyObject * 3864PyUnicode_DecodeUTF8(const char *s, 3865 Py_ssize_t size, 3866 const char *errors) 3867{ 3868 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3869} 3870 3871/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3872#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3873 3874/* Mask to quickly check whether a C 'long' contains a 3875 non-ASCII, UTF8-encoded char. */ 3876#if (SIZEOF_LONG == 8) 3877# define ASCII_CHAR_MASK 0x8080808080808080L 3878#elif (SIZEOF_LONG == 4) 3879# define ASCII_CHAR_MASK 0x80808080L 3880#else 3881# error C 'long' size should be either 4 or 8! 3882#endif 3883 3884/* Scans a UTF-8 string and returns the maximum character to be expected, 3885 the size of the decoded unicode string and if any major errors were 3886 encountered. 3887 3888 This function does check basic UTF-8 sanity, it does however NOT CHECK 3889 if the string contains surrogates, and if all continuation bytes are 3890 within the correct ranges, these checks are performed in 3891 PyUnicode_DecodeUTF8Stateful. 3892 3893 If it sets has_errors to 1, it means the value of unicode_size and max_char 3894 will be bogus and you should not rely on useful information in them. 3895 */ 3896static Py_UCS4 3897utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3898 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3899 int *has_errors) 3900{ 3901 Py_ssize_t n; 3902 Py_ssize_t char_count = 0; 3903 Py_UCS4 max_char = 127, new_max; 3904 Py_UCS4 upper_bound; 3905 const unsigned char *p = (const unsigned char *)s; 3906 const unsigned char *end = p + string_size; 3907 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3908 int err = 0; 3909 3910 for (; p < end && !err; ++p, ++char_count) { 3911 /* Only check value if it's not a ASCII char... */ 3912 if (*p < 0x80) { 3913 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 3914 an explanation. */ 3915 if (!((size_t) p & LONG_PTR_MASK)) { 3916 /* Help register allocation */ 3917 register const unsigned char *_p = p; 3918 while (_p < aligned_end) { 3919 unsigned long value = *(unsigned long *) _p; 3920 if (value & ASCII_CHAR_MASK) 3921 break; 3922 _p += SIZEOF_LONG; 3923 char_count += SIZEOF_LONG; 3924 } 3925 p = _p; 3926 if (p == end) 3927 break; 3928 } 3929 } 3930 if (*p >= 0x80) { 3931 n = utf8_code_length[*p]; 3932 new_max = max_char; 3933 switch (n) { 3934 /* invalid start byte */ 3935 case 0: 3936 err = 1; 3937 break; 3938 case 2: 3939 /* Code points between 0x00FF and 0x07FF inclusive. 3940 Approximate the upper bound of the code point, 3941 if this flips over 255 we can be sure it will be more 3942 than 255 and the string will need 2 bytes per code coint, 3943 if it stays under or equal to 255, we can be sure 1 byte 3944 is enough. 3945 ((*p & 0b00011111) << 6) | 0b00111111 */ 3946 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 3947 if (max_char < upper_bound) 3948 new_max = upper_bound; 3949 /* Ensure we track at least that we left ASCII space. */ 3950 if (new_max < 128) 3951 new_max = 128; 3952 break; 3953 case 3: 3954 /* Between 0x0FFF and 0xFFFF inclusive, so values are 3955 always > 255 and <= 65535 and will always need 2 bytes. */ 3956 if (max_char < 65535) 3957 new_max = 65535; 3958 break; 3959 case 4: 3960 /* Code point will be above 0xFFFF for sure in this case. */ 3961 new_max = 65537; 3962 break; 3963 /* Internal error, this should be caught by the first if */ 3964 case 1: 3965 default: 3966 assert(0 && "Impossible case in utf8_max_char_and_size"); 3967 err = 1; 3968 } 3969 /* Instead of number of overall bytes for this code point, 3970 n contains the number of following bytes: */ 3971 --n; 3972 /* Check if the follow up chars are all valid continuation bytes */ 3973 if (n >= 1) { 3974 const unsigned char *cont; 3975 if ((p + n) >= end) { 3976 if (consumed == 0) 3977 /* incomplete data, non-incremental decoding */ 3978 err = 1; 3979 break; 3980 } 3981 for (cont = p + 1; cont < (p + n); ++cont) { 3982 if ((*cont & 0xc0) != 0x80) { 3983 err = 1; 3984 break; 3985 } 3986 } 3987 p += n; 3988 } 3989 else 3990 err = 1; 3991 max_char = new_max; 3992 } 3993 } 3994 3995 if (unicode_size) 3996 *unicode_size = char_count; 3997 if (has_errors) 3998 *has_errors = err; 3999 return max_char; 4000} 4001 4002/* Similar to PyUnicode_WRITE but can also write into wstr field 4003 of the legacy unicode representation */ 4004#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 4005 do { \ 4006 const int k_ = (kind); \ 4007 if (k_ == PyUnicode_WCHAR_KIND) \ 4008 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 4009 else if (k_ == PyUnicode_1BYTE_KIND) \ 4010 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 4011 else if (k_ == PyUnicode_2BYTE_KIND) \ 4012 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 4013 else \ 4014 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 4015 } while (0) 4016 4017PyObject * 4018PyUnicode_DecodeUTF8Stateful(const char *s, 4019 Py_ssize_t size, 4020 const char *errors, 4021 Py_ssize_t *consumed) 4022{ 4023 const char *starts = s; 4024 int n; 4025 int k; 4026 Py_ssize_t startinpos; 4027 Py_ssize_t endinpos; 4028 const char *e, *aligned_end; 4029 PyUnicodeObject *unicode; 4030 const char *errmsg = ""; 4031 PyObject *errorHandler = NULL; 4032 PyObject *exc = NULL; 4033 Py_UCS4 maxchar = 0; 4034 Py_ssize_t unicode_size; 4035 Py_ssize_t i; 4036 int kind; 4037 void *data; 4038 int has_errors; 4039 Py_UNICODE *error_outptr; 4040#if SIZEOF_WCHAR_T == 2 4041 Py_ssize_t wchar_offset = 0; 4042#endif 4043 4044 if (size == 0) { 4045 if (consumed) 4046 *consumed = 0; 4047 return (PyObject *)PyUnicode_New(0, 0); 4048 } 4049 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4050 consumed, &has_errors); 4051 if (has_errors) { 4052 unicode = _PyUnicode_New(size); 4053 if (!unicode) 4054 return NULL; 4055 kind = PyUnicode_WCHAR_KIND; 4056 data = PyUnicode_AS_UNICODE(unicode); 4057 assert(data != NULL); 4058 } 4059 else { 4060 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 4061 if (!unicode) 4062 return NULL; 4063 /* When the string is ASCII only, just use memcpy and return. 4064 unicode_size may be != size if there is an incomplete UTF-8 4065 sequence at the end of the ASCII block. */ 4066 if (maxchar < 128 && size == unicode_size) { 4067 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4068 return (PyObject *)unicode; 4069 } 4070 kind = PyUnicode_KIND(unicode); 4071 data = PyUnicode_DATA(unicode); 4072 } 4073 /* Unpack UTF-8 encoded data */ 4074 i = 0; 4075 e = s + size; 4076 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4077 4078 while (s < e) { 4079 Py_UCS4 ch = (unsigned char)*s; 4080 4081 if (ch < 0x80) { 4082 /* Fast path for runs of ASCII characters. Given that common UTF-8 4083 input will consist of an overwhelming majority of ASCII 4084 characters, we try to optimize for this case by checking 4085 as many characters as a C 'long' can contain. 4086 First, check if we can do an aligned read, as most CPUs have 4087 a penalty for unaligned reads. 4088 */ 4089 if (!((size_t) s & LONG_PTR_MASK)) { 4090 /* Help register allocation */ 4091 register const char *_s = s; 4092 register Py_ssize_t _i = i; 4093 while (_s < aligned_end) { 4094 /* Read a whole long at a time (either 4 or 8 bytes), 4095 and do a fast unrolled copy if it only contains ASCII 4096 characters. */ 4097 unsigned long value = *(unsigned long *) _s; 4098 if (value & ASCII_CHAR_MASK) 4099 break; 4100 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4101 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4102 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4103 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4104#if (SIZEOF_LONG == 8) 4105 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4106 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4107 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4108 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4109#endif 4110 _s += SIZEOF_LONG; 4111 _i += SIZEOF_LONG; 4112 } 4113 s = _s; 4114 i = _i; 4115 if (s == e) 4116 break; 4117 ch = (unsigned char)*s; 4118 } 4119 } 4120 4121 if (ch < 0x80) { 4122 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4123 s++; 4124 continue; 4125 } 4126 4127 n = utf8_code_length[ch]; 4128 4129 if (s + n > e) { 4130 if (consumed) 4131 break; 4132 else { 4133 errmsg = "unexpected end of data"; 4134 startinpos = s-starts; 4135 endinpos = startinpos+1; 4136 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4137 endinpos++; 4138 goto utf8Error; 4139 } 4140 } 4141 4142 switch (n) { 4143 4144 case 0: 4145 errmsg = "invalid start byte"; 4146 startinpos = s-starts; 4147 endinpos = startinpos+1; 4148 goto utf8Error; 4149 4150 case 1: 4151 errmsg = "internal error"; 4152 startinpos = s-starts; 4153 endinpos = startinpos+1; 4154 goto utf8Error; 4155 4156 case 2: 4157 if ((s[1] & 0xc0) != 0x80) { 4158 errmsg = "invalid continuation byte"; 4159 startinpos = s-starts; 4160 endinpos = startinpos + 1; 4161 goto utf8Error; 4162 } 4163 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4164 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4165 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4166 break; 4167 4168 case 3: 4169 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4170 will result in surrogates in range d800-dfff. Surrogates are 4171 not valid UTF-8 so they are rejected. 4172 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4173 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4174 if ((s[1] & 0xc0) != 0x80 || 4175 (s[2] & 0xc0) != 0x80 || 4176 ((unsigned char)s[0] == 0xE0 && 4177 (unsigned char)s[1] < 0xA0) || 4178 ((unsigned char)s[0] == 0xED && 4179 (unsigned char)s[1] > 0x9F)) { 4180 errmsg = "invalid continuation byte"; 4181 startinpos = s-starts; 4182 endinpos = startinpos + 1; 4183 4184 /* if s[1] first two bits are 1 and 0, then the invalid 4185 continuation byte is s[2], so increment endinpos by 1, 4186 if not, s[1] is invalid and endinpos doesn't need to 4187 be incremented. */ 4188 if ((s[1] & 0xC0) == 0x80) 4189 endinpos++; 4190 goto utf8Error; 4191 } 4192 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4193 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4194 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4195 break; 4196 4197 case 4: 4198 if ((s[1] & 0xc0) != 0x80 || 4199 (s[2] & 0xc0) != 0x80 || 4200 (s[3] & 0xc0) != 0x80 || 4201 ((unsigned char)s[0] == 0xF0 && 4202 (unsigned char)s[1] < 0x90) || 4203 ((unsigned char)s[0] == 0xF4 && 4204 (unsigned char)s[1] > 0x8F)) { 4205 errmsg = "invalid continuation byte"; 4206 startinpos = s-starts; 4207 endinpos = startinpos + 1; 4208 if ((s[1] & 0xC0) == 0x80) { 4209 endinpos++; 4210 if ((s[2] & 0xC0) == 0x80) 4211 endinpos++; 4212 } 4213 goto utf8Error; 4214 } 4215 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4216 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4217 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4218 4219 /* If the string is flexible or we have native UCS-4, write 4220 directly.. */ 4221 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4222 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4223 4224 else { 4225 /* compute and append the two surrogates: */ 4226 4227 /* translate from 10000..10FFFF to 0..FFFF */ 4228 ch -= 0x10000; 4229 4230 /* high surrogate = top 10 bits added to D800 */ 4231 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4232 (Py_UNICODE)(0xD800 + (ch >> 10))); 4233 4234 /* low surrogate = bottom 10 bits added to DC00 */ 4235 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4236 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4237 } 4238#if SIZEOF_WCHAR_T == 2 4239 wchar_offset++; 4240#endif 4241 break; 4242 } 4243 s += n; 4244 continue; 4245 4246 utf8Error: 4247 /* If this is not yet a resizable string, make it one.. */ 4248 if (kind != PyUnicode_WCHAR_KIND) { 4249 const Py_UNICODE *u; 4250 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4251 if (!new_unicode) 4252 goto onError; 4253 u = PyUnicode_AsUnicode((PyObject *)unicode); 4254 if (!u) 4255 goto onError; 4256#if SIZEOF_WCHAR_T == 2 4257 i += wchar_offset; 4258#endif 4259 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4260 Py_DECREF(unicode); 4261 unicode = new_unicode; 4262 kind = 0; 4263 data = PyUnicode_AS_UNICODE(new_unicode); 4264 assert(data != NULL); 4265 } 4266 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4267 if (unicode_decode_call_errorhandler( 4268 errors, &errorHandler, 4269 "utf8", errmsg, 4270 &starts, &e, &startinpos, &endinpos, &exc, &s, 4271 &unicode, &i, &error_outptr)) 4272 goto onError; 4273 /* Update data because unicode_decode_call_errorhandler might have 4274 re-created or resized the unicode object. */ 4275 data = PyUnicode_AS_UNICODE(unicode); 4276 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4277 } 4278 /* Ensure the unicode_size calculation above was correct: */ 4279 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4280 4281 if (consumed) 4282 *consumed = s-starts; 4283 4284 /* Adjust length and ready string when it contained errors and 4285 is of the old resizable kind. */ 4286 if (kind == PyUnicode_WCHAR_KIND) { 4287 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4288 goto onError; 4289 } 4290 4291 Py_XDECREF(errorHandler); 4292 Py_XDECREF(exc); 4293#ifndef DONT_MAKE_RESULT_READY 4294 if (_PyUnicode_READY_REPLACE(&unicode)) { 4295 Py_DECREF(unicode); 4296 return NULL; 4297 } 4298#endif 4299 return (PyObject *)unicode; 4300 4301 onError: 4302 Py_XDECREF(errorHandler); 4303 Py_XDECREF(exc); 4304 Py_DECREF(unicode); 4305 return NULL; 4306} 4307 4308#undef WRITE_FLEXIBLE_OR_WSTR 4309 4310#ifdef __APPLE__ 4311 4312/* Simplified UTF-8 decoder using surrogateescape error handler, 4313 used to decode the command line arguments on Mac OS X. */ 4314 4315wchar_t* 4316_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4317{ 4318 int n; 4319 const char *e; 4320 wchar_t *unicode, *p; 4321 4322 /* Note: size will always be longer than the resulting Unicode 4323 character count */ 4324 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4325 PyErr_NoMemory(); 4326 return NULL; 4327 } 4328 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4329 if (!unicode) 4330 return NULL; 4331 4332 /* Unpack UTF-8 encoded data */ 4333 p = unicode; 4334 e = s + size; 4335 while (s < e) { 4336 Py_UCS4 ch = (unsigned char)*s; 4337 4338 if (ch < 0x80) { 4339 *p++ = (wchar_t)ch; 4340 s++; 4341 continue; 4342 } 4343 4344 n = utf8_code_length[ch]; 4345 if (s + n > e) { 4346 goto surrogateescape; 4347 } 4348 4349 switch (n) { 4350 case 0: 4351 case 1: 4352 goto surrogateescape; 4353 4354 case 2: 4355 if ((s[1] & 0xc0) != 0x80) 4356 goto surrogateescape; 4357 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4358 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4359 *p++ = (wchar_t)ch; 4360 break; 4361 4362 case 3: 4363 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4364 will result in surrogates in range d800-dfff. Surrogates are 4365 not valid UTF-8 so they are rejected. 4366 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4367 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4368 if ((s[1] & 0xc0) != 0x80 || 4369 (s[2] & 0xc0) != 0x80 || 4370 ((unsigned char)s[0] == 0xE0 && 4371 (unsigned char)s[1] < 0xA0) || 4372 ((unsigned char)s[0] == 0xED && 4373 (unsigned char)s[1] > 0x9F)) { 4374 4375 goto surrogateescape; 4376 } 4377 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4378 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4379 *p++ = (wchar_t)ch; 4380 break; 4381 4382 case 4: 4383 if ((s[1] & 0xc0) != 0x80 || 4384 (s[2] & 0xc0) != 0x80 || 4385 (s[3] & 0xc0) != 0x80 || 4386 ((unsigned char)s[0] == 0xF0 && 4387 (unsigned char)s[1] < 0x90) || 4388 ((unsigned char)s[0] == 0xF4 && 4389 (unsigned char)s[1] > 0x8F)) { 4390 goto surrogateescape; 4391 } 4392 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4393 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4394 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4395 4396#if SIZEOF_WCHAR_T == 4 4397 *p++ = (wchar_t)ch; 4398#else 4399 /* compute and append the two surrogates: */ 4400 4401 /* translate from 10000..10FFFF to 0..FFFF */ 4402 ch -= 0x10000; 4403 4404 /* high surrogate = top 10 bits added to D800 */ 4405 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4406 4407 /* low surrogate = bottom 10 bits added to DC00 */ 4408 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4409#endif 4410 break; 4411 } 4412 s += n; 4413 continue; 4414 4415 surrogateescape: 4416 *p++ = 0xDC00 + ch; 4417 s++; 4418 } 4419 *p = L'\0'; 4420 return unicode; 4421} 4422 4423#endif /* __APPLE__ */ 4424 4425/* Primary internal function which creates utf8 encoded bytes objects. 4426 4427 Allocation strategy: if the string is short, convert into a stack buffer 4428 and allocate exactly as much space needed at the end. Else allocate the 4429 maximum possible needed (4 result bytes per Unicode character), and return 4430 the excess memory at the end. 4431*/ 4432PyObject * 4433_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4434{ 4435#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4436 4437 Py_ssize_t i; /* index into s of next input byte */ 4438 PyObject *result; /* result string object */ 4439 char *p; /* next free byte in output buffer */ 4440 Py_ssize_t nallocated; /* number of result bytes allocated */ 4441 Py_ssize_t nneeded; /* number of result bytes needed */ 4442 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4443 PyObject *errorHandler = NULL; 4444 PyObject *exc = NULL; 4445 int kind; 4446 void *data; 4447 Py_ssize_t size; 4448 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4449#if SIZEOF_WCHAR_T == 2 4450 Py_ssize_t wchar_offset = 0; 4451#endif 4452 4453 if (!PyUnicode_Check(unicode)) { 4454 PyErr_BadArgument(); 4455 return NULL; 4456 } 4457 4458 if (PyUnicode_READY(unicode) == -1) 4459 return NULL; 4460 4461 if (PyUnicode_UTF8(unicode)) 4462 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4463 PyUnicode_UTF8_LENGTH(unicode)); 4464 4465 kind = PyUnicode_KIND(unicode); 4466 data = PyUnicode_DATA(unicode); 4467 size = PyUnicode_GET_LENGTH(unicode); 4468 4469 assert(size >= 0); 4470 4471 if (size <= MAX_SHORT_UNICHARS) { 4472 /* Write into the stack buffer; nallocated can't overflow. 4473 * At the end, we'll allocate exactly as much heap space as it 4474 * turns out we need. 4475 */ 4476 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4477 result = NULL; /* will allocate after we're done */ 4478 p = stackbuf; 4479 } 4480 else { 4481 /* Overallocate on the heap, and give the excess back at the end. */ 4482 nallocated = size * 4; 4483 if (nallocated / 4 != size) /* overflow! */ 4484 return PyErr_NoMemory(); 4485 result = PyBytes_FromStringAndSize(NULL, nallocated); 4486 if (result == NULL) 4487 return NULL; 4488 p = PyBytes_AS_STRING(result); 4489 } 4490 4491 for (i = 0; i < size;) { 4492 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4493 4494 if (ch < 0x80) 4495 /* Encode ASCII */ 4496 *p++ = (char) ch; 4497 4498 else if (ch < 0x0800) { 4499 /* Encode Latin-1 */ 4500 *p++ = (char)(0xc0 | (ch >> 6)); 4501 *p++ = (char)(0x80 | (ch & 0x3f)); 4502 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4503 Py_ssize_t newpos; 4504 PyObject *rep; 4505 Py_ssize_t repsize, k, startpos; 4506 startpos = i-1; 4507#if SIZEOF_WCHAR_T == 2 4508 startpos += wchar_offset; 4509#endif 4510 rep = unicode_encode_call_errorhandler( 4511 errors, &errorHandler, "utf-8", "surrogates not allowed", 4512 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4513 &exc, startpos, startpos+1, &newpos); 4514 if (!rep) 4515 goto error; 4516 4517 if (PyBytes_Check(rep)) 4518 repsize = PyBytes_GET_SIZE(rep); 4519 else 4520 repsize = PyUnicode_GET_SIZE(rep); 4521 4522 if (repsize > 4) { 4523 Py_ssize_t offset; 4524 4525 if (result == NULL) 4526 offset = p - stackbuf; 4527 else 4528 offset = p - PyBytes_AS_STRING(result); 4529 4530 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4531 /* integer overflow */ 4532 PyErr_NoMemory(); 4533 goto error; 4534 } 4535 nallocated += repsize - 4; 4536 if (result != NULL) { 4537 if (_PyBytes_Resize(&result, nallocated) < 0) 4538 goto error; 4539 } else { 4540 result = PyBytes_FromStringAndSize(NULL, nallocated); 4541 if (result == NULL) 4542 goto error; 4543 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4544 } 4545 p = PyBytes_AS_STRING(result) + offset; 4546 } 4547 4548 if (PyBytes_Check(rep)) { 4549 char *prep = PyBytes_AS_STRING(rep); 4550 for(k = repsize; k > 0; k--) 4551 *p++ = *prep++; 4552 } else /* rep is unicode */ { 4553 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4554 Py_UNICODE c; 4555 4556 for(k=0; k<repsize; k++) { 4557 c = prep[k]; 4558 if (0x80 <= c) { 4559 raise_encode_exception(&exc, "utf-8", 4560 PyUnicode_AS_UNICODE(unicode), 4561 size, i-1, i, 4562 "surrogates not allowed"); 4563 goto error; 4564 } 4565 *p++ = (char)prep[k]; 4566 } 4567 } 4568 Py_DECREF(rep); 4569 } else if (ch < 0x10000) { 4570 *p++ = (char)(0xe0 | (ch >> 12)); 4571 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4572 *p++ = (char)(0x80 | (ch & 0x3f)); 4573 } else /* ch >= 0x10000 */ { 4574 /* Encode UCS4 Unicode ordinals */ 4575 *p++ = (char)(0xf0 | (ch >> 18)); 4576 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4577 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4578 *p++ = (char)(0x80 | (ch & 0x3f)); 4579#if SIZEOF_WCHAR_T == 2 4580 wchar_offset++; 4581#endif 4582 } 4583 } 4584 4585 if (result == NULL) { 4586 /* This was stack allocated. */ 4587 nneeded = p - stackbuf; 4588 assert(nneeded <= nallocated); 4589 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4590 } 4591 else { 4592 /* Cut back to size actually needed. */ 4593 nneeded = p - PyBytes_AS_STRING(result); 4594 assert(nneeded <= nallocated); 4595 _PyBytes_Resize(&result, nneeded); 4596 } 4597 4598 Py_XDECREF(errorHandler); 4599 Py_XDECREF(exc); 4600 return result; 4601 error: 4602 Py_XDECREF(errorHandler); 4603 Py_XDECREF(exc); 4604 Py_XDECREF(result); 4605 return NULL; 4606 4607#undef MAX_SHORT_UNICHARS 4608} 4609 4610PyObject * 4611PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4612 Py_ssize_t size, 4613 const char *errors) 4614{ 4615 PyObject *v, *unicode; 4616 4617 unicode = PyUnicode_FromUnicode(s, size); 4618 if (unicode == NULL) 4619 return NULL; 4620 v = _PyUnicode_AsUTF8String(unicode, errors); 4621 Py_DECREF(unicode); 4622 return v; 4623} 4624 4625PyObject * 4626PyUnicode_AsUTF8String(PyObject *unicode) 4627{ 4628 return _PyUnicode_AsUTF8String(unicode, NULL); 4629} 4630 4631/* --- UTF-32 Codec ------------------------------------------------------- */ 4632 4633PyObject * 4634PyUnicode_DecodeUTF32(const char *s, 4635 Py_ssize_t size, 4636 const char *errors, 4637 int *byteorder) 4638{ 4639 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4640} 4641 4642PyObject * 4643PyUnicode_DecodeUTF32Stateful(const char *s, 4644 Py_ssize_t size, 4645 const char *errors, 4646 int *byteorder, 4647 Py_ssize_t *consumed) 4648{ 4649 const char *starts = s; 4650 Py_ssize_t startinpos; 4651 Py_ssize_t endinpos; 4652 Py_ssize_t outpos; 4653 PyUnicodeObject *unicode; 4654 Py_UNICODE *p; 4655#ifndef Py_UNICODE_WIDE 4656 int pairs = 0; 4657 const unsigned char *qq; 4658#else 4659 const int pairs = 0; 4660#endif 4661 const unsigned char *q, *e; 4662 int bo = 0; /* assume native ordering by default */ 4663 const char *errmsg = ""; 4664 /* Offsets from q for retrieving bytes in the right order. */ 4665#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4666 int iorder[] = {0, 1, 2, 3}; 4667#else 4668 int iorder[] = {3, 2, 1, 0}; 4669#endif 4670 PyObject *errorHandler = NULL; 4671 PyObject *exc = NULL; 4672 4673 q = (unsigned char *)s; 4674 e = q + size; 4675 4676 if (byteorder) 4677 bo = *byteorder; 4678 4679 /* Check for BOM marks (U+FEFF) in the input and adjust current 4680 byte order setting accordingly. In native mode, the leading BOM 4681 mark is skipped, in all other modes, it is copied to the output 4682 stream as-is (giving a ZWNBSP character). */ 4683 if (bo == 0) { 4684 if (size >= 4) { 4685 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4686 (q[iorder[1]] << 8) | q[iorder[0]]; 4687#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4688 if (bom == 0x0000FEFF) { 4689 q += 4; 4690 bo = -1; 4691 } 4692 else if (bom == 0xFFFE0000) { 4693 q += 4; 4694 bo = 1; 4695 } 4696#else 4697 if (bom == 0x0000FEFF) { 4698 q += 4; 4699 bo = 1; 4700 } 4701 else if (bom == 0xFFFE0000) { 4702 q += 4; 4703 bo = -1; 4704 } 4705#endif 4706 } 4707 } 4708 4709 if (bo == -1) { 4710 /* force LE */ 4711 iorder[0] = 0; 4712 iorder[1] = 1; 4713 iorder[2] = 2; 4714 iorder[3] = 3; 4715 } 4716 else if (bo == 1) { 4717 /* force BE */ 4718 iorder[0] = 3; 4719 iorder[1] = 2; 4720 iorder[2] = 1; 4721 iorder[3] = 0; 4722 } 4723 4724 /* On narrow builds we split characters outside the BMP into two 4725 codepoints => count how much extra space we need. */ 4726#ifndef Py_UNICODE_WIDE 4727 for (qq = q; qq < e; qq += 4) 4728 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4729 pairs++; 4730#endif 4731 4732 /* This might be one to much, because of a BOM */ 4733 unicode = _PyUnicode_New((size+3)/4+pairs); 4734 if (!unicode) 4735 return NULL; 4736 if (size == 0) 4737 return (PyObject *)unicode; 4738 4739 /* Unpack UTF-32 encoded data */ 4740 p = PyUnicode_AS_UNICODE(unicode); 4741 4742 while (q < e) { 4743 Py_UCS4 ch; 4744 /* remaining bytes at the end? (size should be divisible by 4) */ 4745 if (e-q<4) { 4746 if (consumed) 4747 break; 4748 errmsg = "truncated data"; 4749 startinpos = ((const char *)q)-starts; 4750 endinpos = ((const char *)e)-starts; 4751 goto utf32Error; 4752 /* The remaining input chars are ignored if the callback 4753 chooses to skip the input */ 4754 } 4755 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4756 (q[iorder[1]] << 8) | q[iorder[0]]; 4757 4758 if (ch >= 0x110000) 4759 { 4760 errmsg = "codepoint not in range(0x110000)"; 4761 startinpos = ((const char *)q)-starts; 4762 endinpos = startinpos+4; 4763 goto utf32Error; 4764 } 4765#ifndef Py_UNICODE_WIDE 4766 if (ch >= 0x10000) 4767 { 4768 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4769 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4770 } 4771 else 4772#endif 4773 *p++ = ch; 4774 q += 4; 4775 continue; 4776 utf32Error: 4777 outpos = p-PyUnicode_AS_UNICODE(unicode); 4778 if (unicode_decode_call_errorhandler( 4779 errors, &errorHandler, 4780 "utf32", errmsg, 4781 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4782 &unicode, &outpos, &p)) 4783 goto onError; 4784 } 4785 4786 if (byteorder) 4787 *byteorder = bo; 4788 4789 if (consumed) 4790 *consumed = (const char *)q-starts; 4791 4792 /* Adjust length */ 4793 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4794 goto onError; 4795 4796 Py_XDECREF(errorHandler); 4797 Py_XDECREF(exc); 4798#ifndef DONT_MAKE_RESULT_READY 4799 if (_PyUnicode_READY_REPLACE(&unicode)) { 4800 Py_DECREF(unicode); 4801 return NULL; 4802 } 4803#endif 4804 return (PyObject *)unicode; 4805 4806 onError: 4807 Py_DECREF(unicode); 4808 Py_XDECREF(errorHandler); 4809 Py_XDECREF(exc); 4810 return NULL; 4811} 4812 4813PyObject * 4814PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4815 Py_ssize_t size, 4816 const char *errors, 4817 int byteorder) 4818{ 4819 PyObject *v; 4820 unsigned char *p; 4821 Py_ssize_t nsize, bytesize; 4822#ifndef Py_UNICODE_WIDE 4823 Py_ssize_t i, pairs; 4824#else 4825 const int pairs = 0; 4826#endif 4827 /* Offsets from p for storing byte pairs in the right order. */ 4828#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4829 int iorder[] = {0, 1, 2, 3}; 4830#else 4831 int iorder[] = {3, 2, 1, 0}; 4832#endif 4833 4834#define STORECHAR(CH) \ 4835 do { \ 4836 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4837 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4838 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4839 p[iorder[0]] = (CH) & 0xff; \ 4840 p += 4; \ 4841 } while(0) 4842 4843 /* In narrow builds we can output surrogate pairs as one codepoint, 4844 so we need less space. */ 4845#ifndef Py_UNICODE_WIDE 4846 for (i = pairs = 0; i < size-1; i++) 4847 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4848 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4849 pairs++; 4850#endif 4851 nsize = (size - pairs + (byteorder == 0)); 4852 bytesize = nsize * 4; 4853 if (bytesize / 4 != nsize) 4854 return PyErr_NoMemory(); 4855 v = PyBytes_FromStringAndSize(NULL, bytesize); 4856 if (v == NULL) 4857 return NULL; 4858 4859 p = (unsigned char *)PyBytes_AS_STRING(v); 4860 if (byteorder == 0) 4861 STORECHAR(0xFEFF); 4862 if (size == 0) 4863 goto done; 4864 4865 if (byteorder == -1) { 4866 /* force LE */ 4867 iorder[0] = 0; 4868 iorder[1] = 1; 4869 iorder[2] = 2; 4870 iorder[3] = 3; 4871 } 4872 else if (byteorder == 1) { 4873 /* force BE */ 4874 iorder[0] = 3; 4875 iorder[1] = 2; 4876 iorder[2] = 1; 4877 iorder[3] = 0; 4878 } 4879 4880 while (size-- > 0) { 4881 Py_UCS4 ch = *s++; 4882#ifndef Py_UNICODE_WIDE 4883 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4884 Py_UCS4 ch2 = *s; 4885 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4886 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4887 s++; 4888 size--; 4889 } 4890 } 4891#endif 4892 STORECHAR(ch); 4893 } 4894 4895 done: 4896 return v; 4897#undef STORECHAR 4898} 4899 4900PyObject * 4901PyUnicode_AsUTF32String(PyObject *unicode) 4902{ 4903 if (!PyUnicode_Check(unicode)) { 4904 PyErr_BadArgument(); 4905 return NULL; 4906 } 4907 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 4908 PyUnicode_GET_SIZE(unicode), 4909 NULL, 4910 0); 4911} 4912 4913/* --- UTF-16 Codec ------------------------------------------------------- */ 4914 4915PyObject * 4916PyUnicode_DecodeUTF16(const char *s, 4917 Py_ssize_t size, 4918 const char *errors, 4919 int *byteorder) 4920{ 4921 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 4922} 4923 4924/* Two masks for fast checking of whether a C 'long' may contain 4925 UTF16-encoded surrogate characters. This is an efficient heuristic, 4926 assuming that non-surrogate characters with a code point >= 0x8000 are 4927 rare in most input. 4928 FAST_CHAR_MASK is used when the input is in native byte ordering, 4929 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 4930*/ 4931#if (SIZEOF_LONG == 8) 4932# define FAST_CHAR_MASK 0x8000800080008000L 4933# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 4934#elif (SIZEOF_LONG == 4) 4935# define FAST_CHAR_MASK 0x80008000L 4936# define SWAPPED_FAST_CHAR_MASK 0x00800080L 4937#else 4938# error C 'long' size should be either 4 or 8! 4939#endif 4940 4941PyObject * 4942PyUnicode_DecodeUTF16Stateful(const char *s, 4943 Py_ssize_t size, 4944 const char *errors, 4945 int *byteorder, 4946 Py_ssize_t *consumed) 4947{ 4948 const char *starts = s; 4949 Py_ssize_t startinpos; 4950 Py_ssize_t endinpos; 4951 Py_ssize_t outpos; 4952 PyUnicodeObject *unicode; 4953 Py_UNICODE *p; 4954 const unsigned char *q, *e, *aligned_end; 4955 int bo = 0; /* assume native ordering by default */ 4956 int native_ordering = 0; 4957 const char *errmsg = ""; 4958 /* Offsets from q for retrieving byte pairs in the right order. */ 4959#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4960 int ihi = 1, ilo = 0; 4961#else 4962 int ihi = 0, ilo = 1; 4963#endif 4964 PyObject *errorHandler = NULL; 4965 PyObject *exc = NULL; 4966 4967 /* Note: size will always be longer than the resulting Unicode 4968 character count */ 4969 unicode = _PyUnicode_New(size); 4970 if (!unicode) 4971 return NULL; 4972 if (size == 0) 4973 return (PyObject *)unicode; 4974 4975 /* Unpack UTF-16 encoded data */ 4976 p = PyUnicode_AS_UNICODE(unicode); 4977 q = (unsigned char *)s; 4978 e = q + size - 1; 4979 4980 if (byteorder) 4981 bo = *byteorder; 4982 4983 /* Check for BOM marks (U+FEFF) in the input and adjust current 4984 byte order setting accordingly. In native mode, the leading BOM 4985 mark is skipped, in all other modes, it is copied to the output 4986 stream as-is (giving a ZWNBSP character). */ 4987 if (bo == 0) { 4988 if (size >= 2) { 4989 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 4990#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4991 if (bom == 0xFEFF) { 4992 q += 2; 4993 bo = -1; 4994 } 4995 else if (bom == 0xFFFE) { 4996 q += 2; 4997 bo = 1; 4998 } 4999#else 5000 if (bom == 0xFEFF) { 5001 q += 2; 5002 bo = 1; 5003 } 5004 else if (bom == 0xFFFE) { 5005 q += 2; 5006 bo = -1; 5007 } 5008#endif 5009 } 5010 } 5011 5012 if (bo == -1) { 5013 /* force LE */ 5014 ihi = 1; 5015 ilo = 0; 5016 } 5017 else if (bo == 1) { 5018 /* force BE */ 5019 ihi = 0; 5020 ilo = 1; 5021 } 5022#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5023 native_ordering = ilo < ihi; 5024#else 5025 native_ordering = ilo > ihi; 5026#endif 5027 5028 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5029 while (q < e) { 5030 Py_UNICODE ch; 5031 /* First check for possible aligned read of a C 'long'. Unaligned 5032 reads are more expensive, better to defer to another iteration. */ 5033 if (!((size_t) q & LONG_PTR_MASK)) { 5034 /* Fast path for runs of non-surrogate chars. */ 5035 register const unsigned char *_q = q; 5036 Py_UNICODE *_p = p; 5037 if (native_ordering) { 5038 /* Native ordering is simple: as long as the input cannot 5039 possibly contain a surrogate char, do an unrolled copy 5040 of several 16-bit code points to the target object. 5041 The non-surrogate check is done on several input bytes 5042 at a time (as many as a C 'long' can contain). */ 5043 while (_q < aligned_end) { 5044 unsigned long data = * (unsigned long *) _q; 5045 if (data & FAST_CHAR_MASK) 5046 break; 5047 _p[0] = ((unsigned short *) _q)[0]; 5048 _p[1] = ((unsigned short *) _q)[1]; 5049#if (SIZEOF_LONG == 8) 5050 _p[2] = ((unsigned short *) _q)[2]; 5051 _p[3] = ((unsigned short *) _q)[3]; 5052#endif 5053 _q += SIZEOF_LONG; 5054 _p += SIZEOF_LONG / 2; 5055 } 5056 } 5057 else { 5058 /* Byteswapped ordering is similar, but we must decompose 5059 the copy bytewise, and take care of zero'ing out the 5060 upper bytes if the target object is in 32-bit units 5061 (that is, in UCS-4 builds). */ 5062 while (_q < aligned_end) { 5063 unsigned long data = * (unsigned long *) _q; 5064 if (data & SWAPPED_FAST_CHAR_MASK) 5065 break; 5066 /* Zero upper bytes in UCS-4 builds */ 5067#if (Py_UNICODE_SIZE > 2) 5068 _p[0] = 0; 5069 _p[1] = 0; 5070#if (SIZEOF_LONG == 8) 5071 _p[2] = 0; 5072 _p[3] = 0; 5073#endif 5074#endif 5075 /* Issue #4916; UCS-4 builds on big endian machines must 5076 fill the two last bytes of each 4-byte unit. */ 5077#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5078# define OFF 2 5079#else 5080# define OFF 0 5081#endif 5082 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5083 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5084 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5085 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5086#if (SIZEOF_LONG == 8) 5087 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5088 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5089 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5090 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5091#endif 5092#undef OFF 5093 _q += SIZEOF_LONG; 5094 _p += SIZEOF_LONG / 2; 5095 } 5096 } 5097 p = _p; 5098 q = _q; 5099 if (q >= e) 5100 break; 5101 } 5102 ch = (q[ihi] << 8) | q[ilo]; 5103 5104 q += 2; 5105 5106 if (ch < 0xD800 || ch > 0xDFFF) { 5107 *p++ = ch; 5108 continue; 5109 } 5110 5111 /* UTF-16 code pair: */ 5112 if (q > e) { 5113 errmsg = "unexpected end of data"; 5114 startinpos = (((const char *)q) - 2) - starts; 5115 endinpos = ((const char *)e) + 1 - starts; 5116 goto utf16Error; 5117 } 5118 if (0xD800 <= ch && ch <= 0xDBFF) { 5119 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5120 q += 2; 5121 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5122#ifndef Py_UNICODE_WIDE 5123 *p++ = ch; 5124 *p++ = ch2; 5125#else 5126 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5127#endif 5128 continue; 5129 } 5130 else { 5131 errmsg = "illegal UTF-16 surrogate"; 5132 startinpos = (((const char *)q)-4)-starts; 5133 endinpos = startinpos+2; 5134 goto utf16Error; 5135 } 5136 5137 } 5138 errmsg = "illegal encoding"; 5139 startinpos = (((const char *)q)-2)-starts; 5140 endinpos = startinpos+2; 5141 /* Fall through to report the error */ 5142 5143 utf16Error: 5144 outpos = p - PyUnicode_AS_UNICODE(unicode); 5145 if (unicode_decode_call_errorhandler( 5146 errors, 5147 &errorHandler, 5148 "utf16", errmsg, 5149 &starts, 5150 (const char **)&e, 5151 &startinpos, 5152 &endinpos, 5153 &exc, 5154 (const char **)&q, 5155 &unicode, 5156 &outpos, 5157 &p)) 5158 goto onError; 5159 } 5160 /* remaining byte at the end? (size should be even) */ 5161 if (e == q) { 5162 if (!consumed) { 5163 errmsg = "truncated data"; 5164 startinpos = ((const char *)q) - starts; 5165 endinpos = ((const char *)e) + 1 - starts; 5166 outpos = p - PyUnicode_AS_UNICODE(unicode); 5167 if (unicode_decode_call_errorhandler( 5168 errors, 5169 &errorHandler, 5170 "utf16", errmsg, 5171 &starts, 5172 (const char **)&e, 5173 &startinpos, 5174 &endinpos, 5175 &exc, 5176 (const char **)&q, 5177 &unicode, 5178 &outpos, 5179 &p)) 5180 goto onError; 5181 /* The remaining input chars are ignored if the callback 5182 chooses to skip the input */ 5183 } 5184 } 5185 5186 if (byteorder) 5187 *byteorder = bo; 5188 5189 if (consumed) 5190 *consumed = (const char *)q-starts; 5191 5192 /* Adjust length */ 5193 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5194 goto onError; 5195 5196 Py_XDECREF(errorHandler); 5197 Py_XDECREF(exc); 5198#ifndef DONT_MAKE_RESULT_READY 5199 if (_PyUnicode_READY_REPLACE(&unicode)) { 5200 Py_DECREF(unicode); 5201 return NULL; 5202 } 5203#endif 5204 return (PyObject *)unicode; 5205 5206 onError: 5207 Py_DECREF(unicode); 5208 Py_XDECREF(errorHandler); 5209 Py_XDECREF(exc); 5210 return NULL; 5211} 5212 5213#undef FAST_CHAR_MASK 5214#undef SWAPPED_FAST_CHAR_MASK 5215 5216PyObject * 5217PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5218 Py_ssize_t size, 5219 const char *errors, 5220 int byteorder) 5221{ 5222 PyObject *v; 5223 unsigned char *p; 5224 Py_ssize_t nsize, bytesize; 5225#ifdef Py_UNICODE_WIDE 5226 Py_ssize_t i, pairs; 5227#else 5228 const int pairs = 0; 5229#endif 5230 /* Offsets from p for storing byte pairs in the right order. */ 5231#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5232 int ihi = 1, ilo = 0; 5233#else 5234 int ihi = 0, ilo = 1; 5235#endif 5236 5237#define STORECHAR(CH) \ 5238 do { \ 5239 p[ihi] = ((CH) >> 8) & 0xff; \ 5240 p[ilo] = (CH) & 0xff; \ 5241 p += 2; \ 5242 } while(0) 5243 5244#ifdef Py_UNICODE_WIDE 5245 for (i = pairs = 0; i < size; i++) 5246 if (s[i] >= 0x10000) 5247 pairs++; 5248#endif 5249 /* 2 * (size + pairs + (byteorder == 0)) */ 5250 if (size > PY_SSIZE_T_MAX || 5251 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5252 return PyErr_NoMemory(); 5253 nsize = size + pairs + (byteorder == 0); 5254 bytesize = nsize * 2; 5255 if (bytesize / 2 != nsize) 5256 return PyErr_NoMemory(); 5257 v = PyBytes_FromStringAndSize(NULL, bytesize); 5258 if (v == NULL) 5259 return NULL; 5260 5261 p = (unsigned char *)PyBytes_AS_STRING(v); 5262 if (byteorder == 0) 5263 STORECHAR(0xFEFF); 5264 if (size == 0) 5265 goto done; 5266 5267 if (byteorder == -1) { 5268 /* force LE */ 5269 ihi = 1; 5270 ilo = 0; 5271 } 5272 else if (byteorder == 1) { 5273 /* force BE */ 5274 ihi = 0; 5275 ilo = 1; 5276 } 5277 5278 while (size-- > 0) { 5279 Py_UNICODE ch = *s++; 5280 Py_UNICODE ch2 = 0; 5281#ifdef Py_UNICODE_WIDE 5282 if (ch >= 0x10000) { 5283 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5284 ch = 0xD800 | ((ch-0x10000) >> 10); 5285 } 5286#endif 5287 STORECHAR(ch); 5288 if (ch2) 5289 STORECHAR(ch2); 5290 } 5291 5292 done: 5293 return v; 5294#undef STORECHAR 5295} 5296 5297PyObject * 5298PyUnicode_AsUTF16String(PyObject *unicode) 5299{ 5300 if (!PyUnicode_Check(unicode)) { 5301 PyErr_BadArgument(); 5302 return NULL; 5303 } 5304 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5305 PyUnicode_GET_SIZE(unicode), 5306 NULL, 5307 0); 5308} 5309 5310/* --- Unicode Escape Codec ----------------------------------------------- */ 5311 5312/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5313 if all the escapes in the string make it still a valid ASCII string. 5314 Returns -1 if any escapes were found which cause the string to 5315 pop out of ASCII range. Otherwise returns the length of the 5316 required buffer to hold the string. 5317 */ 5318Py_ssize_t 5319length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5320{ 5321 const unsigned char *p = (const unsigned char *)s; 5322 const unsigned char *end = p + size; 5323 Py_ssize_t length = 0; 5324 5325 if (size < 0) 5326 return -1; 5327 5328 for (; p < end; ++p) { 5329 if (*p > 127) { 5330 /* Non-ASCII */ 5331 return -1; 5332 } 5333 else if (*p != '\\') { 5334 /* Normal character */ 5335 ++length; 5336 } 5337 else { 5338 /* Backslash-escape, check next char */ 5339 ++p; 5340 /* Escape sequence reaches till end of string or 5341 non-ASCII follow-up. */ 5342 if (p >= end || *p > 127) 5343 return -1; 5344 switch (*p) { 5345 case '\n': 5346 /* backslash + \n result in zero characters */ 5347 break; 5348 case '\\': case '\'': case '\"': 5349 case 'b': case 'f': case 't': 5350 case 'n': case 'r': case 'v': case 'a': 5351 ++length; 5352 break; 5353 case '0': case '1': case '2': case '3': 5354 case '4': case '5': case '6': case '7': 5355 case 'x': case 'u': case 'U': case 'N': 5356 /* these do not guarantee ASCII characters */ 5357 return -1; 5358 default: 5359 /* count the backslash + the other character */ 5360 length += 2; 5361 } 5362 } 5363 } 5364 return length; 5365} 5366 5367/* Similar to PyUnicode_WRITE but either write into wstr field 5368 or treat string as ASCII. */ 5369#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5370 do { \ 5371 if ((kind) != PyUnicode_WCHAR_KIND) \ 5372 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5373 else \ 5374 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5375 } while (0) 5376 5377#define WRITE_WSTR(buf, index, value) \ 5378 assert(kind == PyUnicode_WCHAR_KIND), \ 5379 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5380 5381 5382static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5383 5384PyObject * 5385PyUnicode_DecodeUnicodeEscape(const char *s, 5386 Py_ssize_t size, 5387 const char *errors) 5388{ 5389 const char *starts = s; 5390 Py_ssize_t startinpos; 5391 Py_ssize_t endinpos; 5392 int j; 5393 PyUnicodeObject *v; 5394 Py_UNICODE *p; 5395 const char *end; 5396 char* message; 5397 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5398 PyObject *errorHandler = NULL; 5399 PyObject *exc = NULL; 5400 Py_ssize_t ascii_length; 5401 Py_ssize_t i; 5402 int kind; 5403 void *data; 5404 5405 ascii_length = length_of_escaped_ascii_string(s, size); 5406 5407 /* After length_of_escaped_ascii_string() there are two alternatives, 5408 either the string is pure ASCII with named escapes like \n, etc. 5409 and we determined it's exact size (common case) 5410 or it contains \x, \u, ... escape sequences. then we create a 5411 legacy wchar string and resize it at the end of this function. */ 5412 if (ascii_length >= 0) { 5413 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5414 if (!v) 5415 goto onError; 5416 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5417 kind = PyUnicode_1BYTE_KIND; 5418 data = PyUnicode_DATA(v); 5419 } 5420 else { 5421 /* Escaped strings will always be longer than the resulting 5422 Unicode string, so we start with size here and then reduce the 5423 length after conversion to the true value. 5424 (but if the error callback returns a long replacement string 5425 we'll have to allocate more space) */ 5426 v = _PyUnicode_New(size); 5427 if (!v) 5428 goto onError; 5429 kind = PyUnicode_WCHAR_KIND; 5430 data = PyUnicode_AS_UNICODE(v); 5431 } 5432 5433 if (size == 0) 5434 return (PyObject *)v; 5435 i = 0; 5436 end = s + size; 5437 5438 while (s < end) { 5439 unsigned char c; 5440 Py_UNICODE x; 5441 int digits; 5442 5443 if (kind == PyUnicode_WCHAR_KIND) { 5444 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5445 } 5446 else { 5447 /* The only case in which i == ascii_length is a backslash 5448 followed by a newline. */ 5449 assert(i <= ascii_length); 5450 } 5451 5452 /* Non-escape characters are interpreted as Unicode ordinals */ 5453 if (*s != '\\') { 5454 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5455 continue; 5456 } 5457 5458 startinpos = s-starts; 5459 /* \ - Escapes */ 5460 s++; 5461 c = *s++; 5462 if (s > end) 5463 c = '\0'; /* Invalid after \ */ 5464 5465 if (kind == PyUnicode_WCHAR_KIND) { 5466 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5467 } 5468 else { 5469 /* The only case in which i == ascii_length is a backslash 5470 followed by a newline. */ 5471 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5472 } 5473 5474 switch (c) { 5475 5476 /* \x escapes */ 5477 case '\n': break; 5478 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5479 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5480 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5481 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5482 /* FF */ 5483 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5484 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5485 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5486 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5487 /* VT */ 5488 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5489 /* BEL, not classic C */ 5490 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5491 5492 /* \OOO (octal) escapes */ 5493 case '0': case '1': case '2': case '3': 5494 case '4': case '5': case '6': case '7': 5495 x = s[-1] - '0'; 5496 if (s < end && '0' <= *s && *s <= '7') { 5497 x = (x<<3) + *s++ - '0'; 5498 if (s < end && '0' <= *s && *s <= '7') 5499 x = (x<<3) + *s++ - '0'; 5500 } 5501 WRITE_WSTR(data, i++, x); 5502 break; 5503 5504 /* hex escapes */ 5505 /* \xXX */ 5506 case 'x': 5507 digits = 2; 5508 message = "truncated \\xXX escape"; 5509 goto hexescape; 5510 5511 /* \uXXXX */ 5512 case 'u': 5513 digits = 4; 5514 message = "truncated \\uXXXX escape"; 5515 goto hexescape; 5516 5517 /* \UXXXXXXXX */ 5518 case 'U': 5519 digits = 8; 5520 message = "truncated \\UXXXXXXXX escape"; 5521 hexescape: 5522 chr = 0; 5523 p = PyUnicode_AS_UNICODE(v) + i; 5524 if (s+digits>end) { 5525 endinpos = size; 5526 if (unicode_decode_call_errorhandler( 5527 errors, &errorHandler, 5528 "unicodeescape", "end of string in escape sequence", 5529 &starts, &end, &startinpos, &endinpos, &exc, &s, 5530 &v, &i, &p)) 5531 goto onError; 5532 data = PyUnicode_AS_UNICODE(v); 5533 goto nextByte; 5534 } 5535 for (j = 0; j < digits; ++j) { 5536 c = (unsigned char) s[j]; 5537 if (!Py_ISXDIGIT(c)) { 5538 endinpos = (s+j+1)-starts; 5539 p = PyUnicode_AS_UNICODE(v) + i; 5540 if (unicode_decode_call_errorhandler( 5541 errors, &errorHandler, 5542 "unicodeescape", message, 5543 &starts, &end, &startinpos, &endinpos, &exc, &s, 5544 &v, &i, &p)) 5545 goto onError; 5546 data = PyUnicode_AS_UNICODE(v); 5547 goto nextByte; 5548 } 5549 chr = (chr<<4) & ~0xF; 5550 if (c >= '0' && c <= '9') 5551 chr += c - '0'; 5552 else if (c >= 'a' && c <= 'f') 5553 chr += 10 + c - 'a'; 5554 else 5555 chr += 10 + c - 'A'; 5556 } 5557 s += j; 5558 if (chr == 0xffffffff && PyErr_Occurred()) 5559 /* _decoding_error will have already written into the 5560 target buffer. */ 5561 break; 5562 store: 5563 /* when we get here, chr is a 32-bit unicode character */ 5564 if (chr <= 0xffff) 5565 /* UCS-2 character */ 5566 WRITE_WSTR(data, i++, chr); 5567 else if (chr <= 0x10ffff) { 5568 /* UCS-4 character. Either store directly, or as 5569 surrogate pair. */ 5570#ifdef Py_UNICODE_WIDE 5571 WRITE_WSTR(data, i++, chr); 5572#else 5573 chr -= 0x10000L; 5574 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5575 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5576#endif 5577 } else { 5578 endinpos = s-starts; 5579 p = PyUnicode_AS_UNICODE(v) + i; 5580 if (unicode_decode_call_errorhandler( 5581 errors, &errorHandler, 5582 "unicodeescape", "illegal Unicode character", 5583 &starts, &end, &startinpos, &endinpos, &exc, &s, 5584 &v, &i, &p)) 5585 goto onError; 5586 data = PyUnicode_AS_UNICODE(v); 5587 } 5588 break; 5589 5590 /* \N{name} */ 5591 case 'N': 5592 message = "malformed \\N character escape"; 5593 if (ucnhash_CAPI == NULL) { 5594 /* load the unicode data module */ 5595 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5596 PyUnicodeData_CAPSULE_NAME, 1); 5597 if (ucnhash_CAPI == NULL) 5598 goto ucnhashError; 5599 } 5600 if (*s == '{') { 5601 const char *start = s+1; 5602 /* look for the closing brace */ 5603 while (*s != '}' && s < end) 5604 s++; 5605 if (s > start && s < end && *s == '}') { 5606 /* found a name. look it up in the unicode database */ 5607 message = "unknown Unicode character name"; 5608 s++; 5609 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5610 &chr)) 5611 goto store; 5612 } 5613 } 5614 endinpos = s-starts; 5615 p = PyUnicode_AS_UNICODE(v) + i; 5616 if (unicode_decode_call_errorhandler( 5617 errors, &errorHandler, 5618 "unicodeescape", message, 5619 &starts, &end, &startinpos, &endinpos, &exc, &s, 5620 &v, &i, &p)) 5621 goto onError; 5622 data = PyUnicode_AS_UNICODE(v); 5623 break; 5624 5625 default: 5626 if (s > end) { 5627 assert(kind == PyUnicode_WCHAR_KIND); 5628 message = "\\ at end of string"; 5629 s--; 5630 endinpos = s-starts; 5631 p = PyUnicode_AS_UNICODE(v) + i; 5632 if (unicode_decode_call_errorhandler( 5633 errors, &errorHandler, 5634 "unicodeescape", message, 5635 &starts, &end, &startinpos, &endinpos, &exc, &s, 5636 &v, &i, &p)) 5637 goto onError; 5638 data = PyUnicode_AS_UNICODE(v); 5639 } 5640 else { 5641 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5642 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5643 } 5644 break; 5645 } 5646 nextByte: 5647 ; 5648 } 5649 /* Ensure the length prediction worked in case of ASCII strings */ 5650 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5651 5652 if (kind == PyUnicode_WCHAR_KIND) 5653 { 5654 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5655 goto onError; 5656 } 5657 Py_XDECREF(errorHandler); 5658 Py_XDECREF(exc); 5659#ifndef DONT_MAKE_RESULT_READY 5660 if (_PyUnicode_READY_REPLACE(&v)) { 5661 Py_DECREF(v); 5662 return NULL; 5663 } 5664#endif 5665 return (PyObject *)v; 5666 5667 ucnhashError: 5668 PyErr_SetString( 5669 PyExc_UnicodeError, 5670 "\\N escapes not supported (can't load unicodedata module)" 5671 ); 5672 Py_XDECREF(v); 5673 Py_XDECREF(errorHandler); 5674 Py_XDECREF(exc); 5675 return NULL; 5676 5677 onError: 5678 Py_XDECREF(v); 5679 Py_XDECREF(errorHandler); 5680 Py_XDECREF(exc); 5681 return NULL; 5682} 5683 5684#undef WRITE_ASCII_OR_WSTR 5685#undef WRITE_WSTR 5686 5687/* Return a Unicode-Escape string version of the Unicode object. 5688 5689 If quotes is true, the string is enclosed in u"" or u'' quotes as 5690 appropriate. 5691 5692*/ 5693 5694static const char *hexdigits = "0123456789abcdef"; 5695 5696PyObject * 5697PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5698 Py_ssize_t size) 5699{ 5700 PyObject *repr; 5701 char *p; 5702 5703#ifdef Py_UNICODE_WIDE 5704 const Py_ssize_t expandsize = 10; 5705#else 5706 const Py_ssize_t expandsize = 6; 5707#endif 5708 5709 /* XXX(nnorwitz): rather than over-allocating, it would be 5710 better to choose a different scheme. Perhaps scan the 5711 first N-chars of the string and allocate based on that size. 5712 */ 5713 /* Initial allocation is based on the longest-possible unichr 5714 escape. 5715 5716 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5717 unichr, so in this case it's the longest unichr escape. In 5718 narrow (UTF-16) builds this is five chars per source unichr 5719 since there are two unichrs in the surrogate pair, so in narrow 5720 (UTF-16) builds it's not the longest unichr escape. 5721 5722 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5723 so in the narrow (UTF-16) build case it's the longest unichr 5724 escape. 5725 */ 5726 5727 if (size == 0) 5728 return PyBytes_FromStringAndSize(NULL, 0); 5729 5730 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5731 return PyErr_NoMemory(); 5732 5733 repr = PyBytes_FromStringAndSize(NULL, 5734 2 5735 + expandsize*size 5736 + 1); 5737 if (repr == NULL) 5738 return NULL; 5739 5740 p = PyBytes_AS_STRING(repr); 5741 5742 while (size-- > 0) { 5743 Py_UNICODE ch = *s++; 5744 5745 /* Escape backslashes */ 5746 if (ch == '\\') { 5747 *p++ = '\\'; 5748 *p++ = (char) ch; 5749 continue; 5750 } 5751 5752#ifdef Py_UNICODE_WIDE 5753 /* Map 21-bit characters to '\U00xxxxxx' */ 5754 else if (ch >= 0x10000) { 5755 *p++ = '\\'; 5756 *p++ = 'U'; 5757 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5758 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5759 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5760 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5761 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5762 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5763 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5764 *p++ = hexdigits[ch & 0x0000000F]; 5765 continue; 5766 } 5767#else 5768 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5769 else if (ch >= 0xD800 && ch < 0xDC00) { 5770 Py_UNICODE ch2; 5771 Py_UCS4 ucs; 5772 5773 ch2 = *s++; 5774 size--; 5775 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5776 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5777 *p++ = '\\'; 5778 *p++ = 'U'; 5779 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5780 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5781 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5782 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5783 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5784 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5785 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5786 *p++ = hexdigits[ucs & 0x0000000F]; 5787 continue; 5788 } 5789 /* Fall through: isolated surrogates are copied as-is */ 5790 s--; 5791 size++; 5792 } 5793#endif 5794 5795 /* Map 16-bit characters to '\uxxxx' */ 5796 if (ch >= 256) { 5797 *p++ = '\\'; 5798 *p++ = 'u'; 5799 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5800 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5801 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5802 *p++ = hexdigits[ch & 0x000F]; 5803 } 5804 5805 /* Map special whitespace to '\t', \n', '\r' */ 5806 else if (ch == '\t') { 5807 *p++ = '\\'; 5808 *p++ = 't'; 5809 } 5810 else if (ch == '\n') { 5811 *p++ = '\\'; 5812 *p++ = 'n'; 5813 } 5814 else if (ch == '\r') { 5815 *p++ = '\\'; 5816 *p++ = 'r'; 5817 } 5818 5819 /* Map non-printable US ASCII to '\xhh' */ 5820 else if (ch < ' ' || ch >= 0x7F) { 5821 *p++ = '\\'; 5822 *p++ = 'x'; 5823 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5824 *p++ = hexdigits[ch & 0x000F]; 5825 } 5826 5827 /* Copy everything else as-is */ 5828 else 5829 *p++ = (char) ch; 5830 } 5831 5832 assert(p - PyBytes_AS_STRING(repr) > 0); 5833 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5834 return NULL; 5835 return repr; 5836} 5837 5838PyObject * 5839PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5840{ 5841 PyObject *s; 5842 if (!PyUnicode_Check(unicode)) { 5843 PyErr_BadArgument(); 5844 return NULL; 5845 } 5846 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5847 PyUnicode_GET_SIZE(unicode)); 5848 return s; 5849} 5850 5851/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5852 5853PyObject * 5854PyUnicode_DecodeRawUnicodeEscape(const char *s, 5855 Py_ssize_t size, 5856 const char *errors) 5857{ 5858 const char *starts = s; 5859 Py_ssize_t startinpos; 5860 Py_ssize_t endinpos; 5861 Py_ssize_t outpos; 5862 PyUnicodeObject *v; 5863 Py_UNICODE *p; 5864 const char *end; 5865 const char *bs; 5866 PyObject *errorHandler = NULL; 5867 PyObject *exc = NULL; 5868 5869 /* Escaped strings will always be longer than the resulting 5870 Unicode string, so we start with size here and then reduce the 5871 length after conversion to the true value. (But decoding error 5872 handler might have to resize the string) */ 5873 v = _PyUnicode_New(size); 5874 if (v == NULL) 5875 goto onError; 5876 if (size == 0) 5877 return (PyObject *)v; 5878 p = PyUnicode_AS_UNICODE(v); 5879 end = s + size; 5880 while (s < end) { 5881 unsigned char c; 5882 Py_UCS4 x; 5883 int i; 5884 int count; 5885 5886 /* Non-escape characters are interpreted as Unicode ordinals */ 5887 if (*s != '\\') { 5888 *p++ = (unsigned char)*s++; 5889 continue; 5890 } 5891 startinpos = s-starts; 5892 5893 /* \u-escapes are only interpreted iff the number of leading 5894 backslashes if odd */ 5895 bs = s; 5896 for (;s < end;) { 5897 if (*s != '\\') 5898 break; 5899 *p++ = (unsigned char)*s++; 5900 } 5901 if (((s - bs) & 1) == 0 || 5902 s >= end || 5903 (*s != 'u' && *s != 'U')) { 5904 continue; 5905 } 5906 p--; 5907 count = *s=='u' ? 4 : 8; 5908 s++; 5909 5910 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5911 outpos = p-PyUnicode_AS_UNICODE(v); 5912 for (x = 0, i = 0; i < count; ++i, ++s) { 5913 c = (unsigned char)*s; 5914 if (!Py_ISXDIGIT(c)) { 5915 endinpos = s-starts; 5916 if (unicode_decode_call_errorhandler( 5917 errors, &errorHandler, 5918 "rawunicodeescape", "truncated \\uXXXX", 5919 &starts, &end, &startinpos, &endinpos, &exc, &s, 5920 &v, &outpos, &p)) 5921 goto onError; 5922 goto nextByte; 5923 } 5924 x = (x<<4) & ~0xF; 5925 if (c >= '0' && c <= '9') 5926 x += c - '0'; 5927 else if (c >= 'a' && c <= 'f') 5928 x += 10 + c - 'a'; 5929 else 5930 x += 10 + c - 'A'; 5931 } 5932 if (x <= 0xffff) 5933 /* UCS-2 character */ 5934 *p++ = (Py_UNICODE) x; 5935 else if (x <= 0x10ffff) { 5936 /* UCS-4 character. Either store directly, or as 5937 surrogate pair. */ 5938#ifdef Py_UNICODE_WIDE 5939 *p++ = (Py_UNICODE) x; 5940#else 5941 x -= 0x10000L; 5942 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 5943 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 5944#endif 5945 } else { 5946 endinpos = s-starts; 5947 outpos = p-PyUnicode_AS_UNICODE(v); 5948 if (unicode_decode_call_errorhandler( 5949 errors, &errorHandler, 5950 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5951 &starts, &end, &startinpos, &endinpos, &exc, &s, 5952 &v, &outpos, &p)) 5953 goto onError; 5954 } 5955 nextByte: 5956 ; 5957 } 5958 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5959 goto onError; 5960 Py_XDECREF(errorHandler); 5961 Py_XDECREF(exc); 5962#ifndef DONT_MAKE_RESULT_READY 5963 if (_PyUnicode_READY_REPLACE(&v)) { 5964 Py_DECREF(v); 5965 return NULL; 5966 } 5967#endif 5968 return (PyObject *)v; 5969 5970 onError: 5971 Py_XDECREF(v); 5972 Py_XDECREF(errorHandler); 5973 Py_XDECREF(exc); 5974 return NULL; 5975} 5976 5977PyObject * 5978PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5979 Py_ssize_t size) 5980{ 5981 PyObject *repr; 5982 char *p; 5983 char *q; 5984 5985#ifdef Py_UNICODE_WIDE 5986 const Py_ssize_t expandsize = 10; 5987#else 5988 const Py_ssize_t expandsize = 6; 5989#endif 5990 5991 if (size > PY_SSIZE_T_MAX / expandsize) 5992 return PyErr_NoMemory(); 5993 5994 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 5995 if (repr == NULL) 5996 return NULL; 5997 if (size == 0) 5998 return repr; 5999 6000 p = q = PyBytes_AS_STRING(repr); 6001 while (size-- > 0) { 6002 Py_UNICODE ch = *s++; 6003#ifdef Py_UNICODE_WIDE 6004 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6005 if (ch >= 0x10000) { 6006 *p++ = '\\'; 6007 *p++ = 'U'; 6008 *p++ = hexdigits[(ch >> 28) & 0xf]; 6009 *p++ = hexdigits[(ch >> 24) & 0xf]; 6010 *p++ = hexdigits[(ch >> 20) & 0xf]; 6011 *p++ = hexdigits[(ch >> 16) & 0xf]; 6012 *p++ = hexdigits[(ch >> 12) & 0xf]; 6013 *p++ = hexdigits[(ch >> 8) & 0xf]; 6014 *p++ = hexdigits[(ch >> 4) & 0xf]; 6015 *p++ = hexdigits[ch & 15]; 6016 } 6017 else 6018#else 6019 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 6020 if (ch >= 0xD800 && ch < 0xDC00) { 6021 Py_UNICODE ch2; 6022 Py_UCS4 ucs; 6023 6024 ch2 = *s++; 6025 size--; 6026 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 6027 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 6028 *p++ = '\\'; 6029 *p++ = 'U'; 6030 *p++ = hexdigits[(ucs >> 28) & 0xf]; 6031 *p++ = hexdigits[(ucs >> 24) & 0xf]; 6032 *p++ = hexdigits[(ucs >> 20) & 0xf]; 6033 *p++ = hexdigits[(ucs >> 16) & 0xf]; 6034 *p++ = hexdigits[(ucs >> 12) & 0xf]; 6035 *p++ = hexdigits[(ucs >> 8) & 0xf]; 6036 *p++ = hexdigits[(ucs >> 4) & 0xf]; 6037 *p++ = hexdigits[ucs & 0xf]; 6038 continue; 6039 } 6040 /* Fall through: isolated surrogates are copied as-is */ 6041 s--; 6042 size++; 6043 } 6044#endif 6045 /* Map 16-bit characters to '\uxxxx' */ 6046 if (ch >= 256) { 6047 *p++ = '\\'; 6048 *p++ = 'u'; 6049 *p++ = hexdigits[(ch >> 12) & 0xf]; 6050 *p++ = hexdigits[(ch >> 8) & 0xf]; 6051 *p++ = hexdigits[(ch >> 4) & 0xf]; 6052 *p++ = hexdigits[ch & 15]; 6053 } 6054 /* Copy everything else as-is */ 6055 else 6056 *p++ = (char) ch; 6057 } 6058 size = p - q; 6059 6060 assert(size > 0); 6061 if (_PyBytes_Resize(&repr, size) < 0) 6062 return NULL; 6063 return repr; 6064} 6065 6066PyObject * 6067PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6068{ 6069 PyObject *s; 6070 if (!PyUnicode_Check(unicode)) { 6071 PyErr_BadArgument(); 6072 return NULL; 6073 } 6074 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6075 PyUnicode_GET_SIZE(unicode)); 6076 6077 return s; 6078} 6079 6080/* --- Unicode Internal Codec ------------------------------------------- */ 6081 6082PyObject * 6083_PyUnicode_DecodeUnicodeInternal(const char *s, 6084 Py_ssize_t size, 6085 const char *errors) 6086{ 6087 const char *starts = s; 6088 Py_ssize_t startinpos; 6089 Py_ssize_t endinpos; 6090 Py_ssize_t outpos; 6091 PyUnicodeObject *v; 6092 Py_UNICODE *p; 6093 const char *end; 6094 const char *reason; 6095 PyObject *errorHandler = NULL; 6096 PyObject *exc = NULL; 6097 6098#ifdef Py_UNICODE_WIDE 6099 Py_UNICODE unimax = PyUnicode_GetMax(); 6100#endif 6101 6102 /* XXX overflow detection missing */ 6103 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6104 if (v == NULL) 6105 goto onError; 6106 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6107 as string was created with the old API. */ 6108 if (PyUnicode_GET_SIZE(v) == 0) 6109 return (PyObject *)v; 6110 p = PyUnicode_AS_UNICODE(v); 6111 end = s + size; 6112 6113 while (s < end) { 6114 memcpy(p, s, sizeof(Py_UNICODE)); 6115 /* We have to sanity check the raw data, otherwise doom looms for 6116 some malformed UCS-4 data. */ 6117 if ( 6118#ifdef Py_UNICODE_WIDE 6119 *p > unimax || *p < 0 || 6120#endif 6121 end-s < Py_UNICODE_SIZE 6122 ) 6123 { 6124 startinpos = s - starts; 6125 if (end-s < Py_UNICODE_SIZE) { 6126 endinpos = end-starts; 6127 reason = "truncated input"; 6128 } 6129 else { 6130 endinpos = s - starts + Py_UNICODE_SIZE; 6131 reason = "illegal code point (> 0x10FFFF)"; 6132 } 6133 outpos = p - PyUnicode_AS_UNICODE(v); 6134 if (unicode_decode_call_errorhandler( 6135 errors, &errorHandler, 6136 "unicode_internal", reason, 6137 &starts, &end, &startinpos, &endinpos, &exc, &s, 6138 &v, &outpos, &p)) { 6139 goto onError; 6140 } 6141 } 6142 else { 6143 p++; 6144 s += Py_UNICODE_SIZE; 6145 } 6146 } 6147 6148 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6149 goto onError; 6150 Py_XDECREF(errorHandler); 6151 Py_XDECREF(exc); 6152#ifndef DONT_MAKE_RESULT_READY 6153 if (_PyUnicode_READY_REPLACE(&v)) { 6154 Py_DECREF(v); 6155 return NULL; 6156 } 6157#endif 6158 return (PyObject *)v; 6159 6160 onError: 6161 Py_XDECREF(v); 6162 Py_XDECREF(errorHandler); 6163 Py_XDECREF(exc); 6164 return NULL; 6165} 6166 6167/* --- Latin-1 Codec ------------------------------------------------------ */ 6168 6169PyObject * 6170PyUnicode_DecodeLatin1(const char *s, 6171 Py_ssize_t size, 6172 const char *errors) 6173{ 6174 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6175 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6176} 6177 6178/* create or adjust a UnicodeEncodeError */ 6179static void 6180make_encode_exception(PyObject **exceptionObject, 6181 const char *encoding, 6182 const Py_UNICODE *unicode, Py_ssize_t size, 6183 Py_ssize_t startpos, Py_ssize_t endpos, 6184 const char *reason) 6185{ 6186 if (*exceptionObject == NULL) { 6187 *exceptionObject = PyUnicodeEncodeError_Create( 6188 encoding, unicode, size, startpos, endpos, reason); 6189 } 6190 else { 6191 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6192 goto onError; 6193 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6194 goto onError; 6195 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6196 goto onError; 6197 return; 6198 onError: 6199 Py_DECREF(*exceptionObject); 6200 *exceptionObject = NULL; 6201 } 6202} 6203 6204/* raises a UnicodeEncodeError */ 6205static void 6206raise_encode_exception(PyObject **exceptionObject, 6207 const char *encoding, 6208 const Py_UNICODE *unicode, Py_ssize_t size, 6209 Py_ssize_t startpos, Py_ssize_t endpos, 6210 const char *reason) 6211{ 6212 make_encode_exception(exceptionObject, 6213 encoding, unicode, size, startpos, endpos, reason); 6214 if (*exceptionObject != NULL) 6215 PyCodec_StrictErrors(*exceptionObject); 6216} 6217 6218/* error handling callback helper: 6219 build arguments, call the callback and check the arguments, 6220 put the result into newpos and return the replacement string, which 6221 has to be freed by the caller */ 6222static PyObject * 6223unicode_encode_call_errorhandler(const char *errors, 6224 PyObject **errorHandler, 6225 const char *encoding, const char *reason, 6226 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6227 Py_ssize_t startpos, Py_ssize_t endpos, 6228 Py_ssize_t *newpos) 6229{ 6230 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6231 6232 PyObject *restuple; 6233 PyObject *resunicode; 6234 6235 if (*errorHandler == NULL) { 6236 *errorHandler = PyCodec_LookupError(errors); 6237 if (*errorHandler == NULL) 6238 return NULL; 6239 } 6240 6241 make_encode_exception(exceptionObject, 6242 encoding, unicode, size, startpos, endpos, reason); 6243 if (*exceptionObject == NULL) 6244 return NULL; 6245 6246 restuple = PyObject_CallFunctionObjArgs( 6247 *errorHandler, *exceptionObject, NULL); 6248 if (restuple == NULL) 6249 return NULL; 6250 if (!PyTuple_Check(restuple)) { 6251 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6252 Py_DECREF(restuple); 6253 return NULL; 6254 } 6255 if (!PyArg_ParseTuple(restuple, argparse, 6256 &resunicode, newpos)) { 6257 Py_DECREF(restuple); 6258 return NULL; 6259 } 6260 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6261 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6262 Py_DECREF(restuple); 6263 return NULL; 6264 } 6265 if (*newpos<0) 6266 *newpos = size+*newpos; 6267 if (*newpos<0 || *newpos>size) { 6268 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6269 Py_DECREF(restuple); 6270 return NULL; 6271 } 6272 Py_INCREF(resunicode); 6273 Py_DECREF(restuple); 6274 return resunicode; 6275} 6276 6277static PyObject * 6278unicode_encode_ucs1(const Py_UNICODE *p, 6279 Py_ssize_t size, 6280 const char *errors, 6281 int limit) 6282{ 6283 /* output object */ 6284 PyObject *res; 6285 /* pointers to the beginning and end+1 of input */ 6286 const Py_UNICODE *startp = p; 6287 const Py_UNICODE *endp = p + size; 6288 /* pointer to the beginning of the unencodable characters */ 6289 /* const Py_UNICODE *badp = NULL; */ 6290 /* pointer into the output */ 6291 char *str; 6292 /* current output position */ 6293 Py_ssize_t ressize; 6294 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6295 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6296 PyObject *errorHandler = NULL; 6297 PyObject *exc = NULL; 6298 /* the following variable is used for caching string comparisons 6299 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6300 int known_errorHandler = -1; 6301 6302 /* allocate enough for a simple encoding without 6303 replacements, if we need more, we'll resize */ 6304 if (size == 0) 6305 return PyBytes_FromStringAndSize(NULL, 0); 6306 res = PyBytes_FromStringAndSize(NULL, size); 6307 if (res == NULL) 6308 return NULL; 6309 str = PyBytes_AS_STRING(res); 6310 ressize = size; 6311 6312 while (p<endp) { 6313 Py_UNICODE c = *p; 6314 6315 /* can we encode this? */ 6316 if (c<limit) { 6317 /* no overflow check, because we know that the space is enough */ 6318 *str++ = (char)c; 6319 ++p; 6320 } 6321 else { 6322 Py_ssize_t unicodepos = p-startp; 6323 Py_ssize_t requiredsize; 6324 PyObject *repunicode; 6325 Py_ssize_t repsize; 6326 Py_ssize_t newpos; 6327 Py_ssize_t respos; 6328 Py_UNICODE *uni2; 6329 /* startpos for collecting unencodable chars */ 6330 const Py_UNICODE *collstart = p; 6331 const Py_UNICODE *collend = p; 6332 /* find all unecodable characters */ 6333 while ((collend < endp) && ((*collend)>=limit)) 6334 ++collend; 6335 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6336 if (known_errorHandler==-1) { 6337 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6338 known_errorHandler = 1; 6339 else if (!strcmp(errors, "replace")) 6340 known_errorHandler = 2; 6341 else if (!strcmp(errors, "ignore")) 6342 known_errorHandler = 3; 6343 else if (!strcmp(errors, "xmlcharrefreplace")) 6344 known_errorHandler = 4; 6345 else 6346 known_errorHandler = 0; 6347 } 6348 switch (known_errorHandler) { 6349 case 1: /* strict */ 6350 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6351 goto onError; 6352 case 2: /* replace */ 6353 while (collstart++<collend) 6354 *str++ = '?'; /* fall through */ 6355 case 3: /* ignore */ 6356 p = collend; 6357 break; 6358 case 4: /* xmlcharrefreplace */ 6359 respos = str - PyBytes_AS_STRING(res); 6360 /* determine replacement size (temporarily (mis)uses p) */ 6361 for (p = collstart, repsize = 0; p < collend; ++p) { 6362 if (*p<10) 6363 repsize += 2+1+1; 6364 else if (*p<100) 6365 repsize += 2+2+1; 6366 else if (*p<1000) 6367 repsize += 2+3+1; 6368 else if (*p<10000) 6369 repsize += 2+4+1; 6370#ifndef Py_UNICODE_WIDE 6371 else 6372 repsize += 2+5+1; 6373#else 6374 else if (*p<100000) 6375 repsize += 2+5+1; 6376 else if (*p<1000000) 6377 repsize += 2+6+1; 6378 else 6379 repsize += 2+7+1; 6380#endif 6381 } 6382 requiredsize = respos+repsize+(endp-collend); 6383 if (requiredsize > ressize) { 6384 if (requiredsize<2*ressize) 6385 requiredsize = 2*ressize; 6386 if (_PyBytes_Resize(&res, requiredsize)) 6387 goto onError; 6388 str = PyBytes_AS_STRING(res) + respos; 6389 ressize = requiredsize; 6390 } 6391 /* generate replacement (temporarily (mis)uses p) */ 6392 for (p = collstart; p < collend; ++p) { 6393 str += sprintf(str, "&#%d;", (int)*p); 6394 } 6395 p = collend; 6396 break; 6397 default: 6398 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6399 encoding, reason, startp, size, &exc, 6400 collstart-startp, collend-startp, &newpos); 6401 if (repunicode == NULL) 6402 goto onError; 6403 if (PyBytes_Check(repunicode)) { 6404 /* Directly copy bytes result to output. */ 6405 repsize = PyBytes_Size(repunicode); 6406 if (repsize > 1) { 6407 /* Make room for all additional bytes. */ 6408 respos = str - PyBytes_AS_STRING(res); 6409 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6410 Py_DECREF(repunicode); 6411 goto onError; 6412 } 6413 str = PyBytes_AS_STRING(res) + respos; 6414 ressize += repsize-1; 6415 } 6416 memcpy(str, PyBytes_AsString(repunicode), repsize); 6417 str += repsize; 6418 p = startp + newpos; 6419 Py_DECREF(repunicode); 6420 break; 6421 } 6422 /* need more space? (at least enough for what we 6423 have+the replacement+the rest of the string, so 6424 we won't have to check space for encodable characters) */ 6425 respos = str - PyBytes_AS_STRING(res); 6426 repsize = PyUnicode_GET_SIZE(repunicode); 6427 requiredsize = respos+repsize+(endp-collend); 6428 if (requiredsize > ressize) { 6429 if (requiredsize<2*ressize) 6430 requiredsize = 2*ressize; 6431 if (_PyBytes_Resize(&res, requiredsize)) { 6432 Py_DECREF(repunicode); 6433 goto onError; 6434 } 6435 str = PyBytes_AS_STRING(res) + respos; 6436 ressize = requiredsize; 6437 } 6438 /* check if there is anything unencodable in the replacement 6439 and copy it to the output */ 6440 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6441 c = *uni2; 6442 if (c >= limit) { 6443 raise_encode_exception(&exc, encoding, startp, size, 6444 unicodepos, unicodepos+1, reason); 6445 Py_DECREF(repunicode); 6446 goto onError; 6447 } 6448 *str = (char)c; 6449 } 6450 p = startp + newpos; 6451 Py_DECREF(repunicode); 6452 } 6453 } 6454 } 6455 /* Resize if we allocated to much */ 6456 size = str - PyBytes_AS_STRING(res); 6457 if (size < ressize) { /* If this falls res will be NULL */ 6458 assert(size >= 0); 6459 if (_PyBytes_Resize(&res, size) < 0) 6460 goto onError; 6461 } 6462 6463 Py_XDECREF(errorHandler); 6464 Py_XDECREF(exc); 6465 return res; 6466 6467 onError: 6468 Py_XDECREF(res); 6469 Py_XDECREF(errorHandler); 6470 Py_XDECREF(exc); 6471 return NULL; 6472} 6473 6474PyObject * 6475PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6476 Py_ssize_t size, 6477 const char *errors) 6478{ 6479 return unicode_encode_ucs1(p, size, errors, 256); 6480} 6481 6482PyObject * 6483_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6484{ 6485 if (!PyUnicode_Check(unicode)) { 6486 PyErr_BadArgument(); 6487 return NULL; 6488 } 6489 if (PyUnicode_READY(unicode) == -1) 6490 return NULL; 6491 /* Fast path: if it is a one-byte string, construct 6492 bytes object directly. */ 6493 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6494 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6495 PyUnicode_GET_LENGTH(unicode)); 6496 /* Non-Latin-1 characters present. Defer to above function to 6497 raise the exception. */ 6498 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6499 PyUnicode_GET_SIZE(unicode), 6500 errors); 6501} 6502 6503PyObject* 6504PyUnicode_AsLatin1String(PyObject *unicode) 6505{ 6506 return _PyUnicode_AsLatin1String(unicode, NULL); 6507} 6508 6509/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6510 6511PyObject * 6512PyUnicode_DecodeASCII(const char *s, 6513 Py_ssize_t size, 6514 const char *errors) 6515{ 6516 const char *starts = s; 6517 PyUnicodeObject *v; 6518 Py_UNICODE *u; 6519 Py_ssize_t startinpos; 6520 Py_ssize_t endinpos; 6521 Py_ssize_t outpos; 6522 const char *e; 6523 int has_error; 6524 const unsigned char *p = (const unsigned char *)s; 6525 const unsigned char *end = p + size; 6526 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6527 PyObject *errorHandler = NULL; 6528 PyObject *exc = NULL; 6529 6530 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6531 if (size == 1 && (unsigned char)s[0] < 128) 6532 return get_latin1_char((unsigned char)s[0]); 6533 6534 has_error = 0; 6535 while (p < end && !has_error) { 6536 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6537 an explanation. */ 6538 if (!((size_t) p & LONG_PTR_MASK)) { 6539 /* Help register allocation */ 6540 register const unsigned char *_p = p; 6541 while (_p < aligned_end) { 6542 unsigned long value = *(unsigned long *) _p; 6543 if (value & ASCII_CHAR_MASK) { 6544 has_error = 1; 6545 break; 6546 } 6547 _p += SIZEOF_LONG; 6548 } 6549 if (_p == end) 6550 break; 6551 if (has_error) 6552 break; 6553 p = _p; 6554 } 6555 if (*p & 0x80) { 6556 has_error = 1; 6557 break; 6558 } 6559 else { 6560 ++p; 6561 } 6562 } 6563 if (!has_error) 6564 return unicode_fromascii((const unsigned char *)s, size); 6565 6566 v = _PyUnicode_New(size); 6567 if (v == NULL) 6568 goto onError; 6569 if (size == 0) 6570 return (PyObject *)v; 6571 u = PyUnicode_AS_UNICODE(v); 6572 e = s + size; 6573 while (s < e) { 6574 register unsigned char c = (unsigned char)*s; 6575 if (c < 128) { 6576 *u++ = c; 6577 ++s; 6578 } 6579 else { 6580 startinpos = s-starts; 6581 endinpos = startinpos + 1; 6582 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6583 if (unicode_decode_call_errorhandler( 6584 errors, &errorHandler, 6585 "ascii", "ordinal not in range(128)", 6586 &starts, &e, &startinpos, &endinpos, &exc, &s, 6587 &v, &outpos, &u)) 6588 goto onError; 6589 } 6590 } 6591 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6592 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0) 6593 goto onError; 6594 Py_XDECREF(errorHandler); 6595 Py_XDECREF(exc); 6596#ifndef DONT_MAKE_RESULT_READY 6597 if (_PyUnicode_READY_REPLACE(&v)) { 6598 Py_DECREF(v); 6599 return NULL; 6600 } 6601#endif 6602 return (PyObject *)v; 6603 6604 onError: 6605 Py_XDECREF(v); 6606 Py_XDECREF(errorHandler); 6607 Py_XDECREF(exc); 6608 return NULL; 6609} 6610 6611PyObject * 6612PyUnicode_EncodeASCII(const Py_UNICODE *p, 6613 Py_ssize_t size, 6614 const char *errors) 6615{ 6616 return unicode_encode_ucs1(p, size, errors, 128); 6617} 6618 6619PyObject * 6620_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6621{ 6622 if (!PyUnicode_Check(unicode)) { 6623 PyErr_BadArgument(); 6624 return NULL; 6625 } 6626 if (PyUnicode_READY(unicode) == -1) 6627 return NULL; 6628 /* Fast path: if it is an ASCII-only string, construct bytes object 6629 directly. Else defer to above function to raise the exception. */ 6630 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6631 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6632 PyUnicode_GET_LENGTH(unicode)); 6633 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6634 PyUnicode_GET_SIZE(unicode), 6635 errors); 6636} 6637 6638PyObject * 6639PyUnicode_AsASCIIString(PyObject *unicode) 6640{ 6641 return _PyUnicode_AsASCIIString(unicode, NULL); 6642} 6643 6644#ifdef HAVE_MBCS 6645 6646/* --- MBCS codecs for Windows -------------------------------------------- */ 6647 6648#if SIZEOF_INT < SIZEOF_SIZE_T 6649#define NEED_RETRY 6650#endif 6651 6652/* XXX This code is limited to "true" double-byte encodings, as 6653 a) it assumes an incomplete character consists of a single byte, and 6654 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6655 encodings, see IsDBCSLeadByteEx documentation. */ 6656 6657static int 6658is_dbcs_lead_byte(const char *s, int offset) 6659{ 6660 const char *curr = s + offset; 6661 6662 if (IsDBCSLeadByte(*curr)) { 6663 const char *prev = CharPrev(s, curr); 6664 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6665 } 6666 return 0; 6667} 6668 6669/* 6670 * Decode MBCS string into unicode object. If 'final' is set, converts 6671 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6672 */ 6673static int 6674decode_mbcs(PyUnicodeObject **v, 6675 const char *s, /* MBCS string */ 6676 int size, /* sizeof MBCS string */ 6677 int final, 6678 const char *errors) 6679{ 6680 Py_UNICODE *p; 6681 Py_ssize_t n; 6682 DWORD usize; 6683 DWORD flags; 6684 6685 assert(size >= 0); 6686 6687 /* check and handle 'errors' arg */ 6688 if (errors==NULL || strcmp(errors, "strict")==0) 6689 flags = MB_ERR_INVALID_CHARS; 6690 else if (strcmp(errors, "ignore")==0) 6691 flags = 0; 6692 else { 6693 PyErr_Format(PyExc_ValueError, 6694 "mbcs encoding does not support errors='%s'", 6695 errors); 6696 return -1; 6697 } 6698 6699 /* Skip trailing lead-byte unless 'final' is set */ 6700 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6701 --size; 6702 6703 /* First get the size of the result */ 6704 if (size > 0) { 6705 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6706 if (usize==0) 6707 goto mbcs_decode_error; 6708 } else 6709 usize = 0; 6710 6711 if (*v == NULL) { 6712 /* Create unicode object */ 6713 *v = _PyUnicode_New(usize); 6714 if (*v == NULL) 6715 return -1; 6716 n = 0; 6717 } 6718 else { 6719 /* Extend unicode object */ 6720 n = PyUnicode_GET_SIZE(*v); 6721 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6722 return -1; 6723 } 6724 6725 /* Do the conversion */ 6726 if (usize > 0) { 6727 p = PyUnicode_AS_UNICODE(*v) + n; 6728 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6729 goto mbcs_decode_error; 6730 } 6731 } 6732 return size; 6733 6734mbcs_decode_error: 6735 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6736 we raise a UnicodeDecodeError - else it is a 'generic' 6737 windows error 6738 */ 6739 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6740 /* Ideally, we should get reason from FormatMessage - this 6741 is the Windows 2000 English version of the message 6742 */ 6743 PyObject *exc = NULL; 6744 const char *reason = "No mapping for the Unicode character exists " 6745 "in the target multi-byte code page."; 6746 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6747 if (exc != NULL) { 6748 PyCodec_StrictErrors(exc); 6749 Py_DECREF(exc); 6750 } 6751 } else { 6752 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6753 } 6754 return -1; 6755} 6756 6757PyObject * 6758PyUnicode_DecodeMBCSStateful(const char *s, 6759 Py_ssize_t size, 6760 const char *errors, 6761 Py_ssize_t *consumed) 6762{ 6763 PyUnicodeObject *v = NULL; 6764 int done; 6765 6766 if (consumed) 6767 *consumed = 0; 6768 6769#ifdef NEED_RETRY 6770 retry: 6771 if (size > INT_MAX) 6772 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6773 else 6774#endif 6775 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6776 6777 if (done < 0) { 6778 Py_XDECREF(v); 6779 return NULL; 6780 } 6781 6782 if (consumed) 6783 *consumed += done; 6784 6785#ifdef NEED_RETRY 6786 if (size > INT_MAX) { 6787 s += done; 6788 size -= done; 6789 goto retry; 6790 } 6791#endif 6792#ifndef DONT_MAKE_RESULT_READY 6793 if (_PyUnicode_READY_REPLACE(&v)) { 6794 Py_DECREF(v); 6795 return NULL; 6796 } 6797#endif 6798 return (PyObject *)v; 6799} 6800 6801PyObject * 6802PyUnicode_DecodeMBCS(const char *s, 6803 Py_ssize_t size, 6804 const char *errors) 6805{ 6806 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6807} 6808 6809/* 6810 * Convert unicode into string object (MBCS). 6811 * Returns 0 if succeed, -1 otherwise. 6812 */ 6813static int 6814encode_mbcs(PyObject **repr, 6815 const Py_UNICODE *p, /* unicode */ 6816 int size, /* size of unicode */ 6817 const char* errors) 6818{ 6819 BOOL usedDefaultChar = FALSE; 6820 BOOL *pusedDefaultChar; 6821 int mbcssize; 6822 Py_ssize_t n; 6823 PyObject *exc = NULL; 6824 DWORD flags; 6825 6826 assert(size >= 0); 6827 6828 /* check and handle 'errors' arg */ 6829 if (errors==NULL || strcmp(errors, "strict")==0) { 6830 flags = WC_NO_BEST_FIT_CHARS; 6831 pusedDefaultChar = &usedDefaultChar; 6832 } else if (strcmp(errors, "replace")==0) { 6833 flags = 0; 6834 pusedDefaultChar = NULL; 6835 } else { 6836 PyErr_Format(PyExc_ValueError, 6837 "mbcs encoding does not support errors='%s'", 6838 errors); 6839 return -1; 6840 } 6841 6842 /* First get the size of the result */ 6843 if (size > 0) { 6844 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6845 NULL, pusedDefaultChar); 6846 if (mbcssize == 0) { 6847 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6848 return -1; 6849 } 6850 /* If we used a default char, then we failed! */ 6851 if (pusedDefaultChar && *pusedDefaultChar) 6852 goto mbcs_encode_error; 6853 } else { 6854 mbcssize = 0; 6855 } 6856 6857 if (*repr == NULL) { 6858 /* Create string object */ 6859 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6860 if (*repr == NULL) 6861 return -1; 6862 n = 0; 6863 } 6864 else { 6865 /* Extend string object */ 6866 n = PyBytes_Size(*repr); 6867 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6868 return -1; 6869 } 6870 6871 /* Do the conversion */ 6872 if (size > 0) { 6873 char *s = PyBytes_AS_STRING(*repr) + n; 6874 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6875 NULL, pusedDefaultChar)) { 6876 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6877 return -1; 6878 } 6879 if (pusedDefaultChar && *pusedDefaultChar) 6880 goto mbcs_encode_error; 6881 } 6882 return 0; 6883 6884mbcs_encode_error: 6885 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6886 Py_XDECREF(exc); 6887 return -1; 6888} 6889 6890PyObject * 6891PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6892 Py_ssize_t size, 6893 const char *errors) 6894{ 6895 PyObject *repr = NULL; 6896 int ret; 6897 6898#ifdef NEED_RETRY 6899 retry: 6900 if (size > INT_MAX) 6901 ret = encode_mbcs(&repr, p, INT_MAX, errors); 6902 else 6903#endif 6904 ret = encode_mbcs(&repr, p, (int)size, errors); 6905 6906 if (ret < 0) { 6907 Py_XDECREF(repr); 6908 return NULL; 6909 } 6910 6911#ifdef NEED_RETRY 6912 if (size > INT_MAX) { 6913 p += INT_MAX; 6914 size -= INT_MAX; 6915 goto retry; 6916 } 6917#endif 6918 6919 return repr; 6920} 6921 6922PyObject * 6923PyUnicode_AsMBCSString(PyObject *unicode) 6924{ 6925 if (!PyUnicode_Check(unicode)) { 6926 PyErr_BadArgument(); 6927 return NULL; 6928 } 6929 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 6930 PyUnicode_GET_SIZE(unicode), 6931 NULL); 6932} 6933 6934#undef NEED_RETRY 6935 6936#endif /* HAVE_MBCS */ 6937 6938/* --- Character Mapping Codec -------------------------------------------- */ 6939 6940PyObject * 6941PyUnicode_DecodeCharmap(const char *s, 6942 Py_ssize_t size, 6943 PyObject *mapping, 6944 const char *errors) 6945{ 6946 const char *starts = s; 6947 Py_ssize_t startinpos; 6948 Py_ssize_t endinpos; 6949 Py_ssize_t outpos; 6950 const char *e; 6951 PyUnicodeObject *v; 6952 Py_UNICODE *p; 6953 Py_ssize_t extrachars = 0; 6954 PyObject *errorHandler = NULL; 6955 PyObject *exc = NULL; 6956 Py_UNICODE *mapstring = NULL; 6957 Py_ssize_t maplen = 0; 6958 6959 /* Default to Latin-1 */ 6960 if (mapping == NULL) 6961 return PyUnicode_DecodeLatin1(s, size, errors); 6962 6963 v = _PyUnicode_New(size); 6964 if (v == NULL) 6965 goto onError; 6966 if (size == 0) 6967 return (PyObject *)v; 6968 p = PyUnicode_AS_UNICODE(v); 6969 e = s + size; 6970 if (PyUnicode_CheckExact(mapping)) { 6971 mapstring = PyUnicode_AS_UNICODE(mapping); 6972 maplen = PyUnicode_GET_SIZE(mapping); 6973 while (s < e) { 6974 unsigned char ch = *s; 6975 Py_UNICODE x = 0xfffe; /* illegal value */ 6976 6977 if (ch < maplen) 6978 x = mapstring[ch]; 6979 6980 if (x == 0xfffe) { 6981 /* undefined mapping */ 6982 outpos = p-PyUnicode_AS_UNICODE(v); 6983 startinpos = s-starts; 6984 endinpos = startinpos+1; 6985 if (unicode_decode_call_errorhandler( 6986 errors, &errorHandler, 6987 "charmap", "character maps to <undefined>", 6988 &starts, &e, &startinpos, &endinpos, &exc, &s, 6989 &v, &outpos, &p)) { 6990 goto onError; 6991 } 6992 continue; 6993 } 6994 *p++ = x; 6995 ++s; 6996 } 6997 } 6998 else { 6999 while (s < e) { 7000 unsigned char ch = *s; 7001 PyObject *w, *x; 7002 7003 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7004 w = PyLong_FromLong((long)ch); 7005 if (w == NULL) 7006 goto onError; 7007 x = PyObject_GetItem(mapping, w); 7008 Py_DECREF(w); 7009 if (x == NULL) { 7010 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7011 /* No mapping found means: mapping is undefined. */ 7012 PyErr_Clear(); 7013 x = Py_None; 7014 Py_INCREF(x); 7015 } else 7016 goto onError; 7017 } 7018 7019 /* Apply mapping */ 7020 if (PyLong_Check(x)) { 7021 long value = PyLong_AS_LONG(x); 7022 if (value < 0 || value > 65535) { 7023 PyErr_SetString(PyExc_TypeError, 7024 "character mapping must be in range(65536)"); 7025 Py_DECREF(x); 7026 goto onError; 7027 } 7028 *p++ = (Py_UNICODE)value; 7029 } 7030 else if (x == Py_None) { 7031 /* undefined mapping */ 7032 outpos = p-PyUnicode_AS_UNICODE(v); 7033 startinpos = s-starts; 7034 endinpos = startinpos+1; 7035 if (unicode_decode_call_errorhandler( 7036 errors, &errorHandler, 7037 "charmap", "character maps to <undefined>", 7038 &starts, &e, &startinpos, &endinpos, &exc, &s, 7039 &v, &outpos, &p)) { 7040 Py_DECREF(x); 7041 goto onError; 7042 } 7043 Py_DECREF(x); 7044 continue; 7045 } 7046 else if (PyUnicode_Check(x)) { 7047 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 7048 7049 if (targetsize == 1) 7050 /* 1-1 mapping */ 7051 *p++ = *PyUnicode_AS_UNICODE(x); 7052 7053 else if (targetsize > 1) { 7054 /* 1-n mapping */ 7055 if (targetsize > extrachars) { 7056 /* resize first */ 7057 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 7058 Py_ssize_t needed = (targetsize - extrachars) + \ 7059 (targetsize << 2); 7060 extrachars += needed; 7061 /* XXX overflow detection missing */ 7062 if (PyUnicode_Resize((PyObject**)&v, 7063 PyUnicode_GET_SIZE(v) + needed) < 0) { 7064 Py_DECREF(x); 7065 goto onError; 7066 } 7067 p = PyUnicode_AS_UNICODE(v) + oldpos; 7068 } 7069 Py_UNICODE_COPY(p, 7070 PyUnicode_AS_UNICODE(x), 7071 targetsize); 7072 p += targetsize; 7073 extrachars -= targetsize; 7074 } 7075 /* 1-0 mapping: skip the character */ 7076 } 7077 else { 7078 /* wrong return value */ 7079 PyErr_SetString(PyExc_TypeError, 7080 "character mapping must return integer, None or str"); 7081 Py_DECREF(x); 7082 goto onError; 7083 } 7084 Py_DECREF(x); 7085 ++s; 7086 } 7087 } 7088 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 7089 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 7090 goto onError; 7091 Py_XDECREF(errorHandler); 7092 Py_XDECREF(exc); 7093#ifndef DONT_MAKE_RESULT_READY 7094 if (_PyUnicode_READY_REPLACE(&v)) { 7095 Py_DECREF(v); 7096 return NULL; 7097 } 7098#endif 7099 return (PyObject *)v; 7100 7101 onError: 7102 Py_XDECREF(errorHandler); 7103 Py_XDECREF(exc); 7104 Py_XDECREF(v); 7105 return NULL; 7106} 7107 7108/* Charmap encoding: the lookup table */ 7109 7110struct encoding_map { 7111 PyObject_HEAD 7112 unsigned char level1[32]; 7113 int count2, count3; 7114 unsigned char level23[1]; 7115}; 7116 7117static PyObject* 7118encoding_map_size(PyObject *obj, PyObject* args) 7119{ 7120 struct encoding_map *map = (struct encoding_map*)obj; 7121 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7122 128*map->count3); 7123} 7124 7125static PyMethodDef encoding_map_methods[] = { 7126 {"size", encoding_map_size, METH_NOARGS, 7127 PyDoc_STR("Return the size (in bytes) of this object") }, 7128 { 0 } 7129}; 7130 7131static void 7132encoding_map_dealloc(PyObject* o) 7133{ 7134 PyObject_FREE(o); 7135} 7136 7137static PyTypeObject EncodingMapType = { 7138 PyVarObject_HEAD_INIT(NULL, 0) 7139 "EncodingMap", /*tp_name*/ 7140 sizeof(struct encoding_map), /*tp_basicsize*/ 7141 0, /*tp_itemsize*/ 7142 /* methods */ 7143 encoding_map_dealloc, /*tp_dealloc*/ 7144 0, /*tp_print*/ 7145 0, /*tp_getattr*/ 7146 0, /*tp_setattr*/ 7147 0, /*tp_reserved*/ 7148 0, /*tp_repr*/ 7149 0, /*tp_as_number*/ 7150 0, /*tp_as_sequence*/ 7151 0, /*tp_as_mapping*/ 7152 0, /*tp_hash*/ 7153 0, /*tp_call*/ 7154 0, /*tp_str*/ 7155 0, /*tp_getattro*/ 7156 0, /*tp_setattro*/ 7157 0, /*tp_as_buffer*/ 7158 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7159 0, /*tp_doc*/ 7160 0, /*tp_traverse*/ 7161 0, /*tp_clear*/ 7162 0, /*tp_richcompare*/ 7163 0, /*tp_weaklistoffset*/ 7164 0, /*tp_iter*/ 7165 0, /*tp_iternext*/ 7166 encoding_map_methods, /*tp_methods*/ 7167 0, /*tp_members*/ 7168 0, /*tp_getset*/ 7169 0, /*tp_base*/ 7170 0, /*tp_dict*/ 7171 0, /*tp_descr_get*/ 7172 0, /*tp_descr_set*/ 7173 0, /*tp_dictoffset*/ 7174 0, /*tp_init*/ 7175 0, /*tp_alloc*/ 7176 0, /*tp_new*/ 7177 0, /*tp_free*/ 7178 0, /*tp_is_gc*/ 7179}; 7180 7181PyObject* 7182PyUnicode_BuildEncodingMap(PyObject* string) 7183{ 7184 PyObject *result; 7185 struct encoding_map *mresult; 7186 int i; 7187 int need_dict = 0; 7188 unsigned char level1[32]; 7189 unsigned char level2[512]; 7190 unsigned char *mlevel1, *mlevel2, *mlevel3; 7191 int count2 = 0, count3 = 0; 7192 int kind; 7193 void *data; 7194 Py_UCS4 ch; 7195 7196 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7197 PyErr_BadArgument(); 7198 return NULL; 7199 } 7200 kind = PyUnicode_KIND(string); 7201 data = PyUnicode_DATA(string); 7202 memset(level1, 0xFF, sizeof level1); 7203 memset(level2, 0xFF, sizeof level2); 7204 7205 /* If there isn't a one-to-one mapping of NULL to \0, 7206 or if there are non-BMP characters, we need to use 7207 a mapping dictionary. */ 7208 if (PyUnicode_READ(kind, data, 0) != 0) 7209 need_dict = 1; 7210 for (i = 1; i < 256; i++) { 7211 int l1, l2; 7212 ch = PyUnicode_READ(kind, data, i); 7213 if (ch == 0 || ch > 0xFFFF) { 7214 need_dict = 1; 7215 break; 7216 } 7217 if (ch == 0xFFFE) 7218 /* unmapped character */ 7219 continue; 7220 l1 = ch >> 11; 7221 l2 = ch >> 7; 7222 if (level1[l1] == 0xFF) 7223 level1[l1] = count2++; 7224 if (level2[l2] == 0xFF) 7225 level2[l2] = count3++; 7226 } 7227 7228 if (count2 >= 0xFF || count3 >= 0xFF) 7229 need_dict = 1; 7230 7231 if (need_dict) { 7232 PyObject *result = PyDict_New(); 7233 PyObject *key, *value; 7234 if (!result) 7235 return NULL; 7236 for (i = 0; i < 256; i++) { 7237 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7238 value = PyLong_FromLong(i); 7239 if (!key || !value) 7240 goto failed1; 7241 if (PyDict_SetItem(result, key, value) == -1) 7242 goto failed1; 7243 Py_DECREF(key); 7244 Py_DECREF(value); 7245 } 7246 return result; 7247 failed1: 7248 Py_XDECREF(key); 7249 Py_XDECREF(value); 7250 Py_DECREF(result); 7251 return NULL; 7252 } 7253 7254 /* Create a three-level trie */ 7255 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7256 16*count2 + 128*count3 - 1); 7257 if (!result) 7258 return PyErr_NoMemory(); 7259 PyObject_Init(result, &EncodingMapType); 7260 mresult = (struct encoding_map*)result; 7261 mresult->count2 = count2; 7262 mresult->count3 = count3; 7263 mlevel1 = mresult->level1; 7264 mlevel2 = mresult->level23; 7265 mlevel3 = mresult->level23 + 16*count2; 7266 memcpy(mlevel1, level1, 32); 7267 memset(mlevel2, 0xFF, 16*count2); 7268 memset(mlevel3, 0, 128*count3); 7269 count3 = 0; 7270 for (i = 1; i < 256; i++) { 7271 int o1, o2, o3, i2, i3; 7272 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7273 /* unmapped character */ 7274 continue; 7275 o1 = PyUnicode_READ(kind, data, i)>>11; 7276 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7277 i2 = 16*mlevel1[o1] + o2; 7278 if (mlevel2[i2] == 0xFF) 7279 mlevel2[i2] = count3++; 7280 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7281 i3 = 128*mlevel2[i2] + o3; 7282 mlevel3[i3] = i; 7283 } 7284 return result; 7285} 7286 7287static int 7288encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7289{ 7290 struct encoding_map *map = (struct encoding_map*)mapping; 7291 int l1 = c>>11; 7292 int l2 = (c>>7) & 0xF; 7293 int l3 = c & 0x7F; 7294 int i; 7295 7296#ifdef Py_UNICODE_WIDE 7297 if (c > 0xFFFF) { 7298 return -1; 7299 } 7300#endif 7301 if (c == 0) 7302 return 0; 7303 /* level 1*/ 7304 i = map->level1[l1]; 7305 if (i == 0xFF) { 7306 return -1; 7307 } 7308 /* level 2*/ 7309 i = map->level23[16*i+l2]; 7310 if (i == 0xFF) { 7311 return -1; 7312 } 7313 /* level 3 */ 7314 i = map->level23[16*map->count2 + 128*i + l3]; 7315 if (i == 0) { 7316 return -1; 7317 } 7318 return i; 7319} 7320 7321/* Lookup the character ch in the mapping. If the character 7322 can't be found, Py_None is returned (or NULL, if another 7323 error occurred). */ 7324static PyObject * 7325charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7326{ 7327 PyObject *w = PyLong_FromLong((long)c); 7328 PyObject *x; 7329 7330 if (w == NULL) 7331 return NULL; 7332 x = PyObject_GetItem(mapping, w); 7333 Py_DECREF(w); 7334 if (x == NULL) { 7335 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7336 /* No mapping found means: mapping is undefined. */ 7337 PyErr_Clear(); 7338 x = Py_None; 7339 Py_INCREF(x); 7340 return x; 7341 } else 7342 return NULL; 7343 } 7344 else if (x == Py_None) 7345 return x; 7346 else if (PyLong_Check(x)) { 7347 long value = PyLong_AS_LONG(x); 7348 if (value < 0 || value > 255) { 7349 PyErr_SetString(PyExc_TypeError, 7350 "character mapping must be in range(256)"); 7351 Py_DECREF(x); 7352 return NULL; 7353 } 7354 return x; 7355 } 7356 else if (PyBytes_Check(x)) 7357 return x; 7358 else { 7359 /* wrong return value */ 7360 PyErr_Format(PyExc_TypeError, 7361 "character mapping must return integer, bytes or None, not %.400s", 7362 x->ob_type->tp_name); 7363 Py_DECREF(x); 7364 return NULL; 7365 } 7366} 7367 7368static int 7369charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7370{ 7371 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7372 /* exponentially overallocate to minimize reallocations */ 7373 if (requiredsize < 2*outsize) 7374 requiredsize = 2*outsize; 7375 if (_PyBytes_Resize(outobj, requiredsize)) 7376 return -1; 7377 return 0; 7378} 7379 7380typedef enum charmapencode_result { 7381 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7382} charmapencode_result; 7383/* lookup the character, put the result in the output string and adjust 7384 various state variables. Resize the output bytes object if not enough 7385 space is available. Return a new reference to the object that 7386 was put in the output buffer, or Py_None, if the mapping was undefined 7387 (in which case no character was written) or NULL, if a 7388 reallocation error occurred. The caller must decref the result */ 7389static charmapencode_result 7390charmapencode_output(Py_UNICODE c, PyObject *mapping, 7391 PyObject **outobj, Py_ssize_t *outpos) 7392{ 7393 PyObject *rep; 7394 char *outstart; 7395 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7396 7397 if (Py_TYPE(mapping) == &EncodingMapType) { 7398 int res = encoding_map_lookup(c, mapping); 7399 Py_ssize_t requiredsize = *outpos+1; 7400 if (res == -1) 7401 return enc_FAILED; 7402 if (outsize<requiredsize) 7403 if (charmapencode_resize(outobj, outpos, requiredsize)) 7404 return enc_EXCEPTION; 7405 outstart = PyBytes_AS_STRING(*outobj); 7406 outstart[(*outpos)++] = (char)res; 7407 return enc_SUCCESS; 7408 } 7409 7410 rep = charmapencode_lookup(c, mapping); 7411 if (rep==NULL) 7412 return enc_EXCEPTION; 7413 else if (rep==Py_None) { 7414 Py_DECREF(rep); 7415 return enc_FAILED; 7416 } else { 7417 if (PyLong_Check(rep)) { 7418 Py_ssize_t requiredsize = *outpos+1; 7419 if (outsize<requiredsize) 7420 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7421 Py_DECREF(rep); 7422 return enc_EXCEPTION; 7423 } 7424 outstart = PyBytes_AS_STRING(*outobj); 7425 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7426 } 7427 else { 7428 const char *repchars = PyBytes_AS_STRING(rep); 7429 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7430 Py_ssize_t requiredsize = *outpos+repsize; 7431 if (outsize<requiredsize) 7432 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7433 Py_DECREF(rep); 7434 return enc_EXCEPTION; 7435 } 7436 outstart = PyBytes_AS_STRING(*outobj); 7437 memcpy(outstart + *outpos, repchars, repsize); 7438 *outpos += repsize; 7439 } 7440 } 7441 Py_DECREF(rep); 7442 return enc_SUCCESS; 7443} 7444 7445/* handle an error in PyUnicode_EncodeCharmap 7446 Return 0 on success, -1 on error */ 7447static int 7448charmap_encoding_error( 7449 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7450 PyObject **exceptionObject, 7451 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7452 PyObject **res, Py_ssize_t *respos) 7453{ 7454 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7455 Py_ssize_t repsize; 7456 Py_ssize_t newpos; 7457 Py_UNICODE *uni2; 7458 /* startpos for collecting unencodable chars */ 7459 Py_ssize_t collstartpos = *inpos; 7460 Py_ssize_t collendpos = *inpos+1; 7461 Py_ssize_t collpos; 7462 char *encoding = "charmap"; 7463 char *reason = "character maps to <undefined>"; 7464 charmapencode_result x; 7465 7466 /* find all unencodable characters */ 7467 while (collendpos < size) { 7468 PyObject *rep; 7469 if (Py_TYPE(mapping) == &EncodingMapType) { 7470 int res = encoding_map_lookup(p[collendpos], mapping); 7471 if (res != -1) 7472 break; 7473 ++collendpos; 7474 continue; 7475 } 7476 7477 rep = charmapencode_lookup(p[collendpos], mapping); 7478 if (rep==NULL) 7479 return -1; 7480 else if (rep!=Py_None) { 7481 Py_DECREF(rep); 7482 break; 7483 } 7484 Py_DECREF(rep); 7485 ++collendpos; 7486 } 7487 /* cache callback name lookup 7488 * (if not done yet, i.e. it's the first error) */ 7489 if (*known_errorHandler==-1) { 7490 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7491 *known_errorHandler = 1; 7492 else if (!strcmp(errors, "replace")) 7493 *known_errorHandler = 2; 7494 else if (!strcmp(errors, "ignore")) 7495 *known_errorHandler = 3; 7496 else if (!strcmp(errors, "xmlcharrefreplace")) 7497 *known_errorHandler = 4; 7498 else 7499 *known_errorHandler = 0; 7500 } 7501 switch (*known_errorHandler) { 7502 case 1: /* strict */ 7503 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7504 return -1; 7505 case 2: /* replace */ 7506 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7507 x = charmapencode_output('?', mapping, res, respos); 7508 if (x==enc_EXCEPTION) { 7509 return -1; 7510 } 7511 else if (x==enc_FAILED) { 7512 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7513 return -1; 7514 } 7515 } 7516 /* fall through */ 7517 case 3: /* ignore */ 7518 *inpos = collendpos; 7519 break; 7520 case 4: /* xmlcharrefreplace */ 7521 /* generate replacement (temporarily (mis)uses p) */ 7522 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7523 char buffer[2+29+1+1]; 7524 char *cp; 7525 sprintf(buffer, "&#%d;", (int)p[collpos]); 7526 for (cp = buffer; *cp; ++cp) { 7527 x = charmapencode_output(*cp, mapping, res, respos); 7528 if (x==enc_EXCEPTION) 7529 return -1; 7530 else if (x==enc_FAILED) { 7531 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7532 return -1; 7533 } 7534 } 7535 } 7536 *inpos = collendpos; 7537 break; 7538 default: 7539 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7540 encoding, reason, p, size, exceptionObject, 7541 collstartpos, collendpos, &newpos); 7542 if (repunicode == NULL) 7543 return -1; 7544 if (PyBytes_Check(repunicode)) { 7545 /* Directly copy bytes result to output. */ 7546 Py_ssize_t outsize = PyBytes_Size(*res); 7547 Py_ssize_t requiredsize; 7548 repsize = PyBytes_Size(repunicode); 7549 requiredsize = *respos + repsize; 7550 if (requiredsize > outsize) 7551 /* Make room for all additional bytes. */ 7552 if (charmapencode_resize(res, respos, requiredsize)) { 7553 Py_DECREF(repunicode); 7554 return -1; 7555 } 7556 memcpy(PyBytes_AsString(*res) + *respos, 7557 PyBytes_AsString(repunicode), repsize); 7558 *respos += repsize; 7559 *inpos = newpos; 7560 Py_DECREF(repunicode); 7561 break; 7562 } 7563 /* generate replacement */ 7564 repsize = PyUnicode_GET_SIZE(repunicode); 7565 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7566 x = charmapencode_output(*uni2, mapping, res, respos); 7567 if (x==enc_EXCEPTION) { 7568 return -1; 7569 } 7570 else if (x==enc_FAILED) { 7571 Py_DECREF(repunicode); 7572 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7573 return -1; 7574 } 7575 } 7576 *inpos = newpos; 7577 Py_DECREF(repunicode); 7578 } 7579 return 0; 7580} 7581 7582PyObject * 7583PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7584 Py_ssize_t size, 7585 PyObject *mapping, 7586 const char *errors) 7587{ 7588 /* output object */ 7589 PyObject *res = NULL; 7590 /* current input position */ 7591 Py_ssize_t inpos = 0; 7592 /* current output position */ 7593 Py_ssize_t respos = 0; 7594 PyObject *errorHandler = NULL; 7595 PyObject *exc = NULL; 7596 /* the following variable is used for caching string comparisons 7597 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7598 * 3=ignore, 4=xmlcharrefreplace */ 7599 int known_errorHandler = -1; 7600 7601 /* Default to Latin-1 */ 7602 if (mapping == NULL) 7603 return PyUnicode_EncodeLatin1(p, size, errors); 7604 7605 /* allocate enough for a simple encoding without 7606 replacements, if we need more, we'll resize */ 7607 res = PyBytes_FromStringAndSize(NULL, size); 7608 if (res == NULL) 7609 goto onError; 7610 if (size == 0) 7611 return res; 7612 7613 while (inpos<size) { 7614 /* try to encode it */ 7615 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7616 if (x==enc_EXCEPTION) /* error */ 7617 goto onError; 7618 if (x==enc_FAILED) { /* unencodable character */ 7619 if (charmap_encoding_error(p, size, &inpos, mapping, 7620 &exc, 7621 &known_errorHandler, &errorHandler, errors, 7622 &res, &respos)) { 7623 goto onError; 7624 } 7625 } 7626 else 7627 /* done with this character => adjust input position */ 7628 ++inpos; 7629 } 7630 7631 /* Resize if we allocated to much */ 7632 if (respos<PyBytes_GET_SIZE(res)) 7633 if (_PyBytes_Resize(&res, respos) < 0) 7634 goto onError; 7635 7636 Py_XDECREF(exc); 7637 Py_XDECREF(errorHandler); 7638 return res; 7639 7640 onError: 7641 Py_XDECREF(res); 7642 Py_XDECREF(exc); 7643 Py_XDECREF(errorHandler); 7644 return NULL; 7645} 7646 7647PyObject * 7648PyUnicode_AsCharmapString(PyObject *unicode, 7649 PyObject *mapping) 7650{ 7651 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7652 PyErr_BadArgument(); 7653 return NULL; 7654 } 7655 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7656 PyUnicode_GET_SIZE(unicode), 7657 mapping, 7658 NULL); 7659} 7660 7661/* create or adjust a UnicodeTranslateError */ 7662static void 7663make_translate_exception(PyObject **exceptionObject, 7664 PyObject *unicode, 7665 Py_ssize_t startpos, Py_ssize_t endpos, 7666 const char *reason) 7667{ 7668 if (*exceptionObject == NULL) { 7669 *exceptionObject = _PyUnicodeTranslateError_Create( 7670 unicode, startpos, endpos, reason); 7671 } 7672 else { 7673 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7674 goto onError; 7675 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7676 goto onError; 7677 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7678 goto onError; 7679 return; 7680 onError: 7681 Py_DECREF(*exceptionObject); 7682 *exceptionObject = NULL; 7683 } 7684} 7685 7686/* raises a UnicodeTranslateError */ 7687static void 7688raise_translate_exception(PyObject **exceptionObject, 7689 PyObject *unicode, 7690 Py_ssize_t startpos, Py_ssize_t endpos, 7691 const char *reason) 7692{ 7693 make_translate_exception(exceptionObject, 7694 unicode, startpos, endpos, reason); 7695 if (*exceptionObject != NULL) 7696 PyCodec_StrictErrors(*exceptionObject); 7697} 7698 7699/* error handling callback helper: 7700 build arguments, call the callback and check the arguments, 7701 put the result into newpos and return the replacement string, which 7702 has to be freed by the caller */ 7703static PyObject * 7704unicode_translate_call_errorhandler(const char *errors, 7705 PyObject **errorHandler, 7706 const char *reason, 7707 PyObject *unicode, PyObject **exceptionObject, 7708 Py_ssize_t startpos, Py_ssize_t endpos, 7709 Py_ssize_t *newpos) 7710{ 7711 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7712 7713 Py_ssize_t i_newpos; 7714 PyObject *restuple; 7715 PyObject *resunicode; 7716 7717 if (*errorHandler == NULL) { 7718 *errorHandler = PyCodec_LookupError(errors); 7719 if (*errorHandler == NULL) 7720 return NULL; 7721 } 7722 7723 make_translate_exception(exceptionObject, 7724 unicode, startpos, endpos, reason); 7725 if (*exceptionObject == NULL) 7726 return NULL; 7727 7728 restuple = PyObject_CallFunctionObjArgs( 7729 *errorHandler, *exceptionObject, NULL); 7730 if (restuple == NULL) 7731 return NULL; 7732 if (!PyTuple_Check(restuple)) { 7733 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7734 Py_DECREF(restuple); 7735 return NULL; 7736 } 7737 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7738 &resunicode, &i_newpos)) { 7739 Py_DECREF(restuple); 7740 return NULL; 7741 } 7742 if (i_newpos<0) 7743 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7744 else 7745 *newpos = i_newpos; 7746 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7747 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7748 Py_DECREF(restuple); 7749 return NULL; 7750 } 7751 Py_INCREF(resunicode); 7752 Py_DECREF(restuple); 7753 return resunicode; 7754} 7755 7756/* Lookup the character ch in the mapping and put the result in result, 7757 which must be decrefed by the caller. 7758 Return 0 on success, -1 on error */ 7759static int 7760charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7761{ 7762 PyObject *w = PyLong_FromLong((long)c); 7763 PyObject *x; 7764 7765 if (w == NULL) 7766 return -1; 7767 x = PyObject_GetItem(mapping, w); 7768 Py_DECREF(w); 7769 if (x == NULL) { 7770 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7771 /* No mapping found means: use 1:1 mapping. */ 7772 PyErr_Clear(); 7773 *result = NULL; 7774 return 0; 7775 } else 7776 return -1; 7777 } 7778 else if (x == Py_None) { 7779 *result = x; 7780 return 0; 7781 } 7782 else if (PyLong_Check(x)) { 7783 long value = PyLong_AS_LONG(x); 7784 long max = PyUnicode_GetMax(); 7785 if (value < 0 || value > max) { 7786 PyErr_Format(PyExc_TypeError, 7787 "character mapping must be in range(0x%x)", max+1); 7788 Py_DECREF(x); 7789 return -1; 7790 } 7791 *result = x; 7792 return 0; 7793 } 7794 else if (PyUnicode_Check(x)) { 7795 *result = x; 7796 return 0; 7797 } 7798 else { 7799 /* wrong return value */ 7800 PyErr_SetString(PyExc_TypeError, 7801 "character mapping must return integer, None or str"); 7802 Py_DECREF(x); 7803 return -1; 7804 } 7805} 7806/* ensure that *outobj is at least requiredsize characters long, 7807 if not reallocate and adjust various state variables. 7808 Return 0 on success, -1 on error */ 7809static int 7810charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7811 Py_ssize_t requiredsize) 7812{ 7813 Py_ssize_t oldsize = *psize; 7814 if (requiredsize > oldsize) { 7815 /* exponentially overallocate to minimize reallocations */ 7816 if (requiredsize < 2 * oldsize) 7817 requiredsize = 2 * oldsize; 7818 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7819 if (*outobj == 0) 7820 return -1; 7821 *psize = requiredsize; 7822 } 7823 return 0; 7824} 7825/* lookup the character, put the result in the output string and adjust 7826 various state variables. Return a new reference to the object that 7827 was put in the output buffer in *result, or Py_None, if the mapping was 7828 undefined (in which case no character was written). 7829 The called must decref result. 7830 Return 0 on success, -1 on error. */ 7831static int 7832charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7833 PyObject *mapping, Py_UCS4 **output, 7834 Py_ssize_t *osize, Py_ssize_t *opos, 7835 PyObject **res) 7836{ 7837 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7838 if (charmaptranslate_lookup(curinp, mapping, res)) 7839 return -1; 7840 if (*res==NULL) { 7841 /* not found => default to 1:1 mapping */ 7842 (*output)[(*opos)++] = curinp; 7843 } 7844 else if (*res==Py_None) 7845 ; 7846 else if (PyLong_Check(*res)) { 7847 /* no overflow check, because we know that the space is enough */ 7848 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7849 } 7850 else if (PyUnicode_Check(*res)) { 7851 Py_ssize_t repsize; 7852 if (PyUnicode_READY(*res) == -1) 7853 return -1; 7854 repsize = PyUnicode_GET_LENGTH(*res); 7855 if (repsize==1) { 7856 /* no overflow check, because we know that the space is enough */ 7857 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7858 } 7859 else if (repsize!=0) { 7860 /* more than one character */ 7861 Py_ssize_t requiredsize = *opos + 7862 (PyUnicode_GET_LENGTH(input) - ipos) + 7863 repsize - 1; 7864 Py_ssize_t i; 7865 if (charmaptranslate_makespace(output, osize, requiredsize)) 7866 return -1; 7867 for(i = 0; i < repsize; i++) 7868 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7869 } 7870 } 7871 else 7872 return -1; 7873 return 0; 7874} 7875 7876PyObject * 7877_PyUnicode_TranslateCharmap(PyObject *input, 7878 PyObject *mapping, 7879 const char *errors) 7880{ 7881 /* input object */ 7882 char *idata; 7883 Py_ssize_t size, i; 7884 int kind; 7885 /* output buffer */ 7886 Py_UCS4 *output = NULL; 7887 Py_ssize_t osize; 7888 PyObject *res; 7889 /* current output position */ 7890 Py_ssize_t opos; 7891 char *reason = "character maps to <undefined>"; 7892 PyObject *errorHandler = NULL; 7893 PyObject *exc = NULL; 7894 /* the following variable is used for caching string comparisons 7895 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7896 * 3=ignore, 4=xmlcharrefreplace */ 7897 int known_errorHandler = -1; 7898 7899 if (mapping == NULL) { 7900 PyErr_BadArgument(); 7901 return NULL; 7902 } 7903 7904 if (PyUnicode_READY(input) == -1) 7905 return NULL; 7906 idata = (char*)PyUnicode_DATA(input); 7907 kind = PyUnicode_KIND(input); 7908 size = PyUnicode_GET_LENGTH(input); 7909 i = 0; 7910 7911 if (size == 0) { 7912 Py_INCREF(input); 7913 return input; 7914 } 7915 7916 /* allocate enough for a simple 1:1 translation without 7917 replacements, if we need more, we'll resize */ 7918 osize = size; 7919 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 7920 opos = 0; 7921 if (output == NULL) { 7922 PyErr_NoMemory(); 7923 goto onError; 7924 } 7925 7926 while (i<size) { 7927 /* try to encode it */ 7928 PyObject *x = NULL; 7929 if (charmaptranslate_output(input, i, mapping, 7930 &output, &osize, &opos, &x)) { 7931 Py_XDECREF(x); 7932 goto onError; 7933 } 7934 Py_XDECREF(x); 7935 if (x!=Py_None) /* it worked => adjust input pointer */ 7936 ++i; 7937 else { /* untranslatable character */ 7938 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7939 Py_ssize_t repsize; 7940 Py_ssize_t newpos; 7941 Py_ssize_t uni2; 7942 /* startpos for collecting untranslatable chars */ 7943 Py_ssize_t collstart = i; 7944 Py_ssize_t collend = i+1; 7945 Py_ssize_t coll; 7946 7947 /* find all untranslatable characters */ 7948 while (collend < size) { 7949 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 7950 goto onError; 7951 Py_XDECREF(x); 7952 if (x!=Py_None) 7953 break; 7954 ++collend; 7955 } 7956 /* cache callback name lookup 7957 * (if not done yet, i.e. it's the first error) */ 7958 if (known_errorHandler==-1) { 7959 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7960 known_errorHandler = 1; 7961 else if (!strcmp(errors, "replace")) 7962 known_errorHandler = 2; 7963 else if (!strcmp(errors, "ignore")) 7964 known_errorHandler = 3; 7965 else if (!strcmp(errors, "xmlcharrefreplace")) 7966 known_errorHandler = 4; 7967 else 7968 known_errorHandler = 0; 7969 } 7970 switch (known_errorHandler) { 7971 case 1: /* strict */ 7972 raise_translate_exception(&exc, input, collstart, 7973 collend, reason); 7974 goto onError; 7975 case 2: /* replace */ 7976 /* No need to check for space, this is a 1:1 replacement */ 7977 for (coll = collstart; coll<collend; coll++) 7978 output[opos++] = '?'; 7979 /* fall through */ 7980 case 3: /* ignore */ 7981 i = collend; 7982 break; 7983 case 4: /* xmlcharrefreplace */ 7984 /* generate replacement (temporarily (mis)uses i) */ 7985 for (i = collstart; i < collend; ++i) { 7986 char buffer[2+29+1+1]; 7987 char *cp; 7988 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 7989 if (charmaptranslate_makespace(&output, &osize, 7990 opos+strlen(buffer)+(size-collend))) 7991 goto onError; 7992 for (cp = buffer; *cp; ++cp) 7993 output[opos++] = *cp; 7994 } 7995 i = collend; 7996 break; 7997 default: 7998 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 7999 reason, input, &exc, 8000 collstart, collend, &newpos); 8001 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 8002 goto onError; 8003 /* generate replacement */ 8004 repsize = PyUnicode_GET_LENGTH(repunicode); 8005 if (charmaptranslate_makespace(&output, &osize, 8006 opos+repsize+(size-collend))) { 8007 Py_DECREF(repunicode); 8008 goto onError; 8009 } 8010 for (uni2 = 0; repsize-->0; ++uni2) 8011 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8012 i = newpos; 8013 Py_DECREF(repunicode); 8014 } 8015 } 8016 } 8017 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8018 if (!res) 8019 goto onError; 8020 PyMem_Free(output); 8021 Py_XDECREF(exc); 8022 Py_XDECREF(errorHandler); 8023 return res; 8024 8025 onError: 8026 PyMem_Free(output); 8027 Py_XDECREF(exc); 8028 Py_XDECREF(errorHandler); 8029 return NULL; 8030} 8031 8032/* Deprecated. Use PyUnicode_Translate instead. */ 8033PyObject * 8034PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8035 Py_ssize_t size, 8036 PyObject *mapping, 8037 const char *errors) 8038{ 8039 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8040 if (!unicode) 8041 return NULL; 8042 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8043} 8044 8045PyObject * 8046PyUnicode_Translate(PyObject *str, 8047 PyObject *mapping, 8048 const char *errors) 8049{ 8050 PyObject *result; 8051 8052 str = PyUnicode_FromObject(str); 8053 if (str == NULL) 8054 goto onError; 8055 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8056 Py_DECREF(str); 8057 return result; 8058 8059 onError: 8060 Py_XDECREF(str); 8061 return NULL; 8062} 8063 8064static Py_UCS4 8065fix_decimal_and_space_to_ascii(PyUnicodeObject *self) 8066{ 8067 /* No need to call PyUnicode_READY(self) because this function is only 8068 called as a callback from fixup() which does it already. */ 8069 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8070 const int kind = PyUnicode_KIND(self); 8071 void *data = PyUnicode_DATA(self); 8072 Py_UCS4 maxchar = 0, ch, fixed; 8073 Py_ssize_t i; 8074 8075 for (i = 0; i < len; ++i) { 8076 ch = PyUnicode_READ(kind, data, i); 8077 fixed = 0; 8078 if (ch > 127) { 8079 if (Py_UNICODE_ISSPACE(ch)) 8080 fixed = ' '; 8081 else { 8082 const int decimal = Py_UNICODE_TODECIMAL(ch); 8083 if (decimal >= 0) 8084 fixed = '0' + decimal; 8085 } 8086 if (fixed != 0) { 8087 if (fixed > maxchar) 8088 maxchar = fixed; 8089 PyUnicode_WRITE(kind, data, i, fixed); 8090 } 8091 else if (ch > maxchar) 8092 maxchar = ch; 8093 } 8094 else if (ch > maxchar) 8095 maxchar = ch; 8096 } 8097 8098 return maxchar; 8099} 8100 8101PyObject * 8102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8103{ 8104 if (!PyUnicode_Check(unicode)) { 8105 PyErr_BadInternalCall(); 8106 return NULL; 8107 } 8108 if (PyUnicode_READY(unicode) == -1) 8109 return NULL; 8110 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8111 /* If the string is already ASCII, just return the same string */ 8112 Py_INCREF(unicode); 8113 return unicode; 8114 } 8115 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); 8116} 8117 8118PyObject * 8119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8120 Py_ssize_t length) 8121{ 8122 PyObject *result; 8123 Py_UNICODE *p; /* write pointer into result */ 8124 Py_ssize_t i; 8125 /* Copy to a new string */ 8126 result = (PyObject *)_PyUnicode_New(length); 8127 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8128 if (result == NULL) 8129 return result; 8130 p = PyUnicode_AS_UNICODE(result); 8131 /* Iterate over code points */ 8132 for (i = 0; i < length; i++) { 8133 Py_UNICODE ch =s[i]; 8134 if (ch > 127) { 8135 int decimal = Py_UNICODE_TODECIMAL(ch); 8136 if (decimal >= 0) 8137 p[i] = '0' + decimal; 8138 } 8139 } 8140#ifndef DONT_MAKE_RESULT_READY 8141 if (_PyUnicode_READY_REPLACE(&result)) { 8142 Py_DECREF(result); 8143 return NULL; 8144 } 8145#endif 8146 return result; 8147} 8148/* --- Decimal Encoder ---------------------------------------------------- */ 8149 8150int 8151PyUnicode_EncodeDecimal(Py_UNICODE *s, 8152 Py_ssize_t length, 8153 char *output, 8154 const char *errors) 8155{ 8156 Py_UNICODE *p, *end; 8157 PyObject *errorHandler = NULL; 8158 PyObject *exc = NULL; 8159 const char *encoding = "decimal"; 8160 const char *reason = "invalid decimal Unicode string"; 8161 /* the following variable is used for caching string comparisons 8162 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8163 int known_errorHandler = -1; 8164 8165 if (output == NULL) { 8166 PyErr_BadArgument(); 8167 return -1; 8168 } 8169 8170 p = s; 8171 end = s + length; 8172 while (p < end) { 8173 register Py_UNICODE ch = *p; 8174 int decimal; 8175 PyObject *repunicode; 8176 Py_ssize_t repsize; 8177 Py_ssize_t newpos; 8178 Py_UNICODE *uni2; 8179 Py_UNICODE *collstart; 8180 Py_UNICODE *collend; 8181 8182 if (Py_UNICODE_ISSPACE(ch)) { 8183 *output++ = ' '; 8184 ++p; 8185 continue; 8186 } 8187 decimal = Py_UNICODE_TODECIMAL(ch); 8188 if (decimal >= 0) { 8189 *output++ = '0' + decimal; 8190 ++p; 8191 continue; 8192 } 8193 if (0 < ch && ch < 256) { 8194 *output++ = (char)ch; 8195 ++p; 8196 continue; 8197 } 8198 /* All other characters are considered unencodable */ 8199 collstart = p; 8200 collend = p+1; 8201 while (collend < end) { 8202 if ((0 < *collend && *collend < 256) || 8203 !Py_UNICODE_ISSPACE(*collend) || 8204 Py_UNICODE_TODECIMAL(*collend)) 8205 break; 8206 } 8207 /* cache callback name lookup 8208 * (if not done yet, i.e. it's the first error) */ 8209 if (known_errorHandler==-1) { 8210 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8211 known_errorHandler = 1; 8212 else if (!strcmp(errors, "replace")) 8213 known_errorHandler = 2; 8214 else if (!strcmp(errors, "ignore")) 8215 known_errorHandler = 3; 8216 else if (!strcmp(errors, "xmlcharrefreplace")) 8217 known_errorHandler = 4; 8218 else 8219 known_errorHandler = 0; 8220 } 8221 switch (known_errorHandler) { 8222 case 1: /* strict */ 8223 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8224 goto onError; 8225 case 2: /* replace */ 8226 for (p = collstart; p < collend; ++p) 8227 *output++ = '?'; 8228 /* fall through */ 8229 case 3: /* ignore */ 8230 p = collend; 8231 break; 8232 case 4: /* xmlcharrefreplace */ 8233 /* generate replacement (temporarily (mis)uses p) */ 8234 for (p = collstart; p < collend; ++p) 8235 output += sprintf(output, "&#%d;", (int)*p); 8236 p = collend; 8237 break; 8238 default: 8239 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8240 encoding, reason, s, length, &exc, 8241 collstart-s, collend-s, &newpos); 8242 if (repunicode == NULL) 8243 goto onError; 8244 if (!PyUnicode_Check(repunicode)) { 8245 /* Byte results not supported, since they have no decimal property. */ 8246 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8247 Py_DECREF(repunicode); 8248 goto onError; 8249 } 8250 /* generate replacement */ 8251 repsize = PyUnicode_GET_SIZE(repunicode); 8252 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8253 Py_UNICODE ch = *uni2; 8254 if (Py_UNICODE_ISSPACE(ch)) 8255 *output++ = ' '; 8256 else { 8257 decimal = Py_UNICODE_TODECIMAL(ch); 8258 if (decimal >= 0) 8259 *output++ = '0' + decimal; 8260 else if (0 < ch && ch < 256) 8261 *output++ = (char)ch; 8262 else { 8263 Py_DECREF(repunicode); 8264 raise_encode_exception(&exc, encoding, 8265 s, length, collstart-s, collend-s, reason); 8266 goto onError; 8267 } 8268 } 8269 } 8270 p = s + newpos; 8271 Py_DECREF(repunicode); 8272 } 8273 } 8274 /* 0-terminate the output string */ 8275 *output++ = '\0'; 8276 Py_XDECREF(exc); 8277 Py_XDECREF(errorHandler); 8278 return 0; 8279 8280 onError: 8281 Py_XDECREF(exc); 8282 Py_XDECREF(errorHandler); 8283 return -1; 8284} 8285 8286/* --- Helpers ------------------------------------------------------------ */ 8287 8288#include "stringlib/ucs1lib.h" 8289#include "stringlib/fastsearch.h" 8290#include "stringlib/partition.h" 8291#include "stringlib/split.h" 8292#include "stringlib/count.h" 8293#include "stringlib/find.h" 8294#include "stringlib/localeutil.h" 8295#include "stringlib/undef.h" 8296 8297#include "stringlib/ucs2lib.h" 8298#include "stringlib/fastsearch.h" 8299#include "stringlib/partition.h" 8300#include "stringlib/split.h" 8301#include "stringlib/count.h" 8302#include "stringlib/find.h" 8303#include "stringlib/localeutil.h" 8304#include "stringlib/undef.h" 8305 8306#include "stringlib/ucs4lib.h" 8307#include "stringlib/fastsearch.h" 8308#include "stringlib/partition.h" 8309#include "stringlib/split.h" 8310#include "stringlib/count.h" 8311#include "stringlib/find.h" 8312#include "stringlib/localeutil.h" 8313#include "stringlib/undef.h" 8314 8315static Py_ssize_t 8316any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8317 const Py_UCS1*, Py_ssize_t, 8318 Py_ssize_t, Py_ssize_t), 8319 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8320 const Py_UCS2*, Py_ssize_t, 8321 Py_ssize_t, Py_ssize_t), 8322 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8323 const Py_UCS4*, Py_ssize_t, 8324 Py_ssize_t, Py_ssize_t), 8325 PyObject* s1, PyObject* s2, 8326 Py_ssize_t start, 8327 Py_ssize_t end) 8328{ 8329 int kind1, kind2, kind; 8330 void *buf1, *buf2; 8331 Py_ssize_t len1, len2, result; 8332 8333 kind1 = PyUnicode_KIND(s1); 8334 kind2 = PyUnicode_KIND(s2); 8335 kind = kind1 > kind2 ? kind1 : kind2; 8336 buf1 = PyUnicode_DATA(s1); 8337 buf2 = PyUnicode_DATA(s2); 8338 if (kind1 != kind) 8339 buf1 = _PyUnicode_AsKind(s1, kind); 8340 if (!buf1) 8341 return -2; 8342 if (kind2 != kind) 8343 buf2 = _PyUnicode_AsKind(s2, kind); 8344 if (!buf2) { 8345 if (kind1 != kind) PyMem_Free(buf1); 8346 return -2; 8347 } 8348 len1 = PyUnicode_GET_LENGTH(s1); 8349 len2 = PyUnicode_GET_LENGTH(s2); 8350 8351 switch(kind) { 8352 case PyUnicode_1BYTE_KIND: 8353 result = ucs1(buf1, len1, buf2, len2, start, end); 8354 break; 8355 case PyUnicode_2BYTE_KIND: 8356 result = ucs2(buf1, len1, buf2, len2, start, end); 8357 break; 8358 case PyUnicode_4BYTE_KIND: 8359 result = ucs4(buf1, len1, buf2, len2, start, end); 8360 break; 8361 default: 8362 assert(0); result = -2; 8363 } 8364 8365 if (kind1 != kind) 8366 PyMem_Free(buf1); 8367 if (kind2 != kind) 8368 PyMem_Free(buf2); 8369 8370 return result; 8371} 8372 8373Py_ssize_t 8374_PyUnicode_InsertThousandsGrouping(int kind, void *data, 8375 Py_ssize_t n_buffer, 8376 void *digits, Py_ssize_t n_digits, 8377 Py_ssize_t min_width, 8378 const char *grouping, 8379 const char *thousands_sep) 8380{ 8381 switch(kind) { 8382 case PyUnicode_1BYTE_KIND: 8383 return _PyUnicode_ucs1_InsertThousandsGrouping( 8384 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8385 min_width, grouping, thousands_sep); 8386 case PyUnicode_2BYTE_KIND: 8387 return _PyUnicode_ucs2_InsertThousandsGrouping( 8388 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8389 min_width, grouping, thousands_sep); 8390 case PyUnicode_4BYTE_KIND: 8391 return _PyUnicode_ucs4_InsertThousandsGrouping( 8392 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8393 min_width, grouping, thousands_sep); 8394 } 8395 assert(0); 8396 return -1; 8397} 8398 8399 8400#include "stringlib/unicodedefs.h" 8401#include "stringlib/fastsearch.h" 8402 8403#include "stringlib/count.h" 8404#include "stringlib/find.h" 8405 8406/* helper macro to fixup start/end slice values */ 8407#define ADJUST_INDICES(start, end, len) \ 8408 if (end > len) \ 8409 end = len; \ 8410 else if (end < 0) { \ 8411 end += len; \ 8412 if (end < 0) \ 8413 end = 0; \ 8414 } \ 8415 if (start < 0) { \ 8416 start += len; \ 8417 if (start < 0) \ 8418 start = 0; \ 8419 } 8420 8421Py_ssize_t 8422PyUnicode_Count(PyObject *str, 8423 PyObject *substr, 8424 Py_ssize_t start, 8425 Py_ssize_t end) 8426{ 8427 Py_ssize_t result; 8428 PyUnicodeObject* str_obj; 8429 PyUnicodeObject* sub_obj; 8430 int kind1, kind2, kind; 8431 void *buf1 = NULL, *buf2 = NULL; 8432 Py_ssize_t len1, len2; 8433 8434 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8435 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8436 return -1; 8437 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8438 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8439 Py_DECREF(str_obj); 8440 return -1; 8441 } 8442 8443 kind1 = PyUnicode_KIND(str_obj); 8444 kind2 = PyUnicode_KIND(sub_obj); 8445 kind = kind1 > kind2 ? kind1 : kind2; 8446 buf1 = PyUnicode_DATA(str_obj); 8447 if (kind1 != kind) 8448 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8449 if (!buf1) 8450 goto onError; 8451 buf2 = PyUnicode_DATA(sub_obj); 8452 if (kind2 != kind) 8453 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8454 if (!buf2) 8455 goto onError; 8456 len1 = PyUnicode_GET_LENGTH(str_obj); 8457 len2 = PyUnicode_GET_LENGTH(sub_obj); 8458 8459 ADJUST_INDICES(start, end, len1); 8460 switch(kind) { 8461 case PyUnicode_1BYTE_KIND: 8462 result = ucs1lib_count( 8463 ((Py_UCS1*)buf1) + start, end - start, 8464 buf2, len2, PY_SSIZE_T_MAX 8465 ); 8466 break; 8467 case PyUnicode_2BYTE_KIND: 8468 result = ucs2lib_count( 8469 ((Py_UCS2*)buf1) + start, end - start, 8470 buf2, len2, PY_SSIZE_T_MAX 8471 ); 8472 break; 8473 case PyUnicode_4BYTE_KIND: 8474 result = ucs4lib_count( 8475 ((Py_UCS4*)buf1) + start, end - start, 8476 buf2, len2, PY_SSIZE_T_MAX 8477 ); 8478 break; 8479 default: 8480 assert(0); result = 0; 8481 } 8482 8483 Py_DECREF(sub_obj); 8484 Py_DECREF(str_obj); 8485 8486 if (kind1 != kind) 8487 PyMem_Free(buf1); 8488 if (kind2 != kind) 8489 PyMem_Free(buf2); 8490 8491 return result; 8492 onError: 8493 Py_DECREF(sub_obj); 8494 Py_DECREF(str_obj); 8495 if (kind1 != kind && buf1) 8496 PyMem_Free(buf1); 8497 if (kind2 != kind && buf2) 8498 PyMem_Free(buf2); 8499 return -1; 8500} 8501 8502Py_ssize_t 8503PyUnicode_Find(PyObject *str, 8504 PyObject *sub, 8505 Py_ssize_t start, 8506 Py_ssize_t end, 8507 int direction) 8508{ 8509 Py_ssize_t result; 8510 8511 str = PyUnicode_FromObject(str); 8512 if (!str || PyUnicode_READY(str) == -1) 8513 return -2; 8514 sub = PyUnicode_FromObject(sub); 8515 if (!sub || PyUnicode_READY(sub) == -1) { 8516 Py_DECREF(str); 8517 return -2; 8518 } 8519 8520 if (direction > 0) 8521 result = any_find_slice( 8522 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 8523 str, sub, start, end 8524 ); 8525 else 8526 result = any_find_slice( 8527 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8528 str, sub, start, end 8529 ); 8530 8531 Py_DECREF(str); 8532 Py_DECREF(sub); 8533 8534 return result; 8535} 8536 8537Py_ssize_t 8538PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8539 Py_ssize_t start, Py_ssize_t end, 8540 int direction) 8541{ 8542 char *result; 8543 int kind; 8544 if (PyUnicode_READY(str) == -1) 8545 return -2; 8546 if (start < 0 || end < 0) { 8547 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8548 return -2; 8549 } 8550 if (end > PyUnicode_GET_LENGTH(str)) 8551 end = PyUnicode_GET_LENGTH(str); 8552 kind = PyUnicode_KIND(str); 8553 result = findchar(PyUnicode_1BYTE_DATA(str) 8554 + PyUnicode_KIND_SIZE(kind, start), 8555 kind, 8556 end-start, ch, direction); 8557 if (!result) 8558 return -1; 8559 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8560} 8561 8562static int 8563tailmatch(PyUnicodeObject *self, 8564 PyUnicodeObject *substring, 8565 Py_ssize_t start, 8566 Py_ssize_t end, 8567 int direction) 8568{ 8569 int kind_self; 8570 int kind_sub; 8571 void *data_self; 8572 void *data_sub; 8573 Py_ssize_t offset; 8574 Py_ssize_t i; 8575 Py_ssize_t end_sub; 8576 8577 if (PyUnicode_READY(self) == -1 || 8578 PyUnicode_READY(substring) == -1) 8579 return 0; 8580 8581 if (PyUnicode_GET_LENGTH(substring) == 0) 8582 return 1; 8583 8584 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8585 end -= PyUnicode_GET_LENGTH(substring); 8586 if (end < start) 8587 return 0; 8588 8589 kind_self = PyUnicode_KIND(self); 8590 data_self = PyUnicode_DATA(self); 8591 kind_sub = PyUnicode_KIND(substring); 8592 data_sub = PyUnicode_DATA(substring); 8593 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8594 8595 if (direction > 0) 8596 offset = end; 8597 else 8598 offset = start; 8599 8600 if (PyUnicode_READ(kind_self, data_self, offset) == 8601 PyUnicode_READ(kind_sub, data_sub, 0) && 8602 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8603 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8604 /* If both are of the same kind, memcmp is sufficient */ 8605 if (kind_self == kind_sub) { 8606 return ! memcmp((char *)data_self + 8607 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8608 data_sub, 8609 PyUnicode_GET_LENGTH(substring) * 8610 PyUnicode_CHARACTER_SIZE(substring)); 8611 } 8612 /* otherwise we have to compare each character by first accesing it */ 8613 else { 8614 /* We do not need to compare 0 and len(substring)-1 because 8615 the if statement above ensured already that they are equal 8616 when we end up here. */ 8617 // TODO: honor direction and do a forward or backwards search 8618 for (i = 1; i < end_sub; ++i) { 8619 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8620 PyUnicode_READ(kind_sub, data_sub, i)) 8621 return 0; 8622 } 8623 return 1; 8624 } 8625 } 8626 8627 return 0; 8628} 8629 8630Py_ssize_t 8631PyUnicode_Tailmatch(PyObject *str, 8632 PyObject *substr, 8633 Py_ssize_t start, 8634 Py_ssize_t end, 8635 int direction) 8636{ 8637 Py_ssize_t result; 8638 8639 str = PyUnicode_FromObject(str); 8640 if (str == NULL) 8641 return -1; 8642 substr = PyUnicode_FromObject(substr); 8643 if (substr == NULL) { 8644 Py_DECREF(str); 8645 return -1; 8646 } 8647 8648 result = tailmatch((PyUnicodeObject *)str, 8649 (PyUnicodeObject *)substr, 8650 start, end, direction); 8651 Py_DECREF(str); 8652 Py_DECREF(substr); 8653 return result; 8654} 8655 8656/* Apply fixfct filter to the Unicode object self and return a 8657 reference to the modified object */ 8658 8659static PyObject * 8660fixup(PyUnicodeObject *self, 8661 Py_UCS4 (*fixfct)(PyUnicodeObject *s)) 8662{ 8663 PyObject *u; 8664 Py_UCS4 maxchar_old, maxchar_new = 0; 8665 8666 if (PyUnicode_READY(self) == -1) 8667 return NULL; 8668 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8669 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8670 maxchar_old); 8671 if (u == NULL) 8672 return NULL; 8673 8674 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8675 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8676 8677 /* fix functions return the new maximum character in a string, 8678 if the kind of the resulting unicode object does not change, 8679 everything is fine. Otherwise we need to change the string kind 8680 and re-run the fix function. */ 8681 maxchar_new = fixfct((PyUnicodeObject*)u); 8682 if (maxchar_new == 0) 8683 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8684 else if (maxchar_new <= 127) 8685 maxchar_new = 127; 8686 else if (maxchar_new <= 255) 8687 maxchar_new = 255; 8688 else if (maxchar_new <= 65535) 8689 maxchar_new = 65535; 8690 else 8691 maxchar_new = 1114111; /* 0x10ffff */ 8692 8693 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8694 /* fixfct should return TRUE if it modified the buffer. If 8695 FALSE, return a reference to the original buffer instead 8696 (to save space, not time) */ 8697 Py_INCREF(self); 8698 Py_DECREF(u); 8699 return (PyObject*) self; 8700 } 8701 else if (maxchar_new == maxchar_old) { 8702 return u; 8703 } 8704 else { 8705 /* In case the maximum character changed, we need to 8706 convert the string to the new category. */ 8707 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8708 if (v == NULL) { 8709 Py_DECREF(u); 8710 return NULL; 8711 } 8712 if (maxchar_new > maxchar_old) { 8713 /* If the maxchar increased so that the kind changed, not all 8714 characters are representable anymore and we need to fix the 8715 string again. This only happens in very few cases. */ 8716 if (PyUnicode_CopyCharacters(v, 0, 8717 (PyObject*)self, 0, 8718 PyUnicode_GET_LENGTH(self)) < 0) 8719 { 8720 Py_DECREF(u); 8721 return NULL; 8722 } 8723 maxchar_old = fixfct((PyUnicodeObject*)v); 8724 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8725 } 8726 else { 8727 if (PyUnicode_CopyCharacters(v, 0, 8728 u, 0, 8729 PyUnicode_GET_LENGTH(self)) < 0) 8730 { 8731 Py_DECREF(u); 8732 return NULL; 8733 } 8734 } 8735 8736 Py_DECREF(u); 8737 return v; 8738 } 8739} 8740 8741static Py_UCS4 8742fixupper(PyUnicodeObject *self) 8743{ 8744 /* No need to call PyUnicode_READY(self) because this function is only 8745 called as a callback from fixup() which does it already. */ 8746 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8747 const int kind = PyUnicode_KIND(self); 8748 void *data = PyUnicode_DATA(self); 8749 int touched = 0; 8750 Py_UCS4 maxchar = 0; 8751 Py_ssize_t i; 8752 8753 for (i = 0; i < len; ++i) { 8754 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8755 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8756 if (up != ch) { 8757 if (up > maxchar) 8758 maxchar = up; 8759 PyUnicode_WRITE(kind, data, i, up); 8760 touched = 1; 8761 } 8762 else if (ch > maxchar) 8763 maxchar = ch; 8764 } 8765 8766 if (touched) 8767 return maxchar; 8768 else 8769 return 0; 8770} 8771 8772static Py_UCS4 8773fixlower(PyUnicodeObject *self) 8774{ 8775 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8776 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8777 const int kind = PyUnicode_KIND(self); 8778 void *data = PyUnicode_DATA(self); 8779 int touched = 0; 8780 Py_UCS4 maxchar = 0; 8781 Py_ssize_t i; 8782 8783 for(i = 0; i < len; ++i) { 8784 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8785 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8786 if (lo != ch) { 8787 if (lo > maxchar) 8788 maxchar = lo; 8789 PyUnicode_WRITE(kind, data, i, lo); 8790 touched = 1; 8791 } 8792 else if (ch > maxchar) 8793 maxchar = ch; 8794 } 8795 8796 if (touched) 8797 return maxchar; 8798 else 8799 return 0; 8800} 8801 8802static Py_UCS4 8803fixswapcase(PyUnicodeObject *self) 8804{ 8805 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8806 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8807 const int kind = PyUnicode_KIND(self); 8808 void *data = PyUnicode_DATA(self); 8809 int touched = 0; 8810 Py_UCS4 maxchar = 0; 8811 Py_ssize_t i; 8812 8813 for(i = 0; i < len; ++i) { 8814 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8815 Py_UCS4 nu = 0; 8816 8817 if (Py_UNICODE_ISUPPER(ch)) 8818 nu = Py_UNICODE_TOLOWER(ch); 8819 else if (Py_UNICODE_ISLOWER(ch)) 8820 nu = Py_UNICODE_TOUPPER(ch); 8821 8822 if (nu != 0) { 8823 if (nu > maxchar) 8824 maxchar = nu; 8825 PyUnicode_WRITE(kind, data, i, nu); 8826 touched = 1; 8827 } 8828 else if (ch > maxchar) 8829 maxchar = ch; 8830 } 8831 8832 if (touched) 8833 return maxchar; 8834 else 8835 return 0; 8836} 8837 8838static Py_UCS4 8839fixcapitalize(PyUnicodeObject *self) 8840{ 8841 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8842 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8843 const int kind = PyUnicode_KIND(self); 8844 void *data = PyUnicode_DATA(self); 8845 int touched = 0; 8846 Py_UCS4 maxchar = 0; 8847 Py_ssize_t i = 0; 8848 Py_UCS4 ch; 8849 8850 if (len == 0) 8851 return 0; 8852 8853 ch = PyUnicode_READ(kind, data, i); 8854 if (!Py_UNICODE_ISUPPER(ch)) { 8855 maxchar = Py_UNICODE_TOUPPER(ch); 8856 PyUnicode_WRITE(kind, data, i, maxchar); 8857 touched = 1; 8858 } 8859 ++i; 8860 for(; i < len; ++i) { 8861 ch = PyUnicode_READ(kind, data, i); 8862 if (!Py_UNICODE_ISLOWER(ch)) { 8863 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8864 if (lo > maxchar) 8865 maxchar = lo; 8866 PyUnicode_WRITE(kind, data, i, lo); 8867 touched = 1; 8868 } 8869 else if (ch > maxchar) 8870 maxchar = ch; 8871 } 8872 8873 if (touched) 8874 return maxchar; 8875 else 8876 return 0; 8877} 8878 8879static Py_UCS4 8880fixtitle(PyUnicodeObject *self) 8881{ 8882 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8883 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8884 const int kind = PyUnicode_KIND(self); 8885 void *data = PyUnicode_DATA(self); 8886 Py_UCS4 maxchar = 0; 8887 Py_ssize_t i = 0; 8888 int previous_is_cased; 8889 8890 /* Shortcut for single character strings */ 8891 if (len == 1) { 8892 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8893 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 8894 if (ti != ch) { 8895 PyUnicode_WRITE(kind, data, i, ti); 8896 return ti; 8897 } 8898 else 8899 return 0; 8900 } 8901 previous_is_cased = 0; 8902 for(; i < len; ++i) { 8903 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8904 Py_UCS4 nu; 8905 8906 if (previous_is_cased) 8907 nu = Py_UNICODE_TOLOWER(ch); 8908 else 8909 nu = Py_UNICODE_TOTITLE(ch); 8910 8911 if (nu > maxchar) 8912 maxchar = nu; 8913 PyUnicode_WRITE(kind, data, i, nu); 8914 8915 if (Py_UNICODE_ISLOWER(ch) || 8916 Py_UNICODE_ISUPPER(ch) || 8917 Py_UNICODE_ISTITLE(ch)) 8918 previous_is_cased = 1; 8919 else 8920 previous_is_cased = 0; 8921 } 8922 return maxchar; 8923} 8924 8925PyObject * 8926PyUnicode_Join(PyObject *separator, PyObject *seq) 8927{ 8928 PyObject *sep = NULL; 8929 Py_ssize_t seplen = 1; 8930 PyObject *res = NULL; /* the result */ 8931 PyObject *fseq; /* PySequence_Fast(seq) */ 8932 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 8933 PyObject **items; 8934 PyObject *item; 8935 Py_ssize_t sz, i, res_offset; 8936 Py_UCS4 maxchar = 0; 8937 Py_UCS4 item_maxchar; 8938 8939 fseq = PySequence_Fast(seq, ""); 8940 if (fseq == NULL) { 8941 return NULL; 8942 } 8943 8944 /* NOTE: the following code can't call back into Python code, 8945 * so we are sure that fseq won't be mutated. 8946 */ 8947 8948 seqlen = PySequence_Fast_GET_SIZE(fseq); 8949 /* If empty sequence, return u"". */ 8950 if (seqlen == 0) { 8951 res = PyUnicode_New(0, 0); 8952 goto Done; 8953 } 8954 items = PySequence_Fast_ITEMS(fseq); 8955 /* If singleton sequence with an exact Unicode, return that. */ 8956 if (seqlen == 1) { 8957 item = items[0]; 8958 if (PyUnicode_CheckExact(item)) { 8959 Py_INCREF(item); 8960 res = item; 8961 goto Done; 8962 } 8963 } 8964 else { 8965 /* Set up sep and seplen */ 8966 if (separator == NULL) { 8967 /* fall back to a blank space separator */ 8968 sep = PyUnicode_FromOrdinal(' '); 8969 if (!sep) 8970 goto onError; 8971 } 8972 else { 8973 if (!PyUnicode_Check(separator)) { 8974 PyErr_Format(PyExc_TypeError, 8975 "separator: expected str instance," 8976 " %.80s found", 8977 Py_TYPE(separator)->tp_name); 8978 goto onError; 8979 } 8980 if (PyUnicode_READY(separator)) 8981 goto onError; 8982 sep = separator; 8983 seplen = PyUnicode_GET_LENGTH(separator); 8984 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 8985 /* inc refcount to keep this code path symmetric with the 8986 above case of a blank separator */ 8987 Py_INCREF(sep); 8988 } 8989 } 8990 8991 /* There are at least two things to join, or else we have a subclass 8992 * of str in the sequence. 8993 * Do a pre-pass to figure out the total amount of space we'll 8994 * need (sz), and see whether all argument are strings. 8995 */ 8996 sz = 0; 8997 for (i = 0; i < seqlen; i++) { 8998 const Py_ssize_t old_sz = sz; 8999 item = items[i]; 9000 if (!PyUnicode_Check(item)) { 9001 PyErr_Format(PyExc_TypeError, 9002 "sequence item %zd: expected str instance," 9003 " %.80s found", 9004 i, Py_TYPE(item)->tp_name); 9005 goto onError; 9006 } 9007 if (PyUnicode_READY(item) == -1) 9008 goto onError; 9009 sz += PyUnicode_GET_LENGTH(item); 9010 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9011 if (item_maxchar > maxchar) 9012 maxchar = item_maxchar; 9013 if (i != 0) 9014 sz += seplen; 9015 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9016 PyErr_SetString(PyExc_OverflowError, 9017 "join() result is too long for a Python string"); 9018 goto onError; 9019 } 9020 } 9021 9022 res = PyUnicode_New(sz, maxchar); 9023 if (res == NULL) 9024 goto onError; 9025 9026 /* Catenate everything. */ 9027 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9028 Py_ssize_t itemlen, copied; 9029 item = items[i]; 9030 /* Copy item, and maybe the separator. */ 9031 if (i && seplen != 0) { 9032 copied = PyUnicode_CopyCharacters(res, res_offset, 9033 sep, 0, seplen); 9034 if (copied < 0) 9035 goto onError; 9036#ifdef Py_DEBUG 9037 res_offset += copied; 9038#else 9039 res_offset += seplen; 9040#endif 9041 } 9042 itemlen = PyUnicode_GET_LENGTH(item); 9043 if (itemlen != 0) { 9044 copied = PyUnicode_CopyCharacters(res, res_offset, 9045 item, 0, itemlen); 9046 if (copied < 0) 9047 goto onError; 9048#ifdef Py_DEBUG 9049 res_offset += copied; 9050#else 9051 res_offset += itemlen; 9052#endif 9053 } 9054 } 9055 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9056 9057 Done: 9058 Py_DECREF(fseq); 9059 Py_XDECREF(sep); 9060 return res; 9061 9062 onError: 9063 Py_DECREF(fseq); 9064 Py_XDECREF(sep); 9065 Py_XDECREF(res); 9066 return NULL; 9067} 9068 9069#define FILL(kind, data, value, start, length) \ 9070 do { \ 9071 Py_ssize_t i_ = 0; \ 9072 assert(kind != PyUnicode_WCHAR_KIND); \ 9073 switch ((kind)) { \ 9074 case PyUnicode_1BYTE_KIND: { \ 9075 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9076 memset(to_, (unsigned char)value, length); \ 9077 break; \ 9078 } \ 9079 case PyUnicode_2BYTE_KIND: { \ 9080 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9081 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9082 break; \ 9083 } \ 9084 default: { \ 9085 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9086 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9087 break; \ 9088 } \ 9089 } \ 9090 } while (0) 9091 9092static PyUnicodeObject * 9093pad(PyUnicodeObject *self, 9094 Py_ssize_t left, 9095 Py_ssize_t right, 9096 Py_UCS4 fill) 9097{ 9098 PyObject *u; 9099 Py_UCS4 maxchar; 9100 int kind; 9101 void *data; 9102 9103 if (left < 0) 9104 left = 0; 9105 if (right < 0) 9106 right = 0; 9107 9108 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9109 Py_INCREF(self); 9110 return self; 9111 } 9112 9113 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9114 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9115 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9116 return NULL; 9117 } 9118 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9119 if (fill > maxchar) 9120 maxchar = fill; 9121 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9122 if (!u) 9123 return NULL; 9124 9125 kind = PyUnicode_KIND(u); 9126 data = PyUnicode_DATA(u); 9127 if (left) 9128 FILL(kind, data, fill, 0, left); 9129 if (right) 9130 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9131 if (PyUnicode_CopyCharacters(u, left, 9132 (PyObject*)self, 0, 9133 _PyUnicode_LENGTH(self)) < 0) 9134 { 9135 Py_DECREF(u); 9136 return NULL; 9137 } 9138 9139 return (PyUnicodeObject*)u; 9140} 9141#undef FILL 9142 9143PyObject * 9144PyUnicode_Splitlines(PyObject *string, int keepends) 9145{ 9146 PyObject *list; 9147 9148 string = PyUnicode_FromObject(string); 9149 if (string == NULL || PyUnicode_READY(string) == -1) 9150 return NULL; 9151 9152 switch(PyUnicode_KIND(string)) { 9153 case PyUnicode_1BYTE_KIND: 9154 list = ucs1lib_splitlines( 9155 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9156 PyUnicode_GET_LENGTH(string), keepends); 9157 break; 9158 case PyUnicode_2BYTE_KIND: 9159 list = ucs2lib_splitlines( 9160 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9161 PyUnicode_GET_LENGTH(string), keepends); 9162 break; 9163 case PyUnicode_4BYTE_KIND: 9164 list = ucs4lib_splitlines( 9165 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9166 PyUnicode_GET_LENGTH(string), keepends); 9167 break; 9168 default: 9169 assert(0); 9170 list = 0; 9171 } 9172 Py_DECREF(string); 9173 return list; 9174} 9175 9176static PyObject * 9177split(PyUnicodeObject *self, 9178 PyUnicodeObject *substring, 9179 Py_ssize_t maxcount) 9180{ 9181 int kind1, kind2, kind; 9182 void *buf1, *buf2; 9183 Py_ssize_t len1, len2; 9184 PyObject* out; 9185 9186 if (maxcount < 0) 9187 maxcount = PY_SSIZE_T_MAX; 9188 9189 if (PyUnicode_READY(self) == -1) 9190 return NULL; 9191 9192 if (substring == NULL) 9193 switch(PyUnicode_KIND(self)) { 9194 case PyUnicode_1BYTE_KIND: 9195 return ucs1lib_split_whitespace( 9196 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9197 PyUnicode_GET_LENGTH(self), maxcount 9198 ); 9199 case PyUnicode_2BYTE_KIND: 9200 return ucs2lib_split_whitespace( 9201 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9202 PyUnicode_GET_LENGTH(self), maxcount 9203 ); 9204 case PyUnicode_4BYTE_KIND: 9205 return ucs4lib_split_whitespace( 9206 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9207 PyUnicode_GET_LENGTH(self), maxcount 9208 ); 9209 default: 9210 assert(0); 9211 return NULL; 9212 } 9213 9214 if (PyUnicode_READY(substring) == -1) 9215 return NULL; 9216 9217 kind1 = PyUnicode_KIND(self); 9218 kind2 = PyUnicode_KIND(substring); 9219 kind = kind1 > kind2 ? kind1 : kind2; 9220 buf1 = PyUnicode_DATA(self); 9221 buf2 = PyUnicode_DATA(substring); 9222 if (kind1 != kind) 9223 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9224 if (!buf1) 9225 return NULL; 9226 if (kind2 != kind) 9227 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9228 if (!buf2) { 9229 if (kind1 != kind) PyMem_Free(buf1); 9230 return NULL; 9231 } 9232 len1 = PyUnicode_GET_LENGTH(self); 9233 len2 = PyUnicode_GET_LENGTH(substring); 9234 9235 switch(kind) { 9236 case PyUnicode_1BYTE_KIND: 9237 out = ucs1lib_split( 9238 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9239 break; 9240 case PyUnicode_2BYTE_KIND: 9241 out = ucs2lib_split( 9242 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9243 break; 9244 case PyUnicode_4BYTE_KIND: 9245 out = ucs4lib_split( 9246 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9247 break; 9248 default: 9249 out = NULL; 9250 } 9251 if (kind1 != kind) 9252 PyMem_Free(buf1); 9253 if (kind2 != kind) 9254 PyMem_Free(buf2); 9255 return out; 9256} 9257 9258static PyObject * 9259rsplit(PyUnicodeObject *self, 9260 PyUnicodeObject *substring, 9261 Py_ssize_t maxcount) 9262{ 9263 int kind1, kind2, kind; 9264 void *buf1, *buf2; 9265 Py_ssize_t len1, len2; 9266 PyObject* out; 9267 9268 if (maxcount < 0) 9269 maxcount = PY_SSIZE_T_MAX; 9270 9271 if (PyUnicode_READY(self) == -1) 9272 return NULL; 9273 9274 if (substring == NULL) 9275 switch(PyUnicode_KIND(self)) { 9276 case PyUnicode_1BYTE_KIND: 9277 return ucs1lib_rsplit_whitespace( 9278 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9279 PyUnicode_GET_LENGTH(self), maxcount 9280 ); 9281 case PyUnicode_2BYTE_KIND: 9282 return ucs2lib_rsplit_whitespace( 9283 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9284 PyUnicode_GET_LENGTH(self), maxcount 9285 ); 9286 case PyUnicode_4BYTE_KIND: 9287 return ucs4lib_rsplit_whitespace( 9288 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9289 PyUnicode_GET_LENGTH(self), maxcount 9290 ); 9291 default: 9292 assert(0); 9293 return NULL; 9294 } 9295 9296 if (PyUnicode_READY(substring) == -1) 9297 return NULL; 9298 9299 kind1 = PyUnicode_KIND(self); 9300 kind2 = PyUnicode_KIND(substring); 9301 kind = kind1 > kind2 ? kind1 : kind2; 9302 buf1 = PyUnicode_DATA(self); 9303 buf2 = PyUnicode_DATA(substring); 9304 if (kind1 != kind) 9305 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9306 if (!buf1) 9307 return NULL; 9308 if (kind2 != kind) 9309 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9310 if (!buf2) { 9311 if (kind1 != kind) PyMem_Free(buf1); 9312 return NULL; 9313 } 9314 len1 = PyUnicode_GET_LENGTH(self); 9315 len2 = PyUnicode_GET_LENGTH(substring); 9316 9317 switch(kind) { 9318 case PyUnicode_1BYTE_KIND: 9319 out = ucs1lib_rsplit( 9320 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9321 break; 9322 case PyUnicode_2BYTE_KIND: 9323 out = ucs2lib_rsplit( 9324 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9325 break; 9326 case PyUnicode_4BYTE_KIND: 9327 out = ucs4lib_rsplit( 9328 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9329 break; 9330 default: 9331 out = NULL; 9332 } 9333 if (kind1 != kind) 9334 PyMem_Free(buf1); 9335 if (kind2 != kind) 9336 PyMem_Free(buf2); 9337 return out; 9338} 9339 9340static Py_ssize_t 9341anylib_find(int kind, void *buf1, Py_ssize_t len1, 9342 void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9343{ 9344 switch(kind) { 9345 case PyUnicode_1BYTE_KIND: 9346 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9347 case PyUnicode_2BYTE_KIND: 9348 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9349 case PyUnicode_4BYTE_KIND: 9350 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9351 } 9352 assert(0); 9353 return -1; 9354} 9355 9356static Py_ssize_t 9357anylib_count(int kind, void* sbuf, Py_ssize_t slen, 9358 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9359{ 9360 switch(kind) { 9361 case PyUnicode_1BYTE_KIND: 9362 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9363 case PyUnicode_2BYTE_KIND: 9364 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9365 case PyUnicode_4BYTE_KIND: 9366 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9367 } 9368 assert(0); 9369 return 0; 9370} 9371 9372static PyObject * 9373replace(PyObject *self, PyObject *str1, 9374 PyObject *str2, Py_ssize_t maxcount) 9375{ 9376 PyObject *u; 9377 char *sbuf = PyUnicode_DATA(self); 9378 char *buf1 = PyUnicode_DATA(str1); 9379 char *buf2 = PyUnicode_DATA(str2); 9380 int srelease = 0, release1 = 0, release2 = 0; 9381 int skind = PyUnicode_KIND(self); 9382 int kind1 = PyUnicode_KIND(str1); 9383 int kind2 = PyUnicode_KIND(str2); 9384 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9385 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9386 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9387 9388 if (maxcount < 0) 9389 maxcount = PY_SSIZE_T_MAX; 9390 else if (maxcount == 0 || slen == 0) 9391 goto nothing; 9392 9393 if (skind < kind1) 9394 /* substring too wide to be present */ 9395 goto nothing; 9396 9397 if (len1 == len2) { 9398 Py_ssize_t i; 9399 /* same length */ 9400 if (len1 == 0) 9401 goto nothing; 9402 if (len1 == 1) { 9403 /* replace characters */ 9404 Py_UCS4 u1, u2, maxchar; 9405 int mayshrink, rkind; 9406 u1 = PyUnicode_READ_CHAR(str1, 0); 9407 if (!findchar(sbuf, PyUnicode_KIND(self), 9408 slen, u1, 1)) 9409 goto nothing; 9410 u2 = PyUnicode_READ_CHAR(str2, 0); 9411 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9412 /* Replacing u1 with u2 may cause a maxchar reduction in the 9413 result string. */ 9414 if (u2 > maxchar) { 9415 maxchar = u2; 9416 mayshrink = 0; 9417 } 9418 else 9419 mayshrink = maxchar > 127; 9420 u = PyUnicode_New(slen, maxchar); 9421 if (!u) 9422 goto error; 9423 if (PyUnicode_CopyCharacters(u, 0, 9424 (PyObject*)self, 0, slen) < 0) 9425 { 9426 Py_DECREF(u); 9427 return NULL; 9428 } 9429 rkind = PyUnicode_KIND(u); 9430 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9431 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9432 if (--maxcount < 0) 9433 break; 9434 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9435 } 9436 if (mayshrink) { 9437 PyObject *tmp = u; 9438 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9439 PyUnicode_GET_LENGTH(tmp)); 9440 Py_DECREF(tmp); 9441 } 9442 } else { 9443 int rkind = skind; 9444 char *res; 9445 if (kind1 < rkind) { 9446 /* widen substring */ 9447 buf1 = _PyUnicode_AsKind(str1, rkind); 9448 if (!buf1) goto error; 9449 release1 = 1; 9450 } 9451 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); 9452 if (i < 0) 9453 goto nothing; 9454 if (rkind > kind2) { 9455 /* widen replacement */ 9456 buf2 = _PyUnicode_AsKind(str2, rkind); 9457 if (!buf2) goto error; 9458 release2 = 1; 9459 } 9460 else if (rkind < kind2) { 9461 /* widen self and buf1 */ 9462 rkind = kind2; 9463 if (release1) PyMem_Free(buf1); 9464 sbuf = _PyUnicode_AsKind(self, rkind); 9465 if (!sbuf) goto error; 9466 srelease = 1; 9467 buf1 = _PyUnicode_AsKind(str1, rkind); 9468 if (!buf1) goto error; 9469 release1 = 1; 9470 } 9471 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9472 if (!res) { 9473 PyErr_NoMemory(); 9474 goto error; 9475 } 9476 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9477 /* change everything in-place, starting with this one */ 9478 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9479 buf2, 9480 PyUnicode_KIND_SIZE(rkind, len2)); 9481 i += len1; 9482 9483 while ( --maxcount > 0) { 9484 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), 9485 slen-i, 9486 buf1, len1, i); 9487 if (i == -1) 9488 break; 9489 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9490 buf2, 9491 PyUnicode_KIND_SIZE(rkind, len2)); 9492 i += len1; 9493 } 9494 9495 u = PyUnicode_FromKindAndData(rkind, res, slen); 9496 PyMem_Free(res); 9497 if (!u) goto error; 9498 } 9499 } else { 9500 9501 Py_ssize_t n, i, j, ires; 9502 Py_ssize_t product, new_size; 9503 int rkind = skind; 9504 char *res; 9505 9506 if (kind1 < rkind) { 9507 buf1 = _PyUnicode_AsKind(str1, rkind); 9508 if (!buf1) goto error; 9509 release1 = 1; 9510 } 9511 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); 9512 if (n == 0) 9513 goto nothing; 9514 if (kind2 < rkind) { 9515 buf2 = _PyUnicode_AsKind(str2, rkind); 9516 if (!buf2) goto error; 9517 release2 = 1; 9518 } 9519 else if (kind2 > rkind) { 9520 rkind = kind2; 9521 sbuf = _PyUnicode_AsKind(self, rkind); 9522 if (!sbuf) goto error; 9523 srelease = 1; 9524 if (release1) PyMem_Free(buf1); 9525 buf1 = _PyUnicode_AsKind(str1, rkind); 9526 if (!buf1) goto error; 9527 release1 = 1; 9528 } 9529 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9530 PyUnicode_GET_LENGTH(str1))); */ 9531 product = n * (len2-len1); 9532 if ((product / (len2-len1)) != n) { 9533 PyErr_SetString(PyExc_OverflowError, 9534 "replace string is too long"); 9535 goto error; 9536 } 9537 new_size = slen + product; 9538 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9539 PyErr_SetString(PyExc_OverflowError, 9540 "replace string is too long"); 9541 goto error; 9542 } 9543 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9544 if (!res) 9545 goto error; 9546 ires = i = 0; 9547 if (len1 > 0) { 9548 while (n-- > 0) { 9549 /* look for next match */ 9550 j = anylib_find(rkind, 9551 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9552 slen-i, buf1, len1, i); 9553 if (j == -1) 9554 break; 9555 else if (j > i) { 9556 /* copy unchanged part [i:j] */ 9557 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9558 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9559 PyUnicode_KIND_SIZE(rkind, j-i)); 9560 ires += j - i; 9561 } 9562 /* copy substitution string */ 9563 if (len2 > 0) { 9564 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9565 buf2, 9566 PyUnicode_KIND_SIZE(rkind, len2)); 9567 ires += len2; 9568 } 9569 i = j + len1; 9570 } 9571 if (i < slen) 9572 /* copy tail [i:] */ 9573 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9574 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9575 PyUnicode_KIND_SIZE(rkind, slen-i)); 9576 } else { 9577 /* interleave */ 9578 while (n > 0) { 9579 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9580 buf2, 9581 PyUnicode_KIND_SIZE(rkind, len2)); 9582 ires += len2; 9583 if (--n <= 0) 9584 break; 9585 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9586 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9587 PyUnicode_KIND_SIZE(rkind, 1)); 9588 ires++; 9589 i++; 9590 } 9591 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9592 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9593 PyUnicode_KIND_SIZE(rkind, slen-i)); 9594 } 9595 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9596 PyMem_Free(res); 9597 } 9598 if (srelease) 9599 PyMem_FREE(sbuf); 9600 if (release1) 9601 PyMem_FREE(buf1); 9602 if (release2) 9603 PyMem_FREE(buf2); 9604 return u; 9605 9606 nothing: 9607 /* nothing to replace; return original string (when possible) */ 9608 if (srelease) 9609 PyMem_FREE(sbuf); 9610 if (release1) 9611 PyMem_FREE(buf1); 9612 if (release2) 9613 PyMem_FREE(buf2); 9614 if (PyUnicode_CheckExact(self)) { 9615 Py_INCREF(self); 9616 return (PyObject *) self; 9617 } 9618 return PyUnicode_Copy(self); 9619 error: 9620 if (srelease && sbuf) 9621 PyMem_FREE(sbuf); 9622 if (release1 && buf1) 9623 PyMem_FREE(buf1); 9624 if (release2 && buf2) 9625 PyMem_FREE(buf2); 9626 return NULL; 9627} 9628 9629/* --- Unicode Object Methods --------------------------------------------- */ 9630 9631PyDoc_STRVAR(title__doc__, 9632 "S.title() -> str\n\ 9633\n\ 9634Return a titlecased version of S, i.e. words start with title case\n\ 9635characters, all remaining cased characters have lower case."); 9636 9637static PyObject* 9638unicode_title(PyUnicodeObject *self) 9639{ 9640 return fixup(self, fixtitle); 9641} 9642 9643PyDoc_STRVAR(capitalize__doc__, 9644 "S.capitalize() -> str\n\ 9645\n\ 9646Return a capitalized version of S, i.e. make the first character\n\ 9647have upper case and the rest lower case."); 9648 9649static PyObject* 9650unicode_capitalize(PyUnicodeObject *self) 9651{ 9652 return fixup(self, fixcapitalize); 9653} 9654 9655#if 0 9656PyDoc_STRVAR(capwords__doc__, 9657 "S.capwords() -> str\n\ 9658\n\ 9659Apply .capitalize() to all words in S and return the result with\n\ 9660normalized whitespace (all whitespace strings are replaced by ' ')."); 9661 9662static PyObject* 9663unicode_capwords(PyUnicodeObject *self) 9664{ 9665 PyObject *list; 9666 PyObject *item; 9667 Py_ssize_t i; 9668 9669 /* Split into words */ 9670 list = split(self, NULL, -1); 9671 if (!list) 9672 return NULL; 9673 9674 /* Capitalize each word */ 9675 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9676 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9677 fixcapitalize); 9678 if (item == NULL) 9679 goto onError; 9680 Py_DECREF(PyList_GET_ITEM(list, i)); 9681 PyList_SET_ITEM(list, i, item); 9682 } 9683 9684 /* Join the words to form a new string */ 9685 item = PyUnicode_Join(NULL, list); 9686 9687 onError: 9688 Py_DECREF(list); 9689 return (PyObject *)item; 9690} 9691#endif 9692 9693/* Argument converter. Coerces to a single unicode character */ 9694 9695static int 9696convert_uc(PyObject *obj, void *addr) 9697{ 9698 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9699 PyObject *uniobj; 9700 9701 uniobj = PyUnicode_FromObject(obj); 9702 if (uniobj == NULL) { 9703 PyErr_SetString(PyExc_TypeError, 9704 "The fill character cannot be converted to Unicode"); 9705 return 0; 9706 } 9707 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9708 PyErr_SetString(PyExc_TypeError, 9709 "The fill character must be exactly one character long"); 9710 Py_DECREF(uniobj); 9711 return 0; 9712 } 9713 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9714 Py_DECREF(uniobj); 9715 return 1; 9716} 9717 9718PyDoc_STRVAR(center__doc__, 9719 "S.center(width[, fillchar]) -> str\n\ 9720\n\ 9721Return S centered in a string of length width. Padding is\n\ 9722done using the specified fill character (default is a space)"); 9723 9724static PyObject * 9725unicode_center(PyUnicodeObject *self, PyObject *args) 9726{ 9727 Py_ssize_t marg, left; 9728 Py_ssize_t width; 9729 Py_UCS4 fillchar = ' '; 9730 9731 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9732 return NULL; 9733 9734 if (PyUnicode_READY(self) == -1) 9735 return NULL; 9736 9737 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9738 Py_INCREF(self); 9739 return (PyObject*) self; 9740 } 9741 9742 marg = width - _PyUnicode_LENGTH(self); 9743 left = marg / 2 + (marg & width & 1); 9744 9745 return (PyObject*) pad(self, left, marg - left, fillchar); 9746} 9747 9748#if 0 9749 9750/* This code should go into some future Unicode collation support 9751 module. The basic comparison should compare ordinals on a naive 9752 basis (this is what Java does and thus Jython too). */ 9753 9754/* speedy UTF-16 code point order comparison */ 9755/* gleaned from: */ 9756/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9757 9758static short utf16Fixup[32] = 9759{ 9760 0, 0, 0, 0, 0, 0, 0, 0, 9761 0, 0, 0, 0, 0, 0, 0, 0, 9762 0, 0, 0, 0, 0, 0, 0, 0, 9763 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9764}; 9765 9766static int 9767unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9768{ 9769 Py_ssize_t len1, len2; 9770 9771 Py_UNICODE *s1 = str1->str; 9772 Py_UNICODE *s2 = str2->str; 9773 9774 len1 = str1->_base._base.length; 9775 len2 = str2->_base._base.length; 9776 9777 while (len1 > 0 && len2 > 0) { 9778 Py_UNICODE c1, c2; 9779 9780 c1 = *s1++; 9781 c2 = *s2++; 9782 9783 if (c1 > (1<<11) * 26) 9784 c1 += utf16Fixup[c1>>11]; 9785 if (c2 > (1<<11) * 26) 9786 c2 += utf16Fixup[c2>>11]; 9787 /* now c1 and c2 are in UTF-32-compatible order */ 9788 9789 if (c1 != c2) 9790 return (c1 < c2) ? -1 : 1; 9791 9792 len1--; len2--; 9793 } 9794 9795 return (len1 < len2) ? -1 : (len1 != len2); 9796} 9797 9798#else 9799 9800/* This function assumes that str1 and str2 are readied by the caller. */ 9801 9802static int 9803unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9804{ 9805 int kind1, kind2; 9806 void *data1, *data2; 9807 Py_ssize_t len1, len2, i; 9808 9809 kind1 = PyUnicode_KIND(str1); 9810 kind2 = PyUnicode_KIND(str2); 9811 data1 = PyUnicode_DATA(str1); 9812 data2 = PyUnicode_DATA(str2); 9813 len1 = PyUnicode_GET_LENGTH(str1); 9814 len2 = PyUnicode_GET_LENGTH(str2); 9815 9816 for (i = 0; i < len1 && i < len2; ++i) { 9817 Py_UCS4 c1, c2; 9818 c1 = PyUnicode_READ(kind1, data1, i); 9819 c2 = PyUnicode_READ(kind2, data2, i); 9820 9821 if (c1 != c2) 9822 return (c1 < c2) ? -1 : 1; 9823 } 9824 9825 return (len1 < len2) ? -1 : (len1 != len2); 9826} 9827 9828#endif 9829 9830int 9831PyUnicode_Compare(PyObject *left, PyObject *right) 9832{ 9833 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9834 if (PyUnicode_READY(left) == -1 || 9835 PyUnicode_READY(right) == -1) 9836 return -1; 9837 return unicode_compare((PyUnicodeObject *)left, 9838 (PyUnicodeObject *)right); 9839 } 9840 PyErr_Format(PyExc_TypeError, 9841 "Can't compare %.100s and %.100s", 9842 left->ob_type->tp_name, 9843 right->ob_type->tp_name); 9844 return -1; 9845} 9846 9847int 9848PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9849{ 9850 Py_ssize_t i; 9851 int kind; 9852 void *data; 9853 Py_UCS4 chr; 9854 9855 assert(_PyUnicode_CHECK(uni)); 9856 if (PyUnicode_READY(uni) == -1) 9857 return -1; 9858 kind = PyUnicode_KIND(uni); 9859 data = PyUnicode_DATA(uni); 9860 /* Compare Unicode string and source character set string */ 9861 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9862 if (chr != str[i]) 9863 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9864 /* This check keeps Python strings that end in '\0' from comparing equal 9865 to C strings identical up to that point. */ 9866 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9867 return 1; /* uni is longer */ 9868 if (str[i]) 9869 return -1; /* str is longer */ 9870 return 0; 9871} 9872 9873 9874#define TEST_COND(cond) \ 9875 ((cond) ? Py_True : Py_False) 9876 9877PyObject * 9878PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 9879{ 9880 int result; 9881 9882 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9883 PyObject *v; 9884 if (PyUnicode_READY(left) == -1 || 9885 PyUnicode_READY(right) == -1) 9886 return NULL; 9887 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 9888 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 9889 if (op == Py_EQ) { 9890 Py_INCREF(Py_False); 9891 return Py_False; 9892 } 9893 if (op == Py_NE) { 9894 Py_INCREF(Py_True); 9895 return Py_True; 9896 } 9897 } 9898 if (left == right) 9899 result = 0; 9900 else 9901 result = unicode_compare((PyUnicodeObject *)left, 9902 (PyUnicodeObject *)right); 9903 9904 /* Convert the return value to a Boolean */ 9905 switch (op) { 9906 case Py_EQ: 9907 v = TEST_COND(result == 0); 9908 break; 9909 case Py_NE: 9910 v = TEST_COND(result != 0); 9911 break; 9912 case Py_LE: 9913 v = TEST_COND(result <= 0); 9914 break; 9915 case Py_GE: 9916 v = TEST_COND(result >= 0); 9917 break; 9918 case Py_LT: 9919 v = TEST_COND(result == -1); 9920 break; 9921 case Py_GT: 9922 v = TEST_COND(result == 1); 9923 break; 9924 default: 9925 PyErr_BadArgument(); 9926 return NULL; 9927 } 9928 Py_INCREF(v); 9929 return v; 9930 } 9931 9932 Py_RETURN_NOTIMPLEMENTED; 9933} 9934 9935int 9936PyUnicode_Contains(PyObject *container, PyObject *element) 9937{ 9938 PyObject *str, *sub; 9939 int kind1, kind2, kind; 9940 void *buf1, *buf2; 9941 Py_ssize_t len1, len2; 9942 int result; 9943 9944 /* Coerce the two arguments */ 9945 sub = PyUnicode_FromObject(element); 9946 if (!sub) { 9947 PyErr_Format(PyExc_TypeError, 9948 "'in <string>' requires string as left operand, not %s", 9949 element->ob_type->tp_name); 9950 return -1; 9951 } 9952 if (PyUnicode_READY(sub) == -1) 9953 return -1; 9954 9955 str = PyUnicode_FromObject(container); 9956 if (!str || PyUnicode_READY(str) == -1) { 9957 Py_DECREF(sub); 9958 return -1; 9959 } 9960 9961 kind1 = PyUnicode_KIND(str); 9962 kind2 = PyUnicode_KIND(sub); 9963 kind = kind1 > kind2 ? kind1 : kind2; 9964 buf1 = PyUnicode_DATA(str); 9965 buf2 = PyUnicode_DATA(sub); 9966 if (kind1 != kind) 9967 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 9968 if (!buf1) { 9969 Py_DECREF(sub); 9970 return -1; 9971 } 9972 if (kind2 != kind) 9973 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 9974 if (!buf2) { 9975 Py_DECREF(sub); 9976 if (kind1 != kind) PyMem_Free(buf1); 9977 return -1; 9978 } 9979 len1 = PyUnicode_GET_LENGTH(str); 9980 len2 = PyUnicode_GET_LENGTH(sub); 9981 9982 switch(kind) { 9983 case PyUnicode_1BYTE_KIND: 9984 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 9985 break; 9986 case PyUnicode_2BYTE_KIND: 9987 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 9988 break; 9989 case PyUnicode_4BYTE_KIND: 9990 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 9991 break; 9992 default: 9993 result = -1; 9994 assert(0); 9995 } 9996 9997 Py_DECREF(str); 9998 Py_DECREF(sub); 9999 10000 if (kind1 != kind) 10001 PyMem_Free(buf1); 10002 if (kind2 != kind) 10003 PyMem_Free(buf2); 10004 10005 return result; 10006} 10007 10008/* Concat to string or Unicode object giving a new Unicode object. */ 10009 10010PyObject * 10011PyUnicode_Concat(PyObject *left, PyObject *right) 10012{ 10013 PyObject *u = NULL, *v = NULL, *w; 10014 Py_UCS4 maxchar; 10015 10016 /* Coerce the two arguments */ 10017 u = PyUnicode_FromObject(left); 10018 if (u == NULL) 10019 goto onError; 10020 v = PyUnicode_FromObject(right); 10021 if (v == NULL) 10022 goto onError; 10023 10024 /* Shortcuts */ 10025 if (v == unicode_empty) { 10026 Py_DECREF(v); 10027 return u; 10028 } 10029 if (u == unicode_empty) { 10030 Py_DECREF(u); 10031 return v; 10032 } 10033 10034 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10035 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 10036 10037 /* Concat the two Unicode strings */ 10038 w = PyUnicode_New( 10039 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10040 maxchar); 10041 if (w == NULL) 10042 goto onError; 10043 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) 10044 goto onError; 10045 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), 10046 v, 0, 10047 PyUnicode_GET_LENGTH(v)) < 0) 10048 goto onError; 10049 Py_DECREF(u); 10050 Py_DECREF(v); 10051 return w; 10052 10053 onError: 10054 Py_XDECREF(u); 10055 Py_XDECREF(v); 10056 return NULL; 10057} 10058 10059static void 10060unicode_append_inplace(PyObject **p_left, PyObject *right) 10061{ 10062 Py_ssize_t left_len, right_len, new_len; 10063#ifdef Py_DEBUG 10064 Py_ssize_t copied; 10065#endif 10066 10067 assert(PyUnicode_IS_READY(*p_left)); 10068 assert(PyUnicode_IS_READY(right)); 10069 10070 left_len = PyUnicode_GET_LENGTH(*p_left); 10071 right_len = PyUnicode_GET_LENGTH(right); 10072 if (left_len > PY_SSIZE_T_MAX - right_len) { 10073 PyErr_SetString(PyExc_OverflowError, 10074 "strings are too large to concat"); 10075 goto error; 10076 } 10077 new_len = left_len + right_len; 10078 10079 /* Now we own the last reference to 'left', so we can resize it 10080 * in-place. 10081 */ 10082 if (unicode_resize(p_left, new_len) != 0) { 10083 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10084 * deallocated so it cannot be put back into 10085 * 'variable'. The MemoryError is raised when there 10086 * is no value in 'variable', which might (very 10087 * remotely) be a cause of incompatibilities. 10088 */ 10089 goto error; 10090 } 10091 /* copy 'right' into the newly allocated area of 'left' */ 10092#ifdef Py_DEBUG 10093 copied = PyUnicode_CopyCharacters(*p_left, left_len, 10094 right, 0, 10095 right_len); 10096 assert(0 <= copied); 10097#else 10098 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len); 10099#endif 10100 return; 10101 10102error: 10103 Py_DECREF(*p_left); 10104 *p_left = NULL; 10105} 10106 10107void 10108PyUnicode_Append(PyObject **p_left, PyObject *right) 10109{ 10110 PyObject *left, *res; 10111 10112 if (p_left == NULL) { 10113 if (!PyErr_Occurred()) 10114 PyErr_BadInternalCall(); 10115 return; 10116 } 10117 left = *p_left; 10118 if (right == NULL || !PyUnicode_Check(left)) { 10119 if (!PyErr_Occurred()) 10120 PyErr_BadInternalCall(); 10121 goto error; 10122 } 10123 10124 if (PyUnicode_READY(left)) 10125 goto error; 10126 if (PyUnicode_READY(right)) 10127 goto error; 10128 10129 if (PyUnicode_CheckExact(left) && left != unicode_empty 10130 && PyUnicode_CheckExact(right) && right != unicode_empty 10131 && unicode_resizable(left) 10132 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10133 || _PyUnicode_WSTR(left) != NULL)) 10134 { 10135 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10136 to change the structure size, but characters are stored just after 10137 the structure, and so it requires to move all characters which is 10138 not so different than duplicating the string. */ 10139 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10140 { 10141 unicode_append_inplace(p_left, right); 10142 return; 10143 } 10144 } 10145 10146 res = PyUnicode_Concat(left, right); 10147 if (res == NULL) 10148 goto error; 10149 Py_DECREF(left); 10150 *p_left = res; 10151 return; 10152 10153error: 10154 Py_DECREF(*p_left); 10155 *p_left = NULL; 10156} 10157 10158void 10159PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10160{ 10161 PyUnicode_Append(pleft, right); 10162 Py_XDECREF(right); 10163} 10164 10165PyDoc_STRVAR(count__doc__, 10166 "S.count(sub[, start[, end]]) -> int\n\ 10167\n\ 10168Return the number of non-overlapping occurrences of substring sub in\n\ 10169string S[start:end]. Optional arguments start and end are\n\ 10170interpreted as in slice notation."); 10171 10172static PyObject * 10173unicode_count(PyUnicodeObject *self, PyObject *args) 10174{ 10175 PyUnicodeObject *substring; 10176 Py_ssize_t start = 0; 10177 Py_ssize_t end = PY_SSIZE_T_MAX; 10178 PyObject *result; 10179 int kind1, kind2, kind; 10180 void *buf1, *buf2; 10181 Py_ssize_t len1, len2, iresult; 10182 10183 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10184 &start, &end)) 10185 return NULL; 10186 10187 kind1 = PyUnicode_KIND(self); 10188 kind2 = PyUnicode_KIND(substring); 10189 kind = kind1 > kind2 ? kind1 : kind2; 10190 buf1 = PyUnicode_DATA(self); 10191 buf2 = PyUnicode_DATA(substring); 10192 if (kind1 != kind) 10193 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10194 if (!buf1) { 10195 Py_DECREF(substring); 10196 return NULL; 10197 } 10198 if (kind2 != kind) 10199 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10200 if (!buf2) { 10201 Py_DECREF(substring); 10202 if (kind1 != kind) PyMem_Free(buf1); 10203 return NULL; 10204 } 10205 len1 = PyUnicode_GET_LENGTH(self); 10206 len2 = PyUnicode_GET_LENGTH(substring); 10207 10208 ADJUST_INDICES(start, end, len1); 10209 switch(kind) { 10210 case PyUnicode_1BYTE_KIND: 10211 iresult = ucs1lib_count( 10212 ((Py_UCS1*)buf1) + start, end - start, 10213 buf2, len2, PY_SSIZE_T_MAX 10214 ); 10215 break; 10216 case PyUnicode_2BYTE_KIND: 10217 iresult = ucs2lib_count( 10218 ((Py_UCS2*)buf1) + start, end - start, 10219 buf2, len2, PY_SSIZE_T_MAX 10220 ); 10221 break; 10222 case PyUnicode_4BYTE_KIND: 10223 iresult = ucs4lib_count( 10224 ((Py_UCS4*)buf1) + start, end - start, 10225 buf2, len2, PY_SSIZE_T_MAX 10226 ); 10227 break; 10228 default: 10229 assert(0); iresult = 0; 10230 } 10231 10232 result = PyLong_FromSsize_t(iresult); 10233 10234 if (kind1 != kind) 10235 PyMem_Free(buf1); 10236 if (kind2 != kind) 10237 PyMem_Free(buf2); 10238 10239 Py_DECREF(substring); 10240 10241 return result; 10242} 10243 10244PyDoc_STRVAR(encode__doc__, 10245 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10246\n\ 10247Encode S using the codec registered for encoding. Default encoding\n\ 10248is 'utf-8'. errors may be given to set a different error\n\ 10249handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10250a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10251'xmlcharrefreplace' as well as any other name registered with\n\ 10252codecs.register_error that can handle UnicodeEncodeErrors."); 10253 10254static PyObject * 10255unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10256{ 10257 static char *kwlist[] = {"encoding", "errors", 0}; 10258 char *encoding = NULL; 10259 char *errors = NULL; 10260 10261 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10262 kwlist, &encoding, &errors)) 10263 return NULL; 10264 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10265} 10266 10267PyDoc_STRVAR(expandtabs__doc__, 10268 "S.expandtabs([tabsize]) -> str\n\ 10269\n\ 10270Return a copy of S where all tab characters are expanded using spaces.\n\ 10271If tabsize is not given, a tab size of 8 characters is assumed."); 10272 10273static PyObject* 10274unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10275{ 10276 Py_ssize_t i, j, line_pos, src_len, incr; 10277 Py_UCS4 ch; 10278 PyObject *u; 10279 void *src_data, *dest_data; 10280 int tabsize = 8; 10281 int kind; 10282 int found; 10283 10284 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10285 return NULL; 10286 10287 if (PyUnicode_READY(self) == -1) 10288 return NULL; 10289 10290 /* First pass: determine size of output string */ 10291 src_len = PyUnicode_GET_LENGTH(self); 10292 i = j = line_pos = 0; 10293 kind = PyUnicode_KIND(self); 10294 src_data = PyUnicode_DATA(self); 10295 found = 0; 10296 for (; i < src_len; i++) { 10297 ch = PyUnicode_READ(kind, src_data, i); 10298 if (ch == '\t') { 10299 found = 1; 10300 if (tabsize > 0) { 10301 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10302 if (j > PY_SSIZE_T_MAX - incr) 10303 goto overflow; 10304 line_pos += incr; 10305 j += incr; 10306 } 10307 } 10308 else { 10309 if (j > PY_SSIZE_T_MAX - 1) 10310 goto overflow; 10311 line_pos++; 10312 j++; 10313 if (ch == '\n' || ch == '\r') 10314 line_pos = 0; 10315 } 10316 } 10317 if (!found && PyUnicode_CheckExact(self)) { 10318 Py_INCREF((PyObject *) self); 10319 return (PyObject *) self; 10320 } 10321 10322 /* Second pass: create output string and fill it */ 10323 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10324 if (!u) 10325 return NULL; 10326 dest_data = PyUnicode_DATA(u); 10327 10328 i = j = line_pos = 0; 10329 10330 for (; i < src_len; i++) { 10331 ch = PyUnicode_READ(kind, src_data, i); 10332 if (ch == '\t') { 10333 if (tabsize > 0) { 10334 incr = tabsize - (line_pos % tabsize); 10335 line_pos += incr; 10336 while (incr--) { 10337 PyUnicode_WRITE(kind, dest_data, j, ' '); 10338 j++; 10339 } 10340 } 10341 } 10342 else { 10343 line_pos++; 10344 PyUnicode_WRITE(kind, dest_data, j, ch); 10345 j++; 10346 if (ch == '\n' || ch == '\r') 10347 line_pos = 0; 10348 } 10349 } 10350 assert (j == PyUnicode_GET_LENGTH(u)); 10351#ifndef DONT_MAKE_RESULT_READY 10352 if (_PyUnicode_READY_REPLACE(&u)) { 10353 Py_DECREF(u); 10354 return NULL; 10355 } 10356#endif 10357 return (PyObject*) u; 10358 10359 overflow: 10360 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10361 return NULL; 10362} 10363 10364PyDoc_STRVAR(find__doc__, 10365 "S.find(sub[, start[, end]]) -> int\n\ 10366\n\ 10367Return the lowest index in S where substring sub is found,\n\ 10368such that sub is contained within S[start:end]. Optional\n\ 10369arguments start and end are interpreted as in slice notation.\n\ 10370\n\ 10371Return -1 on failure."); 10372 10373static PyObject * 10374unicode_find(PyObject *self, PyObject *args) 10375{ 10376 PyUnicodeObject *substring; 10377 Py_ssize_t start; 10378 Py_ssize_t end; 10379 Py_ssize_t result; 10380 10381 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10382 &start, &end)) 10383 return NULL; 10384 10385 if (PyUnicode_READY(self) == -1) 10386 return NULL; 10387 if (PyUnicode_READY(substring) == -1) 10388 return NULL; 10389 10390 result = any_find_slice( 10391 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10392 self, (PyObject*)substring, start, end 10393 ); 10394 10395 Py_DECREF(substring); 10396 10397 if (result == -2) 10398 return NULL; 10399 10400 return PyLong_FromSsize_t(result); 10401} 10402 10403static PyObject * 10404unicode_getitem(PyObject *self, Py_ssize_t index) 10405{ 10406 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10407 if (ch == (Py_UCS4)-1) 10408 return NULL; 10409 return PyUnicode_FromOrdinal(ch); 10410} 10411 10412/* Believe it or not, this produces the same value for ASCII strings 10413 as bytes_hash(). */ 10414static Py_hash_t 10415unicode_hash(PyUnicodeObject *self) 10416{ 10417 Py_ssize_t len; 10418 Py_uhash_t x; 10419 10420 if (_PyUnicode_HASH(self) != -1) 10421 return _PyUnicode_HASH(self); 10422 if (PyUnicode_READY(self) == -1) 10423 return -1; 10424 len = PyUnicode_GET_LENGTH(self); 10425 10426 /* The hash function as a macro, gets expanded three times below. */ 10427#define HASH(P) \ 10428 x = (Py_uhash_t)*P << 7; \ 10429 while (--len >= 0) \ 10430 x = (1000003*x) ^ (Py_uhash_t)*P++; 10431 10432 switch (PyUnicode_KIND(self)) { 10433 case PyUnicode_1BYTE_KIND: { 10434 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10435 HASH(c); 10436 break; 10437 } 10438 case PyUnicode_2BYTE_KIND: { 10439 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10440 HASH(s); 10441 break; 10442 } 10443 default: { 10444 Py_UCS4 *l; 10445 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10446 "Impossible switch case in unicode_hash"); 10447 l = PyUnicode_4BYTE_DATA(self); 10448 HASH(l); 10449 break; 10450 } 10451 } 10452 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10453 10454 if (x == -1) 10455 x = -2; 10456 _PyUnicode_HASH(self) = x; 10457 return x; 10458} 10459#undef HASH 10460 10461PyDoc_STRVAR(index__doc__, 10462 "S.index(sub[, start[, end]]) -> int\n\ 10463\n\ 10464Like S.find() but raise ValueError when the substring is not found."); 10465 10466static PyObject * 10467unicode_index(PyObject *self, PyObject *args) 10468{ 10469 Py_ssize_t result; 10470 PyUnicodeObject *substring; 10471 Py_ssize_t start; 10472 Py_ssize_t end; 10473 10474 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10475 &start, &end)) 10476 return NULL; 10477 10478 if (PyUnicode_READY(self) == -1) 10479 return NULL; 10480 if (PyUnicode_READY(substring) == -1) 10481 return NULL; 10482 10483 result = any_find_slice( 10484 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10485 self, (PyObject*)substring, start, end 10486 ); 10487 10488 Py_DECREF(substring); 10489 10490 if (result == -2) 10491 return NULL; 10492 10493 if (result < 0) { 10494 PyErr_SetString(PyExc_ValueError, "substring not found"); 10495 return NULL; 10496 } 10497 10498 return PyLong_FromSsize_t(result); 10499} 10500 10501PyDoc_STRVAR(islower__doc__, 10502 "S.islower() -> bool\n\ 10503\n\ 10504Return True if all cased characters in S are lowercase and there is\n\ 10505at least one cased character in S, False otherwise."); 10506 10507static PyObject* 10508unicode_islower(PyUnicodeObject *self) 10509{ 10510 Py_ssize_t i, length; 10511 int kind; 10512 void *data; 10513 int cased; 10514 10515 if (PyUnicode_READY(self) == -1) 10516 return NULL; 10517 length = PyUnicode_GET_LENGTH(self); 10518 kind = PyUnicode_KIND(self); 10519 data = PyUnicode_DATA(self); 10520 10521 /* Shortcut for single character strings */ 10522 if (length == 1) 10523 return PyBool_FromLong( 10524 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10525 10526 /* Special case for empty strings */ 10527 if (length == 0) 10528 return PyBool_FromLong(0); 10529 10530 cased = 0; 10531 for (i = 0; i < length; i++) { 10532 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10533 10534 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10535 return PyBool_FromLong(0); 10536 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10537 cased = 1; 10538 } 10539 return PyBool_FromLong(cased); 10540} 10541 10542PyDoc_STRVAR(isupper__doc__, 10543 "S.isupper() -> bool\n\ 10544\n\ 10545Return True if all cased characters in S are uppercase and there is\n\ 10546at least one cased character in S, False otherwise."); 10547 10548static PyObject* 10549unicode_isupper(PyUnicodeObject *self) 10550{ 10551 Py_ssize_t i, length; 10552 int kind; 10553 void *data; 10554 int cased; 10555 10556 if (PyUnicode_READY(self) == -1) 10557 return NULL; 10558 length = PyUnicode_GET_LENGTH(self); 10559 kind = PyUnicode_KIND(self); 10560 data = PyUnicode_DATA(self); 10561 10562 /* Shortcut for single character strings */ 10563 if (length == 1) 10564 return PyBool_FromLong( 10565 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10566 10567 /* Special case for empty strings */ 10568 if (length == 0) 10569 return PyBool_FromLong(0); 10570 10571 cased = 0; 10572 for (i = 0; i < length; i++) { 10573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10574 10575 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10576 return PyBool_FromLong(0); 10577 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10578 cased = 1; 10579 } 10580 return PyBool_FromLong(cased); 10581} 10582 10583PyDoc_STRVAR(istitle__doc__, 10584 "S.istitle() -> bool\n\ 10585\n\ 10586Return True if S is a titlecased string and there is at least one\n\ 10587character in S, i.e. upper- and titlecase characters may only\n\ 10588follow uncased characters and lowercase characters only cased ones.\n\ 10589Return False otherwise."); 10590 10591static PyObject* 10592unicode_istitle(PyUnicodeObject *self) 10593{ 10594 Py_ssize_t i, length; 10595 int kind; 10596 void *data; 10597 int cased, previous_is_cased; 10598 10599 if (PyUnicode_READY(self) == -1) 10600 return NULL; 10601 length = PyUnicode_GET_LENGTH(self); 10602 kind = PyUnicode_KIND(self); 10603 data = PyUnicode_DATA(self); 10604 10605 /* Shortcut for single character strings */ 10606 if (length == 1) { 10607 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10608 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10609 (Py_UNICODE_ISUPPER(ch) != 0)); 10610 } 10611 10612 /* Special case for empty strings */ 10613 if (length == 0) 10614 return PyBool_FromLong(0); 10615 10616 cased = 0; 10617 previous_is_cased = 0; 10618 for (i = 0; i < length; i++) { 10619 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10620 10621 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10622 if (previous_is_cased) 10623 return PyBool_FromLong(0); 10624 previous_is_cased = 1; 10625 cased = 1; 10626 } 10627 else if (Py_UNICODE_ISLOWER(ch)) { 10628 if (!previous_is_cased) 10629 return PyBool_FromLong(0); 10630 previous_is_cased = 1; 10631 cased = 1; 10632 } 10633 else 10634 previous_is_cased = 0; 10635 } 10636 return PyBool_FromLong(cased); 10637} 10638 10639PyDoc_STRVAR(isspace__doc__, 10640 "S.isspace() -> bool\n\ 10641\n\ 10642Return True if all characters in S are whitespace\n\ 10643and there is at least one character in S, False otherwise."); 10644 10645static PyObject* 10646unicode_isspace(PyUnicodeObject *self) 10647{ 10648 Py_ssize_t i, length; 10649 int kind; 10650 void *data; 10651 10652 if (PyUnicode_READY(self) == -1) 10653 return NULL; 10654 length = PyUnicode_GET_LENGTH(self); 10655 kind = PyUnicode_KIND(self); 10656 data = PyUnicode_DATA(self); 10657 10658 /* Shortcut for single character strings */ 10659 if (length == 1) 10660 return PyBool_FromLong( 10661 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10662 10663 /* Special case for empty strings */ 10664 if (length == 0) 10665 return PyBool_FromLong(0); 10666 10667 for (i = 0; i < length; i++) { 10668 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10669 if (!Py_UNICODE_ISSPACE(ch)) 10670 return PyBool_FromLong(0); 10671 } 10672 return PyBool_FromLong(1); 10673} 10674 10675PyDoc_STRVAR(isalpha__doc__, 10676 "S.isalpha() -> bool\n\ 10677\n\ 10678Return True if all characters in S are alphabetic\n\ 10679and there is at least one character in S, False otherwise."); 10680 10681static PyObject* 10682unicode_isalpha(PyUnicodeObject *self) 10683{ 10684 Py_ssize_t i, length; 10685 int kind; 10686 void *data; 10687 10688 if (PyUnicode_READY(self) == -1) 10689 return NULL; 10690 length = PyUnicode_GET_LENGTH(self); 10691 kind = PyUnicode_KIND(self); 10692 data = PyUnicode_DATA(self); 10693 10694 /* Shortcut for single character strings */ 10695 if (length == 1) 10696 return PyBool_FromLong( 10697 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10698 10699 /* Special case for empty strings */ 10700 if (length == 0) 10701 return PyBool_FromLong(0); 10702 10703 for (i = 0; i < length; i++) { 10704 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10705 return PyBool_FromLong(0); 10706 } 10707 return PyBool_FromLong(1); 10708} 10709 10710PyDoc_STRVAR(isalnum__doc__, 10711 "S.isalnum() -> bool\n\ 10712\n\ 10713Return True if all characters in S are alphanumeric\n\ 10714and there is at least one character in S, False otherwise."); 10715 10716static PyObject* 10717unicode_isalnum(PyUnicodeObject *self) 10718{ 10719 int kind; 10720 void *data; 10721 Py_ssize_t len, i; 10722 10723 if (PyUnicode_READY(self) == -1) 10724 return NULL; 10725 10726 kind = PyUnicode_KIND(self); 10727 data = PyUnicode_DATA(self); 10728 len = PyUnicode_GET_LENGTH(self); 10729 10730 /* Shortcut for single character strings */ 10731 if (len == 1) { 10732 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10733 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10734 } 10735 10736 /* Special case for empty strings */ 10737 if (len == 0) 10738 return PyBool_FromLong(0); 10739 10740 for (i = 0; i < len; i++) { 10741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10742 if (!Py_UNICODE_ISALNUM(ch)) 10743 return PyBool_FromLong(0); 10744 } 10745 return PyBool_FromLong(1); 10746} 10747 10748PyDoc_STRVAR(isdecimal__doc__, 10749 "S.isdecimal() -> bool\n\ 10750\n\ 10751Return True if there are only decimal characters in S,\n\ 10752False otherwise."); 10753 10754static PyObject* 10755unicode_isdecimal(PyUnicodeObject *self) 10756{ 10757 Py_ssize_t i, length; 10758 int kind; 10759 void *data; 10760 10761 if (PyUnicode_READY(self) == -1) 10762 return NULL; 10763 length = PyUnicode_GET_LENGTH(self); 10764 kind = PyUnicode_KIND(self); 10765 data = PyUnicode_DATA(self); 10766 10767 /* Shortcut for single character strings */ 10768 if (length == 1) 10769 return PyBool_FromLong( 10770 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10771 10772 /* Special case for empty strings */ 10773 if (length == 0) 10774 return PyBool_FromLong(0); 10775 10776 for (i = 0; i < length; i++) { 10777 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10778 return PyBool_FromLong(0); 10779 } 10780 return PyBool_FromLong(1); 10781} 10782 10783PyDoc_STRVAR(isdigit__doc__, 10784 "S.isdigit() -> bool\n\ 10785\n\ 10786Return True if all characters in S are digits\n\ 10787and there is at least one character in S, False otherwise."); 10788 10789static PyObject* 10790unicode_isdigit(PyUnicodeObject *self) 10791{ 10792 Py_ssize_t i, length; 10793 int kind; 10794 void *data; 10795 10796 if (PyUnicode_READY(self) == -1) 10797 return NULL; 10798 length = PyUnicode_GET_LENGTH(self); 10799 kind = PyUnicode_KIND(self); 10800 data = PyUnicode_DATA(self); 10801 10802 /* Shortcut for single character strings */ 10803 if (length == 1) { 10804 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10805 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10806 } 10807 10808 /* Special case for empty strings */ 10809 if (length == 0) 10810 return PyBool_FromLong(0); 10811 10812 for (i = 0; i < length; i++) { 10813 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10814 return PyBool_FromLong(0); 10815 } 10816 return PyBool_FromLong(1); 10817} 10818 10819PyDoc_STRVAR(isnumeric__doc__, 10820 "S.isnumeric() -> bool\n\ 10821\n\ 10822Return True if there are only numeric characters in S,\n\ 10823False otherwise."); 10824 10825static PyObject* 10826unicode_isnumeric(PyUnicodeObject *self) 10827{ 10828 Py_ssize_t i, length; 10829 int kind; 10830 void *data; 10831 10832 if (PyUnicode_READY(self) == -1) 10833 return NULL; 10834 length = PyUnicode_GET_LENGTH(self); 10835 kind = PyUnicode_KIND(self); 10836 data = PyUnicode_DATA(self); 10837 10838 /* Shortcut for single character strings */ 10839 if (length == 1) 10840 return PyBool_FromLong( 10841 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10842 10843 /* Special case for empty strings */ 10844 if (length == 0) 10845 return PyBool_FromLong(0); 10846 10847 for (i = 0; i < length; i++) { 10848 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10849 return PyBool_FromLong(0); 10850 } 10851 return PyBool_FromLong(1); 10852} 10853 10854int 10855PyUnicode_IsIdentifier(PyObject *self) 10856{ 10857 int kind; 10858 void *data; 10859 Py_ssize_t i; 10860 Py_UCS4 first; 10861 10862 if (PyUnicode_READY(self) == -1) { 10863 Py_FatalError("identifier not ready"); 10864 return 0; 10865 } 10866 10867 /* Special case for empty strings */ 10868 if (PyUnicode_GET_LENGTH(self) == 0) 10869 return 0; 10870 kind = PyUnicode_KIND(self); 10871 data = PyUnicode_DATA(self); 10872 10873 /* PEP 3131 says that the first character must be in 10874 XID_Start and subsequent characters in XID_Continue, 10875 and for the ASCII range, the 2.x rules apply (i.e 10876 start with letters and underscore, continue with 10877 letters, digits, underscore). However, given the current 10878 definition of XID_Start and XID_Continue, it is sufficient 10879 to check just for these, except that _ must be allowed 10880 as starting an identifier. */ 10881 first = PyUnicode_READ(kind, data, 0); 10882 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 10883 return 0; 10884 10885 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 10886 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 10887 return 0; 10888 return 1; 10889} 10890 10891PyDoc_STRVAR(isidentifier__doc__, 10892 "S.isidentifier() -> bool\n\ 10893\n\ 10894Return True if S is a valid identifier according\n\ 10895to the language definition."); 10896 10897static PyObject* 10898unicode_isidentifier(PyObject *self) 10899{ 10900 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 10901} 10902 10903PyDoc_STRVAR(isprintable__doc__, 10904 "S.isprintable() -> bool\n\ 10905\n\ 10906Return True if all characters in S are considered\n\ 10907printable in repr() or S is empty, False otherwise."); 10908 10909static PyObject* 10910unicode_isprintable(PyObject *self) 10911{ 10912 Py_ssize_t i, length; 10913 int kind; 10914 void *data; 10915 10916 if (PyUnicode_READY(self) == -1) 10917 return NULL; 10918 length = PyUnicode_GET_LENGTH(self); 10919 kind = PyUnicode_KIND(self); 10920 data = PyUnicode_DATA(self); 10921 10922 /* Shortcut for single character strings */ 10923 if (length == 1) 10924 return PyBool_FromLong( 10925 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 10926 10927 for (i = 0; i < length; i++) { 10928 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 10929 Py_RETURN_FALSE; 10930 } 10931 } 10932 Py_RETURN_TRUE; 10933} 10934 10935PyDoc_STRVAR(join__doc__, 10936 "S.join(iterable) -> str\n\ 10937\n\ 10938Return a string which is the concatenation of the strings in the\n\ 10939iterable. The separator between elements is S."); 10940 10941static PyObject* 10942unicode_join(PyObject *self, PyObject *data) 10943{ 10944 return PyUnicode_Join(self, data); 10945} 10946 10947static Py_ssize_t 10948unicode_length(PyUnicodeObject *self) 10949{ 10950 if (PyUnicode_READY(self) == -1) 10951 return -1; 10952 return PyUnicode_GET_LENGTH(self); 10953} 10954 10955PyDoc_STRVAR(ljust__doc__, 10956 "S.ljust(width[, fillchar]) -> str\n\ 10957\n\ 10958Return S left-justified in a Unicode string of length width. Padding is\n\ 10959done using the specified fill character (default is a space)."); 10960 10961static PyObject * 10962unicode_ljust(PyUnicodeObject *self, PyObject *args) 10963{ 10964 Py_ssize_t width; 10965 Py_UCS4 fillchar = ' '; 10966 10967 if (PyUnicode_READY(self) == -1) 10968 return NULL; 10969 10970 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 10971 return NULL; 10972 10973 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10974 Py_INCREF(self); 10975 return (PyObject*) self; 10976 } 10977 10978 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 10979} 10980 10981PyDoc_STRVAR(lower__doc__, 10982 "S.lower() -> str\n\ 10983\n\ 10984Return a copy of the string S converted to lowercase."); 10985 10986static PyObject* 10987unicode_lower(PyUnicodeObject *self) 10988{ 10989 return fixup(self, fixlower); 10990} 10991 10992#define LEFTSTRIP 0 10993#define RIGHTSTRIP 1 10994#define BOTHSTRIP 2 10995 10996/* Arrays indexed by above */ 10997static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 10998 10999#define STRIPNAME(i) (stripformat[i]+3) 11000 11001/* externally visible for str.strip(unicode) */ 11002PyObject * 11003_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 11004{ 11005 void *data; 11006 int kind; 11007 Py_ssize_t i, j, len; 11008 BLOOM_MASK sepmask; 11009 11010 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11011 return NULL; 11012 11013 kind = PyUnicode_KIND(self); 11014 data = PyUnicode_DATA(self); 11015 len = PyUnicode_GET_LENGTH(self); 11016 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11017 PyUnicode_DATA(sepobj), 11018 PyUnicode_GET_LENGTH(sepobj)); 11019 11020 i = 0; 11021 if (striptype != RIGHTSTRIP) { 11022 while (i < len && 11023 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11024 i++; 11025 } 11026 } 11027 11028 j = len; 11029 if (striptype != LEFTSTRIP) { 11030 do { 11031 j--; 11032 } while (j >= i && 11033 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11034 j++; 11035 } 11036 11037 return PyUnicode_Substring((PyObject*)self, i, j); 11038} 11039 11040PyObject* 11041PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11042{ 11043 unsigned char *data; 11044 int kind; 11045 Py_ssize_t length; 11046 11047 if (PyUnicode_READY(self) == -1) 11048 return NULL; 11049 11050 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11051 11052 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11053 { 11054 if (PyUnicode_CheckExact(self)) { 11055 Py_INCREF(self); 11056 return self; 11057 } 11058 else 11059 return PyUnicode_Copy(self); 11060 } 11061 11062 length = end - start; 11063 if (length == 1) 11064 return unicode_getitem(self, start); 11065 11066 if (start < 0 || end < 0) { 11067 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11068 return NULL; 11069 } 11070 11071 if (PyUnicode_IS_ASCII(self)) { 11072 kind = PyUnicode_KIND(self); 11073 data = PyUnicode_1BYTE_DATA(self); 11074 return unicode_fromascii(data + start, length); 11075 } 11076 else { 11077 kind = PyUnicode_KIND(self); 11078 data = PyUnicode_1BYTE_DATA(self); 11079 return PyUnicode_FromKindAndData(kind, 11080 data + PyUnicode_KIND_SIZE(kind, start), 11081 length); 11082 } 11083} 11084 11085static PyObject * 11086do_strip(PyUnicodeObject *self, int striptype) 11087{ 11088 int kind; 11089 void *data; 11090 Py_ssize_t len, i, j; 11091 11092 if (PyUnicode_READY(self) == -1) 11093 return NULL; 11094 11095 kind = PyUnicode_KIND(self); 11096 data = PyUnicode_DATA(self); 11097 len = PyUnicode_GET_LENGTH(self); 11098 11099 i = 0; 11100 if (striptype != RIGHTSTRIP) { 11101 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11102 i++; 11103 } 11104 } 11105 11106 j = len; 11107 if (striptype != LEFTSTRIP) { 11108 do { 11109 j--; 11110 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11111 j++; 11112 } 11113 11114 return PyUnicode_Substring((PyObject*)self, i, j); 11115} 11116 11117 11118static PyObject * 11119do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 11120{ 11121 PyObject *sep = NULL; 11122 11123 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11124 return NULL; 11125 11126 if (sep != NULL && sep != Py_None) { 11127 if (PyUnicode_Check(sep)) 11128 return _PyUnicode_XStrip(self, striptype, sep); 11129 else { 11130 PyErr_Format(PyExc_TypeError, 11131 "%s arg must be None or str", 11132 STRIPNAME(striptype)); 11133 return NULL; 11134 } 11135 } 11136 11137 return do_strip(self, striptype); 11138} 11139 11140 11141PyDoc_STRVAR(strip__doc__, 11142 "S.strip([chars]) -> str\n\ 11143\n\ 11144Return a copy of the string S with leading and trailing\n\ 11145whitespace removed.\n\ 11146If chars is given and not None, remove characters in chars instead."); 11147 11148static PyObject * 11149unicode_strip(PyUnicodeObject *self, PyObject *args) 11150{ 11151 if (PyTuple_GET_SIZE(args) == 0) 11152 return do_strip(self, BOTHSTRIP); /* Common case */ 11153 else 11154 return do_argstrip(self, BOTHSTRIP, args); 11155} 11156 11157 11158PyDoc_STRVAR(lstrip__doc__, 11159 "S.lstrip([chars]) -> str\n\ 11160\n\ 11161Return a copy of the string S with leading whitespace removed.\n\ 11162If chars is given and not None, remove characters in chars instead."); 11163 11164static PyObject * 11165unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11166{ 11167 if (PyTuple_GET_SIZE(args) == 0) 11168 return do_strip(self, LEFTSTRIP); /* Common case */ 11169 else 11170 return do_argstrip(self, LEFTSTRIP, args); 11171} 11172 11173 11174PyDoc_STRVAR(rstrip__doc__, 11175 "S.rstrip([chars]) -> str\n\ 11176\n\ 11177Return a copy of the string S with trailing whitespace removed.\n\ 11178If chars is given and not None, remove characters in chars instead."); 11179 11180static PyObject * 11181unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11182{ 11183 if (PyTuple_GET_SIZE(args) == 0) 11184 return do_strip(self, RIGHTSTRIP); /* Common case */ 11185 else 11186 return do_argstrip(self, RIGHTSTRIP, args); 11187} 11188 11189 11190static PyObject* 11191unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11192{ 11193 PyUnicodeObject *u; 11194 Py_ssize_t nchars, n; 11195 11196 if (len < 1) { 11197 Py_INCREF(unicode_empty); 11198 return unicode_empty; 11199 } 11200 11201 if (len == 1 && PyUnicode_CheckExact(str)) { 11202 /* no repeat, return original string */ 11203 Py_INCREF(str); 11204 return (PyObject*) str; 11205 } 11206 11207 if (PyUnicode_READY(str) == -1) 11208 return NULL; 11209 11210 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11211 PyErr_SetString(PyExc_OverflowError, 11212 "repeated string is too long"); 11213 return NULL; 11214 } 11215 nchars = len * PyUnicode_GET_LENGTH(str); 11216 11217 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11218 if (!u) 11219 return NULL; 11220 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11221 11222 if (PyUnicode_GET_LENGTH(str) == 1) { 11223 const int kind = PyUnicode_KIND(str); 11224 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11225 void *to = PyUnicode_DATA(u); 11226 if (kind == PyUnicode_1BYTE_KIND) 11227 memset(to, (unsigned char)fill_char, len); 11228 else { 11229 for (n = 0; n < len; ++n) 11230 PyUnicode_WRITE(kind, to, n, fill_char); 11231 } 11232 } 11233 else { 11234 /* number of characters copied this far */ 11235 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11236 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11237 char *to = (char *) PyUnicode_DATA(u); 11238 Py_MEMCPY(to, PyUnicode_DATA(str), 11239 PyUnicode_GET_LENGTH(str) * char_size); 11240 while (done < nchars) { 11241 n = (done <= nchars-done) ? done : nchars-done; 11242 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11243 done += n; 11244 } 11245 } 11246 11247 return (PyObject*) u; 11248} 11249 11250PyObject * 11251PyUnicode_Replace(PyObject *obj, 11252 PyObject *subobj, 11253 PyObject *replobj, 11254 Py_ssize_t maxcount) 11255{ 11256 PyObject *self; 11257 PyObject *str1; 11258 PyObject *str2; 11259 PyObject *result; 11260 11261 self = PyUnicode_FromObject(obj); 11262 if (self == NULL || PyUnicode_READY(self) == -1) 11263 return NULL; 11264 str1 = PyUnicode_FromObject(subobj); 11265 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11266 Py_DECREF(self); 11267 return NULL; 11268 } 11269 str2 = PyUnicode_FromObject(replobj); 11270 if (str2 == NULL || PyUnicode_READY(str2)) { 11271 Py_DECREF(self); 11272 Py_DECREF(str1); 11273 return NULL; 11274 } 11275 result = replace(self, str1, str2, maxcount); 11276 Py_DECREF(self); 11277 Py_DECREF(str1); 11278 Py_DECREF(str2); 11279 return result; 11280} 11281 11282PyDoc_STRVAR(replace__doc__, 11283 "S.replace(old, new[, count]) -> str\n\ 11284\n\ 11285Return a copy of S with all occurrences of substring\n\ 11286old replaced by new. If the optional argument count is\n\ 11287given, only the first count occurrences are replaced."); 11288 11289static PyObject* 11290unicode_replace(PyObject *self, PyObject *args) 11291{ 11292 PyObject *str1; 11293 PyObject *str2; 11294 Py_ssize_t maxcount = -1; 11295 PyObject *result; 11296 11297 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11298 return NULL; 11299 if (!PyUnicode_READY(self) == -1) 11300 return NULL; 11301 str1 = PyUnicode_FromObject(str1); 11302 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11303 return NULL; 11304 str2 = PyUnicode_FromObject(str2); 11305 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11306 Py_DECREF(str1); 11307 return NULL; 11308 } 11309 11310 result = replace(self, str1, str2, maxcount); 11311 11312 Py_DECREF(str1); 11313 Py_DECREF(str2); 11314 return result; 11315} 11316 11317static PyObject * 11318unicode_repr(PyObject *unicode) 11319{ 11320 PyObject *repr; 11321 Py_ssize_t isize; 11322 Py_ssize_t osize, squote, dquote, i, o; 11323 Py_UCS4 max, quote; 11324 int ikind, okind; 11325 void *idata, *odata; 11326 11327 if (PyUnicode_READY(unicode) == -1) 11328 return NULL; 11329 11330 isize = PyUnicode_GET_LENGTH(unicode); 11331 idata = PyUnicode_DATA(unicode); 11332 11333 /* Compute length of output, quote characters, and 11334 maximum character */ 11335 osize = 2; /* quotes */ 11336 max = 127; 11337 squote = dquote = 0; 11338 ikind = PyUnicode_KIND(unicode); 11339 for (i = 0; i < isize; i++) { 11340 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11341 switch (ch) { 11342 case '\'': squote++; osize++; break; 11343 case '"': dquote++; osize++; break; 11344 case '\\': case '\t': case '\r': case '\n': 11345 osize += 2; break; 11346 default: 11347 /* Fast-path ASCII */ 11348 if (ch < ' ' || ch == 0x7f) 11349 osize += 4; /* \xHH */ 11350 else if (ch < 0x7f) 11351 osize++; 11352 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11353 osize++; 11354 max = ch > max ? ch : max; 11355 } 11356 else if (ch < 0x100) 11357 osize += 4; /* \xHH */ 11358 else if (ch < 0x10000) 11359 osize += 6; /* \uHHHH */ 11360 else 11361 osize += 10; /* \uHHHHHHHH */ 11362 } 11363 } 11364 11365 quote = '\''; 11366 if (squote) { 11367 if (dquote) 11368 /* Both squote and dquote present. Use squote, 11369 and escape them */ 11370 osize += squote; 11371 else 11372 quote = '"'; 11373 } 11374 11375 repr = PyUnicode_New(osize, max); 11376 if (repr == NULL) 11377 return NULL; 11378 okind = PyUnicode_KIND(repr); 11379 odata = PyUnicode_DATA(repr); 11380 11381 PyUnicode_WRITE(okind, odata, 0, quote); 11382 PyUnicode_WRITE(okind, odata, osize-1, quote); 11383 11384 for (i = 0, o = 1; i < isize; i++) { 11385 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11386 11387 /* Escape quotes and backslashes */ 11388 if ((ch == quote) || (ch == '\\')) { 11389 PyUnicode_WRITE(okind, odata, o++, '\\'); 11390 PyUnicode_WRITE(okind, odata, o++, ch); 11391 continue; 11392 } 11393 11394 /* Map special whitespace to '\t', \n', '\r' */ 11395 if (ch == '\t') { 11396 PyUnicode_WRITE(okind, odata, o++, '\\'); 11397 PyUnicode_WRITE(okind, odata, o++, 't'); 11398 } 11399 else if (ch == '\n') { 11400 PyUnicode_WRITE(okind, odata, o++, '\\'); 11401 PyUnicode_WRITE(okind, odata, o++, 'n'); 11402 } 11403 else if (ch == '\r') { 11404 PyUnicode_WRITE(okind, odata, o++, '\\'); 11405 PyUnicode_WRITE(okind, odata, o++, 'r'); 11406 } 11407 11408 /* Map non-printable US ASCII to '\xhh' */ 11409 else if (ch < ' ' || ch == 0x7F) { 11410 PyUnicode_WRITE(okind, odata, o++, '\\'); 11411 PyUnicode_WRITE(okind, odata, o++, 'x'); 11412 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11413 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11414 } 11415 11416 /* Copy ASCII characters as-is */ 11417 else if (ch < 0x7F) { 11418 PyUnicode_WRITE(okind, odata, o++, ch); 11419 } 11420 11421 /* Non-ASCII characters */ 11422 else { 11423 /* Map Unicode whitespace and control characters 11424 (categories Z* and C* except ASCII space) 11425 */ 11426 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11427 /* Map 8-bit characters to '\xhh' */ 11428 if (ch <= 0xff) { 11429 PyUnicode_WRITE(okind, odata, o++, '\\'); 11430 PyUnicode_WRITE(okind, odata, o++, 'x'); 11431 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11432 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11433 } 11434 /* Map 21-bit characters to '\U00xxxxxx' */ 11435 else if (ch >= 0x10000) { 11436 PyUnicode_WRITE(okind, odata, o++, '\\'); 11437 PyUnicode_WRITE(okind, odata, o++, 'U'); 11438 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11439 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11440 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11441 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11442 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11443 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11444 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11445 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11446 } 11447 /* Map 16-bit characters to '\uxxxx' */ 11448 else { 11449 PyUnicode_WRITE(okind, odata, o++, '\\'); 11450 PyUnicode_WRITE(okind, odata, o++, 'u'); 11451 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11452 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11453 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11454 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11455 } 11456 } 11457 /* Copy characters as-is */ 11458 else { 11459 PyUnicode_WRITE(okind, odata, o++, ch); 11460 } 11461 } 11462 } 11463 /* Closing quote already added at the beginning */ 11464 return repr; 11465} 11466 11467PyDoc_STRVAR(rfind__doc__, 11468 "S.rfind(sub[, start[, end]]) -> int\n\ 11469\n\ 11470Return the highest index in S where substring sub is found,\n\ 11471such that sub is contained within S[start:end]. Optional\n\ 11472arguments start and end are interpreted as in slice notation.\n\ 11473\n\ 11474Return -1 on failure."); 11475 11476static PyObject * 11477unicode_rfind(PyObject *self, PyObject *args) 11478{ 11479 PyUnicodeObject *substring; 11480 Py_ssize_t start; 11481 Py_ssize_t end; 11482 Py_ssize_t result; 11483 11484 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11485 &start, &end)) 11486 return NULL; 11487 11488 if (PyUnicode_READY(self) == -1) 11489 return NULL; 11490 if (PyUnicode_READY(substring) == -1) 11491 return NULL; 11492 11493 result = any_find_slice( 11494 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11495 self, (PyObject*)substring, start, end 11496 ); 11497 11498 Py_DECREF(substring); 11499 11500 if (result == -2) 11501 return NULL; 11502 11503 return PyLong_FromSsize_t(result); 11504} 11505 11506PyDoc_STRVAR(rindex__doc__, 11507 "S.rindex(sub[, start[, end]]) -> int\n\ 11508\n\ 11509Like S.rfind() but raise ValueError when the substring is not found."); 11510 11511static PyObject * 11512unicode_rindex(PyObject *self, PyObject *args) 11513{ 11514 PyUnicodeObject *substring; 11515 Py_ssize_t start; 11516 Py_ssize_t end; 11517 Py_ssize_t result; 11518 11519 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11520 &start, &end)) 11521 return NULL; 11522 11523 if (PyUnicode_READY(self) == -1) 11524 return NULL; 11525 if (PyUnicode_READY(substring) == -1) 11526 return NULL; 11527 11528 result = any_find_slice( 11529 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11530 self, (PyObject*)substring, start, end 11531 ); 11532 11533 Py_DECREF(substring); 11534 11535 if (result == -2) 11536 return NULL; 11537 11538 if (result < 0) { 11539 PyErr_SetString(PyExc_ValueError, "substring not found"); 11540 return NULL; 11541 } 11542 11543 return PyLong_FromSsize_t(result); 11544} 11545 11546PyDoc_STRVAR(rjust__doc__, 11547 "S.rjust(width[, fillchar]) -> str\n\ 11548\n\ 11549Return S right-justified in a string of length width. Padding is\n\ 11550done using the specified fill character (default is a space)."); 11551 11552static PyObject * 11553unicode_rjust(PyUnicodeObject *self, PyObject *args) 11554{ 11555 Py_ssize_t width; 11556 Py_UCS4 fillchar = ' '; 11557 11558 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11559 return NULL; 11560 11561 if (PyUnicode_READY(self) == -1) 11562 return NULL; 11563 11564 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11565 Py_INCREF(self); 11566 return (PyObject*) self; 11567 } 11568 11569 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11570} 11571 11572PyObject * 11573PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11574{ 11575 PyObject *result; 11576 11577 s = PyUnicode_FromObject(s); 11578 if (s == NULL) 11579 return NULL; 11580 if (sep != NULL) { 11581 sep = PyUnicode_FromObject(sep); 11582 if (sep == NULL) { 11583 Py_DECREF(s); 11584 return NULL; 11585 } 11586 } 11587 11588 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11589 11590 Py_DECREF(s); 11591 Py_XDECREF(sep); 11592 return result; 11593} 11594 11595PyDoc_STRVAR(split__doc__, 11596 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11597\n\ 11598Return a list of the words in S, using sep as the\n\ 11599delimiter string. If maxsplit is given, at most maxsplit\n\ 11600splits are done. If sep is not specified or is None, any\n\ 11601whitespace string is a separator and empty strings are\n\ 11602removed from the result."); 11603 11604static PyObject* 11605unicode_split(PyUnicodeObject *self, PyObject *args) 11606{ 11607 PyObject *substring = Py_None; 11608 Py_ssize_t maxcount = -1; 11609 11610 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11611 return NULL; 11612 11613 if (substring == Py_None) 11614 return split(self, NULL, maxcount); 11615 else if (PyUnicode_Check(substring)) 11616 return split(self, (PyUnicodeObject *)substring, maxcount); 11617 else 11618 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11619} 11620 11621PyObject * 11622PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11623{ 11624 PyObject* str_obj; 11625 PyObject* sep_obj; 11626 PyObject* out; 11627 int kind1, kind2, kind; 11628 void *buf1 = NULL, *buf2 = NULL; 11629 Py_ssize_t len1, len2; 11630 11631 str_obj = PyUnicode_FromObject(str_in); 11632 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11633 return NULL; 11634 sep_obj = PyUnicode_FromObject(sep_in); 11635 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11636 Py_DECREF(str_obj); 11637 return NULL; 11638 } 11639 11640 kind1 = PyUnicode_KIND(str_in); 11641 kind2 = PyUnicode_KIND(sep_obj); 11642 kind = kind1 > kind2 ? kind1 : kind2; 11643 buf1 = PyUnicode_DATA(str_in); 11644 if (kind1 != kind) 11645 buf1 = _PyUnicode_AsKind(str_in, kind); 11646 if (!buf1) 11647 goto onError; 11648 buf2 = PyUnicode_DATA(sep_obj); 11649 if (kind2 != kind) 11650 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11651 if (!buf2) 11652 goto onError; 11653 len1 = PyUnicode_GET_LENGTH(str_obj); 11654 len2 = PyUnicode_GET_LENGTH(sep_obj); 11655 11656 switch(PyUnicode_KIND(str_in)) { 11657 case PyUnicode_1BYTE_KIND: 11658 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11659 break; 11660 case PyUnicode_2BYTE_KIND: 11661 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11662 break; 11663 case PyUnicode_4BYTE_KIND: 11664 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11665 break; 11666 default: 11667 assert(0); 11668 out = 0; 11669 } 11670 11671 Py_DECREF(sep_obj); 11672 Py_DECREF(str_obj); 11673 if (kind1 != kind) 11674 PyMem_Free(buf1); 11675 if (kind2 != kind) 11676 PyMem_Free(buf2); 11677 11678 return out; 11679 onError: 11680 Py_DECREF(sep_obj); 11681 Py_DECREF(str_obj); 11682 if (kind1 != kind && buf1) 11683 PyMem_Free(buf1); 11684 if (kind2 != kind && buf2) 11685 PyMem_Free(buf2); 11686 return NULL; 11687} 11688 11689 11690PyObject * 11691PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11692{ 11693 PyObject* str_obj; 11694 PyObject* sep_obj; 11695 PyObject* out; 11696 int kind1, kind2, kind; 11697 void *buf1 = NULL, *buf2 = NULL; 11698 Py_ssize_t len1, len2; 11699 11700 str_obj = PyUnicode_FromObject(str_in); 11701 if (!str_obj) 11702 return NULL; 11703 sep_obj = PyUnicode_FromObject(sep_in); 11704 if (!sep_obj) { 11705 Py_DECREF(str_obj); 11706 return NULL; 11707 } 11708 11709 kind1 = PyUnicode_KIND(str_in); 11710 kind2 = PyUnicode_KIND(sep_obj); 11711 kind = Py_MAX(kind1, kind2); 11712 buf1 = PyUnicode_DATA(str_in); 11713 if (kind1 != kind) 11714 buf1 = _PyUnicode_AsKind(str_in, kind); 11715 if (!buf1) 11716 goto onError; 11717 buf2 = PyUnicode_DATA(sep_obj); 11718 if (kind2 != kind) 11719 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11720 if (!buf2) 11721 goto onError; 11722 len1 = PyUnicode_GET_LENGTH(str_obj); 11723 len2 = PyUnicode_GET_LENGTH(sep_obj); 11724 11725 switch(PyUnicode_KIND(str_in)) { 11726 case PyUnicode_1BYTE_KIND: 11727 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11728 break; 11729 case PyUnicode_2BYTE_KIND: 11730 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11731 break; 11732 case PyUnicode_4BYTE_KIND: 11733 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11734 break; 11735 default: 11736 assert(0); 11737 out = 0; 11738 } 11739 11740 Py_DECREF(sep_obj); 11741 Py_DECREF(str_obj); 11742 if (kind1 != kind) 11743 PyMem_Free(buf1); 11744 if (kind2 != kind) 11745 PyMem_Free(buf2); 11746 11747 return out; 11748 onError: 11749 Py_DECREF(sep_obj); 11750 Py_DECREF(str_obj); 11751 if (kind1 != kind && buf1) 11752 PyMem_Free(buf1); 11753 if (kind2 != kind && buf2) 11754 PyMem_Free(buf2); 11755 return NULL; 11756} 11757 11758PyDoc_STRVAR(partition__doc__, 11759 "S.partition(sep) -> (head, sep, tail)\n\ 11760\n\ 11761Search for the separator sep in S, and return the part before it,\n\ 11762the separator itself, and the part after it. If the separator is not\n\ 11763found, return S and two empty strings."); 11764 11765static PyObject* 11766unicode_partition(PyUnicodeObject *self, PyObject *separator) 11767{ 11768 return PyUnicode_Partition((PyObject *)self, separator); 11769} 11770 11771PyDoc_STRVAR(rpartition__doc__, 11772 "S.rpartition(sep) -> (head, sep, tail)\n\ 11773\n\ 11774Search for the separator sep in S, starting at the end of S, and return\n\ 11775the part before it, the separator itself, and the part after it. If the\n\ 11776separator is not found, return two empty strings and S."); 11777 11778static PyObject* 11779unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 11780{ 11781 return PyUnicode_RPartition((PyObject *)self, separator); 11782} 11783 11784PyObject * 11785PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11786{ 11787 PyObject *result; 11788 11789 s = PyUnicode_FromObject(s); 11790 if (s == NULL) 11791 return NULL; 11792 if (sep != NULL) { 11793 sep = PyUnicode_FromObject(sep); 11794 if (sep == NULL) { 11795 Py_DECREF(s); 11796 return NULL; 11797 } 11798 } 11799 11800 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11801 11802 Py_DECREF(s); 11803 Py_XDECREF(sep); 11804 return result; 11805} 11806 11807PyDoc_STRVAR(rsplit__doc__, 11808 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11809\n\ 11810Return a list of the words in S, using sep as the\n\ 11811delimiter string, starting at the end of the string and\n\ 11812working to the front. If maxsplit is given, at most maxsplit\n\ 11813splits are done. If sep is not specified, any whitespace string\n\ 11814is a separator."); 11815 11816static PyObject* 11817unicode_rsplit(PyUnicodeObject *self, PyObject *args) 11818{ 11819 PyObject *substring = Py_None; 11820 Py_ssize_t maxcount = -1; 11821 11822 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11823 return NULL; 11824 11825 if (substring == Py_None) 11826 return rsplit(self, NULL, maxcount); 11827 else if (PyUnicode_Check(substring)) 11828 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 11829 else 11830 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 11831} 11832 11833PyDoc_STRVAR(splitlines__doc__, 11834 "S.splitlines([keepends]) -> list of strings\n\ 11835\n\ 11836Return a list of the lines in S, breaking at line boundaries.\n\ 11837Line breaks are not included in the resulting list unless keepends\n\ 11838is given and true."); 11839 11840static PyObject* 11841unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11842{ 11843 static char *kwlist[] = {"keepends", 0}; 11844 int keepends = 0; 11845 11846 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11847 kwlist, &keepends)) 11848 return NULL; 11849 11850 return PyUnicode_Splitlines((PyObject *)self, keepends); 11851} 11852 11853static 11854PyObject *unicode_str(PyObject *self) 11855{ 11856 if (PyUnicode_CheckExact(self)) { 11857 Py_INCREF(self); 11858 return self; 11859 } else 11860 /* Subtype -- return genuine unicode string with the same value. */ 11861 return PyUnicode_Copy(self); 11862} 11863 11864PyDoc_STRVAR(swapcase__doc__, 11865 "S.swapcase() -> str\n\ 11866\n\ 11867Return a copy of S with uppercase characters converted to lowercase\n\ 11868and vice versa."); 11869 11870static PyObject* 11871unicode_swapcase(PyUnicodeObject *self) 11872{ 11873 return fixup(self, fixswapcase); 11874} 11875 11876PyDoc_STRVAR(maketrans__doc__, 11877 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 11878\n\ 11879Return a translation table usable for str.translate().\n\ 11880If there is only one argument, it must be a dictionary mapping Unicode\n\ 11881ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 11882Character keys will be then converted to ordinals.\n\ 11883If there are two arguments, they must be strings of equal length, and\n\ 11884in the resulting dictionary, each character in x will be mapped to the\n\ 11885character at the same position in y. If there is a third argument, it\n\ 11886must be a string, whose characters will be mapped to None in the result."); 11887 11888static PyObject* 11889unicode_maketrans(PyUnicodeObject *null, PyObject *args) 11890{ 11891 PyObject *x, *y = NULL, *z = NULL; 11892 PyObject *new = NULL, *key, *value; 11893 Py_ssize_t i = 0; 11894 int res; 11895 11896 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 11897 return NULL; 11898 new = PyDict_New(); 11899 if (!new) 11900 return NULL; 11901 if (y != NULL) { 11902 int x_kind, y_kind, z_kind; 11903 void *x_data, *y_data, *z_data; 11904 11905 /* x must be a string too, of equal length */ 11906 if (!PyUnicode_Check(x)) { 11907 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 11908 "be a string if there is a second argument"); 11909 goto err; 11910 } 11911 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 11912 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 11913 "arguments must have equal length"); 11914 goto err; 11915 } 11916 /* create entries for translating chars in x to those in y */ 11917 x_kind = PyUnicode_KIND(x); 11918 y_kind = PyUnicode_KIND(y); 11919 x_data = PyUnicode_DATA(x); 11920 y_data = PyUnicode_DATA(y); 11921 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 11922 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 11923 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 11924 if (!key || !value) 11925 goto err; 11926 res = PyDict_SetItem(new, key, value); 11927 Py_DECREF(key); 11928 Py_DECREF(value); 11929 if (res < 0) 11930 goto err; 11931 } 11932 /* create entries for deleting chars in z */ 11933 if (z != NULL) { 11934 z_kind = PyUnicode_KIND(z); 11935 z_data = PyUnicode_DATA(z); 11936 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 11937 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 11938 if (!key) 11939 goto err; 11940 res = PyDict_SetItem(new, key, Py_None); 11941 Py_DECREF(key); 11942 if (res < 0) 11943 goto err; 11944 } 11945 } 11946 } else { 11947 int kind; 11948 void *data; 11949 11950 /* x must be a dict */ 11951 if (!PyDict_CheckExact(x)) { 11952 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 11953 "to maketrans it must be a dict"); 11954 goto err; 11955 } 11956 /* copy entries into the new dict, converting string keys to int keys */ 11957 while (PyDict_Next(x, &i, &key, &value)) { 11958 if (PyUnicode_Check(key)) { 11959 /* convert string keys to integer keys */ 11960 PyObject *newkey; 11961 if (PyUnicode_GET_SIZE(key) != 1) { 11962 PyErr_SetString(PyExc_ValueError, "string keys in translate " 11963 "table must be of length 1"); 11964 goto err; 11965 } 11966 kind = PyUnicode_KIND(key); 11967 data = PyUnicode_DATA(key); 11968 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 11969 if (!newkey) 11970 goto err; 11971 res = PyDict_SetItem(new, newkey, value); 11972 Py_DECREF(newkey); 11973 if (res < 0) 11974 goto err; 11975 } else if (PyLong_Check(key)) { 11976 /* just keep integer keys */ 11977 if (PyDict_SetItem(new, key, value) < 0) 11978 goto err; 11979 } else { 11980 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 11981 "be strings or integers"); 11982 goto err; 11983 } 11984 } 11985 } 11986 return new; 11987 err: 11988 Py_DECREF(new); 11989 return NULL; 11990} 11991 11992PyDoc_STRVAR(translate__doc__, 11993 "S.translate(table) -> str\n\ 11994\n\ 11995Return a copy of the string S, where all characters have been mapped\n\ 11996through the given translation table, which must be a mapping of\n\ 11997Unicode ordinals to Unicode ordinals, strings, or None.\n\ 11998Unmapped characters are left untouched. Characters mapped to None\n\ 11999are deleted."); 12000 12001static PyObject* 12002unicode_translate(PyObject *self, PyObject *table) 12003{ 12004 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12005} 12006 12007PyDoc_STRVAR(upper__doc__, 12008 "S.upper() -> str\n\ 12009\n\ 12010Return a copy of S converted to uppercase."); 12011 12012static PyObject* 12013unicode_upper(PyUnicodeObject *self) 12014{ 12015 return fixup(self, fixupper); 12016} 12017 12018PyDoc_STRVAR(zfill__doc__, 12019 "S.zfill(width) -> str\n\ 12020\n\ 12021Pad a numeric string S with zeros on the left, to fill a field\n\ 12022of the specified width. The string S is never truncated."); 12023 12024static PyObject * 12025unicode_zfill(PyUnicodeObject *self, PyObject *args) 12026{ 12027 Py_ssize_t fill; 12028 PyUnicodeObject *u; 12029 Py_ssize_t width; 12030 int kind; 12031 void *data; 12032 Py_UCS4 chr; 12033 12034 if (PyUnicode_READY(self) == -1) 12035 return NULL; 12036 12037 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12038 return NULL; 12039 12040 if (PyUnicode_GET_LENGTH(self) >= width) { 12041 if (PyUnicode_CheckExact(self)) { 12042 Py_INCREF(self); 12043 return (PyObject*) self; 12044 } 12045 else 12046 return PyUnicode_Copy((PyObject*)self); 12047 } 12048 12049 fill = width - _PyUnicode_LENGTH(self); 12050 12051 u = pad(self, fill, 0, '0'); 12052 12053 if (u == NULL) 12054 return NULL; 12055 12056 kind = PyUnicode_KIND(u); 12057 data = PyUnicode_DATA(u); 12058 chr = PyUnicode_READ(kind, data, fill); 12059 12060 if (chr == '+' || chr == '-') { 12061 /* move sign to beginning of string */ 12062 PyUnicode_WRITE(kind, data, 0, chr); 12063 PyUnicode_WRITE(kind, data, fill, '0'); 12064 } 12065 12066 return (PyObject*) u; 12067} 12068 12069#if 0 12070static PyObject * 12071unicode__decimal2ascii(PyObject *self) 12072{ 12073 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12074} 12075#endif 12076 12077PyDoc_STRVAR(startswith__doc__, 12078 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12079\n\ 12080Return True if S starts with the specified prefix, False otherwise.\n\ 12081With optional start, test S beginning at that position.\n\ 12082With optional end, stop comparing S at that position.\n\ 12083prefix can also be a tuple of strings to try."); 12084 12085static PyObject * 12086unicode_startswith(PyUnicodeObject *self, 12087 PyObject *args) 12088{ 12089 PyObject *subobj; 12090 PyUnicodeObject *substring; 12091 Py_ssize_t start = 0; 12092 Py_ssize_t end = PY_SSIZE_T_MAX; 12093 int result; 12094 12095 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12096 return NULL; 12097 if (PyTuple_Check(subobj)) { 12098 Py_ssize_t i; 12099 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12100 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12101 PyTuple_GET_ITEM(subobj, i)); 12102 if (substring == NULL) 12103 return NULL; 12104 result = tailmatch(self, substring, start, end, -1); 12105 Py_DECREF(substring); 12106 if (result) { 12107 Py_RETURN_TRUE; 12108 } 12109 } 12110 /* nothing matched */ 12111 Py_RETURN_FALSE; 12112 } 12113 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12114 if (substring == NULL) { 12115 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12116 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12117 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12118 return NULL; 12119 } 12120 result = tailmatch(self, substring, start, end, -1); 12121 Py_DECREF(substring); 12122 return PyBool_FromLong(result); 12123} 12124 12125 12126PyDoc_STRVAR(endswith__doc__, 12127 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12128\n\ 12129Return True if S ends with the specified suffix, False otherwise.\n\ 12130With optional start, test S beginning at that position.\n\ 12131With optional end, stop comparing S at that position.\n\ 12132suffix can also be a tuple of strings to try."); 12133 12134static PyObject * 12135unicode_endswith(PyUnicodeObject *self, 12136 PyObject *args) 12137{ 12138 PyObject *subobj; 12139 PyUnicodeObject *substring; 12140 Py_ssize_t start = 0; 12141 Py_ssize_t end = PY_SSIZE_T_MAX; 12142 int result; 12143 12144 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12145 return NULL; 12146 if (PyTuple_Check(subobj)) { 12147 Py_ssize_t i; 12148 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12149 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12150 PyTuple_GET_ITEM(subobj, i)); 12151 if (substring == NULL) 12152 return NULL; 12153 result = tailmatch(self, substring, start, end, +1); 12154 Py_DECREF(substring); 12155 if (result) { 12156 Py_RETURN_TRUE; 12157 } 12158 } 12159 Py_RETURN_FALSE; 12160 } 12161 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12162 if (substring == NULL) { 12163 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12164 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12165 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12166 return NULL; 12167 } 12168 result = tailmatch(self, substring, start, end, +1); 12169 Py_DECREF(substring); 12170 return PyBool_FromLong(result); 12171} 12172 12173#include "stringlib/unicode_format.h" 12174 12175PyDoc_STRVAR(format__doc__, 12176 "S.format(*args, **kwargs) -> str\n\ 12177\n\ 12178Return a formatted version of S, using substitutions from args and kwargs.\n\ 12179The substitutions are identified by braces ('{' and '}')."); 12180 12181PyDoc_STRVAR(format_map__doc__, 12182 "S.format_map(mapping) -> str\n\ 12183\n\ 12184Return a formatted version of S, using substitutions from mapping.\n\ 12185The substitutions are identified by braces ('{' and '}')."); 12186 12187static PyObject * 12188unicode__format__(PyObject* self, PyObject* args) 12189{ 12190 PyObject *format_spec; 12191 12192 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12193 return NULL; 12194 12195 return _PyUnicode_FormatAdvanced(self, format_spec, 0, 12196 PyUnicode_GET_LENGTH(format_spec)); 12197} 12198 12199PyDoc_STRVAR(p_format__doc__, 12200 "S.__format__(format_spec) -> str\n\ 12201\n\ 12202Return a formatted version of S as described by format_spec."); 12203 12204static PyObject * 12205unicode__sizeof__(PyUnicodeObject *v) 12206{ 12207 Py_ssize_t size; 12208 12209 /* If it's a compact object, account for base structure + 12210 character data. */ 12211 if (PyUnicode_IS_COMPACT_ASCII(v)) 12212 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12213 else if (PyUnicode_IS_COMPACT(v)) 12214 size = sizeof(PyCompactUnicodeObject) + 12215 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 12216 else { 12217 /* If it is a two-block object, account for base object, and 12218 for character block if present. */ 12219 size = sizeof(PyUnicodeObject); 12220 if (_PyUnicode_DATA_ANY(v)) 12221 size += (PyUnicode_GET_LENGTH(v) + 1) * 12222 PyUnicode_CHARACTER_SIZE(v); 12223 } 12224 /* If the wstr pointer is present, account for it unless it is shared 12225 with the data pointer. Check if the data is not shared. */ 12226 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12227 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12228 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12229 size += PyUnicode_UTF8_LENGTH(v) + 1; 12230 12231 return PyLong_FromSsize_t(size); 12232} 12233 12234PyDoc_STRVAR(sizeof__doc__, 12235 "S.__sizeof__() -> size of S in memory, in bytes"); 12236 12237static PyObject * 12238unicode_getnewargs(PyObject *v) 12239{ 12240 PyObject *copy = PyUnicode_Copy(v); 12241 if (!copy) 12242 return NULL; 12243 return Py_BuildValue("(N)", copy); 12244} 12245 12246static PyMethodDef unicode_methods[] = { 12247 12248 /* Order is according to common usage: often used methods should 12249 appear first, since lookup is done sequentially. */ 12250 12251 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12252 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12253 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12254 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12255 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12256 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12257 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12258 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12259 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12260 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12261 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12262 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12263 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12264 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12265 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12266 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12267 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12268 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12269 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12270 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12271 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12272 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12273 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12274 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12275 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12276 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12277 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12278 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12279 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12280 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12281 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12282 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12283 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12284 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12285 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12286 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12287 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12288 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12289 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12290 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12291 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12292 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12293 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12294 {"maketrans", (PyCFunction) unicode_maketrans, 12295 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12296 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12297#if 0 12298 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12299#endif 12300 12301#if 0 12302 /* These methods are just used for debugging the implementation. */ 12303 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12304#endif 12305 12306 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12307 {NULL, NULL} 12308}; 12309 12310static PyObject * 12311unicode_mod(PyObject *v, PyObject *w) 12312{ 12313 if (!PyUnicode_Check(v)) 12314 Py_RETURN_NOTIMPLEMENTED; 12315 return PyUnicode_Format(v, w); 12316} 12317 12318static PyNumberMethods unicode_as_number = { 12319 0, /*nb_add*/ 12320 0, /*nb_subtract*/ 12321 0, /*nb_multiply*/ 12322 unicode_mod, /*nb_remainder*/ 12323}; 12324 12325static PySequenceMethods unicode_as_sequence = { 12326 (lenfunc) unicode_length, /* sq_length */ 12327 PyUnicode_Concat, /* sq_concat */ 12328 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12329 (ssizeargfunc) unicode_getitem, /* sq_item */ 12330 0, /* sq_slice */ 12331 0, /* sq_ass_item */ 12332 0, /* sq_ass_slice */ 12333 PyUnicode_Contains, /* sq_contains */ 12334}; 12335 12336static PyObject* 12337unicode_subscript(PyUnicodeObject* self, PyObject* item) 12338{ 12339 if (PyUnicode_READY(self) == -1) 12340 return NULL; 12341 12342 if (PyIndex_Check(item)) { 12343 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12344 if (i == -1 && PyErr_Occurred()) 12345 return NULL; 12346 if (i < 0) 12347 i += PyUnicode_GET_LENGTH(self); 12348 return unicode_getitem((PyObject*)self, i); 12349 } else if (PySlice_Check(item)) { 12350 Py_ssize_t start, stop, step, slicelength, cur, i; 12351 PyObject *result; 12352 void *src_data, *dest_data; 12353 int src_kind, dest_kind; 12354 Py_UCS4 ch, max_char, kind_limit; 12355 12356 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12357 &start, &stop, &step, &slicelength) < 0) { 12358 return NULL; 12359 } 12360 12361 if (slicelength <= 0) { 12362 return PyUnicode_New(0, 0); 12363 } else if (start == 0 && step == 1 && 12364 slicelength == PyUnicode_GET_LENGTH(self) && 12365 PyUnicode_CheckExact(self)) { 12366 Py_INCREF(self); 12367 return (PyObject *)self; 12368 } else if (step == 1) { 12369 return PyUnicode_Substring((PyObject*)self, 12370 start, start + slicelength); 12371 } 12372 /* General case */ 12373 max_char = 0; 12374 src_kind = PyUnicode_KIND(self); 12375 kind_limit = kind_maxchar_limit(src_kind); 12376 src_data = PyUnicode_DATA(self); 12377 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12378 ch = PyUnicode_READ(src_kind, src_data, cur); 12379 if (ch > max_char) { 12380 max_char = ch; 12381 if (max_char >= kind_limit) 12382 break; 12383 } 12384 } 12385 result = PyUnicode_New(slicelength, max_char); 12386 if (result == NULL) 12387 return NULL; 12388 dest_kind = PyUnicode_KIND(result); 12389 dest_data = PyUnicode_DATA(result); 12390 12391 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12392 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 12393 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 12394 } 12395 return result; 12396 } else { 12397 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12398 return NULL; 12399 } 12400} 12401 12402static PyMappingMethods unicode_as_mapping = { 12403 (lenfunc)unicode_length, /* mp_length */ 12404 (binaryfunc)unicode_subscript, /* mp_subscript */ 12405 (objobjargproc)0, /* mp_ass_subscript */ 12406}; 12407 12408 12409/* Helpers for PyUnicode_Format() */ 12410 12411static PyObject * 12412getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12413{ 12414 Py_ssize_t argidx = *p_argidx; 12415 if (argidx < arglen) { 12416 (*p_argidx)++; 12417 if (arglen < 0) 12418 return args; 12419 else 12420 return PyTuple_GetItem(args, argidx); 12421 } 12422 PyErr_SetString(PyExc_TypeError, 12423 "not enough arguments for format string"); 12424 return NULL; 12425} 12426 12427/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12428 12429static PyObject * 12430formatfloat(PyObject *v, int flags, int prec, int type) 12431{ 12432 char *p; 12433 PyObject *result; 12434 double x; 12435 12436 x = PyFloat_AsDouble(v); 12437 if (x == -1.0 && PyErr_Occurred()) 12438 return NULL; 12439 12440 if (prec < 0) 12441 prec = 6; 12442 12443 p = PyOS_double_to_string(x, type, prec, 12444 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12445 if (p == NULL) 12446 return NULL; 12447 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12448 PyMem_Free(p); 12449 return result; 12450} 12451 12452static PyObject* 12453formatlong(PyObject *val, int flags, int prec, int type) 12454{ 12455 char *buf; 12456 int len; 12457 PyObject *str; /* temporary string object. */ 12458 PyObject *result; 12459 12460 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12461 if (!str) 12462 return NULL; 12463 result = PyUnicode_DecodeASCII(buf, len, NULL); 12464 Py_DECREF(str); 12465 return result; 12466} 12467 12468static int 12469formatchar(Py_UCS4 *buf, 12470 size_t buflen, 12471 PyObject *v) 12472{ 12473 /* presume that the buffer is at least 3 characters long */ 12474 if (PyUnicode_Check(v)) { 12475 if (PyUnicode_GET_LENGTH(v) == 1) { 12476 buf[0] = PyUnicode_READ_CHAR(v, 0); 12477 buf[1] = '\0'; 12478 return 1; 12479 } 12480 goto onError; 12481 } 12482 else { 12483 /* Integer input truncated to a character */ 12484 long x; 12485 x = PyLong_AsLong(v); 12486 if (x == -1 && PyErr_Occurred()) 12487 goto onError; 12488 12489 if (x < 0 || x > 0x10ffff) { 12490 PyErr_SetString(PyExc_OverflowError, 12491 "%c arg not in range(0x110000)"); 12492 return -1; 12493 } 12494 12495 buf[0] = (Py_UCS4) x; 12496 buf[1] = '\0'; 12497 return 1; 12498 } 12499 12500 onError: 12501 PyErr_SetString(PyExc_TypeError, 12502 "%c requires int or char"); 12503 return -1; 12504} 12505 12506/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12507 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12508*/ 12509#define FORMATBUFLEN (size_t)10 12510 12511PyObject * 12512PyUnicode_Format(PyObject *format, PyObject *args) 12513{ 12514 void *fmt; 12515 int fmtkind; 12516 PyObject *result; 12517 Py_UCS4 *res, *res0; 12518 Py_UCS4 max; 12519 int kind; 12520 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12521 int args_owned = 0; 12522 PyObject *dict = NULL; 12523 PyUnicodeObject *uformat; 12524 12525 if (format == NULL || args == NULL) { 12526 PyErr_BadInternalCall(); 12527 return NULL; 12528 } 12529 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12530 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12531 return NULL; 12532 fmt = PyUnicode_DATA(uformat); 12533 fmtkind = PyUnicode_KIND(uformat); 12534 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12535 fmtpos = 0; 12536 12537 reslen = rescnt = fmtcnt + 100; 12538 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12539 if (res0 == NULL) { 12540 PyErr_NoMemory(); 12541 goto onError; 12542 } 12543 12544 if (PyTuple_Check(args)) { 12545 arglen = PyTuple_Size(args); 12546 argidx = 0; 12547 } 12548 else { 12549 arglen = -1; 12550 argidx = -2; 12551 } 12552 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12553 !PyUnicode_Check(args)) 12554 dict = args; 12555 12556 while (--fmtcnt >= 0) { 12557 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12558 if (--rescnt < 0) { 12559 rescnt = fmtcnt + 100; 12560 reslen += rescnt; 12561 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12562 if (res0 == NULL){ 12563 PyErr_NoMemory(); 12564 goto onError; 12565 } 12566 res = res0 + reslen - rescnt; 12567 --rescnt; 12568 } 12569 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12570 } 12571 else { 12572 /* Got a format specifier */ 12573 int flags = 0; 12574 Py_ssize_t width = -1; 12575 int prec = -1; 12576 Py_UCS4 c = '\0'; 12577 Py_UCS4 fill; 12578 int isnumok; 12579 PyObject *v = NULL; 12580 PyObject *temp = NULL; 12581 void *pbuf; 12582 Py_ssize_t pindex; 12583 Py_UNICODE sign; 12584 Py_ssize_t len, len1; 12585 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12586 12587 fmtpos++; 12588 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12589 Py_ssize_t keystart; 12590 Py_ssize_t keylen; 12591 PyObject *key; 12592 int pcount = 1; 12593 12594 if (dict == NULL) { 12595 PyErr_SetString(PyExc_TypeError, 12596 "format requires a mapping"); 12597 goto onError; 12598 } 12599 ++fmtpos; 12600 --fmtcnt; 12601 keystart = fmtpos; 12602 /* Skip over balanced parentheses */ 12603 while (pcount > 0 && --fmtcnt >= 0) { 12604 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12605 --pcount; 12606 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12607 ++pcount; 12608 fmtpos++; 12609 } 12610 keylen = fmtpos - keystart - 1; 12611 if (fmtcnt < 0 || pcount > 0) { 12612 PyErr_SetString(PyExc_ValueError, 12613 "incomplete format key"); 12614 goto onError; 12615 } 12616 key = PyUnicode_Substring((PyObject*)uformat, 12617 keystart, keystart + keylen); 12618 if (key == NULL) 12619 goto onError; 12620 if (args_owned) { 12621 Py_DECREF(args); 12622 args_owned = 0; 12623 } 12624 args = PyObject_GetItem(dict, key); 12625 Py_DECREF(key); 12626 if (args == NULL) { 12627 goto onError; 12628 } 12629 args_owned = 1; 12630 arglen = -1; 12631 argidx = -2; 12632 } 12633 while (--fmtcnt >= 0) { 12634 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12635 case '-': flags |= F_LJUST; continue; 12636 case '+': flags |= F_SIGN; continue; 12637 case ' ': flags |= F_BLANK; continue; 12638 case '#': flags |= F_ALT; continue; 12639 case '0': flags |= F_ZERO; continue; 12640 } 12641 break; 12642 } 12643 if (c == '*') { 12644 v = getnextarg(args, arglen, &argidx); 12645 if (v == NULL) 12646 goto onError; 12647 if (!PyLong_Check(v)) { 12648 PyErr_SetString(PyExc_TypeError, 12649 "* wants int"); 12650 goto onError; 12651 } 12652 width = PyLong_AsLong(v); 12653 if (width == -1 && PyErr_Occurred()) 12654 goto onError; 12655 if (width < 0) { 12656 flags |= F_LJUST; 12657 width = -width; 12658 } 12659 if (--fmtcnt >= 0) 12660 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12661 } 12662 else if (c >= '0' && c <= '9') { 12663 width = c - '0'; 12664 while (--fmtcnt >= 0) { 12665 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12666 if (c < '0' || c > '9') 12667 break; 12668 if ((width*10) / 10 != width) { 12669 PyErr_SetString(PyExc_ValueError, 12670 "width too big"); 12671 goto onError; 12672 } 12673 width = width*10 + (c - '0'); 12674 } 12675 } 12676 if (c == '.') { 12677 prec = 0; 12678 if (--fmtcnt >= 0) 12679 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12680 if (c == '*') { 12681 v = getnextarg(args, arglen, &argidx); 12682 if (v == NULL) 12683 goto onError; 12684 if (!PyLong_Check(v)) { 12685 PyErr_SetString(PyExc_TypeError, 12686 "* wants int"); 12687 goto onError; 12688 } 12689 prec = PyLong_AsLong(v); 12690 if (prec == -1 && PyErr_Occurred()) 12691 goto onError; 12692 if (prec < 0) 12693 prec = 0; 12694 if (--fmtcnt >= 0) 12695 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12696 } 12697 else if (c >= '0' && c <= '9') { 12698 prec = c - '0'; 12699 while (--fmtcnt >= 0) { 12700 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12701 if (c < '0' || c > '9') 12702 break; 12703 if ((prec*10) / 10 != prec) { 12704 PyErr_SetString(PyExc_ValueError, 12705 "prec too big"); 12706 goto onError; 12707 } 12708 prec = prec*10 + (c - '0'); 12709 } 12710 } 12711 } /* prec */ 12712 if (fmtcnt >= 0) { 12713 if (c == 'h' || c == 'l' || c == 'L') { 12714 if (--fmtcnt >= 0) 12715 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12716 } 12717 } 12718 if (fmtcnt < 0) { 12719 PyErr_SetString(PyExc_ValueError, 12720 "incomplete format"); 12721 goto onError; 12722 } 12723 if (c != '%') { 12724 v = getnextarg(args, arglen, &argidx); 12725 if (v == NULL) 12726 goto onError; 12727 } 12728 sign = 0; 12729 fill = ' '; 12730 switch (c) { 12731 12732 case '%': 12733 pbuf = formatbuf; 12734 kind = PyUnicode_4BYTE_KIND; 12735 /* presume that buffer length is at least 1 */ 12736 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12737 len = 1; 12738 break; 12739 12740 case 's': 12741 case 'r': 12742 case 'a': 12743 if (PyUnicode_CheckExact(v) && c == 's') { 12744 temp = v; 12745 Py_INCREF(temp); 12746 } 12747 else { 12748 if (c == 's') 12749 temp = PyObject_Str(v); 12750 else if (c == 'r') 12751 temp = PyObject_Repr(v); 12752 else 12753 temp = PyObject_ASCII(v); 12754 if (temp == NULL) 12755 goto onError; 12756 if (PyUnicode_Check(temp)) 12757 /* nothing to do */; 12758 else { 12759 Py_DECREF(temp); 12760 PyErr_SetString(PyExc_TypeError, 12761 "%s argument has non-string str()"); 12762 goto onError; 12763 } 12764 } 12765 if (PyUnicode_READY(temp) == -1) { 12766 Py_CLEAR(temp); 12767 goto onError; 12768 } 12769 pbuf = PyUnicode_DATA(temp); 12770 kind = PyUnicode_KIND(temp); 12771 len = PyUnicode_GET_LENGTH(temp); 12772 if (prec >= 0 && len > prec) 12773 len = prec; 12774 break; 12775 12776 case 'i': 12777 case 'd': 12778 case 'u': 12779 case 'o': 12780 case 'x': 12781 case 'X': 12782 isnumok = 0; 12783 if (PyNumber_Check(v)) { 12784 PyObject *iobj=NULL; 12785 12786 if (PyLong_Check(v)) { 12787 iobj = v; 12788 Py_INCREF(iobj); 12789 } 12790 else { 12791 iobj = PyNumber_Long(v); 12792 } 12793 if (iobj!=NULL) { 12794 if (PyLong_Check(iobj)) { 12795 isnumok = 1; 12796 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12797 Py_DECREF(iobj); 12798 if (!temp) 12799 goto onError; 12800 if (PyUnicode_READY(temp) == -1) { 12801 Py_CLEAR(temp); 12802 goto onError; 12803 } 12804 pbuf = PyUnicode_DATA(temp); 12805 kind = PyUnicode_KIND(temp); 12806 len = PyUnicode_GET_LENGTH(temp); 12807 sign = 1; 12808 } 12809 else { 12810 Py_DECREF(iobj); 12811 } 12812 } 12813 } 12814 if (!isnumok) { 12815 PyErr_Format(PyExc_TypeError, 12816 "%%%c format: a number is required, " 12817 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12818 goto onError; 12819 } 12820 if (flags & F_ZERO) 12821 fill = '0'; 12822 break; 12823 12824 case 'e': 12825 case 'E': 12826 case 'f': 12827 case 'F': 12828 case 'g': 12829 case 'G': 12830 temp = formatfloat(v, flags, prec, c); 12831 if (!temp) 12832 goto onError; 12833 if (PyUnicode_READY(temp) == -1) { 12834 Py_CLEAR(temp); 12835 goto onError; 12836 } 12837 pbuf = PyUnicode_DATA(temp); 12838 kind = PyUnicode_KIND(temp); 12839 len = PyUnicode_GET_LENGTH(temp); 12840 sign = 1; 12841 if (flags & F_ZERO) 12842 fill = '0'; 12843 break; 12844 12845 case 'c': 12846 pbuf = formatbuf; 12847 kind = PyUnicode_4BYTE_KIND; 12848 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12849 if (len < 0) 12850 goto onError; 12851 break; 12852 12853 default: 12854 PyErr_Format(PyExc_ValueError, 12855 "unsupported format character '%c' (0x%x) " 12856 "at index %zd", 12857 (31<=c && c<=126) ? (char)c : '?', 12858 (int)c, 12859 fmtpos - 1); 12860 goto onError; 12861 } 12862 /* pbuf is initialized here. */ 12863 pindex = 0; 12864 if (sign) { 12865 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 12866 PyUnicode_READ(kind, pbuf, pindex) == '+') { 12867 sign = PyUnicode_READ(kind, pbuf, pindex++); 12868 len--; 12869 } 12870 else if (flags & F_SIGN) 12871 sign = '+'; 12872 else if (flags & F_BLANK) 12873 sign = ' '; 12874 else 12875 sign = 0; 12876 } 12877 if (width < len) 12878 width = len; 12879 if (rescnt - (sign != 0) < width) { 12880 reslen -= rescnt; 12881 rescnt = width + fmtcnt + 100; 12882 reslen += rescnt; 12883 if (reslen < 0) { 12884 Py_XDECREF(temp); 12885 PyErr_NoMemory(); 12886 goto onError; 12887 } 12888 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12889 if (res0 == 0) { 12890 PyErr_NoMemory(); 12891 Py_XDECREF(temp); 12892 goto onError; 12893 } 12894 res = res0 + reslen - rescnt; 12895 } 12896 if (sign) { 12897 if (fill != ' ') 12898 *res++ = sign; 12899 rescnt--; 12900 if (width > len) 12901 width--; 12902 } 12903 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12904 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12905 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12906 if (fill != ' ') { 12907 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12908 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12909 } 12910 rescnt -= 2; 12911 width -= 2; 12912 if (width < 0) 12913 width = 0; 12914 len -= 2; 12915 } 12916 if (width > len && !(flags & F_LJUST)) { 12917 do { 12918 --rescnt; 12919 *res++ = fill; 12920 } while (--width > len); 12921 } 12922 if (fill == ' ') { 12923 if (sign) 12924 *res++ = sign; 12925 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12926 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12927 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12928 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12929 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12930 } 12931 } 12932 /* Copy all characters, preserving len */ 12933 len1 = len; 12934 while (len1--) { 12935 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12936 rescnt--; 12937 } 12938 while (--width >= len) { 12939 --rescnt; 12940 *res++ = ' '; 12941 } 12942 if (dict && (argidx < arglen) && c != '%') { 12943 PyErr_SetString(PyExc_TypeError, 12944 "not all arguments converted during string formatting"); 12945 Py_XDECREF(temp); 12946 goto onError; 12947 } 12948 Py_XDECREF(temp); 12949 } /* '%' */ 12950 } /* until end */ 12951 if (argidx < arglen && !dict) { 12952 PyErr_SetString(PyExc_TypeError, 12953 "not all arguments converted during string formatting"); 12954 goto onError; 12955 } 12956 12957 12958 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 12959 if (*res > max) 12960 max = *res; 12961 result = PyUnicode_New(reslen - rescnt, max); 12962 if (!result) 12963 goto onError; 12964 kind = PyUnicode_KIND(result); 12965 for (res = res0; res < res0+reslen-rescnt; res++) 12966 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 12967 PyMem_Free(res0); 12968 if (args_owned) { 12969 Py_DECREF(args); 12970 } 12971 Py_DECREF(uformat); 12972 return (PyObject *)result; 12973 12974 onError: 12975 PyMem_Free(res0); 12976 Py_DECREF(uformat); 12977 if (args_owned) { 12978 Py_DECREF(args); 12979 } 12980 return NULL; 12981} 12982 12983static PyObject * 12984unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 12985 12986static PyObject * 12987unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12988{ 12989 PyObject *x = NULL; 12990 static char *kwlist[] = {"object", "encoding", "errors", 0}; 12991 char *encoding = NULL; 12992 char *errors = NULL; 12993 12994 if (type != &PyUnicode_Type) 12995 return unicode_subtype_new(type, args, kwds); 12996 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 12997 kwlist, &x, &encoding, &errors)) 12998 return NULL; 12999 if (x == NULL) 13000 return (PyObject *)PyUnicode_New(0, 0); 13001 if (encoding == NULL && errors == NULL) 13002 return PyObject_Str(x); 13003 else 13004 return PyUnicode_FromEncodedObject(x, encoding, errors); 13005} 13006 13007static PyObject * 13008unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13009{ 13010 PyUnicodeObject *unicode, *self; 13011 Py_ssize_t length, char_size; 13012 int share_wstr, share_utf8; 13013 unsigned int kind; 13014 void *data; 13015 13016 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13017 13018 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 13019 if (unicode == NULL) 13020 return NULL; 13021 assert(_PyUnicode_CHECK(unicode)); 13022 if (PyUnicode_READY(unicode)) 13023 return NULL; 13024 13025 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 13026 if (self == NULL) { 13027 Py_DECREF(unicode); 13028 return NULL; 13029 } 13030 kind = PyUnicode_KIND(unicode); 13031 length = PyUnicode_GET_LENGTH(unicode); 13032 13033 _PyUnicode_LENGTH(self) = length; 13034 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13035 _PyUnicode_STATE(self).interned = 0; 13036 _PyUnicode_STATE(self).kind = kind; 13037 _PyUnicode_STATE(self).compact = 0; 13038 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13039 _PyUnicode_STATE(self).ready = 1; 13040 _PyUnicode_WSTR(self) = NULL; 13041 _PyUnicode_UTF8_LENGTH(self) = 0; 13042 _PyUnicode_UTF8(self) = NULL; 13043 _PyUnicode_WSTR_LENGTH(self) = 0; 13044 _PyUnicode_DATA_ANY(self) = NULL; 13045 13046 share_utf8 = 0; 13047 share_wstr = 0; 13048 if (kind == PyUnicode_1BYTE_KIND) { 13049 char_size = 1; 13050 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13051 share_utf8 = 1; 13052 } 13053 else if (kind == PyUnicode_2BYTE_KIND) { 13054 char_size = 2; 13055 if (sizeof(wchar_t) == 2) 13056 share_wstr = 1; 13057 } 13058 else { 13059 assert(kind == PyUnicode_4BYTE_KIND); 13060 char_size = 4; 13061 if (sizeof(wchar_t) == 4) 13062 share_wstr = 1; 13063 } 13064 13065 /* Ensure we won't overflow the length. */ 13066 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13067 PyErr_NoMemory(); 13068 goto onError; 13069 } 13070 data = PyObject_MALLOC((length + 1) * char_size); 13071 if (data == NULL) { 13072 PyErr_NoMemory(); 13073 goto onError; 13074 } 13075 13076 _PyUnicode_DATA_ANY(self) = data; 13077 if (share_utf8) { 13078 _PyUnicode_UTF8_LENGTH(self) = length; 13079 _PyUnicode_UTF8(self) = data; 13080 } 13081 if (share_wstr) { 13082 _PyUnicode_WSTR_LENGTH(self) = length; 13083 _PyUnicode_WSTR(self) = (wchar_t *)data; 13084 } 13085 13086 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13087 PyUnicode_KIND_SIZE(kind, length + 1)); 13088 Py_DECREF(unicode); 13089 return (PyObject *)self; 13090 13091onError: 13092 Py_DECREF(unicode); 13093 Py_DECREF(self); 13094 return NULL; 13095} 13096 13097PyDoc_STRVAR(unicode_doc, 13098 "str(string[, encoding[, errors]]) -> str\n\ 13099\n\ 13100Create a new string object from the given encoded string.\n\ 13101encoding defaults to the current default string encoding.\n\ 13102errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13103 13104static PyObject *unicode_iter(PyObject *seq); 13105 13106PyTypeObject PyUnicode_Type = { 13107 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13108 "str", /* tp_name */ 13109 sizeof(PyUnicodeObject), /* tp_size */ 13110 0, /* tp_itemsize */ 13111 /* Slots */ 13112 (destructor)unicode_dealloc, /* tp_dealloc */ 13113 0, /* tp_print */ 13114 0, /* tp_getattr */ 13115 0, /* tp_setattr */ 13116 0, /* tp_reserved */ 13117 unicode_repr, /* tp_repr */ 13118 &unicode_as_number, /* tp_as_number */ 13119 &unicode_as_sequence, /* tp_as_sequence */ 13120 &unicode_as_mapping, /* tp_as_mapping */ 13121 (hashfunc) unicode_hash, /* tp_hash*/ 13122 0, /* tp_call*/ 13123 (reprfunc) unicode_str, /* tp_str */ 13124 PyObject_GenericGetAttr, /* tp_getattro */ 13125 0, /* tp_setattro */ 13126 0, /* tp_as_buffer */ 13127 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13128 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13129 unicode_doc, /* tp_doc */ 13130 0, /* tp_traverse */ 13131 0, /* tp_clear */ 13132 PyUnicode_RichCompare, /* tp_richcompare */ 13133 0, /* tp_weaklistoffset */ 13134 unicode_iter, /* tp_iter */ 13135 0, /* tp_iternext */ 13136 unicode_methods, /* tp_methods */ 13137 0, /* tp_members */ 13138 0, /* tp_getset */ 13139 &PyBaseObject_Type, /* tp_base */ 13140 0, /* tp_dict */ 13141 0, /* tp_descr_get */ 13142 0, /* tp_descr_set */ 13143 0, /* tp_dictoffset */ 13144 0, /* tp_init */ 13145 0, /* tp_alloc */ 13146 unicode_new, /* tp_new */ 13147 PyObject_Del, /* tp_free */ 13148}; 13149 13150/* Initialize the Unicode implementation */ 13151 13152void _PyUnicode_Init(void) 13153{ 13154 int i; 13155 13156 /* XXX - move this array to unicodectype.c ? */ 13157 Py_UCS2 linebreak[] = { 13158 0x000A, /* LINE FEED */ 13159 0x000D, /* CARRIAGE RETURN */ 13160 0x001C, /* FILE SEPARATOR */ 13161 0x001D, /* GROUP SEPARATOR */ 13162 0x001E, /* RECORD SEPARATOR */ 13163 0x0085, /* NEXT LINE */ 13164 0x2028, /* LINE SEPARATOR */ 13165 0x2029, /* PARAGRAPH SEPARATOR */ 13166 }; 13167 13168 /* Init the implementation */ 13169 unicode_empty = PyUnicode_New(0, 0); 13170 if (!unicode_empty) 13171 Py_FatalError("Can't create empty string"); 13172 13173 for (i = 0; i < 256; i++) 13174 unicode_latin1[i] = NULL; 13175 if (PyType_Ready(&PyUnicode_Type) < 0) 13176 Py_FatalError("Can't initialize 'unicode'"); 13177 13178 /* initialize the linebreak bloom filter */ 13179 bloom_linebreak = make_bloom_mask( 13180 PyUnicode_2BYTE_KIND, linebreak, 13181 Py_ARRAY_LENGTH(linebreak)); 13182 13183 PyType_Ready(&EncodingMapType); 13184} 13185 13186/* Finalize the Unicode implementation */ 13187 13188int 13189PyUnicode_ClearFreeList(void) 13190{ 13191 return 0; 13192} 13193 13194void 13195_PyUnicode_Fini(void) 13196{ 13197 int i; 13198 13199 Py_XDECREF(unicode_empty); 13200 unicode_empty = NULL; 13201 13202 for (i = 0; i < 256; i++) { 13203 if (unicode_latin1[i]) { 13204 Py_DECREF(unicode_latin1[i]); 13205 unicode_latin1[i] = NULL; 13206 } 13207 } 13208 (void)PyUnicode_ClearFreeList(); 13209} 13210 13211void 13212PyUnicode_InternInPlace(PyObject **p) 13213{ 13214 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13215 PyObject *t; 13216#ifdef Py_DEBUG 13217 assert(s != NULL); 13218 assert(_PyUnicode_CHECK(s)); 13219#else 13220 if (s == NULL || !PyUnicode_Check(s)) 13221 return; 13222#endif 13223 /* If it's a subclass, we don't really know what putting 13224 it in the interned dict might do. */ 13225 if (!PyUnicode_CheckExact(s)) 13226 return; 13227 if (PyUnicode_CHECK_INTERNED(s)) 13228 return; 13229 if (_PyUnicode_READY_REPLACE(p)) { 13230 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 13231 return; 13232 } 13233 s = (PyUnicodeObject *)(*p); 13234 if (interned == NULL) { 13235 interned = PyDict_New(); 13236 if (interned == NULL) { 13237 PyErr_Clear(); /* Don't leave an exception */ 13238 return; 13239 } 13240 } 13241 /* It might be that the GetItem call fails even 13242 though the key is present in the dictionary, 13243 namely when this happens during a stack overflow. */ 13244 Py_ALLOW_RECURSION 13245 t = PyDict_GetItem(interned, (PyObject *)s); 13246 Py_END_ALLOW_RECURSION 13247 13248 if (t) { 13249 Py_INCREF(t); 13250 Py_DECREF(*p); 13251 *p = t; 13252 return; 13253 } 13254 13255 PyThreadState_GET()->recursion_critical = 1; 13256 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13257 PyErr_Clear(); 13258 PyThreadState_GET()->recursion_critical = 0; 13259 return; 13260 } 13261 PyThreadState_GET()->recursion_critical = 0; 13262 /* The two references in interned are not counted by refcnt. 13263 The deallocator will take care of this */ 13264 Py_REFCNT(s) -= 2; 13265 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13266} 13267 13268void 13269PyUnicode_InternImmortal(PyObject **p) 13270{ 13271 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13272 13273 PyUnicode_InternInPlace(p); 13274 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13275 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13276 Py_INCREF(*p); 13277 } 13278} 13279 13280PyObject * 13281PyUnicode_InternFromString(const char *cp) 13282{ 13283 PyObject *s = PyUnicode_FromString(cp); 13284 if (s == NULL) 13285 return NULL; 13286 PyUnicode_InternInPlace(&s); 13287 return s; 13288} 13289 13290void 13291_Py_ReleaseInternedUnicodeStrings(void) 13292{ 13293 PyObject *keys; 13294 PyUnicodeObject *s; 13295 Py_ssize_t i, n; 13296 Py_ssize_t immortal_size = 0, mortal_size = 0; 13297 13298 if (interned == NULL || !PyDict_Check(interned)) 13299 return; 13300 keys = PyDict_Keys(interned); 13301 if (keys == NULL || !PyList_Check(keys)) { 13302 PyErr_Clear(); 13303 return; 13304 } 13305 13306 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13307 detector, interned unicode strings are not forcibly deallocated; 13308 rather, we give them their stolen references back, and then clear 13309 and DECREF the interned dict. */ 13310 13311 n = PyList_GET_SIZE(keys); 13312 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13313 n); 13314 for (i = 0; i < n; i++) { 13315 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13316 if (PyUnicode_READY(s) == -1) { 13317 assert(0 && "could not ready string"); 13318 fprintf(stderr, "could not ready string\n"); 13319 } 13320 switch (PyUnicode_CHECK_INTERNED(s)) { 13321 case SSTATE_NOT_INTERNED: 13322 /* XXX Shouldn't happen */ 13323 break; 13324 case SSTATE_INTERNED_IMMORTAL: 13325 Py_REFCNT(s) += 1; 13326 immortal_size += PyUnicode_GET_LENGTH(s); 13327 break; 13328 case SSTATE_INTERNED_MORTAL: 13329 Py_REFCNT(s) += 2; 13330 mortal_size += PyUnicode_GET_LENGTH(s); 13331 break; 13332 default: 13333 Py_FatalError("Inconsistent interned string state."); 13334 } 13335 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13336 } 13337 fprintf(stderr, "total size of all interned strings: " 13338 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13339 "mortal/immortal\n", mortal_size, immortal_size); 13340 Py_DECREF(keys); 13341 PyDict_Clear(interned); 13342 Py_DECREF(interned); 13343 interned = NULL; 13344} 13345 13346 13347/********************* Unicode Iterator **************************/ 13348 13349typedef struct { 13350 PyObject_HEAD 13351 Py_ssize_t it_index; 13352 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13353} unicodeiterobject; 13354 13355static void 13356unicodeiter_dealloc(unicodeiterobject *it) 13357{ 13358 _PyObject_GC_UNTRACK(it); 13359 Py_XDECREF(it->it_seq); 13360 PyObject_GC_Del(it); 13361} 13362 13363static int 13364unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13365{ 13366 Py_VISIT(it->it_seq); 13367 return 0; 13368} 13369 13370static PyObject * 13371unicodeiter_next(unicodeiterobject *it) 13372{ 13373 PyUnicodeObject *seq; 13374 PyObject *item; 13375 13376 assert(it != NULL); 13377 seq = it->it_seq; 13378 if (seq == NULL) 13379 return NULL; 13380 assert(_PyUnicode_CHECK(seq)); 13381 13382 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13383 int kind = PyUnicode_KIND(seq); 13384 void *data = PyUnicode_DATA(seq); 13385 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13386 item = PyUnicode_FromOrdinal(chr); 13387 if (item != NULL) 13388 ++it->it_index; 13389 return item; 13390 } 13391 13392 Py_DECREF(seq); 13393 it->it_seq = NULL; 13394 return NULL; 13395} 13396 13397static PyObject * 13398unicodeiter_len(unicodeiterobject *it) 13399{ 13400 Py_ssize_t len = 0; 13401 if (it->it_seq) 13402 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13403 return PyLong_FromSsize_t(len); 13404} 13405 13406PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13407 13408static PyMethodDef unicodeiter_methods[] = { 13409 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13410 length_hint_doc}, 13411 {NULL, NULL} /* sentinel */ 13412}; 13413 13414PyTypeObject PyUnicodeIter_Type = { 13415 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13416 "str_iterator", /* tp_name */ 13417 sizeof(unicodeiterobject), /* tp_basicsize */ 13418 0, /* tp_itemsize */ 13419 /* methods */ 13420 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13421 0, /* tp_print */ 13422 0, /* tp_getattr */ 13423 0, /* tp_setattr */ 13424 0, /* tp_reserved */ 13425 0, /* tp_repr */ 13426 0, /* tp_as_number */ 13427 0, /* tp_as_sequence */ 13428 0, /* tp_as_mapping */ 13429 0, /* tp_hash */ 13430 0, /* tp_call */ 13431 0, /* tp_str */ 13432 PyObject_GenericGetAttr, /* tp_getattro */ 13433 0, /* tp_setattro */ 13434 0, /* tp_as_buffer */ 13435 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13436 0, /* tp_doc */ 13437 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13438 0, /* tp_clear */ 13439 0, /* tp_richcompare */ 13440 0, /* tp_weaklistoffset */ 13441 PyObject_SelfIter, /* tp_iter */ 13442 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13443 unicodeiter_methods, /* tp_methods */ 13444 0, 13445}; 13446 13447static PyObject * 13448unicode_iter(PyObject *seq) 13449{ 13450 unicodeiterobject *it; 13451 13452 if (!PyUnicode_Check(seq)) { 13453 PyErr_BadInternalCall(); 13454 return NULL; 13455 } 13456 if (PyUnicode_READY(seq) == -1) 13457 return NULL; 13458 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13459 if (it == NULL) 13460 return NULL; 13461 it->it_index = 0; 13462 Py_INCREF(seq); 13463 it->it_seq = (PyUnicodeObject *)seq; 13464 _PyObject_GC_TRACK(it); 13465 return (PyObject *)it; 13466} 13467 13468#define UNIOP(x) Py_UNICODE_##x 13469#define UNIOP_t Py_UNICODE 13470#include "uniops.h" 13471#undef UNIOP 13472#undef UNIOP_t 13473#define UNIOP(x) Py_UCS4_##x 13474#define UNIOP_t Py_UCS4 13475#include "uniops.h" 13476#undef UNIOP 13477#undef UNIOP_t 13478 13479Py_UNICODE* 13480PyUnicode_AsUnicodeCopy(PyObject *object) 13481{ 13482 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13483 Py_UNICODE *copy; 13484 Py_ssize_t size; 13485 13486 if (!PyUnicode_Check(unicode)) { 13487 PyErr_BadArgument(); 13488 return NULL; 13489 } 13490 /* Ensure we won't overflow the size. */ 13491 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13492 PyErr_NoMemory(); 13493 return NULL; 13494 } 13495 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13496 size *= sizeof(Py_UNICODE); 13497 copy = PyMem_Malloc(size); 13498 if (copy == NULL) { 13499 PyErr_NoMemory(); 13500 return NULL; 13501 } 13502 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13503 return copy; 13504} 13505 13506/* A _string module, to export formatter_parser and formatter_field_name_split 13507 to the string.Formatter class implemented in Python. */ 13508 13509static PyMethodDef _string_methods[] = { 13510 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13511 METH_O, PyDoc_STR("split the argument as a field name")}, 13512 {"formatter_parser", (PyCFunction) formatter_parser, 13513 METH_O, PyDoc_STR("parse the argument as a format string")}, 13514 {NULL, NULL} 13515}; 13516 13517static struct PyModuleDef _string_module = { 13518 PyModuleDef_HEAD_INIT, 13519 "_string", 13520 PyDoc_STR("string helper module"), 13521 0, 13522 _string_methods, 13523 NULL, 13524 NULL, 13525 NULL, 13526 NULL 13527}; 13528 13529PyMODINIT_FUNC 13530PyInit__string(void) 13531{ 13532 return PyModule_Create(&_string_module); 13533} 13534 13535 13536#ifdef __cplusplus 13537} 13538#endif 13539