unicodeobject.c revision b9275c104e50361fe3a785126e5ecad24d319a7a
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Limit for the Unicode object free list */ 50 51#define PyUnicode_MAXFREELIST 1024 52 53/* Limit for the Unicode object free list stay alive optimization. 54 55 The implementation will keep allocated Unicode memory intact for 56 all objects on the free list having a size less than this 57 limit. This reduces malloc() overhead for small Unicode objects. 58 59 At worst this will result in PyUnicode_MAXFREELIST * 60 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 61 malloc()-overhead) bytes of unused garbage. 62 63 Setting the limit to 0 effectively turns the feature off. 64 65 Note: This is an experimental feature ! If you get core dumps when 66 using Unicode objects, turn this feature off. 67 68*/ 69 70#define KEEPALIVE_SIZE_LIMIT 9 71 72/* Endianness switches; defaults to little endian */ 73 74#ifdef WORDS_BIGENDIAN 75# define BYTEORDER_IS_BIG_ENDIAN 76#else 77# define BYTEORDER_IS_LITTLE_ENDIAN 78#endif 79 80/* --- Globals ------------------------------------------------------------ 81 82 The globals are initialized by the _PyUnicode_Init() API and should 83 not be used before calling that API. 84 85*/ 86 87 88#ifdef __cplusplus 89extern "C" { 90#endif 91 92#ifdef Py_DEBUG 93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) 94#else 95# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 96#endif 97 98#define _PyUnicode_UTF8(op) \ 99 (((PyCompactUnicodeObject*)(op))->utf8) 100#define PyUnicode_UTF8(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 assert(PyUnicode_IS_READY(op)), \ 103 PyUnicode_IS_COMPACT_ASCII(op) ? \ 104 ((char*)((PyASCIIObject*)(op) + 1)) : \ 105 _PyUnicode_UTF8(op)) 106#define _PyUnicode_UTF8_LENGTH(op) \ 107 (((PyCompactUnicodeObject*)(op))->utf8_length) 108#define PyUnicode_UTF8_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 assert(PyUnicode_IS_READY(op)), \ 111 PyUnicode_IS_COMPACT_ASCII(op) ? \ 112 ((PyASCIIObject*)(op))->length : \ 113 _PyUnicode_UTF8_LENGTH(op)) 114#define _PyUnicode_WSTR(op) \ 115 (((PyASCIIObject*)(op))->wstr) 116#define _PyUnicode_WSTR_LENGTH(op) \ 117 (((PyCompactUnicodeObject*)(op))->wstr_length) 118#define _PyUnicode_LENGTH(op) \ 119 (((PyASCIIObject *)(op))->length) 120#define _PyUnicode_STATE(op) \ 121 (((PyASCIIObject *)(op))->state) 122#define _PyUnicode_HASH(op) \ 123 (((PyASCIIObject *)(op))->hash) 124#define _PyUnicode_KIND(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 ((PyASCIIObject *)(op))->state.kind) 127#define _PyUnicode_GET_LENGTH(op) \ 128 (assert(_PyUnicode_CHECK(op)), \ 129 ((PyASCIIObject *)(op))->length) 130#define _PyUnicode_DATA_ANY(op) \ 131 (((PyUnicodeObject*)(op))->data.any) 132 133#undef PyUnicode_READY 134#define PyUnicode_READY(op) \ 135 (assert(_PyUnicode_CHECK(op)), \ 136 (PyUnicode_IS_READY(op) ? \ 137 0 : \ 138 _PyUnicode_Ready((PyObject *)(op)))) 139 140#define _PyUnicode_READY_REPLACE(p_obj) \ 141 (assert(_PyUnicode_CHECK(*p_obj)), \ 142 (PyUnicode_IS_READY(*p_obj) ? \ 143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 144 145#define _PyUnicode_SHARE_UTF8(op) \ 146 (assert(_PyUnicode_CHECK(op)), \ 147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 149#define _PyUnicode_SHARE_WSTR(op) \ 150 (assert(_PyUnicode_CHECK(op)), \ 151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 152 153/* true if the Unicode object has an allocated UTF-8 memory block 154 (not shared with other data) */ 155#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 156 (assert(_PyUnicode_CHECK(op)), \ 157 (!PyUnicode_IS_COMPACT_ASCII(op) \ 158 && _PyUnicode_UTF8(op) \ 159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 160 161/* true if the Unicode object has an allocated wstr memory block 162 (not shared with other data) */ 163#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 164 (assert(_PyUnicode_CHECK(op)), \ 165 (_PyUnicode_WSTR(op) && \ 166 (!PyUnicode_IS_READY(op) || \ 167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 168 169/* Generic helper macro to convert characters of different types. 170 from_type and to_type have to be valid type names, begin and end 171 are pointers to the source characters which should be of type 172 "from_type *". to is a pointer of type "to_type *" and points to the 173 buffer where the result characters are written to. */ 174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 175 do { \ 176 const from_type *iter_; to_type *to_; \ 177 for (iter_ = (begin), to_ = (to_type *)(to); \ 178 iter_ < (end); \ 179 ++iter_, ++to_) { \ 180 *to_ = (to_type)*iter_; \ 181 } \ 182 } while (0) 183 184/* The Unicode string has been modified: reset the hash */ 185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 186 187/* This dictionary holds all interned unicode strings. Note that references 188 to strings in this dictionary are *not* counted in the string's ob_refcnt. 189 When the interned string reaches a refcnt of 0 the string deallocation 190 function will delete the reference from this dictionary. 191 192 Another way to look at this is that to say that the actual reference 193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 194*/ 195static PyObject *interned; 196 197/* The empty Unicode object is shared to improve performance. */ 198static PyObject *unicode_empty; 199 200/* Single character Unicode strings in the Latin-1 range are being 201 shared as well. */ 202static PyObject *unicode_latin1[256]; 203 204/* Fast detection of the most frequent whitespace characters */ 205const unsigned char _Py_ascii_whitespace[] = { 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x0009: * CHARACTER TABULATION */ 208/* case 0x000A: * LINE FEED */ 209/* case 0x000B: * LINE TABULATION */ 210/* case 0x000C: * FORM FEED */ 211/* case 0x000D: * CARRIAGE RETURN */ 212 0, 1, 1, 1, 1, 1, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214/* case 0x001C: * FILE SEPARATOR */ 215/* case 0x001D: * GROUP SEPARATOR */ 216/* case 0x001E: * RECORD SEPARATOR */ 217/* case 0x001F: * UNIT SEPARATOR */ 218 0, 0, 0, 0, 1, 1, 1, 1, 219/* case 0x0020: * SPACE */ 220 1, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0 233}; 234 235/* forward */ 236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 237static PyObject* get_latin1_char(unsigned char ch); 238 239static PyObject * 240unicode_encode_call_errorhandler(const char *errors, 241 PyObject **errorHandler,const char *encoding, const char *reason, 242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 244 245static void 246raise_encode_exception(PyObject **exceptionObject, 247 const char *encoding, 248 const Py_UNICODE *unicode, Py_ssize_t size, 249 Py_ssize_t startpos, Py_ssize_t endpos, 250 const char *reason); 251 252/* Same for linebreaks */ 253static unsigned char ascii_linebreak[] = { 254 0, 0, 0, 0, 0, 0, 0, 0, 255/* 0x000A, * LINE FEED */ 256/* 0x000B, * LINE TABULATION */ 257/* 0x000C, * FORM FEED */ 258/* 0x000D, * CARRIAGE RETURN */ 259 0, 0, 1, 1, 1, 1, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261/* 0x001C, * FILE SEPARATOR */ 262/* 0x001D, * GROUP SEPARATOR */ 263/* 0x001E, * RECORD SEPARATOR */ 264 0, 0, 0, 0, 1, 1, 1, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0 278}; 279 280/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 281 This function is kept for backward compatibility with the old API. */ 282Py_UNICODE 283PyUnicode_GetMax(void) 284{ 285#ifdef Py_UNICODE_WIDE 286 return 0x10FFFF; 287#else 288 /* This is actually an illegal character, so it should 289 not be passed to unichr. */ 290 return 0xFFFF; 291#endif 292} 293 294#ifdef Py_DEBUG 295static int 296_PyUnicode_CheckConsistency(void *op) 297{ 298 PyASCIIObject *ascii; 299 unsigned int kind; 300 301 assert(PyUnicode_Check(op)); 302 303 ascii = (PyASCIIObject *)op; 304 kind = ascii->state.kind; 305 306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 307 assert(kind == PyUnicode_1BYTE_KIND); 308 assert(ascii->state.ready == 1); 309 } 310 else { 311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 312 void *data; 313 314 if (ascii->state.compact == 1) { 315 data = compact + 1; 316 assert(kind == PyUnicode_1BYTE_KIND 317 || kind == PyUnicode_2BYTE_KIND 318 || kind == PyUnicode_4BYTE_KIND); 319 assert(ascii->state.ascii == 0); 320 assert(ascii->state.ready == 1); 321 assert (compact->utf8 != data); 322 } else { 323 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 324 325 data = unicode->data.any; 326 if (kind == PyUnicode_WCHAR_KIND) { 327 assert(ascii->state.compact == 0); 328 assert(ascii->state.ascii == 0); 329 assert(ascii->state.ready == 0); 330 assert(ascii->wstr != NULL); 331 assert(data == NULL); 332 assert(compact->utf8 == NULL); 333 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 334 } 335 else { 336 assert(kind == PyUnicode_1BYTE_KIND 337 || kind == PyUnicode_2BYTE_KIND 338 || kind == PyUnicode_4BYTE_KIND); 339 assert(ascii->state.compact == 0); 340 assert(ascii->state.ready == 1); 341 assert(data != NULL); 342 if (ascii->state.ascii) { 343 assert (compact->utf8 == data); 344 assert (compact->utf8_length == ascii->length); 345 } 346 else 347 assert (compact->utf8 != data); 348 } 349 } 350 if (kind != PyUnicode_WCHAR_KIND) { 351 if ( 352#if SIZEOF_WCHAR_T == 2 353 kind == PyUnicode_2BYTE_KIND 354#else 355 kind == PyUnicode_4BYTE_KIND 356#endif 357 ) 358 { 359 assert(ascii->wstr == data); 360 assert(compact->wstr_length == ascii->length); 361 } else 362 assert(ascii->wstr != data); 363 } 364 365 if (compact->utf8 == NULL) 366 assert(compact->utf8_length == 0); 367 if (ascii->wstr == NULL) 368 assert(compact->wstr_length == 0); 369 } 370 return 1; 371} 372#else 373static int 374_PyUnicode_CheckConsistency(void *op) 375{ 376 return 1; 377} 378#endif 379 380/* --- Bloom Filters ----------------------------------------------------- */ 381 382/* stuff to implement simple "bloom filters" for Unicode characters. 383 to keep things simple, we use a single bitmask, using the least 5 384 bits from each unicode characters as the bit index. */ 385 386/* the linebreak mask is set up by Unicode_Init below */ 387 388#if LONG_BIT >= 128 389#define BLOOM_WIDTH 128 390#elif LONG_BIT >= 64 391#define BLOOM_WIDTH 64 392#elif LONG_BIT >= 32 393#define BLOOM_WIDTH 32 394#else 395#error "LONG_BIT is smaller than 32" 396#endif 397 398#define BLOOM_MASK unsigned long 399 400static BLOOM_MASK bloom_linebreak; 401 402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 404 405#define BLOOM_LINEBREAK(ch) \ 406 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 408 409Py_LOCAL_INLINE(BLOOM_MASK) 410make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 411{ 412 /* calculate simple bloom-style bitmask for a given unicode string */ 413 414 BLOOM_MASK mask; 415 Py_ssize_t i; 416 417 mask = 0; 418 for (i = 0; i < len; i++) 419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 420 421 return mask; 422} 423 424#define BLOOM_MEMBER(mask, chr, str) \ 425 (BLOOM(mask, chr) \ 426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 427 428/* --- Unicode Object ----------------------------------------------------- */ 429 430static PyObject * 431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); 432 433Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 434 Py_ssize_t size, Py_UCS4 ch, 435 int direction) 436{ 437 /* like wcschr, but doesn't stop at NULL characters */ 438 Py_ssize_t i; 439 if (direction == 1) { 440 for(i = 0; i < size; i++) 441 if (PyUnicode_READ(kind, s, i) == ch) 442 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 443 } 444 else { 445 for(i = size-1; i >= 0; i--) 446 if (PyUnicode_READ(kind, s, i) == ch) 447 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 448 } 449 return NULL; 450} 451 452static PyObject* 453resize_compact(PyObject *unicode, Py_ssize_t length) 454{ 455 Py_ssize_t char_size; 456 Py_ssize_t struct_size; 457 Py_ssize_t new_size; 458 int share_wstr; 459 460 assert(PyUnicode_IS_READY(unicode)); 461 char_size = PyUnicode_CHARACTER_SIZE(unicode); 462 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 463 struct_size = sizeof(PyASCIIObject); 464 else 465 struct_size = sizeof(PyCompactUnicodeObject); 466 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 467 468 _Py_DEC_REFTOTAL; 469 _Py_ForgetReference(unicode); 470 471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 472 PyErr_NoMemory(); 473 return NULL; 474 } 475 new_size = (struct_size + (length + 1) * char_size); 476 477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 478 if (unicode == NULL) { 479 PyObject_Del(unicode); 480 PyErr_NoMemory(); 481 return NULL; 482 } 483 _Py_NewReference(unicode); 484 _PyUnicode_LENGTH(unicode) = length; 485 if (share_wstr) { 486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 487 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 488 _PyUnicode_WSTR_LENGTH(unicode) = length; 489 } 490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 491 length, 0); 492 return unicode; 493} 494 495static int 496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) 497{ 498 wchar_t *wstr; 499 assert(!PyUnicode_IS_COMPACT(unicode)); 500 assert(Py_REFCNT(unicode) == 1); 501 502 _PyUnicode_DIRTY(unicode); 503 504 if (PyUnicode_IS_READY(unicode)) { 505 Py_ssize_t char_size; 506 Py_ssize_t new_size; 507 int share_wstr, share_utf8; 508 void *data; 509 510 data = _PyUnicode_DATA_ANY(unicode); 511 assert(data != NULL); 512 char_size = PyUnicode_CHARACTER_SIZE(unicode); 513 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 516 { 517 PyObject_DEL(_PyUnicode_UTF8(unicode)); 518 _PyUnicode_UTF8(unicode) = NULL; 519 _PyUnicode_UTF8_LENGTH(unicode) = 0; 520 } 521 522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 523 PyErr_NoMemory(); 524 return -1; 525 } 526 new_size = (length + 1) * char_size; 527 528 data = (PyObject *)PyObject_REALLOC(data, new_size); 529 if (data == NULL) { 530 PyErr_NoMemory(); 531 return -1; 532 } 533 _PyUnicode_DATA_ANY(unicode) = data; 534 if (share_wstr) { 535 _PyUnicode_WSTR(unicode) = data; 536 _PyUnicode_WSTR_LENGTH(unicode) = length; 537 } 538 if (share_utf8) { 539 _PyUnicode_UTF8(unicode) = data; 540 _PyUnicode_UTF8_LENGTH(unicode) = length; 541 } 542 _PyUnicode_LENGTH(unicode) = length; 543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 545 _PyUnicode_CheckConsistency(unicode); 546 return 0; 547 } 548 } 549 assert(_PyUnicode_WSTR(unicode) != NULL); 550 551 /* check for integer overflow */ 552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 553 PyErr_NoMemory(); 554 return -1; 555 } 556 wstr = _PyUnicode_WSTR(unicode); 557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 558 if (!wstr) { 559 PyErr_NoMemory(); 560 return -1; 561 } 562 _PyUnicode_WSTR(unicode) = wstr; 563 _PyUnicode_WSTR(unicode)[length] = 0; 564 _PyUnicode_WSTR_LENGTH(unicode) = length; 565 _PyUnicode_CheckConsistency(unicode); 566 return 0; 567} 568 569static PyObject* 570resize_copy(PyObject *unicode, Py_ssize_t length) 571{ 572 Py_ssize_t copy_length; 573 if (PyUnicode_IS_COMPACT(unicode)) { 574 PyObject *copy; 575 assert(PyUnicode_IS_READY(unicode)); 576 577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 578 if (copy == NULL) 579 return NULL; 580 581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 582 if (PyUnicode_CopyCharacters(copy, 0, 583 unicode, 0, 584 copy_length) < 0) 585 { 586 Py_DECREF(copy); 587 return NULL; 588 } 589 return copy; 590 } 591 else { 592 PyUnicodeObject *w; 593 assert(_PyUnicode_WSTR(unicode) != NULL); 594 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 595 w = _PyUnicode_New(length); 596 if (w == NULL) 597 return NULL; 598 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 599 copy_length = Py_MIN(copy_length, length); 600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 601 copy_length); 602 return (PyObject*)w; 603 } 604} 605 606/* We allocate one more byte to make sure the string is 607 Ux0000 terminated; some code (e.g. new_identifier) 608 relies on that. 609 610 XXX This allocator could further be enhanced by assuring that the 611 free list never reduces its size below 1. 612 613*/ 614 615#ifdef Py_DEBUG 616int unicode_old_new_calls = 0; 617#endif 618 619static PyUnicodeObject * 620_PyUnicode_New(Py_ssize_t length) 621{ 622 register PyUnicodeObject *unicode; 623 size_t new_size; 624 625 /* Optimization for empty strings */ 626 if (length == 0 && unicode_empty != NULL) { 627 Py_INCREF(unicode_empty); 628 return (PyUnicodeObject*)unicode_empty; 629 } 630 631 /* Ensure we won't overflow the size. */ 632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 633 return (PyUnicodeObject *)PyErr_NoMemory(); 634 } 635 if (length < 0) { 636 PyErr_SetString(PyExc_SystemError, 637 "Negative size passed to _PyUnicode_New"); 638 return NULL; 639 } 640 641#ifdef Py_DEBUG 642 ++unicode_old_new_calls; 643#endif 644 645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 646 if (unicode == NULL) 647 return NULL; 648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 650 if (!_PyUnicode_WSTR(unicode)) { 651 PyErr_NoMemory(); 652 goto onError; 653 } 654 655 /* Initialize the first element to guard against cases where 656 * the caller fails before initializing str -- unicode_resize() 657 * reads str[0], and the Keep-Alive optimization can keep memory 658 * allocated for str alive across a call to unicode_dealloc(unicode). 659 * We don't want unicode_resize to read uninitialized memory in 660 * that case. 661 */ 662 _PyUnicode_WSTR(unicode)[0] = 0; 663 _PyUnicode_WSTR(unicode)[length] = 0; 664 _PyUnicode_WSTR_LENGTH(unicode) = length; 665 _PyUnicode_HASH(unicode) = -1; 666 _PyUnicode_STATE(unicode).interned = 0; 667 _PyUnicode_STATE(unicode).kind = 0; 668 _PyUnicode_STATE(unicode).compact = 0; 669 _PyUnicode_STATE(unicode).ready = 0; 670 _PyUnicode_STATE(unicode).ascii = 0; 671 _PyUnicode_DATA_ANY(unicode) = NULL; 672 _PyUnicode_LENGTH(unicode) = 0; 673 _PyUnicode_UTF8(unicode) = NULL; 674 _PyUnicode_UTF8_LENGTH(unicode) = 0; 675 return unicode; 676 677 onError: 678 /* XXX UNREF/NEWREF interface should be more symmetrical */ 679 _Py_DEC_REFTOTAL; 680 _Py_ForgetReference((PyObject *)unicode); 681 PyObject_Del(unicode); 682 return NULL; 683} 684 685static const char* 686unicode_kind_name(PyObject *unicode) 687{ 688 /* don't check consistency: unicode_kind_name() is called from 689 _PyUnicode_Dump() */ 690 if (!PyUnicode_IS_COMPACT(unicode)) 691 { 692 if (!PyUnicode_IS_READY(unicode)) 693 return "wstr"; 694 switch(PyUnicode_KIND(unicode)) 695 { 696 case PyUnicode_1BYTE_KIND: 697 if (PyUnicode_IS_ASCII(unicode)) 698 return "legacy ascii"; 699 else 700 return "legacy latin1"; 701 case PyUnicode_2BYTE_KIND: 702 return "legacy UCS2"; 703 case PyUnicode_4BYTE_KIND: 704 return "legacy UCS4"; 705 default: 706 return "<legacy invalid kind>"; 707 } 708 } 709 assert(PyUnicode_IS_READY(unicode)); 710 switch(PyUnicode_KIND(unicode)) 711 { 712 case PyUnicode_1BYTE_KIND: 713 if (PyUnicode_IS_ASCII(unicode)) 714 return "ascii"; 715 else 716 return "latin1"; 717 case PyUnicode_2BYTE_KIND: 718 return "UCS2"; 719 case PyUnicode_4BYTE_KIND: 720 return "UCS4"; 721 default: 722 return "<invalid compact kind>"; 723 } 724} 725 726#ifdef Py_DEBUG 727int unicode_new_new_calls = 0; 728 729/* Functions wrapping macros for use in debugger */ 730char *_PyUnicode_utf8(void *unicode){ 731 return PyUnicode_UTF8(unicode); 732} 733 734void *_PyUnicode_compact_data(void *unicode) { 735 return _PyUnicode_COMPACT_DATA(unicode); 736} 737void *_PyUnicode_data(void *unicode){ 738 printf("obj %p\n", unicode); 739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 744 return PyUnicode_DATA(unicode); 745} 746 747void 748_PyUnicode_Dump(PyObject *op) 749{ 750 PyASCIIObject *ascii = (PyASCIIObject *)op; 751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 752 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 753 void *data; 754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 755 if (ascii->state.compact) 756 data = (compact + 1); 757 else 758 data = unicode->data.any; 759 if (ascii->wstr == data) 760 printf("shared "); 761 printf("wstr=%p", ascii->wstr); 762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 763 printf(" (%zu), ", compact->wstr_length); 764 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 765 printf("shared "); 766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 767 } 768 printf(", data=%p\n", data); 769} 770#endif 771 772PyObject * 773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 774{ 775 PyObject *obj; 776 PyCompactUnicodeObject *unicode; 777 void *data; 778 int kind_state; 779 int is_sharing, is_ascii; 780 Py_ssize_t char_size; 781 Py_ssize_t struct_size; 782 783 /* Optimization for empty strings */ 784 if (size == 0 && unicode_empty != NULL) { 785 Py_INCREF(unicode_empty); 786 return unicode_empty; 787 } 788 789#ifdef Py_DEBUG 790 ++unicode_new_new_calls; 791#endif 792 793 is_ascii = 0; 794 is_sharing = 0; 795 struct_size = sizeof(PyCompactUnicodeObject); 796 if (maxchar < 128) { 797 kind_state = PyUnicode_1BYTE_KIND; 798 char_size = 1; 799 is_ascii = 1; 800 struct_size = sizeof(PyASCIIObject); 801 } 802 else if (maxchar < 256) { 803 kind_state = PyUnicode_1BYTE_KIND; 804 char_size = 1; 805 } 806 else if (maxchar < 65536) { 807 kind_state = PyUnicode_2BYTE_KIND; 808 char_size = 2; 809 if (sizeof(wchar_t) == 2) 810 is_sharing = 1; 811 } 812 else { 813 kind_state = PyUnicode_4BYTE_KIND; 814 char_size = 4; 815 if (sizeof(wchar_t) == 4) 816 is_sharing = 1; 817 } 818 819 /* Ensure we won't overflow the size. */ 820 if (size < 0) { 821 PyErr_SetString(PyExc_SystemError, 822 "Negative size passed to PyUnicode_New"); 823 return NULL; 824 } 825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 826 return PyErr_NoMemory(); 827 828 /* Duplicated allocation code from _PyObject_New() instead of a call to 829 * PyObject_New() so we are able to allocate space for the object and 830 * it's data buffer. 831 */ 832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 833 if (obj == NULL) 834 return PyErr_NoMemory(); 835 obj = PyObject_INIT(obj, &PyUnicode_Type); 836 if (obj == NULL) 837 return NULL; 838 839 unicode = (PyCompactUnicodeObject *)obj; 840 if (is_ascii) 841 data = ((PyASCIIObject*)obj) + 1; 842 else 843 data = unicode + 1; 844 _PyUnicode_LENGTH(unicode) = size; 845 _PyUnicode_HASH(unicode) = -1; 846 _PyUnicode_STATE(unicode).interned = 0; 847 _PyUnicode_STATE(unicode).kind = kind_state; 848 _PyUnicode_STATE(unicode).compact = 1; 849 _PyUnicode_STATE(unicode).ready = 1; 850 _PyUnicode_STATE(unicode).ascii = is_ascii; 851 if (is_ascii) { 852 ((char*)data)[size] = 0; 853 _PyUnicode_WSTR(unicode) = NULL; 854 } 855 else if (kind_state == PyUnicode_1BYTE_KIND) { 856 ((char*)data)[size] = 0; 857 _PyUnicode_WSTR(unicode) = NULL; 858 _PyUnicode_WSTR_LENGTH(unicode) = 0; 859 unicode->utf8 = NULL; 860 unicode->utf8_length = 0; 861 } 862 else { 863 unicode->utf8 = NULL; 864 unicode->utf8_length = 0; 865 if (kind_state == PyUnicode_2BYTE_KIND) 866 ((Py_UCS2*)data)[size] = 0; 867 else /* kind_state == PyUnicode_4BYTE_KIND */ 868 ((Py_UCS4*)data)[size] = 0; 869 if (is_sharing) { 870 _PyUnicode_WSTR_LENGTH(unicode) = size; 871 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 872 } 873 else { 874 _PyUnicode_WSTR_LENGTH(unicode) = 0; 875 _PyUnicode_WSTR(unicode) = NULL; 876 } 877 } 878 return obj; 879} 880 881#if SIZEOF_WCHAR_T == 2 882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 883 will decode surrogate pairs, the other conversions are implemented as macros 884 for efficency. 885 886 This function assumes that unicode can hold one more code point than wstr 887 characters for a terminating null character. */ 888static void 889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 890 PyUnicodeObject *unicode) 891{ 892 const wchar_t *iter; 893 Py_UCS4 *ucs4_out; 894 895 assert(unicode != NULL); 896 assert(_PyUnicode_CHECK(unicode)); 897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 898 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 899 900 for (iter = begin; iter < end; ) { 901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 902 _PyUnicode_GET_LENGTH(unicode))); 903 if (*iter >= 0xD800 && *iter <= 0xDBFF 904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 905 { 906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 907 iter += 2; 908 } 909 else { 910 *ucs4_out++ = *iter; 911 iter++; 912 } 913 } 914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 915 _PyUnicode_GET_LENGTH(unicode))); 916 917} 918#endif 919 920static int 921_PyUnicode_Dirty(PyObject *unicode) 922{ 923 assert(_PyUnicode_CHECK(unicode)); 924 if (Py_REFCNT(unicode) != 1) { 925 PyErr_SetString(PyExc_SystemError, 926 "Cannot modify a string having more than 1 reference"); 927 return -1; 928 } 929 _PyUnicode_DIRTY(unicode); 930 return 0; 931} 932 933Py_ssize_t 934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 935 PyObject *from, Py_ssize_t from_start, 936 Py_ssize_t how_many) 937{ 938 unsigned int from_kind, to_kind; 939 void *from_data, *to_data; 940 941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 942 PyErr_BadInternalCall(); 943 return -1; 944 } 945 946 if (PyUnicode_READY(from)) 947 return -1; 948 if (PyUnicode_READY(to)) 949 return -1; 950 951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 953 PyErr_Format(PyExc_SystemError, 954 "Cannot write %zi characters at %zi " 955 "in a string of %zi characters", 956 how_many, to_start, PyUnicode_GET_LENGTH(to)); 957 return -1; 958 } 959 if (how_many == 0) 960 return 0; 961 962 if (_PyUnicode_Dirty(to)) 963 return -1; 964 965 from_kind = PyUnicode_KIND(from); 966 from_data = PyUnicode_DATA(from); 967 to_kind = PyUnicode_KIND(to); 968 to_data = PyUnicode_DATA(to); 969 970 if (from_kind == to_kind 971 /* deny latin1 => ascii */ 972 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 973 { 974 Py_MEMCPY((char*)to_data 975 + PyUnicode_KIND_SIZE(to_kind, to_start), 976 (char*)from_data 977 + PyUnicode_KIND_SIZE(from_kind, from_start), 978 PyUnicode_KIND_SIZE(to_kind, how_many)); 979 } 980 else if (from_kind == PyUnicode_1BYTE_KIND 981 && to_kind == PyUnicode_2BYTE_KIND) 982 { 983 _PyUnicode_CONVERT_BYTES( 984 Py_UCS1, Py_UCS2, 985 PyUnicode_1BYTE_DATA(from) + from_start, 986 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 987 PyUnicode_2BYTE_DATA(to) + to_start 988 ); 989 } 990 else if (from_kind == PyUnicode_1BYTE_KIND 991 && to_kind == PyUnicode_4BYTE_KIND) 992 { 993 _PyUnicode_CONVERT_BYTES( 994 Py_UCS1, Py_UCS4, 995 PyUnicode_1BYTE_DATA(from) + from_start, 996 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 997 PyUnicode_4BYTE_DATA(to) + to_start 998 ); 999 } 1000 else if (from_kind == PyUnicode_2BYTE_KIND 1001 && to_kind == PyUnicode_4BYTE_KIND) 1002 { 1003 _PyUnicode_CONVERT_BYTES( 1004 Py_UCS2, Py_UCS4, 1005 PyUnicode_2BYTE_DATA(from) + from_start, 1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1007 PyUnicode_4BYTE_DATA(to) + to_start 1008 ); 1009 } 1010 else { 1011 int invalid_kinds; 1012 1013 /* check if max_char(from substring) <= max_char(to) */ 1014 if (from_kind > to_kind 1015 /* latin1 => ascii */ 1016 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1017 { 1018 /* slow path to check for character overflow */ 1019 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1020 Py_UCS4 ch, maxchar; 1021 Py_ssize_t i; 1022 1023 maxchar = 0; 1024 invalid_kinds = 0; 1025 for (i=0; i < how_many; i++) { 1026 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1027 if (ch > maxchar) { 1028 maxchar = ch; 1029 if (maxchar > to_maxchar) { 1030 invalid_kinds = 1; 1031 break; 1032 } 1033 } 1034 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1035 } 1036 } 1037 else 1038 invalid_kinds = 1; 1039 if (invalid_kinds) { 1040 PyErr_Format(PyExc_SystemError, 1041 "Cannot copy %s characters " 1042 "into a string of %s characters", 1043 unicode_kind_name(from), 1044 unicode_kind_name(to)); 1045 return -1; 1046 } 1047 } 1048 return how_many; 1049} 1050 1051/* Find the maximum code point and count the number of surrogate pairs so a 1052 correct string length can be computed before converting a string to UCS4. 1053 This function counts single surrogates as a character and not as a pair. 1054 1055 Return 0 on success, or -1 on error. */ 1056static int 1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1058 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1059{ 1060 const wchar_t *iter; 1061 1062 assert(num_surrogates != NULL && maxchar != NULL); 1063 if (num_surrogates == NULL || maxchar == NULL) { 1064 PyErr_SetString(PyExc_SystemError, 1065 "unexpected NULL arguments to " 1066 "PyUnicode_FindMaxCharAndNumSurrogatePairs"); 1067 return -1; 1068 } 1069 1070 *num_surrogates = 0; 1071 *maxchar = 0; 1072 1073 for (iter = begin; iter < end; ) { 1074 if (*iter > *maxchar) 1075 *maxchar = *iter; 1076#if SIZEOF_WCHAR_T == 2 1077 if (*iter >= 0xD800 && *iter <= 0xDBFF 1078 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1079 { 1080 Py_UCS4 surrogate_val; 1081 surrogate_val = (((iter[0] & 0x3FF)<<10) 1082 | (iter[1] & 0x3FF)) + 0x10000; 1083 ++(*num_surrogates); 1084 if (surrogate_val > *maxchar) 1085 *maxchar = surrogate_val; 1086 iter += 2; 1087 } 1088 else 1089 iter++; 1090#else 1091 iter++; 1092#endif 1093 } 1094 return 0; 1095} 1096 1097#ifdef Py_DEBUG 1098int unicode_ready_calls = 0; 1099#endif 1100 1101static int 1102unicode_ready(PyObject **p_obj, int replace) 1103{ 1104 PyUnicodeObject *unicode; 1105 wchar_t *end; 1106 Py_UCS4 maxchar = 0; 1107 Py_ssize_t num_surrogates; 1108#if SIZEOF_WCHAR_T == 2 1109 Py_ssize_t length_wo_surrogates; 1110#endif 1111 1112 assert(p_obj != NULL); 1113 unicode = (PyUnicodeObject *)*p_obj; 1114 1115 /* _PyUnicode_Ready() is only intented for old-style API usage where 1116 strings were created using _PyObject_New() and where no canonical 1117 representation (the str field) has been set yet aka strings 1118 which are not yet ready. */ 1119 assert(_PyUnicode_CHECK(unicode)); 1120 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1121 assert(_PyUnicode_WSTR(unicode) != NULL); 1122 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1123 assert(_PyUnicode_UTF8(unicode) == NULL); 1124 /* Actually, it should neither be interned nor be anything else: */ 1125 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1126 1127#ifdef Py_DEBUG 1128 ++unicode_ready_calls; 1129#endif 1130 1131#ifdef Py_DEBUG 1132 assert(!replace || Py_REFCNT(unicode) == 1); 1133#else 1134 if (replace && Py_REFCNT(unicode) != 1) 1135 replace = 0; 1136#endif 1137 if (replace) { 1138 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1139 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1140 /* Optimization for empty strings */ 1141 if (len == 0) { 1142 Py_INCREF(unicode_empty); 1143 Py_DECREF(*p_obj); 1144 *p_obj = unicode_empty; 1145 return 0; 1146 } 1147 if (len == 1 && wstr[0] < 256) { 1148 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1149 if (latin1_char == NULL) 1150 return -1; 1151 Py_DECREF(*p_obj); 1152 *p_obj = latin1_char; 1153 return 0; 1154 } 1155 } 1156 1157 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1158 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1159 &maxchar, &num_surrogates) == -1) 1160 return -1; 1161 1162 if (maxchar < 256) { 1163 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1164 if (!_PyUnicode_DATA_ANY(unicode)) { 1165 PyErr_NoMemory(); 1166 return -1; 1167 } 1168 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1169 _PyUnicode_WSTR(unicode), end, 1170 PyUnicode_1BYTE_DATA(unicode)); 1171 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1172 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1173 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1174 if (maxchar < 128) { 1175 _PyUnicode_STATE(unicode).ascii = 1; 1176 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1177 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1178 } 1179 else { 1180 _PyUnicode_STATE(unicode).ascii = 0; 1181 _PyUnicode_UTF8(unicode) = NULL; 1182 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1183 } 1184 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1185 _PyUnicode_WSTR(unicode) = NULL; 1186 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1187 } 1188 /* In this case we might have to convert down from 4-byte native 1189 wchar_t to 2-byte unicode. */ 1190 else if (maxchar < 65536) { 1191 assert(num_surrogates == 0 && 1192 "FindMaxCharAndNumSurrogatePairs() messed up"); 1193 1194#if SIZEOF_WCHAR_T == 2 1195 /* We can share representations and are done. */ 1196 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1197 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1198 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1199 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1200 _PyUnicode_UTF8(unicode) = NULL; 1201 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1202#else 1203 /* sizeof(wchar_t) == 4 */ 1204 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1205 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1206 if (!_PyUnicode_DATA_ANY(unicode)) { 1207 PyErr_NoMemory(); 1208 return -1; 1209 } 1210 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1211 _PyUnicode_WSTR(unicode), end, 1212 PyUnicode_2BYTE_DATA(unicode)); 1213 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1214 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1215 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1216 _PyUnicode_UTF8(unicode) = NULL; 1217 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1218 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1219 _PyUnicode_WSTR(unicode) = NULL; 1220 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1221#endif 1222 } 1223 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1224 else { 1225#if SIZEOF_WCHAR_T == 2 1226 /* in case the native representation is 2-bytes, we need to allocate a 1227 new normalized 4-byte version. */ 1228 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1229 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1230 if (!_PyUnicode_DATA_ANY(unicode)) { 1231 PyErr_NoMemory(); 1232 return -1; 1233 } 1234 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1235 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1236 _PyUnicode_UTF8(unicode) = NULL; 1237 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1238 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1239 _PyUnicode_STATE(unicode).ready = 1; 1240 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1241 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1242 _PyUnicode_WSTR(unicode) = NULL; 1243 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1244#else 1245 assert(num_surrogates == 0); 1246 1247 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1248 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1249 _PyUnicode_UTF8(unicode) = NULL; 1250 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1251 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1252#endif 1253 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1254 } 1255 _PyUnicode_STATE(unicode).ready = 1; 1256 return 0; 1257} 1258 1259int 1260_PyUnicode_ReadyReplace(PyObject **op) 1261{ 1262 return unicode_ready(op, 1); 1263} 1264 1265int 1266_PyUnicode_Ready(PyObject *op) 1267{ 1268 return unicode_ready(&op, 0); 1269} 1270 1271static void 1272unicode_dealloc(register PyUnicodeObject *unicode) 1273{ 1274 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1275 case SSTATE_NOT_INTERNED: 1276 break; 1277 1278 case SSTATE_INTERNED_MORTAL: 1279 /* revive dead object temporarily for DelItem */ 1280 Py_REFCNT(unicode) = 3; 1281 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1282 Py_FatalError( 1283 "deletion of interned string failed"); 1284 break; 1285 1286 case SSTATE_INTERNED_IMMORTAL: 1287 Py_FatalError("Immortal interned string died."); 1288 1289 default: 1290 Py_FatalError("Inconsistent interned string state."); 1291 } 1292 1293 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1294 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1295 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1296 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1297 1298 if (PyUnicode_IS_COMPACT(unicode)) { 1299 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1300 } 1301 else { 1302 if (_PyUnicode_DATA_ANY(unicode)) 1303 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1304 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1305 } 1306} 1307 1308static int 1309unicode_resizable(PyObject *unicode) 1310{ 1311 if (Py_REFCNT(unicode) != 1) 1312 return 0; 1313 if (PyUnicode_CHECK_INTERNED(unicode)) 1314 return 0; 1315 assert(unicode != unicode_empty); 1316#ifdef Py_DEBUG 1317 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND 1318 && PyUnicode_GET_LENGTH(unicode) == 1) 1319 { 1320 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1321 if (ch < 256 && unicode_latin1[ch] == unicode) 1322 return 0; 1323 } 1324#endif 1325 return 1; 1326} 1327 1328static int 1329unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1330{ 1331 PyObject *unicode; 1332 Py_ssize_t old_length; 1333 1334 assert(p_unicode != NULL); 1335 unicode = *p_unicode; 1336 1337 assert(unicode != NULL); 1338 assert(PyUnicode_Check(unicode)); 1339 assert(0 <= length); 1340 1341 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1342 old_length = PyUnicode_WSTR_LENGTH(unicode); 1343 else 1344 old_length = PyUnicode_GET_LENGTH(unicode); 1345 if (old_length == length) 1346 return 0; 1347 1348 if (!unicode_resizable(unicode)) { 1349 PyObject *copy = resize_copy(unicode, length); 1350 if (copy == NULL) 1351 return -1; 1352 Py_DECREF(*p_unicode); 1353 *p_unicode = copy; 1354 return 0; 1355 } 1356 1357 if (PyUnicode_IS_COMPACT(unicode)) { 1358 *p_unicode = resize_compact(unicode, length); 1359 if (*p_unicode == NULL) 1360 return -1; 1361 _PyUnicode_CheckConsistency(*p_unicode); 1362 return 0; 1363 } 1364 return resize_inplace((PyUnicodeObject*)unicode, length); 1365} 1366 1367int 1368PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1369{ 1370 PyObject *unicode; 1371 if (p_unicode == NULL) { 1372 PyErr_BadInternalCall(); 1373 return -1; 1374 } 1375 unicode = *p_unicode; 1376 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1377 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1378 { 1379 PyErr_BadInternalCall(); 1380 return -1; 1381 } 1382 return unicode_resize(p_unicode, length); 1383} 1384 1385static PyObject* 1386get_latin1_char(unsigned char ch) 1387{ 1388 PyObject *unicode = unicode_latin1[ch]; 1389 if (!unicode) { 1390 unicode = PyUnicode_New(1, ch); 1391 if (!unicode) 1392 return NULL; 1393 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1394 unicode_latin1[ch] = unicode; 1395 } 1396 Py_INCREF(unicode); 1397 return unicode; 1398} 1399 1400PyObject * 1401PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1402{ 1403 PyUnicodeObject *unicode; 1404 Py_UCS4 maxchar = 0; 1405 Py_ssize_t num_surrogates; 1406 1407 if (u == NULL) 1408 return (PyObject*)_PyUnicode_New(size); 1409 1410 /* If the Unicode data is known at construction time, we can apply 1411 some optimizations which share commonly used objects. */ 1412 1413 /* Optimization for empty strings */ 1414 if (size == 0 && unicode_empty != NULL) { 1415 Py_INCREF(unicode_empty); 1416 return unicode_empty; 1417 } 1418 1419 /* Single character Unicode objects in the Latin-1 range are 1420 shared when using this constructor */ 1421 if (size == 1 && *u < 256) 1422 return get_latin1_char((unsigned char)*u); 1423 1424 /* If not empty and not single character, copy the Unicode data 1425 into the new object */ 1426 if (find_maxchar_surrogates(u, u + size, 1427 &maxchar, &num_surrogates) == -1) 1428 return NULL; 1429 1430 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1431 maxchar); 1432 if (!unicode) 1433 return NULL; 1434 1435 switch (PyUnicode_KIND(unicode)) { 1436 case PyUnicode_1BYTE_KIND: 1437 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1438 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1439 break; 1440 case PyUnicode_2BYTE_KIND: 1441#if Py_UNICODE_SIZE == 2 1442 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1443#else 1444 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1445 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1446#endif 1447 break; 1448 case PyUnicode_4BYTE_KIND: 1449#if SIZEOF_WCHAR_T == 2 1450 /* This is the only case which has to process surrogates, thus 1451 a simple copy loop is not enough and we need a function. */ 1452 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1453#else 1454 assert(num_surrogates == 0); 1455 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1456#endif 1457 break; 1458 default: 1459 assert(0 && "Impossible state"); 1460 } 1461 1462 return (PyObject *)unicode; 1463} 1464 1465PyObject * 1466PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1467{ 1468 PyUnicodeObject *unicode; 1469 1470 if (size < 0) { 1471 PyErr_SetString(PyExc_SystemError, 1472 "Negative size passed to PyUnicode_FromStringAndSize"); 1473 return NULL; 1474 } 1475 1476 /* If the Unicode data is known at construction time, we can apply 1477 some optimizations which share commonly used objects. 1478 Also, this means the input must be UTF-8, so fall back to the 1479 UTF-8 decoder at the end. */ 1480 if (u != NULL) { 1481 1482 /* Optimization for empty strings */ 1483 if (size == 0 && unicode_empty != NULL) { 1484 Py_INCREF(unicode_empty); 1485 return unicode_empty; 1486 } 1487 1488 /* Single characters are shared when using this constructor. 1489 Restrict to ASCII, since the input must be UTF-8. */ 1490 if (size == 1 && Py_CHARMASK(*u) < 128) 1491 return get_latin1_char(Py_CHARMASK(*u)); 1492 1493 return PyUnicode_DecodeUTF8(u, size, NULL); 1494 } 1495 1496 unicode = _PyUnicode_New(size); 1497 if (!unicode) 1498 return NULL; 1499 1500 return (PyObject *)unicode; 1501} 1502 1503PyObject * 1504PyUnicode_FromString(const char *u) 1505{ 1506 size_t size = strlen(u); 1507 if (size > PY_SSIZE_T_MAX) { 1508 PyErr_SetString(PyExc_OverflowError, "input too long"); 1509 return NULL; 1510 } 1511 1512 return PyUnicode_FromStringAndSize(u, size); 1513} 1514 1515static PyObject* 1516unicode_fromascii(const unsigned char* u, Py_ssize_t size) 1517{ 1518 PyObject *res = PyUnicode_New(size, 127); 1519 if (!res) 1520 return NULL; 1521 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1522 return res; 1523} 1524 1525static PyObject* 1526_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1527{ 1528 PyObject *res; 1529 unsigned char max_char = 127; 1530 Py_ssize_t i; 1531 1532 assert(size >= 0); 1533 for (i = 0; i < size; i++) { 1534 if (u[i] & 0x80) { 1535 max_char = 255; 1536 break; 1537 } 1538 } 1539 res = PyUnicode_New(size, max_char); 1540 if (!res) 1541 return NULL; 1542 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1543 return res; 1544} 1545 1546static PyObject* 1547_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1548{ 1549 PyObject *res; 1550 Py_UCS2 max_char = 0; 1551 Py_ssize_t i; 1552 1553 assert(size >= 0); 1554 for (i = 0; i < size; i++) { 1555 if (u[i] > max_char) { 1556 max_char = u[i]; 1557 if (max_char >= 256) 1558 break; 1559 } 1560 } 1561 res = PyUnicode_New(size, max_char); 1562 if (!res) 1563 return NULL; 1564 if (max_char >= 256) 1565 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1566 else 1567 for (i = 0; i < size; i++) 1568 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1569 return res; 1570} 1571 1572static PyObject* 1573_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1574{ 1575 PyObject *res; 1576 Py_UCS4 max_char = 0; 1577 Py_ssize_t i; 1578 1579 assert(size >= 0); 1580 for (i = 0; i < size; i++) { 1581 if (u[i] > max_char) { 1582 max_char = u[i]; 1583 if (max_char >= 0x10000) 1584 break; 1585 } 1586 } 1587 res = PyUnicode_New(size, max_char); 1588 if (!res) 1589 return NULL; 1590 if (max_char >= 0x10000) 1591 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1592 else { 1593 int kind = PyUnicode_KIND(res); 1594 void *data = PyUnicode_DATA(res); 1595 for (i = 0; i < size; i++) 1596 PyUnicode_WRITE(kind, data, i, u[i]); 1597 } 1598 return res; 1599} 1600 1601PyObject* 1602PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1603{ 1604 switch(kind) { 1605 case PyUnicode_1BYTE_KIND: 1606 return _PyUnicode_FromUCS1(buffer, size); 1607 case PyUnicode_2BYTE_KIND: 1608 return _PyUnicode_FromUCS2(buffer, size); 1609 case PyUnicode_4BYTE_KIND: 1610 return _PyUnicode_FromUCS4(buffer, size); 1611 default: 1612 assert(0 && "invalid kind"); 1613 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1614 return NULL; 1615 } 1616} 1617 1618PyObject* 1619PyUnicode_Copy(PyObject *unicode) 1620{ 1621 Py_ssize_t size; 1622 PyObject *copy; 1623 void *data; 1624 1625 if (!PyUnicode_Check(unicode)) { 1626 PyErr_BadInternalCall(); 1627 return NULL; 1628 } 1629 if (PyUnicode_READY(unicode)) 1630 return NULL; 1631 1632 size = PyUnicode_GET_LENGTH(unicode); 1633 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1634 if (!copy) 1635 return NULL; 1636 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1637 1638 data = PyUnicode_DATA(unicode); 1639 switch (PyUnicode_KIND(unicode)) 1640 { 1641 case PyUnicode_1BYTE_KIND: 1642 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1643 break; 1644 case PyUnicode_2BYTE_KIND: 1645 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1646 break; 1647 case PyUnicode_4BYTE_KIND: 1648 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1649 break; 1650 default: 1651 assert(0); 1652 break; 1653 } 1654 return copy; 1655} 1656 1657 1658/* Widen Unicode objects to larger buffers. Don't write terminating null 1659 character. Return NULL on error. */ 1660 1661void* 1662_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1663{ 1664 Py_ssize_t len; 1665 void *result; 1666 unsigned int skind; 1667 1668 if (PyUnicode_READY(s)) 1669 return NULL; 1670 1671 len = PyUnicode_GET_LENGTH(s); 1672 skind = PyUnicode_KIND(s); 1673 if (skind >= kind) { 1674 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1675 return NULL; 1676 } 1677 switch(kind) { 1678 case PyUnicode_2BYTE_KIND: 1679 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1680 if (!result) 1681 return PyErr_NoMemory(); 1682 assert(skind == PyUnicode_1BYTE_KIND); 1683 _PyUnicode_CONVERT_BYTES( 1684 Py_UCS1, Py_UCS2, 1685 PyUnicode_1BYTE_DATA(s), 1686 PyUnicode_1BYTE_DATA(s) + len, 1687 result); 1688 return result; 1689 case PyUnicode_4BYTE_KIND: 1690 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1691 if (!result) 1692 return PyErr_NoMemory(); 1693 if (skind == PyUnicode_2BYTE_KIND) { 1694 _PyUnicode_CONVERT_BYTES( 1695 Py_UCS2, Py_UCS4, 1696 PyUnicode_2BYTE_DATA(s), 1697 PyUnicode_2BYTE_DATA(s) + len, 1698 result); 1699 } 1700 else { 1701 assert(skind == PyUnicode_1BYTE_KIND); 1702 _PyUnicode_CONVERT_BYTES( 1703 Py_UCS1, Py_UCS4, 1704 PyUnicode_1BYTE_DATA(s), 1705 PyUnicode_1BYTE_DATA(s) + len, 1706 result); 1707 } 1708 return result; 1709 default: 1710 break; 1711 } 1712 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1713 return NULL; 1714} 1715 1716static Py_UCS4* 1717as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1718 int copy_null) 1719{ 1720 int kind; 1721 void *data; 1722 Py_ssize_t len, targetlen; 1723 if (PyUnicode_READY(string) == -1) 1724 return NULL; 1725 kind = PyUnicode_KIND(string); 1726 data = PyUnicode_DATA(string); 1727 len = PyUnicode_GET_LENGTH(string); 1728 targetlen = len; 1729 if (copy_null) 1730 targetlen++; 1731 if (!target) { 1732 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1733 PyErr_NoMemory(); 1734 return NULL; 1735 } 1736 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1737 if (!target) { 1738 PyErr_NoMemory(); 1739 return NULL; 1740 } 1741 } 1742 else { 1743 if (targetsize < targetlen) { 1744 PyErr_Format(PyExc_SystemError, 1745 "string is longer than the buffer"); 1746 if (copy_null && 0 < targetsize) 1747 target[0] = 0; 1748 return NULL; 1749 } 1750 } 1751 if (kind != PyUnicode_4BYTE_KIND) { 1752 Py_ssize_t i; 1753 for (i = 0; i < len; i++) 1754 target[i] = PyUnicode_READ(kind, data, i); 1755 } 1756 else 1757 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1758 if (copy_null) 1759 target[len] = 0; 1760 return target; 1761} 1762 1763Py_UCS4* 1764PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1765 int copy_null) 1766{ 1767 if (target == NULL || targetsize < 1) { 1768 PyErr_BadInternalCall(); 1769 return NULL; 1770 } 1771 return as_ucs4(string, target, targetsize, copy_null); 1772} 1773 1774Py_UCS4* 1775PyUnicode_AsUCS4Copy(PyObject *string) 1776{ 1777 return as_ucs4(string, NULL, 0, 1); 1778} 1779 1780#ifdef HAVE_WCHAR_H 1781 1782PyObject * 1783PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1784{ 1785 if (w == NULL) { 1786 if (size == 0) 1787 return PyUnicode_New(0, 0); 1788 PyErr_BadInternalCall(); 1789 return NULL; 1790 } 1791 1792 if (size == -1) { 1793 size = wcslen(w); 1794 } 1795 1796 return PyUnicode_FromUnicode(w, size); 1797} 1798 1799#endif /* HAVE_WCHAR_H */ 1800 1801static void 1802makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1803 int zeropad, int width, int precision, char c) 1804{ 1805 *fmt++ = '%'; 1806 if (width) { 1807 if (zeropad) 1808 *fmt++ = '0'; 1809 fmt += sprintf(fmt, "%d", width); 1810 } 1811 if (precision) 1812 fmt += sprintf(fmt, ".%d", precision); 1813 if (longflag) 1814 *fmt++ = 'l'; 1815 else if (longlongflag) { 1816 /* longlongflag should only ever be nonzero on machines with 1817 HAVE_LONG_LONG defined */ 1818#ifdef HAVE_LONG_LONG 1819 char *f = PY_FORMAT_LONG_LONG; 1820 while (*f) 1821 *fmt++ = *f++; 1822#else 1823 /* we shouldn't ever get here */ 1824 assert(0); 1825 *fmt++ = 'l'; 1826#endif 1827 } 1828 else if (size_tflag) { 1829 char *f = PY_FORMAT_SIZE_T; 1830 while (*f) 1831 *fmt++ = *f++; 1832 } 1833 *fmt++ = c; 1834 *fmt = '\0'; 1835} 1836 1837/* helper for PyUnicode_FromFormatV() */ 1838 1839static const char* 1840parse_format_flags(const char *f, 1841 int *p_width, int *p_precision, 1842 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1843{ 1844 int width, precision, longflag, longlongflag, size_tflag; 1845 1846 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1847 f++; 1848 width = 0; 1849 while (Py_ISDIGIT((unsigned)*f)) 1850 width = (width*10) + *f++ - '0'; 1851 precision = 0; 1852 if (*f == '.') { 1853 f++; 1854 while (Py_ISDIGIT((unsigned)*f)) 1855 precision = (precision*10) + *f++ - '0'; 1856 if (*f == '%') { 1857 /* "%.3%s" => f points to "3" */ 1858 f--; 1859 } 1860 } 1861 if (*f == '\0') { 1862 /* bogus format "%.1" => go backward, f points to "1" */ 1863 f--; 1864 } 1865 if (p_width != NULL) 1866 *p_width = width; 1867 if (p_precision != NULL) 1868 *p_precision = precision; 1869 1870 /* Handle %ld, %lu, %lld and %llu. */ 1871 longflag = 0; 1872 longlongflag = 0; 1873 size_tflag = 0; 1874 1875 if (*f == 'l') { 1876 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1877 longflag = 1; 1878 ++f; 1879 } 1880#ifdef HAVE_LONG_LONG 1881 else if (f[1] == 'l' && 1882 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1883 longlongflag = 1; 1884 f += 2; 1885 } 1886#endif 1887 } 1888 /* handle the size_t flag. */ 1889 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 1890 size_tflag = 1; 1891 ++f; 1892 } 1893 if (p_longflag != NULL) 1894 *p_longflag = longflag; 1895 if (p_longlongflag != NULL) 1896 *p_longlongflag = longlongflag; 1897 if (p_size_tflag != NULL) 1898 *p_size_tflag = size_tflag; 1899 return f; 1900} 1901 1902/* maximum number of characters required for output of %ld. 21 characters 1903 allows for 64-bit integers (in decimal) and an optional sign. */ 1904#define MAX_LONG_CHARS 21 1905/* maximum number of characters required for output of %lld. 1906 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 1907 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 1908#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 1909 1910PyObject * 1911PyUnicode_FromFormatV(const char *format, va_list vargs) 1912{ 1913 va_list count; 1914 Py_ssize_t callcount = 0; 1915 PyObject **callresults = NULL; 1916 PyObject **callresult = NULL; 1917 Py_ssize_t n = 0; 1918 int width = 0; 1919 int precision = 0; 1920 int zeropad; 1921 const char* f; 1922 PyUnicodeObject *string; 1923 /* used by sprintf */ 1924 char fmt[61]; /* should be enough for %0width.precisionlld */ 1925 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 1926 Py_UCS4 argmaxchar; 1927 Py_ssize_t numbersize = 0; 1928 char *numberresults = NULL; 1929 char *numberresult = NULL; 1930 Py_ssize_t i; 1931 int kind; 1932 void *data; 1933 1934 Py_VA_COPY(count, vargs); 1935 /* step 1: count the number of %S/%R/%A/%s format specifications 1936 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 1937 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 1938 * result in an array) 1939 * also esimate a upper bound for all the number formats in the string, 1940 * numbers will be formated in step 3 and be keept in a '\0'-separated 1941 * buffer before putting everything together. */ 1942 for (f = format; *f; f++) { 1943 if (*f == '%') { 1944 int longlongflag; 1945 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 1946 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 1947 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 1948 ++callcount; 1949 1950 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 1951#ifdef HAVE_LONG_LONG 1952 if (longlongflag) { 1953 if (width < MAX_LONG_LONG_CHARS) 1954 width = MAX_LONG_LONG_CHARS; 1955 } 1956 else 1957#endif 1958 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 1959 including sign. Decimal takes the most space. This 1960 isn't enough for octal. If a width is specified we 1961 need more (which we allocate later). */ 1962 if (width < MAX_LONG_CHARS) 1963 width = MAX_LONG_CHARS; 1964 1965 /* account for the size + '\0' to separate numbers 1966 inside of the numberresults buffer */ 1967 numbersize += (width + 1); 1968 } 1969 } 1970 else if ((unsigned char)*f > 127) { 1971 PyErr_Format(PyExc_ValueError, 1972 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1973 "string, got a non-ASCII byte: 0x%02x", 1974 (unsigned char)*f); 1975 return NULL; 1976 } 1977 } 1978 /* step 2: allocate memory for the results of 1979 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 1980 if (callcount) { 1981 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 1982 if (!callresults) { 1983 PyErr_NoMemory(); 1984 return NULL; 1985 } 1986 callresult = callresults; 1987 } 1988 /* step 2.5: allocate memory for the results of formating numbers */ 1989 if (numbersize) { 1990 numberresults = PyObject_Malloc(numbersize); 1991 if (!numberresults) { 1992 PyErr_NoMemory(); 1993 goto fail; 1994 } 1995 numberresult = numberresults; 1996 } 1997 1998 /* step 3: format numbers and figure out how large a buffer we need */ 1999 for (f = format; *f; f++) { 2000 if (*f == '%') { 2001 const char* p; 2002 int longflag; 2003 int longlongflag; 2004 int size_tflag; 2005 int numprinted; 2006 2007 p = f; 2008 zeropad = (f[1] == '0'); 2009 f = parse_format_flags(f, &width, &precision, 2010 &longflag, &longlongflag, &size_tflag); 2011 switch (*f) { 2012 case 'c': 2013 { 2014 Py_UCS4 ordinal = va_arg(count, int); 2015 maxchar = Py_MAX(maxchar, ordinal); 2016 n++; 2017 break; 2018 } 2019 case '%': 2020 n++; 2021 break; 2022 case 'i': 2023 case 'd': 2024 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2025 width, precision, *f); 2026 if (longflag) 2027 numprinted = sprintf(numberresult, fmt, 2028 va_arg(count, long)); 2029#ifdef HAVE_LONG_LONG 2030 else if (longlongflag) 2031 numprinted = sprintf(numberresult, fmt, 2032 va_arg(count, PY_LONG_LONG)); 2033#endif 2034 else if (size_tflag) 2035 numprinted = sprintf(numberresult, fmt, 2036 va_arg(count, Py_ssize_t)); 2037 else 2038 numprinted = sprintf(numberresult, fmt, 2039 va_arg(count, int)); 2040 n += numprinted; 2041 /* advance by +1 to skip over the '\0' */ 2042 numberresult += (numprinted + 1); 2043 assert(*(numberresult - 1) == '\0'); 2044 assert(*(numberresult - 2) != '\0'); 2045 assert(numprinted >= 0); 2046 assert(numberresult <= numberresults + numbersize); 2047 break; 2048 case 'u': 2049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2050 width, precision, 'u'); 2051 if (longflag) 2052 numprinted = sprintf(numberresult, fmt, 2053 va_arg(count, unsigned long)); 2054#ifdef HAVE_LONG_LONG 2055 else if (longlongflag) 2056 numprinted = sprintf(numberresult, fmt, 2057 va_arg(count, unsigned PY_LONG_LONG)); 2058#endif 2059 else if (size_tflag) 2060 numprinted = sprintf(numberresult, fmt, 2061 va_arg(count, size_t)); 2062 else 2063 numprinted = sprintf(numberresult, fmt, 2064 va_arg(count, unsigned int)); 2065 n += numprinted; 2066 numberresult += (numprinted + 1); 2067 assert(*(numberresult - 1) == '\0'); 2068 assert(*(numberresult - 2) != '\0'); 2069 assert(numprinted >= 0); 2070 assert(numberresult <= numberresults + numbersize); 2071 break; 2072 case 'x': 2073 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2074 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2075 n += numprinted; 2076 numberresult += (numprinted + 1); 2077 assert(*(numberresult - 1) == '\0'); 2078 assert(*(numberresult - 2) != '\0'); 2079 assert(numprinted >= 0); 2080 assert(numberresult <= numberresults + numbersize); 2081 break; 2082 case 'p': 2083 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2084 /* %p is ill-defined: ensure leading 0x. */ 2085 if (numberresult[1] == 'X') 2086 numberresult[1] = 'x'; 2087 else if (numberresult[1] != 'x') { 2088 memmove(numberresult + 2, numberresult, 2089 strlen(numberresult) + 1); 2090 numberresult[0] = '0'; 2091 numberresult[1] = 'x'; 2092 numprinted += 2; 2093 } 2094 n += numprinted; 2095 numberresult += (numprinted + 1); 2096 assert(*(numberresult - 1) == '\0'); 2097 assert(*(numberresult - 2) != '\0'); 2098 assert(numprinted >= 0); 2099 assert(numberresult <= numberresults + numbersize); 2100 break; 2101 case 's': 2102 { 2103 /* UTF-8 */ 2104 const char *s = va_arg(count, const char*); 2105 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2106 if (!str) 2107 goto fail; 2108 /* since PyUnicode_DecodeUTF8 returns already flexible 2109 unicode objects, there is no need to call ready on them */ 2110 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2111 maxchar = Py_MAX(maxchar, argmaxchar); 2112 n += PyUnicode_GET_LENGTH(str); 2113 /* Remember the str and switch to the next slot */ 2114 *callresult++ = str; 2115 break; 2116 } 2117 case 'U': 2118 { 2119 PyObject *obj = va_arg(count, PyObject *); 2120 assert(obj && _PyUnicode_CHECK(obj)); 2121 if (PyUnicode_READY(obj) == -1) 2122 goto fail; 2123 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2124 maxchar = Py_MAX(maxchar, argmaxchar); 2125 n += PyUnicode_GET_LENGTH(obj); 2126 break; 2127 } 2128 case 'V': 2129 { 2130 PyObject *obj = va_arg(count, PyObject *); 2131 const char *str = va_arg(count, const char *); 2132 PyObject *str_obj; 2133 assert(obj || str); 2134 assert(!obj || _PyUnicode_CHECK(obj)); 2135 if (obj) { 2136 if (PyUnicode_READY(obj) == -1) 2137 goto fail; 2138 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2139 maxchar = Py_MAX(maxchar, argmaxchar); 2140 n += PyUnicode_GET_LENGTH(obj); 2141 *callresult++ = NULL; 2142 } 2143 else { 2144 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2145 if (!str_obj) 2146 goto fail; 2147 if (PyUnicode_READY(str_obj)) { 2148 Py_DECREF(str_obj); 2149 goto fail; 2150 } 2151 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2152 maxchar = Py_MAX(maxchar, argmaxchar); 2153 n += PyUnicode_GET_LENGTH(str_obj); 2154 *callresult++ = str_obj; 2155 } 2156 break; 2157 } 2158 case 'S': 2159 { 2160 PyObject *obj = va_arg(count, PyObject *); 2161 PyObject *str; 2162 assert(obj); 2163 str = PyObject_Str(obj); 2164 if (!str || PyUnicode_READY(str) == -1) 2165 goto fail; 2166 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2167 maxchar = Py_MAX(maxchar, argmaxchar); 2168 n += PyUnicode_GET_LENGTH(str); 2169 /* Remember the str and switch to the next slot */ 2170 *callresult++ = str; 2171 break; 2172 } 2173 case 'R': 2174 { 2175 PyObject *obj = va_arg(count, PyObject *); 2176 PyObject *repr; 2177 assert(obj); 2178 repr = PyObject_Repr(obj); 2179 if (!repr || PyUnicode_READY(repr) == -1) 2180 goto fail; 2181 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2182 maxchar = Py_MAX(maxchar, argmaxchar); 2183 n += PyUnicode_GET_LENGTH(repr); 2184 /* Remember the repr and switch to the next slot */ 2185 *callresult++ = repr; 2186 break; 2187 } 2188 case 'A': 2189 { 2190 PyObject *obj = va_arg(count, PyObject *); 2191 PyObject *ascii; 2192 assert(obj); 2193 ascii = PyObject_ASCII(obj); 2194 if (!ascii || PyUnicode_READY(ascii) == -1) 2195 goto fail; 2196 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2197 maxchar = Py_MAX(maxchar, argmaxchar); 2198 n += PyUnicode_GET_LENGTH(ascii); 2199 /* Remember the repr and switch to the next slot */ 2200 *callresult++ = ascii; 2201 break; 2202 } 2203 default: 2204 /* if we stumble upon an unknown 2205 formatting code, copy the rest of 2206 the format string to the output 2207 string. (we cannot just skip the 2208 code, since there's no way to know 2209 what's in the argument list) */ 2210 n += strlen(p); 2211 goto expand; 2212 } 2213 } else 2214 n++; 2215 } 2216 expand: 2217 /* step 4: fill the buffer */ 2218 /* Since we've analyzed how much space we need, 2219 we don't have to resize the string. 2220 There can be no errors beyond this point. */ 2221 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); 2222 if (!string) 2223 goto fail; 2224 kind = PyUnicode_KIND(string); 2225 data = PyUnicode_DATA(string); 2226 callresult = callresults; 2227 numberresult = numberresults; 2228 2229 for (i = 0, f = format; *f; f++) { 2230 if (*f == '%') { 2231 const char* p; 2232 2233 p = f; 2234 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2235 /* checking for == because the last argument could be a empty 2236 string, which causes i to point to end, the assert at the end of 2237 the loop */ 2238 assert(i <= PyUnicode_GET_LENGTH(string)); 2239 2240 switch (*f) { 2241 case 'c': 2242 { 2243 const int ordinal = va_arg(vargs, int); 2244 PyUnicode_WRITE(kind, data, i++, ordinal); 2245 break; 2246 } 2247 case 'i': 2248 case 'd': 2249 case 'u': 2250 case 'x': 2251 case 'p': 2252 /* unused, since we already have the result */ 2253 if (*f == 'p') 2254 (void) va_arg(vargs, void *); 2255 else 2256 (void) va_arg(vargs, int); 2257 /* extract the result from numberresults and append. */ 2258 for (; *numberresult; ++i, ++numberresult) 2259 PyUnicode_WRITE(kind, data, i, *numberresult); 2260 /* skip over the separating '\0' */ 2261 assert(*numberresult == '\0'); 2262 numberresult++; 2263 assert(numberresult <= numberresults + numbersize); 2264 break; 2265 case 's': 2266 { 2267 /* unused, since we already have the result */ 2268 Py_ssize_t size; 2269 (void) va_arg(vargs, char *); 2270 size = PyUnicode_GET_LENGTH(*callresult); 2271 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2272 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2273 *callresult, 0, 2274 size) < 0) 2275 goto fail; 2276 i += size; 2277 /* We're done with the unicode()/repr() => forget it */ 2278 Py_DECREF(*callresult); 2279 /* switch to next unicode()/repr() result */ 2280 ++callresult; 2281 break; 2282 } 2283 case 'U': 2284 { 2285 PyObject *obj = va_arg(vargs, PyObject *); 2286 Py_ssize_t size; 2287 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2288 size = PyUnicode_GET_LENGTH(obj); 2289 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2290 obj, 0, 2291 size) < 0) 2292 goto fail; 2293 i += size; 2294 break; 2295 } 2296 case 'V': 2297 { 2298 Py_ssize_t size; 2299 PyObject *obj = va_arg(vargs, PyObject *); 2300 va_arg(vargs, const char *); 2301 if (obj) { 2302 size = PyUnicode_GET_LENGTH(obj); 2303 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2304 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2305 obj, 0, 2306 size) < 0) 2307 goto fail; 2308 i += size; 2309 } else { 2310 size = PyUnicode_GET_LENGTH(*callresult); 2311 assert(PyUnicode_KIND(*callresult) <= 2312 PyUnicode_KIND(string)); 2313 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2314 *callresult, 2315 0, size) < 0) 2316 goto fail; 2317 i += size; 2318 Py_DECREF(*callresult); 2319 } 2320 ++callresult; 2321 break; 2322 } 2323 case 'S': 2324 case 'R': 2325 case 'A': 2326 { 2327 /* unused, since we already have the result */ 2328 (void) va_arg(vargs, PyObject *); 2329 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2330 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2331 *callresult, 0, 2332 PyUnicode_GET_LENGTH(*callresult)) < 0) 2333 goto fail; 2334 i += PyUnicode_GET_LENGTH(*callresult); 2335 /* We're done with the unicode()/repr() => forget it */ 2336 Py_DECREF(*callresult); 2337 /* switch to next unicode()/repr() result */ 2338 ++callresult; 2339 break; 2340 } 2341 case '%': 2342 PyUnicode_WRITE(kind, data, i++, '%'); 2343 break; 2344 default: 2345 for (; *p; ++p, ++i) 2346 PyUnicode_WRITE(kind, data, i, *p); 2347 assert(i == PyUnicode_GET_LENGTH(string)); 2348 goto end; 2349 } 2350 } 2351 else { 2352 assert(i < PyUnicode_GET_LENGTH(string)); 2353 PyUnicode_WRITE(kind, data, i++, *f); 2354 } 2355 } 2356 assert(i == PyUnicode_GET_LENGTH(string)); 2357 2358 end: 2359 if (callresults) 2360 PyObject_Free(callresults); 2361 if (numberresults) 2362 PyObject_Free(numberresults); 2363 return (PyObject *)string; 2364 fail: 2365 if (callresults) { 2366 PyObject **callresult2 = callresults; 2367 while (callresult2 < callresult) { 2368 Py_XDECREF(*callresult2); 2369 ++callresult2; 2370 } 2371 PyObject_Free(callresults); 2372 } 2373 if (numberresults) 2374 PyObject_Free(numberresults); 2375 return NULL; 2376} 2377 2378PyObject * 2379PyUnicode_FromFormat(const char *format, ...) 2380{ 2381 PyObject* ret; 2382 va_list vargs; 2383 2384#ifdef HAVE_STDARG_PROTOTYPES 2385 va_start(vargs, format); 2386#else 2387 va_start(vargs); 2388#endif 2389 ret = PyUnicode_FromFormatV(format, vargs); 2390 va_end(vargs); 2391 return ret; 2392} 2393 2394#ifdef HAVE_WCHAR_H 2395 2396/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2397 convert a Unicode object to a wide character string. 2398 2399 - If w is NULL: return the number of wide characters (including the null 2400 character) required to convert the unicode object. Ignore size argument. 2401 2402 - Otherwise: return the number of wide characters (excluding the null 2403 character) written into w. Write at most size wide characters (including 2404 the null character). */ 2405static Py_ssize_t 2406unicode_aswidechar(PyUnicodeObject *unicode, 2407 wchar_t *w, 2408 Py_ssize_t size) 2409{ 2410 Py_ssize_t res; 2411 const wchar_t *wstr; 2412 2413 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2414 if (wstr == NULL) 2415 return -1; 2416 2417 if (w != NULL) { 2418 if (size > res) 2419 size = res + 1; 2420 else 2421 res = size; 2422 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2423 return res; 2424 } 2425 else 2426 return res + 1; 2427} 2428 2429Py_ssize_t 2430PyUnicode_AsWideChar(PyObject *unicode, 2431 wchar_t *w, 2432 Py_ssize_t size) 2433{ 2434 if (unicode == NULL) { 2435 PyErr_BadInternalCall(); 2436 return -1; 2437 } 2438 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2439} 2440 2441wchar_t* 2442PyUnicode_AsWideCharString(PyObject *unicode, 2443 Py_ssize_t *size) 2444{ 2445 wchar_t* buffer; 2446 Py_ssize_t buflen; 2447 2448 if (unicode == NULL) { 2449 PyErr_BadInternalCall(); 2450 return NULL; 2451 } 2452 2453 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2454 if (buflen == -1) 2455 return NULL; 2456 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2457 PyErr_NoMemory(); 2458 return NULL; 2459 } 2460 2461 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2462 if (buffer == NULL) { 2463 PyErr_NoMemory(); 2464 return NULL; 2465 } 2466 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2467 if (buflen == -1) 2468 return NULL; 2469 if (size != NULL) 2470 *size = buflen; 2471 return buffer; 2472} 2473 2474#endif /* HAVE_WCHAR_H */ 2475 2476PyObject * 2477PyUnicode_FromOrdinal(int ordinal) 2478{ 2479 PyObject *v; 2480 if (ordinal < 0 || ordinal > 0x10ffff) { 2481 PyErr_SetString(PyExc_ValueError, 2482 "chr() arg not in range(0x110000)"); 2483 return NULL; 2484 } 2485 2486 if (ordinal < 256) 2487 return get_latin1_char(ordinal); 2488 2489 v = PyUnicode_New(1, ordinal); 2490 if (v == NULL) 2491 return NULL; 2492 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2493 return v; 2494} 2495 2496PyObject * 2497PyUnicode_FromObject(register PyObject *obj) 2498{ 2499 /* XXX Perhaps we should make this API an alias of 2500 PyObject_Str() instead ?! */ 2501 if (PyUnicode_CheckExact(obj)) { 2502 if (PyUnicode_READY(obj)) 2503 return NULL; 2504 Py_INCREF(obj); 2505 return obj; 2506 } 2507 if (PyUnicode_Check(obj)) { 2508 /* For a Unicode subtype that's not a Unicode object, 2509 return a true Unicode object with the same data. */ 2510 return PyUnicode_Copy(obj); 2511 } 2512 PyErr_Format(PyExc_TypeError, 2513 "Can't convert '%.100s' object to str implicitly", 2514 Py_TYPE(obj)->tp_name); 2515 return NULL; 2516} 2517 2518PyObject * 2519PyUnicode_FromEncodedObject(register PyObject *obj, 2520 const char *encoding, 2521 const char *errors) 2522{ 2523 Py_buffer buffer; 2524 PyObject *v; 2525 2526 if (obj == NULL) { 2527 PyErr_BadInternalCall(); 2528 return NULL; 2529 } 2530 2531 /* Decoding bytes objects is the most common case and should be fast */ 2532 if (PyBytes_Check(obj)) { 2533 if (PyBytes_GET_SIZE(obj) == 0) { 2534 Py_INCREF(unicode_empty); 2535 v = unicode_empty; 2536 } 2537 else { 2538 v = PyUnicode_Decode( 2539 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2540 encoding, errors); 2541 } 2542 return v; 2543 } 2544 2545 if (PyUnicode_Check(obj)) { 2546 PyErr_SetString(PyExc_TypeError, 2547 "decoding str is not supported"); 2548 return NULL; 2549 } 2550 2551 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2552 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2553 PyErr_Format(PyExc_TypeError, 2554 "coercing to str: need bytes, bytearray " 2555 "or buffer-like object, %.80s found", 2556 Py_TYPE(obj)->tp_name); 2557 return NULL; 2558 } 2559 2560 if (buffer.len == 0) { 2561 Py_INCREF(unicode_empty); 2562 v = unicode_empty; 2563 } 2564 else 2565 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2566 2567 PyBuffer_Release(&buffer); 2568 return v; 2569} 2570 2571/* Convert encoding to lower case and replace '_' with '-' in order to 2572 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2573 1 on success. */ 2574static int 2575normalize_encoding(const char *encoding, 2576 char *lower, 2577 size_t lower_len) 2578{ 2579 const char *e; 2580 char *l; 2581 char *l_end; 2582 2583 e = encoding; 2584 l = lower; 2585 l_end = &lower[lower_len - 1]; 2586 while (*e) { 2587 if (l == l_end) 2588 return 0; 2589 if (Py_ISUPPER(*e)) { 2590 *l++ = Py_TOLOWER(*e++); 2591 } 2592 else if (*e == '_') { 2593 *l++ = '-'; 2594 e++; 2595 } 2596 else { 2597 *l++ = *e++; 2598 } 2599 } 2600 *l = '\0'; 2601 return 1; 2602} 2603 2604PyObject * 2605PyUnicode_Decode(const char *s, 2606 Py_ssize_t size, 2607 const char *encoding, 2608 const char *errors) 2609{ 2610 PyObject *buffer = NULL, *unicode; 2611 Py_buffer info; 2612 char lower[11]; /* Enough for any encoding shortcut */ 2613 2614 if (encoding == NULL) 2615 return PyUnicode_DecodeUTF8(s, size, errors); 2616 2617 /* Shortcuts for common default encodings */ 2618 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2619 if ((strcmp(lower, "utf-8") == 0) || 2620 (strcmp(lower, "utf8") == 0)) 2621 return PyUnicode_DecodeUTF8(s, size, errors); 2622 else if ((strcmp(lower, "latin-1") == 0) || 2623 (strcmp(lower, "latin1") == 0) || 2624 (strcmp(lower, "iso-8859-1") == 0)) 2625 return PyUnicode_DecodeLatin1(s, size, errors); 2626#ifdef HAVE_MBCS 2627 else if (strcmp(lower, "mbcs") == 0) 2628 return PyUnicode_DecodeMBCS(s, size, errors); 2629#endif 2630 else if (strcmp(lower, "ascii") == 0) 2631 return PyUnicode_DecodeASCII(s, size, errors); 2632 else if (strcmp(lower, "utf-16") == 0) 2633 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2634 else if (strcmp(lower, "utf-32") == 0) 2635 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2636 } 2637 2638 /* Decode via the codec registry */ 2639 buffer = NULL; 2640 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2641 goto onError; 2642 buffer = PyMemoryView_FromBuffer(&info); 2643 if (buffer == NULL) 2644 goto onError; 2645 unicode = PyCodec_Decode(buffer, encoding, errors); 2646 if (unicode == NULL) 2647 goto onError; 2648 if (!PyUnicode_Check(unicode)) { 2649 PyErr_Format(PyExc_TypeError, 2650 "decoder did not return a str object (type=%.400s)", 2651 Py_TYPE(unicode)->tp_name); 2652 Py_DECREF(unicode); 2653 goto onError; 2654 } 2655 Py_DECREF(buffer); 2656#ifndef DONT_MAKE_RESULT_READY 2657 if (_PyUnicode_READY_REPLACE(&unicode)) { 2658 Py_DECREF(unicode); 2659 return NULL; 2660 } 2661#endif 2662 return unicode; 2663 2664 onError: 2665 Py_XDECREF(buffer); 2666 return NULL; 2667} 2668 2669PyObject * 2670PyUnicode_AsDecodedObject(PyObject *unicode, 2671 const char *encoding, 2672 const char *errors) 2673{ 2674 PyObject *v; 2675 2676 if (!PyUnicode_Check(unicode)) { 2677 PyErr_BadArgument(); 2678 goto onError; 2679 } 2680 2681 if (encoding == NULL) 2682 encoding = PyUnicode_GetDefaultEncoding(); 2683 2684 /* Decode via the codec registry */ 2685 v = PyCodec_Decode(unicode, encoding, errors); 2686 if (v == NULL) 2687 goto onError; 2688 return v; 2689 2690 onError: 2691 return NULL; 2692} 2693 2694PyObject * 2695PyUnicode_AsDecodedUnicode(PyObject *unicode, 2696 const char *encoding, 2697 const char *errors) 2698{ 2699 PyObject *v; 2700 2701 if (!PyUnicode_Check(unicode)) { 2702 PyErr_BadArgument(); 2703 goto onError; 2704 } 2705 2706 if (encoding == NULL) 2707 encoding = PyUnicode_GetDefaultEncoding(); 2708 2709 /* Decode via the codec registry */ 2710 v = PyCodec_Decode(unicode, encoding, errors); 2711 if (v == NULL) 2712 goto onError; 2713 if (!PyUnicode_Check(v)) { 2714 PyErr_Format(PyExc_TypeError, 2715 "decoder did not return a str object (type=%.400s)", 2716 Py_TYPE(v)->tp_name); 2717 Py_DECREF(v); 2718 goto onError; 2719 } 2720 return v; 2721 2722 onError: 2723 return NULL; 2724} 2725 2726PyObject * 2727PyUnicode_Encode(const Py_UNICODE *s, 2728 Py_ssize_t size, 2729 const char *encoding, 2730 const char *errors) 2731{ 2732 PyObject *v, *unicode; 2733 2734 unicode = PyUnicode_FromUnicode(s, size); 2735 if (unicode == NULL) 2736 return NULL; 2737 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2738 Py_DECREF(unicode); 2739 return v; 2740} 2741 2742PyObject * 2743PyUnicode_AsEncodedObject(PyObject *unicode, 2744 const char *encoding, 2745 const char *errors) 2746{ 2747 PyObject *v; 2748 2749 if (!PyUnicode_Check(unicode)) { 2750 PyErr_BadArgument(); 2751 goto onError; 2752 } 2753 2754 if (encoding == NULL) 2755 encoding = PyUnicode_GetDefaultEncoding(); 2756 2757 /* Encode via the codec registry */ 2758 v = PyCodec_Encode(unicode, encoding, errors); 2759 if (v == NULL) 2760 goto onError; 2761 return v; 2762 2763 onError: 2764 return NULL; 2765} 2766 2767PyObject * 2768PyUnicode_EncodeFSDefault(PyObject *unicode) 2769{ 2770#ifdef HAVE_MBCS 2771 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2772 PyUnicode_GET_SIZE(unicode), 2773 NULL); 2774#elif defined(__APPLE__) 2775 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2776#else 2777 PyInterpreterState *interp = PyThreadState_GET()->interp; 2778 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2779 cannot use it to encode and decode filenames before it is loaded. Load 2780 the Python codec requires to encode at least its own filename. Use the C 2781 version of the locale codec until the codec registry is initialized and 2782 the Python codec is loaded. 2783 2784 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2785 cannot only rely on it: check also interp->fscodec_initialized for 2786 subinterpreters. */ 2787 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2788 return PyUnicode_AsEncodedString(unicode, 2789 Py_FileSystemDefaultEncoding, 2790 "surrogateescape"); 2791 } 2792 else { 2793 /* locale encoding with surrogateescape */ 2794 wchar_t *wchar; 2795 char *bytes; 2796 PyObject *bytes_obj; 2797 size_t error_pos; 2798 2799 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2800 if (wchar == NULL) 2801 return NULL; 2802 bytes = _Py_wchar2char(wchar, &error_pos); 2803 if (bytes == NULL) { 2804 if (error_pos != (size_t)-1) { 2805 char *errmsg = strerror(errno); 2806 PyObject *exc = NULL; 2807 if (errmsg == NULL) 2808 errmsg = "Py_wchar2char() failed"; 2809 raise_encode_exception(&exc, 2810 "filesystemencoding", 2811 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2812 error_pos, error_pos+1, 2813 errmsg); 2814 Py_XDECREF(exc); 2815 } 2816 else 2817 PyErr_NoMemory(); 2818 PyMem_Free(wchar); 2819 return NULL; 2820 } 2821 PyMem_Free(wchar); 2822 2823 bytes_obj = PyBytes_FromString(bytes); 2824 PyMem_Free(bytes); 2825 return bytes_obj; 2826 } 2827#endif 2828} 2829 2830PyObject * 2831PyUnicode_AsEncodedString(PyObject *unicode, 2832 const char *encoding, 2833 const char *errors) 2834{ 2835 PyObject *v; 2836 char lower[11]; /* Enough for any encoding shortcut */ 2837 2838 if (!PyUnicode_Check(unicode)) { 2839 PyErr_BadArgument(); 2840 return NULL; 2841 } 2842 2843 if (encoding == NULL) { 2844 if (errors == NULL || strcmp(errors, "strict") == 0) 2845 return _PyUnicode_AsUTF8String(unicode, NULL); 2846 else 2847 return _PyUnicode_AsUTF8String(unicode, errors); 2848 } 2849 2850 /* Shortcuts for common default encodings */ 2851 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2852 if ((strcmp(lower, "utf-8") == 0) || 2853 (strcmp(lower, "utf8") == 0)) 2854 { 2855 if (errors == NULL || strcmp(errors, "strict") == 0) 2856 return _PyUnicode_AsUTF8String(unicode, NULL); 2857 else 2858 return _PyUnicode_AsUTF8String(unicode, errors); 2859 } 2860 else if ((strcmp(lower, "latin-1") == 0) || 2861 (strcmp(lower, "latin1") == 0) || 2862 (strcmp(lower, "iso-8859-1") == 0)) 2863 return _PyUnicode_AsLatin1String(unicode, errors); 2864#ifdef HAVE_MBCS 2865 else if (strcmp(lower, "mbcs") == 0) 2866 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2867 PyUnicode_GET_SIZE(unicode), 2868 errors); 2869#endif 2870 else if (strcmp(lower, "ascii") == 0) 2871 return _PyUnicode_AsASCIIString(unicode, errors); 2872 } 2873 2874 /* Encode via the codec registry */ 2875 v = PyCodec_Encode(unicode, encoding, errors); 2876 if (v == NULL) 2877 return NULL; 2878 2879 /* The normal path */ 2880 if (PyBytes_Check(v)) 2881 return v; 2882 2883 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2884 if (PyByteArray_Check(v)) { 2885 int error; 2886 PyObject *b; 2887 2888 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2889 "encoder %s returned bytearray instead of bytes", 2890 encoding); 2891 if (error) { 2892 Py_DECREF(v); 2893 return NULL; 2894 } 2895 2896 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2897 Py_DECREF(v); 2898 return b; 2899 } 2900 2901 PyErr_Format(PyExc_TypeError, 2902 "encoder did not return a bytes object (type=%.400s)", 2903 Py_TYPE(v)->tp_name); 2904 Py_DECREF(v); 2905 return NULL; 2906} 2907 2908PyObject * 2909PyUnicode_AsEncodedUnicode(PyObject *unicode, 2910 const char *encoding, 2911 const char *errors) 2912{ 2913 PyObject *v; 2914 2915 if (!PyUnicode_Check(unicode)) { 2916 PyErr_BadArgument(); 2917 goto onError; 2918 } 2919 2920 if (encoding == NULL) 2921 encoding = PyUnicode_GetDefaultEncoding(); 2922 2923 /* Encode via the codec registry */ 2924 v = PyCodec_Encode(unicode, encoding, errors); 2925 if (v == NULL) 2926 goto onError; 2927 if (!PyUnicode_Check(v)) { 2928 PyErr_Format(PyExc_TypeError, 2929 "encoder did not return an str object (type=%.400s)", 2930 Py_TYPE(v)->tp_name); 2931 Py_DECREF(v); 2932 goto onError; 2933 } 2934 return v; 2935 2936 onError: 2937 return NULL; 2938} 2939 2940PyObject* 2941PyUnicode_DecodeFSDefault(const char *s) { 2942 Py_ssize_t size = (Py_ssize_t)strlen(s); 2943 return PyUnicode_DecodeFSDefaultAndSize(s, size); 2944} 2945 2946PyObject* 2947PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 2948{ 2949#ifdef HAVE_MBCS 2950 return PyUnicode_DecodeMBCS(s, size, NULL); 2951#elif defined(__APPLE__) 2952 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 2953#else 2954 PyInterpreterState *interp = PyThreadState_GET()->interp; 2955 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2956 cannot use it to encode and decode filenames before it is loaded. Load 2957 the Python codec requires to encode at least its own filename. Use the C 2958 version of the locale codec until the codec registry is initialized and 2959 the Python codec is loaded. 2960 2961 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2962 cannot only rely on it: check also interp->fscodec_initialized for 2963 subinterpreters. */ 2964 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2965 return PyUnicode_Decode(s, size, 2966 Py_FileSystemDefaultEncoding, 2967 "surrogateescape"); 2968 } 2969 else { 2970 /* locale encoding with surrogateescape */ 2971 wchar_t *wchar; 2972 PyObject *unicode; 2973 size_t len; 2974 2975 if (s[size] != '\0' || size != strlen(s)) { 2976 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2977 return NULL; 2978 } 2979 2980 wchar = _Py_char2wchar(s, &len); 2981 if (wchar == NULL) 2982 return PyErr_NoMemory(); 2983 2984 unicode = PyUnicode_FromWideChar(wchar, len); 2985 PyMem_Free(wchar); 2986 return unicode; 2987 } 2988#endif 2989} 2990 2991 2992int 2993PyUnicode_FSConverter(PyObject* arg, void* addr) 2994{ 2995 PyObject *output = NULL; 2996 Py_ssize_t size; 2997 void *data; 2998 if (arg == NULL) { 2999 Py_DECREF(*(PyObject**)addr); 3000 return 1; 3001 } 3002 if (PyBytes_Check(arg)) { 3003 output = arg; 3004 Py_INCREF(output); 3005 } 3006 else { 3007 arg = PyUnicode_FromObject(arg); 3008 if (!arg) 3009 return 0; 3010 output = PyUnicode_EncodeFSDefault(arg); 3011 Py_DECREF(arg); 3012 if (!output) 3013 return 0; 3014 if (!PyBytes_Check(output)) { 3015 Py_DECREF(output); 3016 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3017 return 0; 3018 } 3019 } 3020 size = PyBytes_GET_SIZE(output); 3021 data = PyBytes_AS_STRING(output); 3022 if (size != strlen(data)) { 3023 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3024 Py_DECREF(output); 3025 return 0; 3026 } 3027 *(PyObject**)addr = output; 3028 return Py_CLEANUP_SUPPORTED; 3029} 3030 3031 3032int 3033PyUnicode_FSDecoder(PyObject* arg, void* addr) 3034{ 3035 PyObject *output = NULL; 3036 if (arg == NULL) { 3037 Py_DECREF(*(PyObject**)addr); 3038 return 1; 3039 } 3040 if (PyUnicode_Check(arg)) { 3041 if (PyUnicode_READY(arg)) 3042 return 0; 3043 output = arg; 3044 Py_INCREF(output); 3045 } 3046 else { 3047 arg = PyBytes_FromObject(arg); 3048 if (!arg) 3049 return 0; 3050 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3051 PyBytes_GET_SIZE(arg)); 3052 Py_DECREF(arg); 3053 if (!output) 3054 return 0; 3055 if (!PyUnicode_Check(output)) { 3056 Py_DECREF(output); 3057 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3058 return 0; 3059 } 3060 } 3061 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3062 PyUnicode_GET_LENGTH(output), 0, 1)) { 3063 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3064 Py_DECREF(output); 3065 return 0; 3066 } 3067 *(PyObject**)addr = output; 3068 return Py_CLEANUP_SUPPORTED; 3069} 3070 3071 3072char* 3073PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3074{ 3075 PyObject *bytes; 3076 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3077 3078 if (!PyUnicode_Check(unicode)) { 3079 PyErr_BadArgument(); 3080 return NULL; 3081 } 3082 if (PyUnicode_READY(u) == -1) 3083 return NULL; 3084 3085 if (PyUnicode_UTF8(unicode) == NULL) { 3086 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3087 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3088 if (bytes == NULL) 3089 return NULL; 3090 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3091 if (_PyUnicode_UTF8(u) == NULL) { 3092 Py_DECREF(bytes); 3093 return NULL; 3094 } 3095 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3096 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3097 Py_DECREF(bytes); 3098 } 3099 3100 if (psize) 3101 *psize = PyUnicode_UTF8_LENGTH(unicode); 3102 return PyUnicode_UTF8(unicode); 3103} 3104 3105char* 3106PyUnicode_AsUTF8(PyObject *unicode) 3107{ 3108 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3109} 3110 3111#ifdef Py_DEBUG 3112int unicode_as_unicode_calls = 0; 3113#endif 3114 3115 3116Py_UNICODE * 3117PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3118{ 3119 PyUnicodeObject *u; 3120 const unsigned char *one_byte; 3121#if SIZEOF_WCHAR_T == 4 3122 const Py_UCS2 *two_bytes; 3123#else 3124 const Py_UCS4 *four_bytes; 3125 const Py_UCS4 *ucs4_end; 3126 Py_ssize_t num_surrogates; 3127#endif 3128 wchar_t *w; 3129 wchar_t *wchar_end; 3130 3131 if (!PyUnicode_Check(unicode)) { 3132 PyErr_BadArgument(); 3133 return NULL; 3134 } 3135 u = (PyUnicodeObject*)unicode; 3136 if (_PyUnicode_WSTR(u) == NULL) { 3137 /* Non-ASCII compact unicode object */ 3138 assert(_PyUnicode_KIND(u) != 0); 3139 assert(PyUnicode_IS_READY(u)); 3140 3141#ifdef Py_DEBUG 3142 ++unicode_as_unicode_calls; 3143#endif 3144 3145 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3146#if SIZEOF_WCHAR_T == 2 3147 four_bytes = PyUnicode_4BYTE_DATA(u); 3148 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3149 num_surrogates = 0; 3150 3151 for (; four_bytes < ucs4_end; ++four_bytes) { 3152 if (*four_bytes > 0xFFFF) 3153 ++num_surrogates; 3154 } 3155 3156 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3157 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3158 if (!_PyUnicode_WSTR(u)) { 3159 PyErr_NoMemory(); 3160 return NULL; 3161 } 3162 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3163 3164 w = _PyUnicode_WSTR(u); 3165 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3166 four_bytes = PyUnicode_4BYTE_DATA(u); 3167 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3168 if (*four_bytes > 0xFFFF) { 3169 /* encode surrogate pair in this case */ 3170 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3171 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3172 } 3173 else 3174 *w = *four_bytes; 3175 3176 if (w > wchar_end) { 3177 assert(0 && "Miscalculated string end"); 3178 } 3179 } 3180 *w = 0; 3181#else 3182 /* sizeof(wchar_t) == 4 */ 3183 Py_FatalError("Impossible unicode object state, wstr and str " 3184 "should share memory already."); 3185 return NULL; 3186#endif 3187 } 3188 else { 3189 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3190 (_PyUnicode_LENGTH(u) + 1)); 3191 if (!_PyUnicode_WSTR(u)) { 3192 PyErr_NoMemory(); 3193 return NULL; 3194 } 3195 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3196 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3197 w = _PyUnicode_WSTR(u); 3198 wchar_end = w + _PyUnicode_LENGTH(u); 3199 3200 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3201 one_byte = PyUnicode_1BYTE_DATA(u); 3202 for (; w < wchar_end; ++one_byte, ++w) 3203 *w = *one_byte; 3204 /* null-terminate the wstr */ 3205 *w = 0; 3206 } 3207 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3208#if SIZEOF_WCHAR_T == 4 3209 two_bytes = PyUnicode_2BYTE_DATA(u); 3210 for (; w < wchar_end; ++two_bytes, ++w) 3211 *w = *two_bytes; 3212 /* null-terminate the wstr */ 3213 *w = 0; 3214#else 3215 /* sizeof(wchar_t) == 2 */ 3216 PyObject_FREE(_PyUnicode_WSTR(u)); 3217 _PyUnicode_WSTR(u) = NULL; 3218 Py_FatalError("Impossible unicode object state, wstr " 3219 "and str should share memory already."); 3220 return NULL; 3221#endif 3222 } 3223 else { 3224 assert(0 && "This should never happen."); 3225 } 3226 } 3227 } 3228 if (size != NULL) 3229 *size = PyUnicode_WSTR_LENGTH(u); 3230 return _PyUnicode_WSTR(u); 3231} 3232 3233Py_UNICODE * 3234PyUnicode_AsUnicode(PyObject *unicode) 3235{ 3236 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3237} 3238 3239 3240Py_ssize_t 3241PyUnicode_GetSize(PyObject *unicode) 3242{ 3243 if (!PyUnicode_Check(unicode)) { 3244 PyErr_BadArgument(); 3245 goto onError; 3246 } 3247 return PyUnicode_GET_SIZE(unicode); 3248 3249 onError: 3250 return -1; 3251} 3252 3253Py_ssize_t 3254PyUnicode_GetLength(PyObject *unicode) 3255{ 3256 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3257 PyErr_BadArgument(); 3258 return -1; 3259 } 3260 3261 return PyUnicode_GET_LENGTH(unicode); 3262} 3263 3264Py_UCS4 3265PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3266{ 3267 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3268 PyErr_BadArgument(); 3269 return (Py_UCS4)-1; 3270 } 3271 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3272 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3273 return (Py_UCS4)-1; 3274 } 3275 return PyUnicode_READ_CHAR(unicode, index); 3276} 3277 3278int 3279PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3280{ 3281 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3282 PyErr_BadArgument(); 3283 return -1; 3284 } 3285 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3286 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3287 return -1; 3288 } 3289 if (_PyUnicode_Dirty(unicode)) 3290 return -1; 3291 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3292 index, ch); 3293 return 0; 3294} 3295 3296const char * 3297PyUnicode_GetDefaultEncoding(void) 3298{ 3299 return "utf-8"; 3300} 3301 3302/* create or adjust a UnicodeDecodeError */ 3303static void 3304make_decode_exception(PyObject **exceptionObject, 3305 const char *encoding, 3306 const char *input, Py_ssize_t length, 3307 Py_ssize_t startpos, Py_ssize_t endpos, 3308 const char *reason) 3309{ 3310 if (*exceptionObject == NULL) { 3311 *exceptionObject = PyUnicodeDecodeError_Create( 3312 encoding, input, length, startpos, endpos, reason); 3313 } 3314 else { 3315 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3316 goto onError; 3317 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3318 goto onError; 3319 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3320 goto onError; 3321 } 3322 return; 3323 3324onError: 3325 Py_DECREF(*exceptionObject); 3326 *exceptionObject = NULL; 3327} 3328 3329/* error handling callback helper: 3330 build arguments, call the callback and check the arguments, 3331 if no exception occurred, copy the replacement to the output 3332 and adjust various state variables. 3333 return 0 on success, -1 on error 3334*/ 3335 3336static int 3337unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3338 const char *encoding, const char *reason, 3339 const char **input, const char **inend, Py_ssize_t *startinpos, 3340 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3341 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3342{ 3343 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3344 3345 PyObject *restuple = NULL; 3346 PyObject *repunicode = NULL; 3347 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3348 Py_ssize_t insize; 3349 Py_ssize_t requiredsize; 3350 Py_ssize_t newpos; 3351 const Py_UNICODE *repptr; 3352 PyObject *inputobj = NULL; 3353 Py_ssize_t repsize; 3354 int res = -1; 3355 3356 if (*errorHandler == NULL) { 3357 *errorHandler = PyCodec_LookupError(errors); 3358 if (*errorHandler == NULL) 3359 goto onError; 3360 } 3361 3362 make_decode_exception(exceptionObject, 3363 encoding, 3364 *input, *inend - *input, 3365 *startinpos, *endinpos, 3366 reason); 3367 if (*exceptionObject == NULL) 3368 goto onError; 3369 3370 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3371 if (restuple == NULL) 3372 goto onError; 3373 if (!PyTuple_Check(restuple)) { 3374 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3375 goto onError; 3376 } 3377 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3378 goto onError; 3379 3380 /* Copy back the bytes variables, which might have been modified by the 3381 callback */ 3382 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3383 if (!inputobj) 3384 goto onError; 3385 if (!PyBytes_Check(inputobj)) { 3386 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3387 } 3388 *input = PyBytes_AS_STRING(inputobj); 3389 insize = PyBytes_GET_SIZE(inputobj); 3390 *inend = *input + insize; 3391 /* we can DECREF safely, as the exception has another reference, 3392 so the object won't go away. */ 3393 Py_DECREF(inputobj); 3394 3395 if (newpos<0) 3396 newpos = insize+newpos; 3397 if (newpos<0 || newpos>insize) { 3398 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3399 goto onError; 3400 } 3401 3402 /* need more space? (at least enough for what we 3403 have+the replacement+the rest of the string (starting 3404 at the new input position), so we won't have to check space 3405 when there are no errors in the rest of the string) */ 3406 repptr = PyUnicode_AS_UNICODE(repunicode); 3407 repsize = PyUnicode_GET_SIZE(repunicode); 3408 requiredsize = *outpos + repsize + insize-newpos; 3409 if (requiredsize > outsize) { 3410 if (requiredsize<2*outsize) 3411 requiredsize = 2*outsize; 3412 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3413 goto onError; 3414 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3415 } 3416 *endinpos = newpos; 3417 *inptr = *input + newpos; 3418 Py_UNICODE_COPY(*outptr, repptr, repsize); 3419 *outptr += repsize; 3420 *outpos += repsize; 3421 3422 /* we made it! */ 3423 res = 0; 3424 3425 onError: 3426 Py_XDECREF(restuple); 3427 return res; 3428} 3429 3430/* --- UTF-7 Codec -------------------------------------------------------- */ 3431 3432/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3433 3434/* Three simple macros defining base-64. */ 3435 3436/* Is c a base-64 character? */ 3437 3438#define IS_BASE64(c) \ 3439 (((c) >= 'A' && (c) <= 'Z') || \ 3440 ((c) >= 'a' && (c) <= 'z') || \ 3441 ((c) >= '0' && (c) <= '9') || \ 3442 (c) == '+' || (c) == '/') 3443 3444/* given that c is a base-64 character, what is its base-64 value? */ 3445 3446#define FROM_BASE64(c) \ 3447 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3448 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3449 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3450 (c) == '+' ? 62 : 63) 3451 3452/* What is the base-64 character of the bottom 6 bits of n? */ 3453 3454#define TO_BASE64(n) \ 3455 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3456 3457/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3458 * decoded as itself. We are permissive on decoding; the only ASCII 3459 * byte not decoding to itself is the + which begins a base64 3460 * string. */ 3461 3462#define DECODE_DIRECT(c) \ 3463 ((c) <= 127 && (c) != '+') 3464 3465/* The UTF-7 encoder treats ASCII characters differently according to 3466 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3467 * the above). See RFC2152. This array identifies these different 3468 * sets: 3469 * 0 : "Set D" 3470 * alphanumeric and '(),-./:? 3471 * 1 : "Set O" 3472 * !"#$%&*;<=>@[]^_`{|} 3473 * 2 : "whitespace" 3474 * ht nl cr sp 3475 * 3 : special (must be base64 encoded) 3476 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3477 */ 3478 3479static 3480char utf7_category[128] = { 3481/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3482 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3483/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3484 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3485/* sp ! " # $ % & ' ( ) * + , - . / */ 3486 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3487/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3489/* @ A B C D E F G H I J K L M N O */ 3490 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3491/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3493/* ` a b c d e f g h i j k l m n o */ 3494 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3495/* p q r s t u v w x y z { | } ~ del */ 3496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3497}; 3498 3499/* ENCODE_DIRECT: this character should be encoded as itself. The 3500 * answer depends on whether we are encoding set O as itself, and also 3501 * on whether we are encoding whitespace as itself. RFC2152 makes it 3502 * clear that the answers to these questions vary between 3503 * applications, so this code needs to be flexible. */ 3504 3505#define ENCODE_DIRECT(c, directO, directWS) \ 3506 ((c) < 128 && (c) > 0 && \ 3507 ((utf7_category[(c)] == 0) || \ 3508 (directWS && (utf7_category[(c)] == 2)) || \ 3509 (directO && (utf7_category[(c)] == 1)))) 3510 3511PyObject * 3512PyUnicode_DecodeUTF7(const char *s, 3513 Py_ssize_t size, 3514 const char *errors) 3515{ 3516 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3517} 3518 3519/* The decoder. The only state we preserve is our read position, 3520 * i.e. how many characters we have consumed. So if we end in the 3521 * middle of a shift sequence we have to back off the read position 3522 * and the output to the beginning of the sequence, otherwise we lose 3523 * all the shift state (seen bits, number of bits seen, high 3524 * surrogate). */ 3525 3526PyObject * 3527PyUnicode_DecodeUTF7Stateful(const char *s, 3528 Py_ssize_t size, 3529 const char *errors, 3530 Py_ssize_t *consumed) 3531{ 3532 const char *starts = s; 3533 Py_ssize_t startinpos; 3534 Py_ssize_t endinpos; 3535 Py_ssize_t outpos; 3536 const char *e; 3537 PyUnicodeObject *unicode; 3538 Py_UNICODE *p; 3539 const char *errmsg = ""; 3540 int inShift = 0; 3541 Py_UNICODE *shiftOutStart; 3542 unsigned int base64bits = 0; 3543 unsigned long base64buffer = 0; 3544 Py_UNICODE surrogate = 0; 3545 PyObject *errorHandler = NULL; 3546 PyObject *exc = NULL; 3547 3548 unicode = _PyUnicode_New(size); 3549 if (!unicode) 3550 return NULL; 3551 if (size == 0) { 3552 if (consumed) 3553 *consumed = 0; 3554 return (PyObject *)unicode; 3555 } 3556 3557 p = PyUnicode_AS_UNICODE(unicode); 3558 shiftOutStart = p; 3559 e = s + size; 3560 3561 while (s < e) { 3562 Py_UNICODE ch; 3563 restart: 3564 ch = (unsigned char) *s; 3565 3566 if (inShift) { /* in a base-64 section */ 3567 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3568 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3569 base64bits += 6; 3570 s++; 3571 if (base64bits >= 16) { 3572 /* we have enough bits for a UTF-16 value */ 3573 Py_UNICODE outCh = (Py_UNICODE) 3574 (base64buffer >> (base64bits-16)); 3575 base64bits -= 16; 3576 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3577 if (surrogate) { 3578 /* expecting a second surrogate */ 3579 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3580#ifdef Py_UNICODE_WIDE 3581 *p++ = (((surrogate & 0x3FF)<<10) 3582 | (outCh & 0x3FF)) + 0x10000; 3583#else 3584 *p++ = surrogate; 3585 *p++ = outCh; 3586#endif 3587 surrogate = 0; 3588 } 3589 else { 3590 surrogate = 0; 3591 errmsg = "second surrogate missing"; 3592 goto utf7Error; 3593 } 3594 } 3595 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3596 /* first surrogate */ 3597 surrogate = outCh; 3598 } 3599 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3600 errmsg = "unexpected second surrogate"; 3601 goto utf7Error; 3602 } 3603 else { 3604 *p++ = outCh; 3605 } 3606 } 3607 } 3608 else { /* now leaving a base-64 section */ 3609 inShift = 0; 3610 s++; 3611 if (surrogate) { 3612 errmsg = "second surrogate missing at end of shift sequence"; 3613 goto utf7Error; 3614 } 3615 if (base64bits > 0) { /* left-over bits */ 3616 if (base64bits >= 6) { 3617 /* We've seen at least one base-64 character */ 3618 errmsg = "partial character in shift sequence"; 3619 goto utf7Error; 3620 } 3621 else { 3622 /* Some bits remain; they should be zero */ 3623 if (base64buffer != 0) { 3624 errmsg = "non-zero padding bits in shift sequence"; 3625 goto utf7Error; 3626 } 3627 } 3628 } 3629 if (ch != '-') { 3630 /* '-' is absorbed; other terminating 3631 characters are preserved */ 3632 *p++ = ch; 3633 } 3634 } 3635 } 3636 else if ( ch == '+' ) { 3637 startinpos = s-starts; 3638 s++; /* consume '+' */ 3639 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3640 s++; 3641 *p++ = '+'; 3642 } 3643 else { /* begin base64-encoded section */ 3644 inShift = 1; 3645 shiftOutStart = p; 3646 base64bits = 0; 3647 } 3648 } 3649 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3650 *p++ = ch; 3651 s++; 3652 } 3653 else { 3654 startinpos = s-starts; 3655 s++; 3656 errmsg = "unexpected special character"; 3657 goto utf7Error; 3658 } 3659 continue; 3660utf7Error: 3661 outpos = p-PyUnicode_AS_UNICODE(unicode); 3662 endinpos = s-starts; 3663 if (unicode_decode_call_errorhandler( 3664 errors, &errorHandler, 3665 "utf7", errmsg, 3666 &starts, &e, &startinpos, &endinpos, &exc, &s, 3667 &unicode, &outpos, &p)) 3668 goto onError; 3669 } 3670 3671 /* end of string */ 3672 3673 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3674 /* if we're in an inconsistent state, that's an error */ 3675 if (surrogate || 3676 (base64bits >= 6) || 3677 (base64bits > 0 && base64buffer != 0)) { 3678 outpos = p-PyUnicode_AS_UNICODE(unicode); 3679 endinpos = size; 3680 if (unicode_decode_call_errorhandler( 3681 errors, &errorHandler, 3682 "utf7", "unterminated shift sequence", 3683 &starts, &e, &startinpos, &endinpos, &exc, &s, 3684 &unicode, &outpos, &p)) 3685 goto onError; 3686 if (s < e) 3687 goto restart; 3688 } 3689 } 3690 3691 /* return state */ 3692 if (consumed) { 3693 if (inShift) { 3694 p = shiftOutStart; /* back off output */ 3695 *consumed = startinpos; 3696 } 3697 else { 3698 *consumed = s-starts; 3699 } 3700 } 3701 3702 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3703 goto onError; 3704 3705 Py_XDECREF(errorHandler); 3706 Py_XDECREF(exc); 3707#ifndef DONT_MAKE_RESULT_READY 3708 if (_PyUnicode_READY_REPLACE(&unicode)) { 3709 Py_DECREF(unicode); 3710 return NULL; 3711 } 3712#endif 3713 return (PyObject *)unicode; 3714 3715 onError: 3716 Py_XDECREF(errorHandler); 3717 Py_XDECREF(exc); 3718 Py_DECREF(unicode); 3719 return NULL; 3720} 3721 3722 3723PyObject * 3724PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3725 Py_ssize_t size, 3726 int base64SetO, 3727 int base64WhiteSpace, 3728 const char *errors) 3729{ 3730 PyObject *v; 3731 /* It might be possible to tighten this worst case */ 3732 Py_ssize_t allocated = 8 * size; 3733 int inShift = 0; 3734 Py_ssize_t i = 0; 3735 unsigned int base64bits = 0; 3736 unsigned long base64buffer = 0; 3737 char * out; 3738 char * start; 3739 3740 if (size == 0) 3741 return PyBytes_FromStringAndSize(NULL, 0); 3742 3743 if (allocated / 8 != size) 3744 return PyErr_NoMemory(); 3745 3746 v = PyBytes_FromStringAndSize(NULL, allocated); 3747 if (v == NULL) 3748 return NULL; 3749 3750 start = out = PyBytes_AS_STRING(v); 3751 for (;i < size; ++i) { 3752 Py_UNICODE ch = s[i]; 3753 3754 if (inShift) { 3755 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3756 /* shifting out */ 3757 if (base64bits) { /* output remaining bits */ 3758 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3759 base64buffer = 0; 3760 base64bits = 0; 3761 } 3762 inShift = 0; 3763 /* Characters not in the BASE64 set implicitly unshift the sequence 3764 so no '-' is required, except if the character is itself a '-' */ 3765 if (IS_BASE64(ch) || ch == '-') { 3766 *out++ = '-'; 3767 } 3768 *out++ = (char) ch; 3769 } 3770 else { 3771 goto encode_char; 3772 } 3773 } 3774 else { /* not in a shift sequence */ 3775 if (ch == '+') { 3776 *out++ = '+'; 3777 *out++ = '-'; 3778 } 3779 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3780 *out++ = (char) ch; 3781 } 3782 else { 3783 *out++ = '+'; 3784 inShift = 1; 3785 goto encode_char; 3786 } 3787 } 3788 continue; 3789encode_char: 3790#ifdef Py_UNICODE_WIDE 3791 if (ch >= 0x10000) { 3792 /* code first surrogate */ 3793 base64bits += 16; 3794 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3795 while (base64bits >= 6) { 3796 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3797 base64bits -= 6; 3798 } 3799 /* prepare second surrogate */ 3800 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3801 } 3802#endif 3803 base64bits += 16; 3804 base64buffer = (base64buffer << 16) | ch; 3805 while (base64bits >= 6) { 3806 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3807 base64bits -= 6; 3808 } 3809 } 3810 if (base64bits) 3811 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3812 if (inShift) 3813 *out++ = '-'; 3814 if (_PyBytes_Resize(&v, out - start) < 0) 3815 return NULL; 3816 return v; 3817} 3818 3819#undef IS_BASE64 3820#undef FROM_BASE64 3821#undef TO_BASE64 3822#undef DECODE_DIRECT 3823#undef ENCODE_DIRECT 3824 3825/* --- UTF-8 Codec -------------------------------------------------------- */ 3826 3827static 3828char utf8_code_length[256] = { 3829 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3830 illegal prefix. See RFC 3629 for details */ 3831 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3832 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3833 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3834 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3835 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3837 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3838 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3839 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3841 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3843 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3844 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3845 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3846 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3847}; 3848 3849PyObject * 3850PyUnicode_DecodeUTF8(const char *s, 3851 Py_ssize_t size, 3852 const char *errors) 3853{ 3854 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3855} 3856 3857/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3858#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3859 3860/* Mask to quickly check whether a C 'long' contains a 3861 non-ASCII, UTF8-encoded char. */ 3862#if (SIZEOF_LONG == 8) 3863# define ASCII_CHAR_MASK 0x8080808080808080L 3864#elif (SIZEOF_LONG == 4) 3865# define ASCII_CHAR_MASK 0x80808080L 3866#else 3867# error C 'long' size should be either 4 or 8! 3868#endif 3869 3870/* Scans a UTF-8 string and returns the maximum character to be expected, 3871 the size of the decoded unicode string and if any major errors were 3872 encountered. 3873 3874 This function does check basic UTF-8 sanity, it does however NOT CHECK 3875 if the string contains surrogates, and if all continuation bytes are 3876 within the correct ranges, these checks are performed in 3877 PyUnicode_DecodeUTF8Stateful. 3878 3879 If it sets has_errors to 1, it means the value of unicode_size and max_char 3880 will be bogus and you should not rely on useful information in them. 3881 */ 3882static Py_UCS4 3883utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3884 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3885 int *has_errors) 3886{ 3887 Py_ssize_t n; 3888 Py_ssize_t char_count = 0; 3889 Py_UCS4 max_char = 127, new_max; 3890 Py_UCS4 upper_bound; 3891 const unsigned char *p = (const unsigned char *)s; 3892 const unsigned char *end = p + string_size; 3893 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3894 int err = 0; 3895 3896 for (; p < end && !err; ++p, ++char_count) { 3897 /* Only check value if it's not a ASCII char... */ 3898 if (*p < 0x80) { 3899 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 3900 an explanation. */ 3901 if (!((size_t) p & LONG_PTR_MASK)) { 3902 /* Help register allocation */ 3903 register const unsigned char *_p = p; 3904 while (_p < aligned_end) { 3905 unsigned long value = *(unsigned long *) _p; 3906 if (value & ASCII_CHAR_MASK) 3907 break; 3908 _p += SIZEOF_LONG; 3909 char_count += SIZEOF_LONG; 3910 } 3911 p = _p; 3912 if (p == end) 3913 break; 3914 } 3915 } 3916 if (*p >= 0x80) { 3917 n = utf8_code_length[*p]; 3918 new_max = max_char; 3919 switch (n) { 3920 /* invalid start byte */ 3921 case 0: 3922 err = 1; 3923 break; 3924 case 2: 3925 /* Code points between 0x00FF and 0x07FF inclusive. 3926 Approximate the upper bound of the code point, 3927 if this flips over 255 we can be sure it will be more 3928 than 255 and the string will need 2 bytes per code coint, 3929 if it stays under or equal to 255, we can be sure 1 byte 3930 is enough. 3931 ((*p & 0b00011111) << 6) | 0b00111111 */ 3932 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 3933 if (max_char < upper_bound) 3934 new_max = upper_bound; 3935 /* Ensure we track at least that we left ASCII space. */ 3936 if (new_max < 128) 3937 new_max = 128; 3938 break; 3939 case 3: 3940 /* Between 0x0FFF and 0xFFFF inclusive, so values are 3941 always > 255 and <= 65535 and will always need 2 bytes. */ 3942 if (max_char < 65535) 3943 new_max = 65535; 3944 break; 3945 case 4: 3946 /* Code point will be above 0xFFFF for sure in this case. */ 3947 new_max = 65537; 3948 break; 3949 /* Internal error, this should be caught by the first if */ 3950 case 1: 3951 default: 3952 assert(0 && "Impossible case in utf8_max_char_and_size"); 3953 err = 1; 3954 } 3955 /* Instead of number of overall bytes for this code point, 3956 n containts the number of following bytes: */ 3957 --n; 3958 /* Check if the follow up chars are all valid continuation bytes */ 3959 if (n >= 1) { 3960 const unsigned char *cont; 3961 if ((p + n) >= end) { 3962 if (consumed == 0) 3963 /* incomplete data, non-incremental decoding */ 3964 err = 1; 3965 break; 3966 } 3967 for (cont = p + 1; cont < (p + n); ++cont) { 3968 if ((*cont & 0xc0) != 0x80) { 3969 err = 1; 3970 break; 3971 } 3972 } 3973 p += n; 3974 } 3975 else 3976 err = 1; 3977 max_char = new_max; 3978 } 3979 } 3980 3981 if (unicode_size) 3982 *unicode_size = char_count; 3983 if (has_errors) 3984 *has_errors = err; 3985 return max_char; 3986} 3987 3988/* Similar to PyUnicode_WRITE but can also write into wstr field 3989 of the legacy unicode representation */ 3990#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 3991 do { \ 3992 const int k_ = (kind); \ 3993 if (k_ == PyUnicode_WCHAR_KIND) \ 3994 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 3995 else if (k_ == PyUnicode_1BYTE_KIND) \ 3996 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 3997 else if (k_ == PyUnicode_2BYTE_KIND) \ 3998 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 3999 else \ 4000 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 4001 } while (0) 4002 4003PyObject * 4004PyUnicode_DecodeUTF8Stateful(const char *s, 4005 Py_ssize_t size, 4006 const char *errors, 4007 Py_ssize_t *consumed) 4008{ 4009 const char *starts = s; 4010 int n; 4011 int k; 4012 Py_ssize_t startinpos; 4013 Py_ssize_t endinpos; 4014 const char *e, *aligned_end; 4015 PyUnicodeObject *unicode; 4016 const char *errmsg = ""; 4017 PyObject *errorHandler = NULL; 4018 PyObject *exc = NULL; 4019 Py_UCS4 maxchar = 0; 4020 Py_ssize_t unicode_size; 4021 Py_ssize_t i; 4022 int kind; 4023 void *data; 4024 int has_errors; 4025 Py_UNICODE *error_outptr; 4026#if SIZEOF_WCHAR_T == 2 4027 Py_ssize_t wchar_offset = 0; 4028#endif 4029 4030 if (size == 0) { 4031 if (consumed) 4032 *consumed = 0; 4033 return (PyObject *)PyUnicode_New(0, 0); 4034 } 4035 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4036 consumed, &has_errors); 4037 if (has_errors) { 4038 unicode = _PyUnicode_New(size); 4039 if (!unicode) 4040 return NULL; 4041 kind = PyUnicode_WCHAR_KIND; 4042 data = PyUnicode_AS_UNICODE(unicode); 4043 assert(data != NULL); 4044 } 4045 else { 4046 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 4047 if (!unicode) 4048 return NULL; 4049 /* When the string is ASCII only, just use memcpy and return. 4050 unicode_size may be != size if there is an incomplete UTF-8 4051 sequence at the end of the ASCII block. */ 4052 if (maxchar < 128 && size == unicode_size) { 4053 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4054 return (PyObject *)unicode; 4055 } 4056 kind = PyUnicode_KIND(unicode); 4057 data = PyUnicode_DATA(unicode); 4058 } 4059 /* Unpack UTF-8 encoded data */ 4060 i = 0; 4061 e = s + size; 4062 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4063 4064 while (s < e) { 4065 Py_UCS4 ch = (unsigned char)*s; 4066 4067 if (ch < 0x80) { 4068 /* Fast path for runs of ASCII characters. Given that common UTF-8 4069 input will consist of an overwhelming majority of ASCII 4070 characters, we try to optimize for this case by checking 4071 as many characters as a C 'long' can contain. 4072 First, check if we can do an aligned read, as most CPUs have 4073 a penalty for unaligned reads. 4074 */ 4075 if (!((size_t) s & LONG_PTR_MASK)) { 4076 /* Help register allocation */ 4077 register const char *_s = s; 4078 register Py_ssize_t _i = i; 4079 while (_s < aligned_end) { 4080 /* Read a whole long at a time (either 4 or 8 bytes), 4081 and do a fast unrolled copy if it only contains ASCII 4082 characters. */ 4083 unsigned long value = *(unsigned long *) _s; 4084 if (value & ASCII_CHAR_MASK) 4085 break; 4086 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4087 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4088 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4089 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4090#if (SIZEOF_LONG == 8) 4091 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4092 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4093 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4094 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4095#endif 4096 _s += SIZEOF_LONG; 4097 _i += SIZEOF_LONG; 4098 } 4099 s = _s; 4100 i = _i; 4101 if (s == e) 4102 break; 4103 ch = (unsigned char)*s; 4104 } 4105 } 4106 4107 if (ch < 0x80) { 4108 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4109 s++; 4110 continue; 4111 } 4112 4113 n = utf8_code_length[ch]; 4114 4115 if (s + n > e) { 4116 if (consumed) 4117 break; 4118 else { 4119 errmsg = "unexpected end of data"; 4120 startinpos = s-starts; 4121 endinpos = startinpos+1; 4122 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4123 endinpos++; 4124 goto utf8Error; 4125 } 4126 } 4127 4128 switch (n) { 4129 4130 case 0: 4131 errmsg = "invalid start byte"; 4132 startinpos = s-starts; 4133 endinpos = startinpos+1; 4134 goto utf8Error; 4135 4136 case 1: 4137 errmsg = "internal error"; 4138 startinpos = s-starts; 4139 endinpos = startinpos+1; 4140 goto utf8Error; 4141 4142 case 2: 4143 if ((s[1] & 0xc0) != 0x80) { 4144 errmsg = "invalid continuation byte"; 4145 startinpos = s-starts; 4146 endinpos = startinpos + 1; 4147 goto utf8Error; 4148 } 4149 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4150 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4151 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4152 break; 4153 4154 case 3: 4155 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4156 will result in surrogates in range d800-dfff. Surrogates are 4157 not valid UTF-8 so they are rejected. 4158 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4159 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4160 if ((s[1] & 0xc0) != 0x80 || 4161 (s[2] & 0xc0) != 0x80 || 4162 ((unsigned char)s[0] == 0xE0 && 4163 (unsigned char)s[1] < 0xA0) || 4164 ((unsigned char)s[0] == 0xED && 4165 (unsigned char)s[1] > 0x9F)) { 4166 errmsg = "invalid continuation byte"; 4167 startinpos = s-starts; 4168 endinpos = startinpos + 1; 4169 4170 /* if s[1] first two bits are 1 and 0, then the invalid 4171 continuation byte is s[2], so increment endinpos by 1, 4172 if not, s[1] is invalid and endinpos doesn't need to 4173 be incremented. */ 4174 if ((s[1] & 0xC0) == 0x80) 4175 endinpos++; 4176 goto utf8Error; 4177 } 4178 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4179 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4180 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4181 break; 4182 4183 case 4: 4184 if ((s[1] & 0xc0) != 0x80 || 4185 (s[2] & 0xc0) != 0x80 || 4186 (s[3] & 0xc0) != 0x80 || 4187 ((unsigned char)s[0] == 0xF0 && 4188 (unsigned char)s[1] < 0x90) || 4189 ((unsigned char)s[0] == 0xF4 && 4190 (unsigned char)s[1] > 0x8F)) { 4191 errmsg = "invalid continuation byte"; 4192 startinpos = s-starts; 4193 endinpos = startinpos + 1; 4194 if ((s[1] & 0xC0) == 0x80) { 4195 endinpos++; 4196 if ((s[2] & 0xC0) == 0x80) 4197 endinpos++; 4198 } 4199 goto utf8Error; 4200 } 4201 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4202 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4203 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4204 4205 /* If the string is flexible or we have native UCS-4, write 4206 directly.. */ 4207 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4208 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4209 4210 else { 4211 /* compute and append the two surrogates: */ 4212 4213 /* translate from 10000..10FFFF to 0..FFFF */ 4214 ch -= 0x10000; 4215 4216 /* high surrogate = top 10 bits added to D800 */ 4217 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4218 (Py_UNICODE)(0xD800 + (ch >> 10))); 4219 4220 /* low surrogate = bottom 10 bits added to DC00 */ 4221 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4222 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4223 } 4224#if SIZEOF_WCHAR_T == 2 4225 wchar_offset++; 4226#endif 4227 break; 4228 } 4229 s += n; 4230 continue; 4231 4232 utf8Error: 4233 /* If this is not yet a resizable string, make it one.. */ 4234 if (kind != PyUnicode_WCHAR_KIND) { 4235 const Py_UNICODE *u; 4236 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4237 if (!new_unicode) 4238 goto onError; 4239 u = PyUnicode_AsUnicode((PyObject *)unicode); 4240 if (!u) 4241 goto onError; 4242#if SIZEOF_WCHAR_T == 2 4243 i += wchar_offset; 4244#endif 4245 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4246 Py_DECREF(unicode); 4247 unicode = new_unicode; 4248 kind = 0; 4249 data = PyUnicode_AS_UNICODE(new_unicode); 4250 assert(data != NULL); 4251 } 4252 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4253 if (unicode_decode_call_errorhandler( 4254 errors, &errorHandler, 4255 "utf8", errmsg, 4256 &starts, &e, &startinpos, &endinpos, &exc, &s, 4257 &unicode, &i, &error_outptr)) 4258 goto onError; 4259 /* Update data because unicode_decode_call_errorhandler might have 4260 re-created or resized the unicode object. */ 4261 data = PyUnicode_AS_UNICODE(unicode); 4262 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4263 } 4264 /* Ensure the unicode_size calculation above was correct: */ 4265 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4266 4267 if (consumed) 4268 *consumed = s-starts; 4269 4270 /* Adjust length and ready string when it contained errors and 4271 is of the old resizable kind. */ 4272 if (kind == PyUnicode_WCHAR_KIND) { 4273 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4274 goto onError; 4275 } 4276 4277 Py_XDECREF(errorHandler); 4278 Py_XDECREF(exc); 4279#ifndef DONT_MAKE_RESULT_READY 4280 if (_PyUnicode_READY_REPLACE(&unicode)) { 4281 Py_DECREF(unicode); 4282 return NULL; 4283 } 4284#endif 4285 return (PyObject *)unicode; 4286 4287 onError: 4288 Py_XDECREF(errorHandler); 4289 Py_XDECREF(exc); 4290 Py_DECREF(unicode); 4291 return NULL; 4292} 4293 4294#undef WRITE_FLEXIBLE_OR_WSTR 4295 4296#ifdef __APPLE__ 4297 4298/* Simplified UTF-8 decoder using surrogateescape error handler, 4299 used to decode the command line arguments on Mac OS X. */ 4300 4301wchar_t* 4302_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4303{ 4304 int n; 4305 const char *e; 4306 wchar_t *unicode, *p; 4307 4308 /* Note: size will always be longer than the resulting Unicode 4309 character count */ 4310 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4311 PyErr_NoMemory(); 4312 return NULL; 4313 } 4314 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4315 if (!unicode) 4316 return NULL; 4317 4318 /* Unpack UTF-8 encoded data */ 4319 p = unicode; 4320 e = s + size; 4321 while (s < e) { 4322 Py_UCS4 ch = (unsigned char)*s; 4323 4324 if (ch < 0x80) { 4325 *p++ = (wchar_t)ch; 4326 s++; 4327 continue; 4328 } 4329 4330 n = utf8_code_length[ch]; 4331 if (s + n > e) { 4332 goto surrogateescape; 4333 } 4334 4335 switch (n) { 4336 case 0: 4337 case 1: 4338 goto surrogateescape; 4339 4340 case 2: 4341 if ((s[1] & 0xc0) != 0x80) 4342 goto surrogateescape; 4343 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4344 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4345 *p++ = (wchar_t)ch; 4346 break; 4347 4348 case 3: 4349 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4350 will result in surrogates in range d800-dfff. Surrogates are 4351 not valid UTF-8 so they are rejected. 4352 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4353 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4354 if ((s[1] & 0xc0) != 0x80 || 4355 (s[2] & 0xc0) != 0x80 || 4356 ((unsigned char)s[0] == 0xE0 && 4357 (unsigned char)s[1] < 0xA0) || 4358 ((unsigned char)s[0] == 0xED && 4359 (unsigned char)s[1] > 0x9F)) { 4360 4361 goto surrogateescape; 4362 } 4363 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4364 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4365 *p++ = (wchar_t)ch; 4366 break; 4367 4368 case 4: 4369 if ((s[1] & 0xc0) != 0x80 || 4370 (s[2] & 0xc0) != 0x80 || 4371 (s[3] & 0xc0) != 0x80 || 4372 ((unsigned char)s[0] == 0xF0 && 4373 (unsigned char)s[1] < 0x90) || 4374 ((unsigned char)s[0] == 0xF4 && 4375 (unsigned char)s[1] > 0x8F)) { 4376 goto surrogateescape; 4377 } 4378 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4379 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4380 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4381 4382#if SIZEOF_WCHAR_T == 4 4383 *p++ = (wchar_t)ch; 4384#else 4385 /* compute and append the two surrogates: */ 4386 4387 /* translate from 10000..10FFFF to 0..FFFF */ 4388 ch -= 0x10000; 4389 4390 /* high surrogate = top 10 bits added to D800 */ 4391 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4392 4393 /* low surrogate = bottom 10 bits added to DC00 */ 4394 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4395#endif 4396 break; 4397 } 4398 s += n; 4399 continue; 4400 4401 surrogateescape: 4402 *p++ = 0xDC00 + ch; 4403 s++; 4404 } 4405 *p = L'\0'; 4406 return unicode; 4407} 4408 4409#endif /* __APPLE__ */ 4410 4411/* Primary internal function which creates utf8 encoded bytes objects. 4412 4413 Allocation strategy: if the string is short, convert into a stack buffer 4414 and allocate exactly as much space needed at the end. Else allocate the 4415 maximum possible needed (4 result bytes per Unicode character), and return 4416 the excess memory at the end. 4417*/ 4418PyObject * 4419_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4420{ 4421#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4422 4423 Py_ssize_t i; /* index into s of next input byte */ 4424 PyObject *result; /* result string object */ 4425 char *p; /* next free byte in output buffer */ 4426 Py_ssize_t nallocated; /* number of result bytes allocated */ 4427 Py_ssize_t nneeded; /* number of result bytes needed */ 4428 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4429 PyObject *errorHandler = NULL; 4430 PyObject *exc = NULL; 4431 int kind; 4432 void *data; 4433 Py_ssize_t size; 4434 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4435#if SIZEOF_WCHAR_T == 2 4436 Py_ssize_t wchar_offset = 0; 4437#endif 4438 4439 if (!PyUnicode_Check(unicode)) { 4440 PyErr_BadArgument(); 4441 return NULL; 4442 } 4443 4444 if (PyUnicode_READY(unicode) == -1) 4445 return NULL; 4446 4447 if (PyUnicode_UTF8(unicode)) 4448 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4449 PyUnicode_UTF8_LENGTH(unicode)); 4450 4451 kind = PyUnicode_KIND(unicode); 4452 data = PyUnicode_DATA(unicode); 4453 size = PyUnicode_GET_LENGTH(unicode); 4454 4455 assert(size >= 0); 4456 4457 if (size <= MAX_SHORT_UNICHARS) { 4458 /* Write into the stack buffer; nallocated can't overflow. 4459 * At the end, we'll allocate exactly as much heap space as it 4460 * turns out we need. 4461 */ 4462 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4463 result = NULL; /* will allocate after we're done */ 4464 p = stackbuf; 4465 } 4466 else { 4467 /* Overallocate on the heap, and give the excess back at the end. */ 4468 nallocated = size * 4; 4469 if (nallocated / 4 != size) /* overflow! */ 4470 return PyErr_NoMemory(); 4471 result = PyBytes_FromStringAndSize(NULL, nallocated); 4472 if (result == NULL) 4473 return NULL; 4474 p = PyBytes_AS_STRING(result); 4475 } 4476 4477 for (i = 0; i < size;) { 4478 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4479 4480 if (ch < 0x80) 4481 /* Encode ASCII */ 4482 *p++ = (char) ch; 4483 4484 else if (ch < 0x0800) { 4485 /* Encode Latin-1 */ 4486 *p++ = (char)(0xc0 | (ch >> 6)); 4487 *p++ = (char)(0x80 | (ch & 0x3f)); 4488 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4489 Py_ssize_t newpos; 4490 PyObject *rep; 4491 Py_ssize_t repsize, k, startpos; 4492 startpos = i-1; 4493#if SIZEOF_WCHAR_T == 2 4494 startpos += wchar_offset; 4495#endif 4496 rep = unicode_encode_call_errorhandler( 4497 errors, &errorHandler, "utf-8", "surrogates not allowed", 4498 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4499 &exc, startpos, startpos+1, &newpos); 4500 if (!rep) 4501 goto error; 4502 4503 if (PyBytes_Check(rep)) 4504 repsize = PyBytes_GET_SIZE(rep); 4505 else 4506 repsize = PyUnicode_GET_SIZE(rep); 4507 4508 if (repsize > 4) { 4509 Py_ssize_t offset; 4510 4511 if (result == NULL) 4512 offset = p - stackbuf; 4513 else 4514 offset = p - PyBytes_AS_STRING(result); 4515 4516 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4517 /* integer overflow */ 4518 PyErr_NoMemory(); 4519 goto error; 4520 } 4521 nallocated += repsize - 4; 4522 if (result != NULL) { 4523 if (_PyBytes_Resize(&result, nallocated) < 0) 4524 goto error; 4525 } else { 4526 result = PyBytes_FromStringAndSize(NULL, nallocated); 4527 if (result == NULL) 4528 goto error; 4529 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4530 } 4531 p = PyBytes_AS_STRING(result) + offset; 4532 } 4533 4534 if (PyBytes_Check(rep)) { 4535 char *prep = PyBytes_AS_STRING(rep); 4536 for(k = repsize; k > 0; k--) 4537 *p++ = *prep++; 4538 } else /* rep is unicode */ { 4539 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4540 Py_UNICODE c; 4541 4542 for(k=0; k<repsize; k++) { 4543 c = prep[k]; 4544 if (0x80 <= c) { 4545 raise_encode_exception(&exc, "utf-8", 4546 PyUnicode_AS_UNICODE(unicode), 4547 size, i-1, i, 4548 "surrogates not allowed"); 4549 goto error; 4550 } 4551 *p++ = (char)prep[k]; 4552 } 4553 } 4554 Py_DECREF(rep); 4555 } else if (ch < 0x10000) { 4556 *p++ = (char)(0xe0 | (ch >> 12)); 4557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4558 *p++ = (char)(0x80 | (ch & 0x3f)); 4559 } else /* ch >= 0x10000 */ { 4560 /* Encode UCS4 Unicode ordinals */ 4561 *p++ = (char)(0xf0 | (ch >> 18)); 4562 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4563 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4564 *p++ = (char)(0x80 | (ch & 0x3f)); 4565#if SIZEOF_WCHAR_T == 2 4566 wchar_offset++; 4567#endif 4568 } 4569 } 4570 4571 if (result == NULL) { 4572 /* This was stack allocated. */ 4573 nneeded = p - stackbuf; 4574 assert(nneeded <= nallocated); 4575 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4576 } 4577 else { 4578 /* Cut back to size actually needed. */ 4579 nneeded = p - PyBytes_AS_STRING(result); 4580 assert(nneeded <= nallocated); 4581 _PyBytes_Resize(&result, nneeded); 4582 } 4583 4584 Py_XDECREF(errorHandler); 4585 Py_XDECREF(exc); 4586 return result; 4587 error: 4588 Py_XDECREF(errorHandler); 4589 Py_XDECREF(exc); 4590 Py_XDECREF(result); 4591 return NULL; 4592 4593#undef MAX_SHORT_UNICHARS 4594} 4595 4596PyObject * 4597PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4598 Py_ssize_t size, 4599 const char *errors) 4600{ 4601 PyObject *v, *unicode; 4602 4603 unicode = PyUnicode_FromUnicode(s, size); 4604 if (unicode == NULL) 4605 return NULL; 4606 v = _PyUnicode_AsUTF8String(unicode, errors); 4607 Py_DECREF(unicode); 4608 return v; 4609} 4610 4611PyObject * 4612PyUnicode_AsUTF8String(PyObject *unicode) 4613{ 4614 return _PyUnicode_AsUTF8String(unicode, NULL); 4615} 4616 4617/* --- UTF-32 Codec ------------------------------------------------------- */ 4618 4619PyObject * 4620PyUnicode_DecodeUTF32(const char *s, 4621 Py_ssize_t size, 4622 const char *errors, 4623 int *byteorder) 4624{ 4625 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4626} 4627 4628PyObject * 4629PyUnicode_DecodeUTF32Stateful(const char *s, 4630 Py_ssize_t size, 4631 const char *errors, 4632 int *byteorder, 4633 Py_ssize_t *consumed) 4634{ 4635 const char *starts = s; 4636 Py_ssize_t startinpos; 4637 Py_ssize_t endinpos; 4638 Py_ssize_t outpos; 4639 PyUnicodeObject *unicode; 4640 Py_UNICODE *p; 4641#ifndef Py_UNICODE_WIDE 4642 int pairs = 0; 4643 const unsigned char *qq; 4644#else 4645 const int pairs = 0; 4646#endif 4647 const unsigned char *q, *e; 4648 int bo = 0; /* assume native ordering by default */ 4649 const char *errmsg = ""; 4650 /* Offsets from q for retrieving bytes in the right order. */ 4651#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4652 int iorder[] = {0, 1, 2, 3}; 4653#else 4654 int iorder[] = {3, 2, 1, 0}; 4655#endif 4656 PyObject *errorHandler = NULL; 4657 PyObject *exc = NULL; 4658 4659 q = (unsigned char *)s; 4660 e = q + size; 4661 4662 if (byteorder) 4663 bo = *byteorder; 4664 4665 /* Check for BOM marks (U+FEFF) in the input and adjust current 4666 byte order setting accordingly. In native mode, the leading BOM 4667 mark is skipped, in all other modes, it is copied to the output 4668 stream as-is (giving a ZWNBSP character). */ 4669 if (bo == 0) { 4670 if (size >= 4) { 4671 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4672 (q[iorder[1]] << 8) | q[iorder[0]]; 4673#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4674 if (bom == 0x0000FEFF) { 4675 q += 4; 4676 bo = -1; 4677 } 4678 else if (bom == 0xFFFE0000) { 4679 q += 4; 4680 bo = 1; 4681 } 4682#else 4683 if (bom == 0x0000FEFF) { 4684 q += 4; 4685 bo = 1; 4686 } 4687 else if (bom == 0xFFFE0000) { 4688 q += 4; 4689 bo = -1; 4690 } 4691#endif 4692 } 4693 } 4694 4695 if (bo == -1) { 4696 /* force LE */ 4697 iorder[0] = 0; 4698 iorder[1] = 1; 4699 iorder[2] = 2; 4700 iorder[3] = 3; 4701 } 4702 else if (bo == 1) { 4703 /* force BE */ 4704 iorder[0] = 3; 4705 iorder[1] = 2; 4706 iorder[2] = 1; 4707 iorder[3] = 0; 4708 } 4709 4710 /* On narrow builds we split characters outside the BMP into two 4711 codepoints => count how much extra space we need. */ 4712#ifndef Py_UNICODE_WIDE 4713 for (qq = q; qq < e; qq += 4) 4714 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4715 pairs++; 4716#endif 4717 4718 /* This might be one to much, because of a BOM */ 4719 unicode = _PyUnicode_New((size+3)/4+pairs); 4720 if (!unicode) 4721 return NULL; 4722 if (size == 0) 4723 return (PyObject *)unicode; 4724 4725 /* Unpack UTF-32 encoded data */ 4726 p = PyUnicode_AS_UNICODE(unicode); 4727 4728 while (q < e) { 4729 Py_UCS4 ch; 4730 /* remaining bytes at the end? (size should be divisible by 4) */ 4731 if (e-q<4) { 4732 if (consumed) 4733 break; 4734 errmsg = "truncated data"; 4735 startinpos = ((const char *)q)-starts; 4736 endinpos = ((const char *)e)-starts; 4737 goto utf32Error; 4738 /* The remaining input chars are ignored if the callback 4739 chooses to skip the input */ 4740 } 4741 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4742 (q[iorder[1]] << 8) | q[iorder[0]]; 4743 4744 if (ch >= 0x110000) 4745 { 4746 errmsg = "codepoint not in range(0x110000)"; 4747 startinpos = ((const char *)q)-starts; 4748 endinpos = startinpos+4; 4749 goto utf32Error; 4750 } 4751#ifndef Py_UNICODE_WIDE 4752 if (ch >= 0x10000) 4753 { 4754 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4755 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4756 } 4757 else 4758#endif 4759 *p++ = ch; 4760 q += 4; 4761 continue; 4762 utf32Error: 4763 outpos = p-PyUnicode_AS_UNICODE(unicode); 4764 if (unicode_decode_call_errorhandler( 4765 errors, &errorHandler, 4766 "utf32", errmsg, 4767 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4768 &unicode, &outpos, &p)) 4769 goto onError; 4770 } 4771 4772 if (byteorder) 4773 *byteorder = bo; 4774 4775 if (consumed) 4776 *consumed = (const char *)q-starts; 4777 4778 /* Adjust length */ 4779 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4780 goto onError; 4781 4782 Py_XDECREF(errorHandler); 4783 Py_XDECREF(exc); 4784#ifndef DONT_MAKE_RESULT_READY 4785 if (_PyUnicode_READY_REPLACE(&unicode)) { 4786 Py_DECREF(unicode); 4787 return NULL; 4788 } 4789#endif 4790 return (PyObject *)unicode; 4791 4792 onError: 4793 Py_DECREF(unicode); 4794 Py_XDECREF(errorHandler); 4795 Py_XDECREF(exc); 4796 return NULL; 4797} 4798 4799PyObject * 4800PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4801 Py_ssize_t size, 4802 const char *errors, 4803 int byteorder) 4804{ 4805 PyObject *v; 4806 unsigned char *p; 4807 Py_ssize_t nsize, bytesize; 4808#ifndef Py_UNICODE_WIDE 4809 Py_ssize_t i, pairs; 4810#else 4811 const int pairs = 0; 4812#endif 4813 /* Offsets from p for storing byte pairs in the right order. */ 4814#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4815 int iorder[] = {0, 1, 2, 3}; 4816#else 4817 int iorder[] = {3, 2, 1, 0}; 4818#endif 4819 4820#define STORECHAR(CH) \ 4821 do { \ 4822 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4823 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4824 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4825 p[iorder[0]] = (CH) & 0xff; \ 4826 p += 4; \ 4827 } while(0) 4828 4829 /* In narrow builds we can output surrogate pairs as one codepoint, 4830 so we need less space. */ 4831#ifndef Py_UNICODE_WIDE 4832 for (i = pairs = 0; i < size-1; i++) 4833 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4834 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4835 pairs++; 4836#endif 4837 nsize = (size - pairs + (byteorder == 0)); 4838 bytesize = nsize * 4; 4839 if (bytesize / 4 != nsize) 4840 return PyErr_NoMemory(); 4841 v = PyBytes_FromStringAndSize(NULL, bytesize); 4842 if (v == NULL) 4843 return NULL; 4844 4845 p = (unsigned char *)PyBytes_AS_STRING(v); 4846 if (byteorder == 0) 4847 STORECHAR(0xFEFF); 4848 if (size == 0) 4849 goto done; 4850 4851 if (byteorder == -1) { 4852 /* force LE */ 4853 iorder[0] = 0; 4854 iorder[1] = 1; 4855 iorder[2] = 2; 4856 iorder[3] = 3; 4857 } 4858 else if (byteorder == 1) { 4859 /* force BE */ 4860 iorder[0] = 3; 4861 iorder[1] = 2; 4862 iorder[2] = 1; 4863 iorder[3] = 0; 4864 } 4865 4866 while (size-- > 0) { 4867 Py_UCS4 ch = *s++; 4868#ifndef Py_UNICODE_WIDE 4869 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4870 Py_UCS4 ch2 = *s; 4871 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4872 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4873 s++; 4874 size--; 4875 } 4876 } 4877#endif 4878 STORECHAR(ch); 4879 } 4880 4881 done: 4882 return v; 4883#undef STORECHAR 4884} 4885 4886PyObject * 4887PyUnicode_AsUTF32String(PyObject *unicode) 4888{ 4889 if (!PyUnicode_Check(unicode)) { 4890 PyErr_BadArgument(); 4891 return NULL; 4892 } 4893 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 4894 PyUnicode_GET_SIZE(unicode), 4895 NULL, 4896 0); 4897} 4898 4899/* --- UTF-16 Codec ------------------------------------------------------- */ 4900 4901PyObject * 4902PyUnicode_DecodeUTF16(const char *s, 4903 Py_ssize_t size, 4904 const char *errors, 4905 int *byteorder) 4906{ 4907 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 4908} 4909 4910/* Two masks for fast checking of whether a C 'long' may contain 4911 UTF16-encoded surrogate characters. This is an efficient heuristic, 4912 assuming that non-surrogate characters with a code point >= 0x8000 are 4913 rare in most input. 4914 FAST_CHAR_MASK is used when the input is in native byte ordering, 4915 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 4916*/ 4917#if (SIZEOF_LONG == 8) 4918# define FAST_CHAR_MASK 0x8000800080008000L 4919# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 4920#elif (SIZEOF_LONG == 4) 4921# define FAST_CHAR_MASK 0x80008000L 4922# define SWAPPED_FAST_CHAR_MASK 0x00800080L 4923#else 4924# error C 'long' size should be either 4 or 8! 4925#endif 4926 4927PyObject * 4928PyUnicode_DecodeUTF16Stateful(const char *s, 4929 Py_ssize_t size, 4930 const char *errors, 4931 int *byteorder, 4932 Py_ssize_t *consumed) 4933{ 4934 const char *starts = s; 4935 Py_ssize_t startinpos; 4936 Py_ssize_t endinpos; 4937 Py_ssize_t outpos; 4938 PyUnicodeObject *unicode; 4939 Py_UNICODE *p; 4940 const unsigned char *q, *e, *aligned_end; 4941 int bo = 0; /* assume native ordering by default */ 4942 int native_ordering = 0; 4943 const char *errmsg = ""; 4944 /* Offsets from q for retrieving byte pairs in the right order. */ 4945#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4946 int ihi = 1, ilo = 0; 4947#else 4948 int ihi = 0, ilo = 1; 4949#endif 4950 PyObject *errorHandler = NULL; 4951 PyObject *exc = NULL; 4952 4953 /* Note: size will always be longer than the resulting Unicode 4954 character count */ 4955 unicode = _PyUnicode_New(size); 4956 if (!unicode) 4957 return NULL; 4958 if (size == 0) 4959 return (PyObject *)unicode; 4960 4961 /* Unpack UTF-16 encoded data */ 4962 p = PyUnicode_AS_UNICODE(unicode); 4963 q = (unsigned char *)s; 4964 e = q + size - 1; 4965 4966 if (byteorder) 4967 bo = *byteorder; 4968 4969 /* Check for BOM marks (U+FEFF) in the input and adjust current 4970 byte order setting accordingly. In native mode, the leading BOM 4971 mark is skipped, in all other modes, it is copied to the output 4972 stream as-is (giving a ZWNBSP character). */ 4973 if (bo == 0) { 4974 if (size >= 2) { 4975 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 4976#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4977 if (bom == 0xFEFF) { 4978 q += 2; 4979 bo = -1; 4980 } 4981 else if (bom == 0xFFFE) { 4982 q += 2; 4983 bo = 1; 4984 } 4985#else 4986 if (bom == 0xFEFF) { 4987 q += 2; 4988 bo = 1; 4989 } 4990 else if (bom == 0xFFFE) { 4991 q += 2; 4992 bo = -1; 4993 } 4994#endif 4995 } 4996 } 4997 4998 if (bo == -1) { 4999 /* force LE */ 5000 ihi = 1; 5001 ilo = 0; 5002 } 5003 else if (bo == 1) { 5004 /* force BE */ 5005 ihi = 0; 5006 ilo = 1; 5007 } 5008#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5009 native_ordering = ilo < ihi; 5010#else 5011 native_ordering = ilo > ihi; 5012#endif 5013 5014 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5015 while (q < e) { 5016 Py_UNICODE ch; 5017 /* First check for possible aligned read of a C 'long'. Unaligned 5018 reads are more expensive, better to defer to another iteration. */ 5019 if (!((size_t) q & LONG_PTR_MASK)) { 5020 /* Fast path for runs of non-surrogate chars. */ 5021 register const unsigned char *_q = q; 5022 Py_UNICODE *_p = p; 5023 if (native_ordering) { 5024 /* Native ordering is simple: as long as the input cannot 5025 possibly contain a surrogate char, do an unrolled copy 5026 of several 16-bit code points to the target object. 5027 The non-surrogate check is done on several input bytes 5028 at a time (as many as a C 'long' can contain). */ 5029 while (_q < aligned_end) { 5030 unsigned long data = * (unsigned long *) _q; 5031 if (data & FAST_CHAR_MASK) 5032 break; 5033 _p[0] = ((unsigned short *) _q)[0]; 5034 _p[1] = ((unsigned short *) _q)[1]; 5035#if (SIZEOF_LONG == 8) 5036 _p[2] = ((unsigned short *) _q)[2]; 5037 _p[3] = ((unsigned short *) _q)[3]; 5038#endif 5039 _q += SIZEOF_LONG; 5040 _p += SIZEOF_LONG / 2; 5041 } 5042 } 5043 else { 5044 /* Byteswapped ordering is similar, but we must decompose 5045 the copy bytewise, and take care of zero'ing out the 5046 upper bytes if the target object is in 32-bit units 5047 (that is, in UCS-4 builds). */ 5048 while (_q < aligned_end) { 5049 unsigned long data = * (unsigned long *) _q; 5050 if (data & SWAPPED_FAST_CHAR_MASK) 5051 break; 5052 /* Zero upper bytes in UCS-4 builds */ 5053#if (Py_UNICODE_SIZE > 2) 5054 _p[0] = 0; 5055 _p[1] = 0; 5056#if (SIZEOF_LONG == 8) 5057 _p[2] = 0; 5058 _p[3] = 0; 5059#endif 5060#endif 5061 /* Issue #4916; UCS-4 builds on big endian machines must 5062 fill the two last bytes of each 4-byte unit. */ 5063#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5064# define OFF 2 5065#else 5066# define OFF 0 5067#endif 5068 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5069 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5070 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5071 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5072#if (SIZEOF_LONG == 8) 5073 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5074 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5075 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5076 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5077#endif 5078#undef OFF 5079 _q += SIZEOF_LONG; 5080 _p += SIZEOF_LONG / 2; 5081 } 5082 } 5083 p = _p; 5084 q = _q; 5085 if (q >= e) 5086 break; 5087 } 5088 ch = (q[ihi] << 8) | q[ilo]; 5089 5090 q += 2; 5091 5092 if (ch < 0xD800 || ch > 0xDFFF) { 5093 *p++ = ch; 5094 continue; 5095 } 5096 5097 /* UTF-16 code pair: */ 5098 if (q > e) { 5099 errmsg = "unexpected end of data"; 5100 startinpos = (((const char *)q) - 2) - starts; 5101 endinpos = ((const char *)e) + 1 - starts; 5102 goto utf16Error; 5103 } 5104 if (0xD800 <= ch && ch <= 0xDBFF) { 5105 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5106 q += 2; 5107 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5108#ifndef Py_UNICODE_WIDE 5109 *p++ = ch; 5110 *p++ = ch2; 5111#else 5112 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5113#endif 5114 continue; 5115 } 5116 else { 5117 errmsg = "illegal UTF-16 surrogate"; 5118 startinpos = (((const char *)q)-4)-starts; 5119 endinpos = startinpos+2; 5120 goto utf16Error; 5121 } 5122 5123 } 5124 errmsg = "illegal encoding"; 5125 startinpos = (((const char *)q)-2)-starts; 5126 endinpos = startinpos+2; 5127 /* Fall through to report the error */ 5128 5129 utf16Error: 5130 outpos = p - PyUnicode_AS_UNICODE(unicode); 5131 if (unicode_decode_call_errorhandler( 5132 errors, 5133 &errorHandler, 5134 "utf16", errmsg, 5135 &starts, 5136 (const char **)&e, 5137 &startinpos, 5138 &endinpos, 5139 &exc, 5140 (const char **)&q, 5141 &unicode, 5142 &outpos, 5143 &p)) 5144 goto onError; 5145 } 5146 /* remaining byte at the end? (size should be even) */ 5147 if (e == q) { 5148 if (!consumed) { 5149 errmsg = "truncated data"; 5150 startinpos = ((const char *)q) - starts; 5151 endinpos = ((const char *)e) + 1 - starts; 5152 outpos = p - PyUnicode_AS_UNICODE(unicode); 5153 if (unicode_decode_call_errorhandler( 5154 errors, 5155 &errorHandler, 5156 "utf16", errmsg, 5157 &starts, 5158 (const char **)&e, 5159 &startinpos, 5160 &endinpos, 5161 &exc, 5162 (const char **)&q, 5163 &unicode, 5164 &outpos, 5165 &p)) 5166 goto onError; 5167 /* The remaining input chars are ignored if the callback 5168 chooses to skip the input */ 5169 } 5170 } 5171 5172 if (byteorder) 5173 *byteorder = bo; 5174 5175 if (consumed) 5176 *consumed = (const char *)q-starts; 5177 5178 /* Adjust length */ 5179 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5180 goto onError; 5181 5182 Py_XDECREF(errorHandler); 5183 Py_XDECREF(exc); 5184#ifndef DONT_MAKE_RESULT_READY 5185 if (_PyUnicode_READY_REPLACE(&unicode)) { 5186 Py_DECREF(unicode); 5187 return NULL; 5188 } 5189#endif 5190 return (PyObject *)unicode; 5191 5192 onError: 5193 Py_DECREF(unicode); 5194 Py_XDECREF(errorHandler); 5195 Py_XDECREF(exc); 5196 return NULL; 5197} 5198 5199#undef FAST_CHAR_MASK 5200#undef SWAPPED_FAST_CHAR_MASK 5201 5202PyObject * 5203PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5204 Py_ssize_t size, 5205 const char *errors, 5206 int byteorder) 5207{ 5208 PyObject *v; 5209 unsigned char *p; 5210 Py_ssize_t nsize, bytesize; 5211#ifdef Py_UNICODE_WIDE 5212 Py_ssize_t i, pairs; 5213#else 5214 const int pairs = 0; 5215#endif 5216 /* Offsets from p for storing byte pairs in the right order. */ 5217#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5218 int ihi = 1, ilo = 0; 5219#else 5220 int ihi = 0, ilo = 1; 5221#endif 5222 5223#define STORECHAR(CH) \ 5224 do { \ 5225 p[ihi] = ((CH) >> 8) & 0xff; \ 5226 p[ilo] = (CH) & 0xff; \ 5227 p += 2; \ 5228 } while(0) 5229 5230#ifdef Py_UNICODE_WIDE 5231 for (i = pairs = 0; i < size; i++) 5232 if (s[i] >= 0x10000) 5233 pairs++; 5234#endif 5235 /* 2 * (size + pairs + (byteorder == 0)) */ 5236 if (size > PY_SSIZE_T_MAX || 5237 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5238 return PyErr_NoMemory(); 5239 nsize = size + pairs + (byteorder == 0); 5240 bytesize = nsize * 2; 5241 if (bytesize / 2 != nsize) 5242 return PyErr_NoMemory(); 5243 v = PyBytes_FromStringAndSize(NULL, bytesize); 5244 if (v == NULL) 5245 return NULL; 5246 5247 p = (unsigned char *)PyBytes_AS_STRING(v); 5248 if (byteorder == 0) 5249 STORECHAR(0xFEFF); 5250 if (size == 0) 5251 goto done; 5252 5253 if (byteorder == -1) { 5254 /* force LE */ 5255 ihi = 1; 5256 ilo = 0; 5257 } 5258 else if (byteorder == 1) { 5259 /* force BE */ 5260 ihi = 0; 5261 ilo = 1; 5262 } 5263 5264 while (size-- > 0) { 5265 Py_UNICODE ch = *s++; 5266 Py_UNICODE ch2 = 0; 5267#ifdef Py_UNICODE_WIDE 5268 if (ch >= 0x10000) { 5269 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5270 ch = 0xD800 | ((ch-0x10000) >> 10); 5271 } 5272#endif 5273 STORECHAR(ch); 5274 if (ch2) 5275 STORECHAR(ch2); 5276 } 5277 5278 done: 5279 return v; 5280#undef STORECHAR 5281} 5282 5283PyObject * 5284PyUnicode_AsUTF16String(PyObject *unicode) 5285{ 5286 if (!PyUnicode_Check(unicode)) { 5287 PyErr_BadArgument(); 5288 return NULL; 5289 } 5290 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5291 PyUnicode_GET_SIZE(unicode), 5292 NULL, 5293 0); 5294} 5295 5296/* --- Unicode Escape Codec ----------------------------------------------- */ 5297 5298/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5299 if all the escapes in the string make it still a valid ASCII string. 5300 Returns -1 if any escapes were found which cause the string to 5301 pop out of ASCII range. Otherwise returns the length of the 5302 required buffer to hold the string. 5303 */ 5304Py_ssize_t 5305length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5306{ 5307 const unsigned char *p = (const unsigned char *)s; 5308 const unsigned char *end = p + size; 5309 Py_ssize_t length = 0; 5310 5311 if (size < 0) 5312 return -1; 5313 5314 for (; p < end; ++p) { 5315 if (*p > 127) { 5316 /* Non-ASCII */ 5317 return -1; 5318 } 5319 else if (*p != '\\') { 5320 /* Normal character */ 5321 ++length; 5322 } 5323 else { 5324 /* Backslash-escape, check next char */ 5325 ++p; 5326 /* Escape sequence reaches till end of string or 5327 non-ASCII follow-up. */ 5328 if (p >= end || *p > 127) 5329 return -1; 5330 switch (*p) { 5331 case '\n': 5332 /* backslash + \n result in zero characters */ 5333 break; 5334 case '\\': case '\'': case '\"': 5335 case 'b': case 'f': case 't': 5336 case 'n': case 'r': case 'v': case 'a': 5337 ++length; 5338 break; 5339 case '0': case '1': case '2': case '3': 5340 case '4': case '5': case '6': case '7': 5341 case 'x': case 'u': case 'U': case 'N': 5342 /* these do not guarantee ASCII characters */ 5343 return -1; 5344 default: 5345 /* count the backslash + the other character */ 5346 length += 2; 5347 } 5348 } 5349 } 5350 return length; 5351} 5352 5353/* Similar to PyUnicode_WRITE but either write into wstr field 5354 or treat string as ASCII. */ 5355#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5356 do { \ 5357 if ((kind) != PyUnicode_WCHAR_KIND) \ 5358 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5359 else \ 5360 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5361 } while (0) 5362 5363#define WRITE_WSTR(buf, index, value) \ 5364 assert(kind == PyUnicode_WCHAR_KIND), \ 5365 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5366 5367 5368static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5369 5370PyObject * 5371PyUnicode_DecodeUnicodeEscape(const char *s, 5372 Py_ssize_t size, 5373 const char *errors) 5374{ 5375 const char *starts = s; 5376 Py_ssize_t startinpos; 5377 Py_ssize_t endinpos; 5378 int j; 5379 PyUnicodeObject *v; 5380 Py_UNICODE *p; 5381 const char *end; 5382 char* message; 5383 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5384 PyObject *errorHandler = NULL; 5385 PyObject *exc = NULL; 5386 Py_ssize_t ascii_length; 5387 Py_ssize_t i; 5388 int kind; 5389 void *data; 5390 5391 ascii_length = length_of_escaped_ascii_string(s, size); 5392 5393 /* After length_of_escaped_ascii_string() there are two alternatives, 5394 either the string is pure ASCII with named escapes like \n, etc. 5395 and we determined it's exact size (common case) 5396 or it contains \x, \u, ... escape sequences. then we create a 5397 legacy wchar string and resize it at the end of this function. */ 5398 if (ascii_length >= 0) { 5399 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5400 if (!v) 5401 goto onError; 5402 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5403 kind = PyUnicode_1BYTE_KIND; 5404 data = PyUnicode_DATA(v); 5405 } 5406 else { 5407 /* Escaped strings will always be longer than the resulting 5408 Unicode string, so we start with size here and then reduce the 5409 length after conversion to the true value. 5410 (but if the error callback returns a long replacement string 5411 we'll have to allocate more space) */ 5412 v = _PyUnicode_New(size); 5413 if (!v) 5414 goto onError; 5415 kind = PyUnicode_WCHAR_KIND; 5416 data = PyUnicode_AS_UNICODE(v); 5417 } 5418 5419 if (size == 0) 5420 return (PyObject *)v; 5421 i = 0; 5422 end = s + size; 5423 5424 while (s < end) { 5425 unsigned char c; 5426 Py_UNICODE x; 5427 int digits; 5428 5429 if (kind == PyUnicode_WCHAR_KIND) { 5430 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5431 } 5432 else { 5433 /* The only case in which i == ascii_length is a backslash 5434 followed by a newline. */ 5435 assert(i <= ascii_length); 5436 } 5437 5438 /* Non-escape characters are interpreted as Unicode ordinals */ 5439 if (*s != '\\') { 5440 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5441 continue; 5442 } 5443 5444 startinpos = s-starts; 5445 /* \ - Escapes */ 5446 s++; 5447 c = *s++; 5448 if (s > end) 5449 c = '\0'; /* Invalid after \ */ 5450 5451 if (kind == PyUnicode_WCHAR_KIND) { 5452 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5453 } 5454 else { 5455 /* The only case in which i == ascii_length is a backslash 5456 followed by a newline. */ 5457 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5458 } 5459 5460 switch (c) { 5461 5462 /* \x escapes */ 5463 case '\n': break; 5464 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5465 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5466 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5467 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5468 /* FF */ 5469 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5470 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5471 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5472 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5473 /* VT */ 5474 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5475 /* BEL, not classic C */ 5476 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5477 5478 /* \OOO (octal) escapes */ 5479 case '0': case '1': case '2': case '3': 5480 case '4': case '5': case '6': case '7': 5481 x = s[-1] - '0'; 5482 if (s < end && '0' <= *s && *s <= '7') { 5483 x = (x<<3) + *s++ - '0'; 5484 if (s < end && '0' <= *s && *s <= '7') 5485 x = (x<<3) + *s++ - '0'; 5486 } 5487 WRITE_WSTR(data, i++, x); 5488 break; 5489 5490 /* hex escapes */ 5491 /* \xXX */ 5492 case 'x': 5493 digits = 2; 5494 message = "truncated \\xXX escape"; 5495 goto hexescape; 5496 5497 /* \uXXXX */ 5498 case 'u': 5499 digits = 4; 5500 message = "truncated \\uXXXX escape"; 5501 goto hexescape; 5502 5503 /* \UXXXXXXXX */ 5504 case 'U': 5505 digits = 8; 5506 message = "truncated \\UXXXXXXXX escape"; 5507 hexescape: 5508 chr = 0; 5509 p = PyUnicode_AS_UNICODE(v) + i; 5510 if (s+digits>end) { 5511 endinpos = size; 5512 if (unicode_decode_call_errorhandler( 5513 errors, &errorHandler, 5514 "unicodeescape", "end of string in escape sequence", 5515 &starts, &end, &startinpos, &endinpos, &exc, &s, 5516 &v, &i, &p)) 5517 goto onError; 5518 data = PyUnicode_AS_UNICODE(v); 5519 goto nextByte; 5520 } 5521 for (j = 0; j < digits; ++j) { 5522 c = (unsigned char) s[j]; 5523 if (!Py_ISXDIGIT(c)) { 5524 endinpos = (s+j+1)-starts; 5525 p = PyUnicode_AS_UNICODE(v) + i; 5526 if (unicode_decode_call_errorhandler( 5527 errors, &errorHandler, 5528 "unicodeescape", message, 5529 &starts, &end, &startinpos, &endinpos, &exc, &s, 5530 &v, &i, &p)) 5531 goto onError; 5532 data = PyUnicode_AS_UNICODE(v); 5533 goto nextByte; 5534 } 5535 chr = (chr<<4) & ~0xF; 5536 if (c >= '0' && c <= '9') 5537 chr += c - '0'; 5538 else if (c >= 'a' && c <= 'f') 5539 chr += 10 + c - 'a'; 5540 else 5541 chr += 10 + c - 'A'; 5542 } 5543 s += j; 5544 if (chr == 0xffffffff && PyErr_Occurred()) 5545 /* _decoding_error will have already written into the 5546 target buffer. */ 5547 break; 5548 store: 5549 /* when we get here, chr is a 32-bit unicode character */ 5550 if (chr <= 0xffff) 5551 /* UCS-2 character */ 5552 WRITE_WSTR(data, i++, chr); 5553 else if (chr <= 0x10ffff) { 5554 /* UCS-4 character. Either store directly, or as 5555 surrogate pair. */ 5556#ifdef Py_UNICODE_WIDE 5557 WRITE_WSTR(data, i++, chr); 5558#else 5559 chr -= 0x10000L; 5560 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5561 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5562#endif 5563 } else { 5564 endinpos = s-starts; 5565 p = PyUnicode_AS_UNICODE(v) + i; 5566 if (unicode_decode_call_errorhandler( 5567 errors, &errorHandler, 5568 "unicodeescape", "illegal Unicode character", 5569 &starts, &end, &startinpos, &endinpos, &exc, &s, 5570 &v, &i, &p)) 5571 goto onError; 5572 data = PyUnicode_AS_UNICODE(v); 5573 } 5574 break; 5575 5576 /* \N{name} */ 5577 case 'N': 5578 message = "malformed \\N character escape"; 5579 if (ucnhash_CAPI == NULL) { 5580 /* load the unicode data module */ 5581 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5582 PyUnicodeData_CAPSULE_NAME, 1); 5583 if (ucnhash_CAPI == NULL) 5584 goto ucnhashError; 5585 } 5586 if (*s == '{') { 5587 const char *start = s+1; 5588 /* look for the closing brace */ 5589 while (*s != '}' && s < end) 5590 s++; 5591 if (s > start && s < end && *s == '}') { 5592 /* found a name. look it up in the unicode database */ 5593 message = "unknown Unicode character name"; 5594 s++; 5595 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5596 &chr)) 5597 goto store; 5598 } 5599 } 5600 endinpos = s-starts; 5601 p = PyUnicode_AS_UNICODE(v) + i; 5602 if (unicode_decode_call_errorhandler( 5603 errors, &errorHandler, 5604 "unicodeescape", message, 5605 &starts, &end, &startinpos, &endinpos, &exc, &s, 5606 &v, &i, &p)) 5607 goto onError; 5608 data = PyUnicode_AS_UNICODE(v); 5609 break; 5610 5611 default: 5612 if (s > end) { 5613 assert(kind == PyUnicode_WCHAR_KIND); 5614 message = "\\ at end of string"; 5615 s--; 5616 endinpos = s-starts; 5617 p = PyUnicode_AS_UNICODE(v) + i; 5618 if (unicode_decode_call_errorhandler( 5619 errors, &errorHandler, 5620 "unicodeescape", message, 5621 &starts, &end, &startinpos, &endinpos, &exc, &s, 5622 &v, &i, &p)) 5623 goto onError; 5624 data = PyUnicode_AS_UNICODE(v); 5625 } 5626 else { 5627 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5628 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5629 } 5630 break; 5631 } 5632 nextByte: 5633 ; 5634 } 5635 /* Ensure the length prediction worked in case of ASCII strings */ 5636 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5637 5638 if (kind == PyUnicode_WCHAR_KIND) 5639 { 5640 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5641 goto onError; 5642 } 5643 Py_XDECREF(errorHandler); 5644 Py_XDECREF(exc); 5645#ifndef DONT_MAKE_RESULT_READY 5646 if (_PyUnicode_READY_REPLACE(&v)) { 5647 Py_DECREF(v); 5648 return NULL; 5649 } 5650#endif 5651 return (PyObject *)v; 5652 5653 ucnhashError: 5654 PyErr_SetString( 5655 PyExc_UnicodeError, 5656 "\\N escapes not supported (can't load unicodedata module)" 5657 ); 5658 Py_XDECREF(v); 5659 Py_XDECREF(errorHandler); 5660 Py_XDECREF(exc); 5661 return NULL; 5662 5663 onError: 5664 Py_XDECREF(v); 5665 Py_XDECREF(errorHandler); 5666 Py_XDECREF(exc); 5667 return NULL; 5668} 5669 5670#undef WRITE_ASCII_OR_WSTR 5671#undef WRITE_WSTR 5672 5673/* Return a Unicode-Escape string version of the Unicode object. 5674 5675 If quotes is true, the string is enclosed in u"" or u'' quotes as 5676 appropriate. 5677 5678*/ 5679 5680static const char *hexdigits = "0123456789abcdef"; 5681 5682PyObject * 5683PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5684 Py_ssize_t size) 5685{ 5686 PyObject *repr; 5687 char *p; 5688 5689#ifdef Py_UNICODE_WIDE 5690 const Py_ssize_t expandsize = 10; 5691#else 5692 const Py_ssize_t expandsize = 6; 5693#endif 5694 5695 /* XXX(nnorwitz): rather than over-allocating, it would be 5696 better to choose a different scheme. Perhaps scan the 5697 first N-chars of the string and allocate based on that size. 5698 */ 5699 /* Initial allocation is based on the longest-possible unichr 5700 escape. 5701 5702 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5703 unichr, so in this case it's the longest unichr escape. In 5704 narrow (UTF-16) builds this is five chars per source unichr 5705 since there are two unichrs in the surrogate pair, so in narrow 5706 (UTF-16) builds it's not the longest unichr escape. 5707 5708 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5709 so in the narrow (UTF-16) build case it's the longest unichr 5710 escape. 5711 */ 5712 5713 if (size == 0) 5714 return PyBytes_FromStringAndSize(NULL, 0); 5715 5716 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5717 return PyErr_NoMemory(); 5718 5719 repr = PyBytes_FromStringAndSize(NULL, 5720 2 5721 + expandsize*size 5722 + 1); 5723 if (repr == NULL) 5724 return NULL; 5725 5726 p = PyBytes_AS_STRING(repr); 5727 5728 while (size-- > 0) { 5729 Py_UNICODE ch = *s++; 5730 5731 /* Escape backslashes */ 5732 if (ch == '\\') { 5733 *p++ = '\\'; 5734 *p++ = (char) ch; 5735 continue; 5736 } 5737 5738#ifdef Py_UNICODE_WIDE 5739 /* Map 21-bit characters to '\U00xxxxxx' */ 5740 else if (ch >= 0x10000) { 5741 *p++ = '\\'; 5742 *p++ = 'U'; 5743 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5744 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5745 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5746 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5747 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5748 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5749 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5750 *p++ = hexdigits[ch & 0x0000000F]; 5751 continue; 5752 } 5753#else 5754 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5755 else if (ch >= 0xD800 && ch < 0xDC00) { 5756 Py_UNICODE ch2; 5757 Py_UCS4 ucs; 5758 5759 ch2 = *s++; 5760 size--; 5761 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5762 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5763 *p++ = '\\'; 5764 *p++ = 'U'; 5765 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5766 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5767 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5768 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5769 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5770 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5771 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5772 *p++ = hexdigits[ucs & 0x0000000F]; 5773 continue; 5774 } 5775 /* Fall through: isolated surrogates are copied as-is */ 5776 s--; 5777 size++; 5778 } 5779#endif 5780 5781 /* Map 16-bit characters to '\uxxxx' */ 5782 if (ch >= 256) { 5783 *p++ = '\\'; 5784 *p++ = 'u'; 5785 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5786 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5787 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5788 *p++ = hexdigits[ch & 0x000F]; 5789 } 5790 5791 /* Map special whitespace to '\t', \n', '\r' */ 5792 else if (ch == '\t') { 5793 *p++ = '\\'; 5794 *p++ = 't'; 5795 } 5796 else if (ch == '\n') { 5797 *p++ = '\\'; 5798 *p++ = 'n'; 5799 } 5800 else if (ch == '\r') { 5801 *p++ = '\\'; 5802 *p++ = 'r'; 5803 } 5804 5805 /* Map non-printable US ASCII to '\xhh' */ 5806 else if (ch < ' ' || ch >= 0x7F) { 5807 *p++ = '\\'; 5808 *p++ = 'x'; 5809 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5810 *p++ = hexdigits[ch & 0x000F]; 5811 } 5812 5813 /* Copy everything else as-is */ 5814 else 5815 *p++ = (char) ch; 5816 } 5817 5818 assert(p - PyBytes_AS_STRING(repr) > 0); 5819 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5820 return NULL; 5821 return repr; 5822} 5823 5824PyObject * 5825PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5826{ 5827 PyObject *s; 5828 if (!PyUnicode_Check(unicode)) { 5829 PyErr_BadArgument(); 5830 return NULL; 5831 } 5832 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5833 PyUnicode_GET_SIZE(unicode)); 5834 return s; 5835} 5836 5837/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5838 5839PyObject * 5840PyUnicode_DecodeRawUnicodeEscape(const char *s, 5841 Py_ssize_t size, 5842 const char *errors) 5843{ 5844 const char *starts = s; 5845 Py_ssize_t startinpos; 5846 Py_ssize_t endinpos; 5847 Py_ssize_t outpos; 5848 PyUnicodeObject *v; 5849 Py_UNICODE *p; 5850 const char *end; 5851 const char *bs; 5852 PyObject *errorHandler = NULL; 5853 PyObject *exc = NULL; 5854 5855 /* Escaped strings will always be longer than the resulting 5856 Unicode string, so we start with size here and then reduce the 5857 length after conversion to the true value. (But decoding error 5858 handler might have to resize the string) */ 5859 v = _PyUnicode_New(size); 5860 if (v == NULL) 5861 goto onError; 5862 if (size == 0) 5863 return (PyObject *)v; 5864 p = PyUnicode_AS_UNICODE(v); 5865 end = s + size; 5866 while (s < end) { 5867 unsigned char c; 5868 Py_UCS4 x; 5869 int i; 5870 int count; 5871 5872 /* Non-escape characters are interpreted as Unicode ordinals */ 5873 if (*s != '\\') { 5874 *p++ = (unsigned char)*s++; 5875 continue; 5876 } 5877 startinpos = s-starts; 5878 5879 /* \u-escapes are only interpreted iff the number of leading 5880 backslashes if odd */ 5881 bs = s; 5882 for (;s < end;) { 5883 if (*s != '\\') 5884 break; 5885 *p++ = (unsigned char)*s++; 5886 } 5887 if (((s - bs) & 1) == 0 || 5888 s >= end || 5889 (*s != 'u' && *s != 'U')) { 5890 continue; 5891 } 5892 p--; 5893 count = *s=='u' ? 4 : 8; 5894 s++; 5895 5896 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5897 outpos = p-PyUnicode_AS_UNICODE(v); 5898 for (x = 0, i = 0; i < count; ++i, ++s) { 5899 c = (unsigned char)*s; 5900 if (!Py_ISXDIGIT(c)) { 5901 endinpos = s-starts; 5902 if (unicode_decode_call_errorhandler( 5903 errors, &errorHandler, 5904 "rawunicodeescape", "truncated \\uXXXX", 5905 &starts, &end, &startinpos, &endinpos, &exc, &s, 5906 &v, &outpos, &p)) 5907 goto onError; 5908 goto nextByte; 5909 } 5910 x = (x<<4) & ~0xF; 5911 if (c >= '0' && c <= '9') 5912 x += c - '0'; 5913 else if (c >= 'a' && c <= 'f') 5914 x += 10 + c - 'a'; 5915 else 5916 x += 10 + c - 'A'; 5917 } 5918 if (x <= 0xffff) 5919 /* UCS-2 character */ 5920 *p++ = (Py_UNICODE) x; 5921 else if (x <= 0x10ffff) { 5922 /* UCS-4 character. Either store directly, or as 5923 surrogate pair. */ 5924#ifdef Py_UNICODE_WIDE 5925 *p++ = (Py_UNICODE) x; 5926#else 5927 x -= 0x10000L; 5928 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 5929 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 5930#endif 5931 } else { 5932 endinpos = s-starts; 5933 outpos = p-PyUnicode_AS_UNICODE(v); 5934 if (unicode_decode_call_errorhandler( 5935 errors, &errorHandler, 5936 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5937 &starts, &end, &startinpos, &endinpos, &exc, &s, 5938 &v, &outpos, &p)) 5939 goto onError; 5940 } 5941 nextByte: 5942 ; 5943 } 5944 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5945 goto onError; 5946 Py_XDECREF(errorHandler); 5947 Py_XDECREF(exc); 5948#ifndef DONT_MAKE_RESULT_READY 5949 if (_PyUnicode_READY_REPLACE(&v)) { 5950 Py_DECREF(v); 5951 return NULL; 5952 } 5953#endif 5954 return (PyObject *)v; 5955 5956 onError: 5957 Py_XDECREF(v); 5958 Py_XDECREF(errorHandler); 5959 Py_XDECREF(exc); 5960 return NULL; 5961} 5962 5963PyObject * 5964PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5965 Py_ssize_t size) 5966{ 5967 PyObject *repr; 5968 char *p; 5969 char *q; 5970 5971#ifdef Py_UNICODE_WIDE 5972 const Py_ssize_t expandsize = 10; 5973#else 5974 const Py_ssize_t expandsize = 6; 5975#endif 5976 5977 if (size > PY_SSIZE_T_MAX / expandsize) 5978 return PyErr_NoMemory(); 5979 5980 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 5981 if (repr == NULL) 5982 return NULL; 5983 if (size == 0) 5984 return repr; 5985 5986 p = q = PyBytes_AS_STRING(repr); 5987 while (size-- > 0) { 5988 Py_UNICODE ch = *s++; 5989#ifdef Py_UNICODE_WIDE 5990 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5991 if (ch >= 0x10000) { 5992 *p++ = '\\'; 5993 *p++ = 'U'; 5994 *p++ = hexdigits[(ch >> 28) & 0xf]; 5995 *p++ = hexdigits[(ch >> 24) & 0xf]; 5996 *p++ = hexdigits[(ch >> 20) & 0xf]; 5997 *p++ = hexdigits[(ch >> 16) & 0xf]; 5998 *p++ = hexdigits[(ch >> 12) & 0xf]; 5999 *p++ = hexdigits[(ch >> 8) & 0xf]; 6000 *p++ = hexdigits[(ch >> 4) & 0xf]; 6001 *p++ = hexdigits[ch & 15]; 6002 } 6003 else 6004#else 6005 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 6006 if (ch >= 0xD800 && ch < 0xDC00) { 6007 Py_UNICODE ch2; 6008 Py_UCS4 ucs; 6009 6010 ch2 = *s++; 6011 size--; 6012 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 6013 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 6014 *p++ = '\\'; 6015 *p++ = 'U'; 6016 *p++ = hexdigits[(ucs >> 28) & 0xf]; 6017 *p++ = hexdigits[(ucs >> 24) & 0xf]; 6018 *p++ = hexdigits[(ucs >> 20) & 0xf]; 6019 *p++ = hexdigits[(ucs >> 16) & 0xf]; 6020 *p++ = hexdigits[(ucs >> 12) & 0xf]; 6021 *p++ = hexdigits[(ucs >> 8) & 0xf]; 6022 *p++ = hexdigits[(ucs >> 4) & 0xf]; 6023 *p++ = hexdigits[ucs & 0xf]; 6024 continue; 6025 } 6026 /* Fall through: isolated surrogates are copied as-is */ 6027 s--; 6028 size++; 6029 } 6030#endif 6031 /* Map 16-bit characters to '\uxxxx' */ 6032 if (ch >= 256) { 6033 *p++ = '\\'; 6034 *p++ = 'u'; 6035 *p++ = hexdigits[(ch >> 12) & 0xf]; 6036 *p++ = hexdigits[(ch >> 8) & 0xf]; 6037 *p++ = hexdigits[(ch >> 4) & 0xf]; 6038 *p++ = hexdigits[ch & 15]; 6039 } 6040 /* Copy everything else as-is */ 6041 else 6042 *p++ = (char) ch; 6043 } 6044 size = p - q; 6045 6046 assert(size > 0); 6047 if (_PyBytes_Resize(&repr, size) < 0) 6048 return NULL; 6049 return repr; 6050} 6051 6052PyObject * 6053PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6054{ 6055 PyObject *s; 6056 if (!PyUnicode_Check(unicode)) { 6057 PyErr_BadArgument(); 6058 return NULL; 6059 } 6060 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6061 PyUnicode_GET_SIZE(unicode)); 6062 6063 return s; 6064} 6065 6066/* --- Unicode Internal Codec ------------------------------------------- */ 6067 6068PyObject * 6069_PyUnicode_DecodeUnicodeInternal(const char *s, 6070 Py_ssize_t size, 6071 const char *errors) 6072{ 6073 const char *starts = s; 6074 Py_ssize_t startinpos; 6075 Py_ssize_t endinpos; 6076 Py_ssize_t outpos; 6077 PyUnicodeObject *v; 6078 Py_UNICODE *p; 6079 const char *end; 6080 const char *reason; 6081 PyObject *errorHandler = NULL; 6082 PyObject *exc = NULL; 6083 6084#ifdef Py_UNICODE_WIDE 6085 Py_UNICODE unimax = PyUnicode_GetMax(); 6086#endif 6087 6088 /* XXX overflow detection missing */ 6089 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6090 if (v == NULL) 6091 goto onError; 6092 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6093 as string was created with the old API. */ 6094 if (PyUnicode_GET_SIZE(v) == 0) 6095 return (PyObject *)v; 6096 p = PyUnicode_AS_UNICODE(v); 6097 end = s + size; 6098 6099 while (s < end) { 6100 memcpy(p, s, sizeof(Py_UNICODE)); 6101 /* We have to sanity check the raw data, otherwise doom looms for 6102 some malformed UCS-4 data. */ 6103 if ( 6104#ifdef Py_UNICODE_WIDE 6105 *p > unimax || *p < 0 || 6106#endif 6107 end-s < Py_UNICODE_SIZE 6108 ) 6109 { 6110 startinpos = s - starts; 6111 if (end-s < Py_UNICODE_SIZE) { 6112 endinpos = end-starts; 6113 reason = "truncated input"; 6114 } 6115 else { 6116 endinpos = s - starts + Py_UNICODE_SIZE; 6117 reason = "illegal code point (> 0x10FFFF)"; 6118 } 6119 outpos = p - PyUnicode_AS_UNICODE(v); 6120 if (unicode_decode_call_errorhandler( 6121 errors, &errorHandler, 6122 "unicode_internal", reason, 6123 &starts, &end, &startinpos, &endinpos, &exc, &s, 6124 &v, &outpos, &p)) { 6125 goto onError; 6126 } 6127 } 6128 else { 6129 p++; 6130 s += Py_UNICODE_SIZE; 6131 } 6132 } 6133 6134 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6135 goto onError; 6136 Py_XDECREF(errorHandler); 6137 Py_XDECREF(exc); 6138#ifndef DONT_MAKE_RESULT_READY 6139 if (_PyUnicode_READY_REPLACE(&v)) { 6140 Py_DECREF(v); 6141 return NULL; 6142 } 6143#endif 6144 return (PyObject *)v; 6145 6146 onError: 6147 Py_XDECREF(v); 6148 Py_XDECREF(errorHandler); 6149 Py_XDECREF(exc); 6150 return NULL; 6151} 6152 6153/* --- Latin-1 Codec ------------------------------------------------------ */ 6154 6155PyObject * 6156PyUnicode_DecodeLatin1(const char *s, 6157 Py_ssize_t size, 6158 const char *errors) 6159{ 6160 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6161 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6162} 6163 6164/* create or adjust a UnicodeEncodeError */ 6165static void 6166make_encode_exception(PyObject **exceptionObject, 6167 const char *encoding, 6168 const Py_UNICODE *unicode, Py_ssize_t size, 6169 Py_ssize_t startpos, Py_ssize_t endpos, 6170 const char *reason) 6171{ 6172 if (*exceptionObject == NULL) { 6173 *exceptionObject = PyUnicodeEncodeError_Create( 6174 encoding, unicode, size, startpos, endpos, reason); 6175 } 6176 else { 6177 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6178 goto onError; 6179 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6180 goto onError; 6181 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6182 goto onError; 6183 return; 6184 onError: 6185 Py_DECREF(*exceptionObject); 6186 *exceptionObject = NULL; 6187 } 6188} 6189 6190/* raises a UnicodeEncodeError */ 6191static void 6192raise_encode_exception(PyObject **exceptionObject, 6193 const char *encoding, 6194 const Py_UNICODE *unicode, Py_ssize_t size, 6195 Py_ssize_t startpos, Py_ssize_t endpos, 6196 const char *reason) 6197{ 6198 make_encode_exception(exceptionObject, 6199 encoding, unicode, size, startpos, endpos, reason); 6200 if (*exceptionObject != NULL) 6201 PyCodec_StrictErrors(*exceptionObject); 6202} 6203 6204/* error handling callback helper: 6205 build arguments, call the callback and check the arguments, 6206 put the result into newpos and return the replacement string, which 6207 has to be freed by the caller */ 6208static PyObject * 6209unicode_encode_call_errorhandler(const char *errors, 6210 PyObject **errorHandler, 6211 const char *encoding, const char *reason, 6212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6213 Py_ssize_t startpos, Py_ssize_t endpos, 6214 Py_ssize_t *newpos) 6215{ 6216 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6217 6218 PyObject *restuple; 6219 PyObject *resunicode; 6220 6221 if (*errorHandler == NULL) { 6222 *errorHandler = PyCodec_LookupError(errors); 6223 if (*errorHandler == NULL) 6224 return NULL; 6225 } 6226 6227 make_encode_exception(exceptionObject, 6228 encoding, unicode, size, startpos, endpos, reason); 6229 if (*exceptionObject == NULL) 6230 return NULL; 6231 6232 restuple = PyObject_CallFunctionObjArgs( 6233 *errorHandler, *exceptionObject, NULL); 6234 if (restuple == NULL) 6235 return NULL; 6236 if (!PyTuple_Check(restuple)) { 6237 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6238 Py_DECREF(restuple); 6239 return NULL; 6240 } 6241 if (!PyArg_ParseTuple(restuple, argparse, 6242 &resunicode, newpos)) { 6243 Py_DECREF(restuple); 6244 return NULL; 6245 } 6246 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6247 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6248 Py_DECREF(restuple); 6249 return NULL; 6250 } 6251 if (*newpos<0) 6252 *newpos = size+*newpos; 6253 if (*newpos<0 || *newpos>size) { 6254 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6255 Py_DECREF(restuple); 6256 return NULL; 6257 } 6258 Py_INCREF(resunicode); 6259 Py_DECREF(restuple); 6260 return resunicode; 6261} 6262 6263static PyObject * 6264unicode_encode_ucs1(const Py_UNICODE *p, 6265 Py_ssize_t size, 6266 const char *errors, 6267 int limit) 6268{ 6269 /* output object */ 6270 PyObject *res; 6271 /* pointers to the beginning and end+1 of input */ 6272 const Py_UNICODE *startp = p; 6273 const Py_UNICODE *endp = p + size; 6274 /* pointer to the beginning of the unencodable characters */ 6275 /* const Py_UNICODE *badp = NULL; */ 6276 /* pointer into the output */ 6277 char *str; 6278 /* current output position */ 6279 Py_ssize_t ressize; 6280 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6281 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6282 PyObject *errorHandler = NULL; 6283 PyObject *exc = NULL; 6284 /* the following variable is used for caching string comparisons 6285 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6286 int known_errorHandler = -1; 6287 6288 /* allocate enough for a simple encoding without 6289 replacements, if we need more, we'll resize */ 6290 if (size == 0) 6291 return PyBytes_FromStringAndSize(NULL, 0); 6292 res = PyBytes_FromStringAndSize(NULL, size); 6293 if (res == NULL) 6294 return NULL; 6295 str = PyBytes_AS_STRING(res); 6296 ressize = size; 6297 6298 while (p<endp) { 6299 Py_UNICODE c = *p; 6300 6301 /* can we encode this? */ 6302 if (c<limit) { 6303 /* no overflow check, because we know that the space is enough */ 6304 *str++ = (char)c; 6305 ++p; 6306 } 6307 else { 6308 Py_ssize_t unicodepos = p-startp; 6309 Py_ssize_t requiredsize; 6310 PyObject *repunicode; 6311 Py_ssize_t repsize; 6312 Py_ssize_t newpos; 6313 Py_ssize_t respos; 6314 Py_UNICODE *uni2; 6315 /* startpos for collecting unencodable chars */ 6316 const Py_UNICODE *collstart = p; 6317 const Py_UNICODE *collend = p; 6318 /* find all unecodable characters */ 6319 while ((collend < endp) && ((*collend)>=limit)) 6320 ++collend; 6321 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6322 if (known_errorHandler==-1) { 6323 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6324 known_errorHandler = 1; 6325 else if (!strcmp(errors, "replace")) 6326 known_errorHandler = 2; 6327 else if (!strcmp(errors, "ignore")) 6328 known_errorHandler = 3; 6329 else if (!strcmp(errors, "xmlcharrefreplace")) 6330 known_errorHandler = 4; 6331 else 6332 known_errorHandler = 0; 6333 } 6334 switch (known_errorHandler) { 6335 case 1: /* strict */ 6336 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6337 goto onError; 6338 case 2: /* replace */ 6339 while (collstart++<collend) 6340 *str++ = '?'; /* fall through */ 6341 case 3: /* ignore */ 6342 p = collend; 6343 break; 6344 case 4: /* xmlcharrefreplace */ 6345 respos = str - PyBytes_AS_STRING(res); 6346 /* determine replacement size (temporarily (mis)uses p) */ 6347 for (p = collstart, repsize = 0; p < collend; ++p) { 6348 if (*p<10) 6349 repsize += 2+1+1; 6350 else if (*p<100) 6351 repsize += 2+2+1; 6352 else if (*p<1000) 6353 repsize += 2+3+1; 6354 else if (*p<10000) 6355 repsize += 2+4+1; 6356#ifndef Py_UNICODE_WIDE 6357 else 6358 repsize += 2+5+1; 6359#else 6360 else if (*p<100000) 6361 repsize += 2+5+1; 6362 else if (*p<1000000) 6363 repsize += 2+6+1; 6364 else 6365 repsize += 2+7+1; 6366#endif 6367 } 6368 requiredsize = respos+repsize+(endp-collend); 6369 if (requiredsize > ressize) { 6370 if (requiredsize<2*ressize) 6371 requiredsize = 2*ressize; 6372 if (_PyBytes_Resize(&res, requiredsize)) 6373 goto onError; 6374 str = PyBytes_AS_STRING(res) + respos; 6375 ressize = requiredsize; 6376 } 6377 /* generate replacement (temporarily (mis)uses p) */ 6378 for (p = collstart; p < collend; ++p) { 6379 str += sprintf(str, "&#%d;", (int)*p); 6380 } 6381 p = collend; 6382 break; 6383 default: 6384 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6385 encoding, reason, startp, size, &exc, 6386 collstart-startp, collend-startp, &newpos); 6387 if (repunicode == NULL) 6388 goto onError; 6389 if (PyBytes_Check(repunicode)) { 6390 /* Directly copy bytes result to output. */ 6391 repsize = PyBytes_Size(repunicode); 6392 if (repsize > 1) { 6393 /* Make room for all additional bytes. */ 6394 respos = str - PyBytes_AS_STRING(res); 6395 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6396 Py_DECREF(repunicode); 6397 goto onError; 6398 } 6399 str = PyBytes_AS_STRING(res) + respos; 6400 ressize += repsize-1; 6401 } 6402 memcpy(str, PyBytes_AsString(repunicode), repsize); 6403 str += repsize; 6404 p = startp + newpos; 6405 Py_DECREF(repunicode); 6406 break; 6407 } 6408 /* need more space? (at least enough for what we 6409 have+the replacement+the rest of the string, so 6410 we won't have to check space for encodable characters) */ 6411 respos = str - PyBytes_AS_STRING(res); 6412 repsize = PyUnicode_GET_SIZE(repunicode); 6413 requiredsize = respos+repsize+(endp-collend); 6414 if (requiredsize > ressize) { 6415 if (requiredsize<2*ressize) 6416 requiredsize = 2*ressize; 6417 if (_PyBytes_Resize(&res, requiredsize)) { 6418 Py_DECREF(repunicode); 6419 goto onError; 6420 } 6421 str = PyBytes_AS_STRING(res) + respos; 6422 ressize = requiredsize; 6423 } 6424 /* check if there is anything unencodable in the replacement 6425 and copy it to the output */ 6426 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6427 c = *uni2; 6428 if (c >= limit) { 6429 raise_encode_exception(&exc, encoding, startp, size, 6430 unicodepos, unicodepos+1, reason); 6431 Py_DECREF(repunicode); 6432 goto onError; 6433 } 6434 *str = (char)c; 6435 } 6436 p = startp + newpos; 6437 Py_DECREF(repunicode); 6438 } 6439 } 6440 } 6441 /* Resize if we allocated to much */ 6442 size = str - PyBytes_AS_STRING(res); 6443 if (size < ressize) { /* If this falls res will be NULL */ 6444 assert(size >= 0); 6445 if (_PyBytes_Resize(&res, size) < 0) 6446 goto onError; 6447 } 6448 6449 Py_XDECREF(errorHandler); 6450 Py_XDECREF(exc); 6451 return res; 6452 6453 onError: 6454 Py_XDECREF(res); 6455 Py_XDECREF(errorHandler); 6456 Py_XDECREF(exc); 6457 return NULL; 6458} 6459 6460PyObject * 6461PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6462 Py_ssize_t size, 6463 const char *errors) 6464{ 6465 return unicode_encode_ucs1(p, size, errors, 256); 6466} 6467 6468PyObject * 6469_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6470{ 6471 if (!PyUnicode_Check(unicode)) { 6472 PyErr_BadArgument(); 6473 return NULL; 6474 } 6475 if (PyUnicode_READY(unicode) == -1) 6476 return NULL; 6477 /* Fast path: if it is a one-byte string, construct 6478 bytes object directly. */ 6479 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6480 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6481 PyUnicode_GET_LENGTH(unicode)); 6482 /* Non-Latin-1 characters present. Defer to above function to 6483 raise the exception. */ 6484 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6485 PyUnicode_GET_SIZE(unicode), 6486 errors); 6487} 6488 6489PyObject* 6490PyUnicode_AsLatin1String(PyObject *unicode) 6491{ 6492 return _PyUnicode_AsLatin1String(unicode, NULL); 6493} 6494 6495/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6496 6497PyObject * 6498PyUnicode_DecodeASCII(const char *s, 6499 Py_ssize_t size, 6500 const char *errors) 6501{ 6502 const char *starts = s; 6503 PyUnicodeObject *v; 6504 Py_UNICODE *u; 6505 Py_ssize_t startinpos; 6506 Py_ssize_t endinpos; 6507 Py_ssize_t outpos; 6508 const char *e; 6509 int has_error; 6510 const unsigned char *p = (const unsigned char *)s; 6511 const unsigned char *end = p + size; 6512 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6513 PyObject *errorHandler = NULL; 6514 PyObject *exc = NULL; 6515 6516 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6517 if (size == 1 && (unsigned char)s[0] < 128) 6518 return get_latin1_char((unsigned char)s[0]); 6519 6520 has_error = 0; 6521 while (p < end && !has_error) { 6522 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6523 an explanation. */ 6524 if (!((size_t) p & LONG_PTR_MASK)) { 6525 /* Help register allocation */ 6526 register const unsigned char *_p = p; 6527 while (_p < aligned_end) { 6528 unsigned long value = *(unsigned long *) _p; 6529 if (value & ASCII_CHAR_MASK) { 6530 has_error = 1; 6531 break; 6532 } 6533 _p += SIZEOF_LONG; 6534 } 6535 if (_p == end) 6536 break; 6537 if (has_error) 6538 break; 6539 p = _p; 6540 } 6541 if (*p & 0x80) { 6542 has_error = 1; 6543 break; 6544 } 6545 else { 6546 ++p; 6547 } 6548 } 6549 if (!has_error) 6550 return unicode_fromascii((const unsigned char *)s, size); 6551 6552 v = _PyUnicode_New(size); 6553 if (v == NULL) 6554 goto onError; 6555 if (size == 0) 6556 return (PyObject *)v; 6557 u = PyUnicode_AS_UNICODE(v); 6558 e = s + size; 6559 while (s < e) { 6560 register unsigned char c = (unsigned char)*s; 6561 if (c < 128) { 6562 *u++ = c; 6563 ++s; 6564 } 6565 else { 6566 startinpos = s-starts; 6567 endinpos = startinpos + 1; 6568 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6569 if (unicode_decode_call_errorhandler( 6570 errors, &errorHandler, 6571 "ascii", "ordinal not in range(128)", 6572 &starts, &e, &startinpos, &endinpos, &exc, &s, 6573 &v, &outpos, &u)) 6574 goto onError; 6575 } 6576 } 6577 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6578 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0) 6579 goto onError; 6580 Py_XDECREF(errorHandler); 6581 Py_XDECREF(exc); 6582#ifndef DONT_MAKE_RESULT_READY 6583 if (_PyUnicode_READY_REPLACE(&v)) { 6584 Py_DECREF(v); 6585 return NULL; 6586 } 6587#endif 6588 return (PyObject *)v; 6589 6590 onError: 6591 Py_XDECREF(v); 6592 Py_XDECREF(errorHandler); 6593 Py_XDECREF(exc); 6594 return NULL; 6595} 6596 6597PyObject * 6598PyUnicode_EncodeASCII(const Py_UNICODE *p, 6599 Py_ssize_t size, 6600 const char *errors) 6601{ 6602 return unicode_encode_ucs1(p, size, errors, 128); 6603} 6604 6605PyObject * 6606_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6607{ 6608 if (!PyUnicode_Check(unicode)) { 6609 PyErr_BadArgument(); 6610 return NULL; 6611 } 6612 if (PyUnicode_READY(unicode) == -1) 6613 return NULL; 6614 /* Fast path: if it is an ASCII-only string, construct bytes object 6615 directly. Else defer to above function to raise the exception. */ 6616 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6617 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6618 PyUnicode_GET_LENGTH(unicode)); 6619 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6620 PyUnicode_GET_SIZE(unicode), 6621 errors); 6622} 6623 6624PyObject * 6625PyUnicode_AsASCIIString(PyObject *unicode) 6626{ 6627 return _PyUnicode_AsASCIIString(unicode, NULL); 6628} 6629 6630#ifdef HAVE_MBCS 6631 6632/* --- MBCS codecs for Windows -------------------------------------------- */ 6633 6634#if SIZEOF_INT < SIZEOF_SIZE_T 6635#define NEED_RETRY 6636#endif 6637 6638/* XXX This code is limited to "true" double-byte encodings, as 6639 a) it assumes an incomplete character consists of a single byte, and 6640 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6641 encodings, see IsDBCSLeadByteEx documentation. */ 6642 6643static int 6644is_dbcs_lead_byte(const char *s, int offset) 6645{ 6646 const char *curr = s + offset; 6647 6648 if (IsDBCSLeadByte(*curr)) { 6649 const char *prev = CharPrev(s, curr); 6650 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6651 } 6652 return 0; 6653} 6654 6655/* 6656 * Decode MBCS string into unicode object. If 'final' is set, converts 6657 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6658 */ 6659static int 6660decode_mbcs(PyUnicodeObject **v, 6661 const char *s, /* MBCS string */ 6662 int size, /* sizeof MBCS string */ 6663 int final, 6664 const char *errors) 6665{ 6666 Py_UNICODE *p; 6667 Py_ssize_t n; 6668 DWORD usize; 6669 DWORD flags; 6670 6671 assert(size >= 0); 6672 6673 /* check and handle 'errors' arg */ 6674 if (errors==NULL || strcmp(errors, "strict")==0) 6675 flags = MB_ERR_INVALID_CHARS; 6676 else if (strcmp(errors, "ignore")==0) 6677 flags = 0; 6678 else { 6679 PyErr_Format(PyExc_ValueError, 6680 "mbcs encoding does not support errors='%s'", 6681 errors); 6682 return -1; 6683 } 6684 6685 /* Skip trailing lead-byte unless 'final' is set */ 6686 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6687 --size; 6688 6689 /* First get the size of the result */ 6690 if (size > 0) { 6691 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6692 if (usize==0) 6693 goto mbcs_decode_error; 6694 } else 6695 usize = 0; 6696 6697 if (*v == NULL) { 6698 /* Create unicode object */ 6699 *v = _PyUnicode_New(usize); 6700 if (*v == NULL) 6701 return -1; 6702 n = 0; 6703 } 6704 else { 6705 /* Extend unicode object */ 6706 n = PyUnicode_GET_SIZE(*v); 6707 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6708 return -1; 6709 } 6710 6711 /* Do the conversion */ 6712 if (usize > 0) { 6713 p = PyUnicode_AS_UNICODE(*v) + n; 6714 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6715 goto mbcs_decode_error; 6716 } 6717 } 6718 return size; 6719 6720mbcs_decode_error: 6721 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6722 we raise a UnicodeDecodeError - else it is a 'generic' 6723 windows error 6724 */ 6725 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6726 /* Ideally, we should get reason from FormatMessage - this 6727 is the Windows 2000 English version of the message 6728 */ 6729 PyObject *exc = NULL; 6730 const char *reason = "No mapping for the Unicode character exists " 6731 "in the target multi-byte code page."; 6732 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6733 if (exc != NULL) { 6734 PyCodec_StrictErrors(exc); 6735 Py_DECREF(exc); 6736 } 6737 } else { 6738 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6739 } 6740 return -1; 6741} 6742 6743PyObject * 6744PyUnicode_DecodeMBCSStateful(const char *s, 6745 Py_ssize_t size, 6746 const char *errors, 6747 Py_ssize_t *consumed) 6748{ 6749 PyUnicodeObject *v = NULL; 6750 int done; 6751 6752 if (consumed) 6753 *consumed = 0; 6754 6755#ifdef NEED_RETRY 6756 retry: 6757 if (size > INT_MAX) 6758 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6759 else 6760#endif 6761 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6762 6763 if (done < 0) { 6764 Py_XDECREF(v); 6765 return NULL; 6766 } 6767 6768 if (consumed) 6769 *consumed += done; 6770 6771#ifdef NEED_RETRY 6772 if (size > INT_MAX) { 6773 s += done; 6774 size -= done; 6775 goto retry; 6776 } 6777#endif 6778#ifndef DONT_MAKE_RESULT_READY 6779 if (_PyUnicode_READY_REPLACE(&v)) { 6780 Py_DECREF(v); 6781 return NULL; 6782 } 6783#endif 6784 return (PyObject *)v; 6785} 6786 6787PyObject * 6788PyUnicode_DecodeMBCS(const char *s, 6789 Py_ssize_t size, 6790 const char *errors) 6791{ 6792 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6793} 6794 6795/* 6796 * Convert unicode into string object (MBCS). 6797 * Returns 0 if succeed, -1 otherwise. 6798 */ 6799static int 6800encode_mbcs(PyObject **repr, 6801 const Py_UNICODE *p, /* unicode */ 6802 int size, /* size of unicode */ 6803 const char* errors) 6804{ 6805 BOOL usedDefaultChar = FALSE; 6806 BOOL *pusedDefaultChar; 6807 int mbcssize; 6808 Py_ssize_t n; 6809 PyObject *exc = NULL; 6810 DWORD flags; 6811 6812 assert(size >= 0); 6813 6814 /* check and handle 'errors' arg */ 6815 if (errors==NULL || strcmp(errors, "strict")==0) { 6816 flags = WC_NO_BEST_FIT_CHARS; 6817 pusedDefaultChar = &usedDefaultChar; 6818 } else if (strcmp(errors, "replace")==0) { 6819 flags = 0; 6820 pusedDefaultChar = NULL; 6821 } else { 6822 PyErr_Format(PyExc_ValueError, 6823 "mbcs encoding does not support errors='%s'", 6824 errors); 6825 return -1; 6826 } 6827 6828 /* First get the size of the result */ 6829 if (size > 0) { 6830 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6831 NULL, pusedDefaultChar); 6832 if (mbcssize == 0) { 6833 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6834 return -1; 6835 } 6836 /* If we used a default char, then we failed! */ 6837 if (pusedDefaultChar && *pusedDefaultChar) 6838 goto mbcs_encode_error; 6839 } else { 6840 mbcssize = 0; 6841 } 6842 6843 if (*repr == NULL) { 6844 /* Create string object */ 6845 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6846 if (*repr == NULL) 6847 return -1; 6848 n = 0; 6849 } 6850 else { 6851 /* Extend string object */ 6852 n = PyBytes_Size(*repr); 6853 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6854 return -1; 6855 } 6856 6857 /* Do the conversion */ 6858 if (size > 0) { 6859 char *s = PyBytes_AS_STRING(*repr) + n; 6860 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6861 NULL, pusedDefaultChar)) { 6862 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6863 return -1; 6864 } 6865 if (pusedDefaultChar && *pusedDefaultChar) 6866 goto mbcs_encode_error; 6867 } 6868 return 0; 6869 6870mbcs_encode_error: 6871 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6872 Py_XDECREF(exc); 6873 return -1; 6874} 6875 6876PyObject * 6877PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6878 Py_ssize_t size, 6879 const char *errors) 6880{ 6881 PyObject *repr = NULL; 6882 int ret; 6883 6884#ifdef NEED_RETRY 6885 retry: 6886 if (size > INT_MAX) 6887 ret = encode_mbcs(&repr, p, INT_MAX, errors); 6888 else 6889#endif 6890 ret = encode_mbcs(&repr, p, (int)size, errors); 6891 6892 if (ret < 0) { 6893 Py_XDECREF(repr); 6894 return NULL; 6895 } 6896 6897#ifdef NEED_RETRY 6898 if (size > INT_MAX) { 6899 p += INT_MAX; 6900 size -= INT_MAX; 6901 goto retry; 6902 } 6903#endif 6904 6905 return repr; 6906} 6907 6908PyObject * 6909PyUnicode_AsMBCSString(PyObject *unicode) 6910{ 6911 if (!PyUnicode_Check(unicode)) { 6912 PyErr_BadArgument(); 6913 return NULL; 6914 } 6915 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 6916 PyUnicode_GET_SIZE(unicode), 6917 NULL); 6918} 6919 6920#undef NEED_RETRY 6921 6922#endif /* HAVE_MBCS */ 6923 6924/* --- Character Mapping Codec -------------------------------------------- */ 6925 6926PyObject * 6927PyUnicode_DecodeCharmap(const char *s, 6928 Py_ssize_t size, 6929 PyObject *mapping, 6930 const char *errors) 6931{ 6932 const char *starts = s; 6933 Py_ssize_t startinpos; 6934 Py_ssize_t endinpos; 6935 Py_ssize_t outpos; 6936 const char *e; 6937 PyUnicodeObject *v; 6938 Py_UNICODE *p; 6939 Py_ssize_t extrachars = 0; 6940 PyObject *errorHandler = NULL; 6941 PyObject *exc = NULL; 6942 Py_UNICODE *mapstring = NULL; 6943 Py_ssize_t maplen = 0; 6944 6945 /* Default to Latin-1 */ 6946 if (mapping == NULL) 6947 return PyUnicode_DecodeLatin1(s, size, errors); 6948 6949 v = _PyUnicode_New(size); 6950 if (v == NULL) 6951 goto onError; 6952 if (size == 0) 6953 return (PyObject *)v; 6954 p = PyUnicode_AS_UNICODE(v); 6955 e = s + size; 6956 if (PyUnicode_CheckExact(mapping)) { 6957 mapstring = PyUnicode_AS_UNICODE(mapping); 6958 maplen = PyUnicode_GET_SIZE(mapping); 6959 while (s < e) { 6960 unsigned char ch = *s; 6961 Py_UNICODE x = 0xfffe; /* illegal value */ 6962 6963 if (ch < maplen) 6964 x = mapstring[ch]; 6965 6966 if (x == 0xfffe) { 6967 /* undefined mapping */ 6968 outpos = p-PyUnicode_AS_UNICODE(v); 6969 startinpos = s-starts; 6970 endinpos = startinpos+1; 6971 if (unicode_decode_call_errorhandler( 6972 errors, &errorHandler, 6973 "charmap", "character maps to <undefined>", 6974 &starts, &e, &startinpos, &endinpos, &exc, &s, 6975 &v, &outpos, &p)) { 6976 goto onError; 6977 } 6978 continue; 6979 } 6980 *p++ = x; 6981 ++s; 6982 } 6983 } 6984 else { 6985 while (s < e) { 6986 unsigned char ch = *s; 6987 PyObject *w, *x; 6988 6989 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 6990 w = PyLong_FromLong((long)ch); 6991 if (w == NULL) 6992 goto onError; 6993 x = PyObject_GetItem(mapping, w); 6994 Py_DECREF(w); 6995 if (x == NULL) { 6996 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6997 /* No mapping found means: mapping is undefined. */ 6998 PyErr_Clear(); 6999 x = Py_None; 7000 Py_INCREF(x); 7001 } else 7002 goto onError; 7003 } 7004 7005 /* Apply mapping */ 7006 if (PyLong_Check(x)) { 7007 long value = PyLong_AS_LONG(x); 7008 if (value < 0 || value > 65535) { 7009 PyErr_SetString(PyExc_TypeError, 7010 "character mapping must be in range(65536)"); 7011 Py_DECREF(x); 7012 goto onError; 7013 } 7014 *p++ = (Py_UNICODE)value; 7015 } 7016 else if (x == Py_None) { 7017 /* undefined mapping */ 7018 outpos = p-PyUnicode_AS_UNICODE(v); 7019 startinpos = s-starts; 7020 endinpos = startinpos+1; 7021 if (unicode_decode_call_errorhandler( 7022 errors, &errorHandler, 7023 "charmap", "character maps to <undefined>", 7024 &starts, &e, &startinpos, &endinpos, &exc, &s, 7025 &v, &outpos, &p)) { 7026 Py_DECREF(x); 7027 goto onError; 7028 } 7029 Py_DECREF(x); 7030 continue; 7031 } 7032 else if (PyUnicode_Check(x)) { 7033 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 7034 7035 if (targetsize == 1) 7036 /* 1-1 mapping */ 7037 *p++ = *PyUnicode_AS_UNICODE(x); 7038 7039 else if (targetsize > 1) { 7040 /* 1-n mapping */ 7041 if (targetsize > extrachars) { 7042 /* resize first */ 7043 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 7044 Py_ssize_t needed = (targetsize - extrachars) + \ 7045 (targetsize << 2); 7046 extrachars += needed; 7047 /* XXX overflow detection missing */ 7048 if (PyUnicode_Resize((PyObject**)&v, 7049 PyUnicode_GET_SIZE(v) + needed) < 0) { 7050 Py_DECREF(x); 7051 goto onError; 7052 } 7053 p = PyUnicode_AS_UNICODE(v) + oldpos; 7054 } 7055 Py_UNICODE_COPY(p, 7056 PyUnicode_AS_UNICODE(x), 7057 targetsize); 7058 p += targetsize; 7059 extrachars -= targetsize; 7060 } 7061 /* 1-0 mapping: skip the character */ 7062 } 7063 else { 7064 /* wrong return value */ 7065 PyErr_SetString(PyExc_TypeError, 7066 "character mapping must return integer, None or str"); 7067 Py_DECREF(x); 7068 goto onError; 7069 } 7070 Py_DECREF(x); 7071 ++s; 7072 } 7073 } 7074 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 7075 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 7076 goto onError; 7077 Py_XDECREF(errorHandler); 7078 Py_XDECREF(exc); 7079#ifndef DONT_MAKE_RESULT_READY 7080 if (_PyUnicode_READY_REPLACE(&v)) { 7081 Py_DECREF(v); 7082 return NULL; 7083 } 7084#endif 7085 return (PyObject *)v; 7086 7087 onError: 7088 Py_XDECREF(errorHandler); 7089 Py_XDECREF(exc); 7090 Py_XDECREF(v); 7091 return NULL; 7092} 7093 7094/* Charmap encoding: the lookup table */ 7095 7096struct encoding_map { 7097 PyObject_HEAD 7098 unsigned char level1[32]; 7099 int count2, count3; 7100 unsigned char level23[1]; 7101}; 7102 7103static PyObject* 7104encoding_map_size(PyObject *obj, PyObject* args) 7105{ 7106 struct encoding_map *map = (struct encoding_map*)obj; 7107 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7108 128*map->count3); 7109} 7110 7111static PyMethodDef encoding_map_methods[] = { 7112 {"size", encoding_map_size, METH_NOARGS, 7113 PyDoc_STR("Return the size (in bytes) of this object") }, 7114 { 0 } 7115}; 7116 7117static void 7118encoding_map_dealloc(PyObject* o) 7119{ 7120 PyObject_FREE(o); 7121} 7122 7123static PyTypeObject EncodingMapType = { 7124 PyVarObject_HEAD_INIT(NULL, 0) 7125 "EncodingMap", /*tp_name*/ 7126 sizeof(struct encoding_map), /*tp_basicsize*/ 7127 0, /*tp_itemsize*/ 7128 /* methods */ 7129 encoding_map_dealloc, /*tp_dealloc*/ 7130 0, /*tp_print*/ 7131 0, /*tp_getattr*/ 7132 0, /*tp_setattr*/ 7133 0, /*tp_reserved*/ 7134 0, /*tp_repr*/ 7135 0, /*tp_as_number*/ 7136 0, /*tp_as_sequence*/ 7137 0, /*tp_as_mapping*/ 7138 0, /*tp_hash*/ 7139 0, /*tp_call*/ 7140 0, /*tp_str*/ 7141 0, /*tp_getattro*/ 7142 0, /*tp_setattro*/ 7143 0, /*tp_as_buffer*/ 7144 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7145 0, /*tp_doc*/ 7146 0, /*tp_traverse*/ 7147 0, /*tp_clear*/ 7148 0, /*tp_richcompare*/ 7149 0, /*tp_weaklistoffset*/ 7150 0, /*tp_iter*/ 7151 0, /*tp_iternext*/ 7152 encoding_map_methods, /*tp_methods*/ 7153 0, /*tp_members*/ 7154 0, /*tp_getset*/ 7155 0, /*tp_base*/ 7156 0, /*tp_dict*/ 7157 0, /*tp_descr_get*/ 7158 0, /*tp_descr_set*/ 7159 0, /*tp_dictoffset*/ 7160 0, /*tp_init*/ 7161 0, /*tp_alloc*/ 7162 0, /*tp_new*/ 7163 0, /*tp_free*/ 7164 0, /*tp_is_gc*/ 7165}; 7166 7167PyObject* 7168PyUnicode_BuildEncodingMap(PyObject* string) 7169{ 7170 PyObject *result; 7171 struct encoding_map *mresult; 7172 int i; 7173 int need_dict = 0; 7174 unsigned char level1[32]; 7175 unsigned char level2[512]; 7176 unsigned char *mlevel1, *mlevel2, *mlevel3; 7177 int count2 = 0, count3 = 0; 7178 int kind; 7179 void *data; 7180 Py_UCS4 ch; 7181 7182 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7183 PyErr_BadArgument(); 7184 return NULL; 7185 } 7186 kind = PyUnicode_KIND(string); 7187 data = PyUnicode_DATA(string); 7188 memset(level1, 0xFF, sizeof level1); 7189 memset(level2, 0xFF, sizeof level2); 7190 7191 /* If there isn't a one-to-one mapping of NULL to \0, 7192 or if there are non-BMP characters, we need to use 7193 a mapping dictionary. */ 7194 if (PyUnicode_READ(kind, data, 0) != 0) 7195 need_dict = 1; 7196 for (i = 1; i < 256; i++) { 7197 int l1, l2; 7198 ch = PyUnicode_READ(kind, data, i); 7199 if (ch == 0 || ch > 0xFFFF) { 7200 need_dict = 1; 7201 break; 7202 } 7203 if (ch == 0xFFFE) 7204 /* unmapped character */ 7205 continue; 7206 l1 = ch >> 11; 7207 l2 = ch >> 7; 7208 if (level1[l1] == 0xFF) 7209 level1[l1] = count2++; 7210 if (level2[l2] == 0xFF) 7211 level2[l2] = count3++; 7212 } 7213 7214 if (count2 >= 0xFF || count3 >= 0xFF) 7215 need_dict = 1; 7216 7217 if (need_dict) { 7218 PyObject *result = PyDict_New(); 7219 PyObject *key, *value; 7220 if (!result) 7221 return NULL; 7222 for (i = 0; i < 256; i++) { 7223 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7224 value = PyLong_FromLong(i); 7225 if (!key || !value) 7226 goto failed1; 7227 if (PyDict_SetItem(result, key, value) == -1) 7228 goto failed1; 7229 Py_DECREF(key); 7230 Py_DECREF(value); 7231 } 7232 return result; 7233 failed1: 7234 Py_XDECREF(key); 7235 Py_XDECREF(value); 7236 Py_DECREF(result); 7237 return NULL; 7238 } 7239 7240 /* Create a three-level trie */ 7241 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7242 16*count2 + 128*count3 - 1); 7243 if (!result) 7244 return PyErr_NoMemory(); 7245 PyObject_Init(result, &EncodingMapType); 7246 mresult = (struct encoding_map*)result; 7247 mresult->count2 = count2; 7248 mresult->count3 = count3; 7249 mlevel1 = mresult->level1; 7250 mlevel2 = mresult->level23; 7251 mlevel3 = mresult->level23 + 16*count2; 7252 memcpy(mlevel1, level1, 32); 7253 memset(mlevel2, 0xFF, 16*count2); 7254 memset(mlevel3, 0, 128*count3); 7255 count3 = 0; 7256 for (i = 1; i < 256; i++) { 7257 int o1, o2, o3, i2, i3; 7258 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7259 /* unmapped character */ 7260 continue; 7261 o1 = PyUnicode_READ(kind, data, i)>>11; 7262 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7263 i2 = 16*mlevel1[o1] + o2; 7264 if (mlevel2[i2] == 0xFF) 7265 mlevel2[i2] = count3++; 7266 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7267 i3 = 128*mlevel2[i2] + o3; 7268 mlevel3[i3] = i; 7269 } 7270 return result; 7271} 7272 7273static int 7274encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7275{ 7276 struct encoding_map *map = (struct encoding_map*)mapping; 7277 int l1 = c>>11; 7278 int l2 = (c>>7) & 0xF; 7279 int l3 = c & 0x7F; 7280 int i; 7281 7282#ifdef Py_UNICODE_WIDE 7283 if (c > 0xFFFF) { 7284 return -1; 7285 } 7286#endif 7287 if (c == 0) 7288 return 0; 7289 /* level 1*/ 7290 i = map->level1[l1]; 7291 if (i == 0xFF) { 7292 return -1; 7293 } 7294 /* level 2*/ 7295 i = map->level23[16*i+l2]; 7296 if (i == 0xFF) { 7297 return -1; 7298 } 7299 /* level 3 */ 7300 i = map->level23[16*map->count2 + 128*i + l3]; 7301 if (i == 0) { 7302 return -1; 7303 } 7304 return i; 7305} 7306 7307/* Lookup the character ch in the mapping. If the character 7308 can't be found, Py_None is returned (or NULL, if another 7309 error occurred). */ 7310static PyObject * 7311charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7312{ 7313 PyObject *w = PyLong_FromLong((long)c); 7314 PyObject *x; 7315 7316 if (w == NULL) 7317 return NULL; 7318 x = PyObject_GetItem(mapping, w); 7319 Py_DECREF(w); 7320 if (x == NULL) { 7321 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7322 /* No mapping found means: mapping is undefined. */ 7323 PyErr_Clear(); 7324 x = Py_None; 7325 Py_INCREF(x); 7326 return x; 7327 } else 7328 return NULL; 7329 } 7330 else if (x == Py_None) 7331 return x; 7332 else if (PyLong_Check(x)) { 7333 long value = PyLong_AS_LONG(x); 7334 if (value < 0 || value > 255) { 7335 PyErr_SetString(PyExc_TypeError, 7336 "character mapping must be in range(256)"); 7337 Py_DECREF(x); 7338 return NULL; 7339 } 7340 return x; 7341 } 7342 else if (PyBytes_Check(x)) 7343 return x; 7344 else { 7345 /* wrong return value */ 7346 PyErr_Format(PyExc_TypeError, 7347 "character mapping must return integer, bytes or None, not %.400s", 7348 x->ob_type->tp_name); 7349 Py_DECREF(x); 7350 return NULL; 7351 } 7352} 7353 7354static int 7355charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7356{ 7357 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7358 /* exponentially overallocate to minimize reallocations */ 7359 if (requiredsize < 2*outsize) 7360 requiredsize = 2*outsize; 7361 if (_PyBytes_Resize(outobj, requiredsize)) 7362 return -1; 7363 return 0; 7364} 7365 7366typedef enum charmapencode_result { 7367 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7368} charmapencode_result; 7369/* lookup the character, put the result in the output string and adjust 7370 various state variables. Resize the output bytes object if not enough 7371 space is available. Return a new reference to the object that 7372 was put in the output buffer, or Py_None, if the mapping was undefined 7373 (in which case no character was written) or NULL, if a 7374 reallocation error occurred. The caller must decref the result */ 7375static charmapencode_result 7376charmapencode_output(Py_UNICODE c, PyObject *mapping, 7377 PyObject **outobj, Py_ssize_t *outpos) 7378{ 7379 PyObject *rep; 7380 char *outstart; 7381 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7382 7383 if (Py_TYPE(mapping) == &EncodingMapType) { 7384 int res = encoding_map_lookup(c, mapping); 7385 Py_ssize_t requiredsize = *outpos+1; 7386 if (res == -1) 7387 return enc_FAILED; 7388 if (outsize<requiredsize) 7389 if (charmapencode_resize(outobj, outpos, requiredsize)) 7390 return enc_EXCEPTION; 7391 outstart = PyBytes_AS_STRING(*outobj); 7392 outstart[(*outpos)++] = (char)res; 7393 return enc_SUCCESS; 7394 } 7395 7396 rep = charmapencode_lookup(c, mapping); 7397 if (rep==NULL) 7398 return enc_EXCEPTION; 7399 else if (rep==Py_None) { 7400 Py_DECREF(rep); 7401 return enc_FAILED; 7402 } else { 7403 if (PyLong_Check(rep)) { 7404 Py_ssize_t requiredsize = *outpos+1; 7405 if (outsize<requiredsize) 7406 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7407 Py_DECREF(rep); 7408 return enc_EXCEPTION; 7409 } 7410 outstart = PyBytes_AS_STRING(*outobj); 7411 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7412 } 7413 else { 7414 const char *repchars = PyBytes_AS_STRING(rep); 7415 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7416 Py_ssize_t requiredsize = *outpos+repsize; 7417 if (outsize<requiredsize) 7418 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7419 Py_DECREF(rep); 7420 return enc_EXCEPTION; 7421 } 7422 outstart = PyBytes_AS_STRING(*outobj); 7423 memcpy(outstart + *outpos, repchars, repsize); 7424 *outpos += repsize; 7425 } 7426 } 7427 Py_DECREF(rep); 7428 return enc_SUCCESS; 7429} 7430 7431/* handle an error in PyUnicode_EncodeCharmap 7432 Return 0 on success, -1 on error */ 7433static int 7434charmap_encoding_error( 7435 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7436 PyObject **exceptionObject, 7437 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7438 PyObject **res, Py_ssize_t *respos) 7439{ 7440 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7441 Py_ssize_t repsize; 7442 Py_ssize_t newpos; 7443 Py_UNICODE *uni2; 7444 /* startpos for collecting unencodable chars */ 7445 Py_ssize_t collstartpos = *inpos; 7446 Py_ssize_t collendpos = *inpos+1; 7447 Py_ssize_t collpos; 7448 char *encoding = "charmap"; 7449 char *reason = "character maps to <undefined>"; 7450 charmapencode_result x; 7451 7452 /* find all unencodable characters */ 7453 while (collendpos < size) { 7454 PyObject *rep; 7455 if (Py_TYPE(mapping) == &EncodingMapType) { 7456 int res = encoding_map_lookup(p[collendpos], mapping); 7457 if (res != -1) 7458 break; 7459 ++collendpos; 7460 continue; 7461 } 7462 7463 rep = charmapencode_lookup(p[collendpos], mapping); 7464 if (rep==NULL) 7465 return -1; 7466 else if (rep!=Py_None) { 7467 Py_DECREF(rep); 7468 break; 7469 } 7470 Py_DECREF(rep); 7471 ++collendpos; 7472 } 7473 /* cache callback name lookup 7474 * (if not done yet, i.e. it's the first error) */ 7475 if (*known_errorHandler==-1) { 7476 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7477 *known_errorHandler = 1; 7478 else if (!strcmp(errors, "replace")) 7479 *known_errorHandler = 2; 7480 else if (!strcmp(errors, "ignore")) 7481 *known_errorHandler = 3; 7482 else if (!strcmp(errors, "xmlcharrefreplace")) 7483 *known_errorHandler = 4; 7484 else 7485 *known_errorHandler = 0; 7486 } 7487 switch (*known_errorHandler) { 7488 case 1: /* strict */ 7489 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7490 return -1; 7491 case 2: /* replace */ 7492 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7493 x = charmapencode_output('?', mapping, res, respos); 7494 if (x==enc_EXCEPTION) { 7495 return -1; 7496 } 7497 else if (x==enc_FAILED) { 7498 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7499 return -1; 7500 } 7501 } 7502 /* fall through */ 7503 case 3: /* ignore */ 7504 *inpos = collendpos; 7505 break; 7506 case 4: /* xmlcharrefreplace */ 7507 /* generate replacement (temporarily (mis)uses p) */ 7508 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7509 char buffer[2+29+1+1]; 7510 char *cp; 7511 sprintf(buffer, "&#%d;", (int)p[collpos]); 7512 for (cp = buffer; *cp; ++cp) { 7513 x = charmapencode_output(*cp, mapping, res, respos); 7514 if (x==enc_EXCEPTION) 7515 return -1; 7516 else if (x==enc_FAILED) { 7517 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7518 return -1; 7519 } 7520 } 7521 } 7522 *inpos = collendpos; 7523 break; 7524 default: 7525 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7526 encoding, reason, p, size, exceptionObject, 7527 collstartpos, collendpos, &newpos); 7528 if (repunicode == NULL) 7529 return -1; 7530 if (PyBytes_Check(repunicode)) { 7531 /* Directly copy bytes result to output. */ 7532 Py_ssize_t outsize = PyBytes_Size(*res); 7533 Py_ssize_t requiredsize; 7534 repsize = PyBytes_Size(repunicode); 7535 requiredsize = *respos + repsize; 7536 if (requiredsize > outsize) 7537 /* Make room for all additional bytes. */ 7538 if (charmapencode_resize(res, respos, requiredsize)) { 7539 Py_DECREF(repunicode); 7540 return -1; 7541 } 7542 memcpy(PyBytes_AsString(*res) + *respos, 7543 PyBytes_AsString(repunicode), repsize); 7544 *respos += repsize; 7545 *inpos = newpos; 7546 Py_DECREF(repunicode); 7547 break; 7548 } 7549 /* generate replacement */ 7550 repsize = PyUnicode_GET_SIZE(repunicode); 7551 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7552 x = charmapencode_output(*uni2, mapping, res, respos); 7553 if (x==enc_EXCEPTION) { 7554 return -1; 7555 } 7556 else if (x==enc_FAILED) { 7557 Py_DECREF(repunicode); 7558 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7559 return -1; 7560 } 7561 } 7562 *inpos = newpos; 7563 Py_DECREF(repunicode); 7564 } 7565 return 0; 7566} 7567 7568PyObject * 7569PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7570 Py_ssize_t size, 7571 PyObject *mapping, 7572 const char *errors) 7573{ 7574 /* output object */ 7575 PyObject *res = NULL; 7576 /* current input position */ 7577 Py_ssize_t inpos = 0; 7578 /* current output position */ 7579 Py_ssize_t respos = 0; 7580 PyObject *errorHandler = NULL; 7581 PyObject *exc = NULL; 7582 /* the following variable is used for caching string comparisons 7583 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7584 * 3=ignore, 4=xmlcharrefreplace */ 7585 int known_errorHandler = -1; 7586 7587 /* Default to Latin-1 */ 7588 if (mapping == NULL) 7589 return PyUnicode_EncodeLatin1(p, size, errors); 7590 7591 /* allocate enough for a simple encoding without 7592 replacements, if we need more, we'll resize */ 7593 res = PyBytes_FromStringAndSize(NULL, size); 7594 if (res == NULL) 7595 goto onError; 7596 if (size == 0) 7597 return res; 7598 7599 while (inpos<size) { 7600 /* try to encode it */ 7601 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7602 if (x==enc_EXCEPTION) /* error */ 7603 goto onError; 7604 if (x==enc_FAILED) { /* unencodable character */ 7605 if (charmap_encoding_error(p, size, &inpos, mapping, 7606 &exc, 7607 &known_errorHandler, &errorHandler, errors, 7608 &res, &respos)) { 7609 goto onError; 7610 } 7611 } 7612 else 7613 /* done with this character => adjust input position */ 7614 ++inpos; 7615 } 7616 7617 /* Resize if we allocated to much */ 7618 if (respos<PyBytes_GET_SIZE(res)) 7619 if (_PyBytes_Resize(&res, respos) < 0) 7620 goto onError; 7621 7622 Py_XDECREF(exc); 7623 Py_XDECREF(errorHandler); 7624 return res; 7625 7626 onError: 7627 Py_XDECREF(res); 7628 Py_XDECREF(exc); 7629 Py_XDECREF(errorHandler); 7630 return NULL; 7631} 7632 7633PyObject * 7634PyUnicode_AsCharmapString(PyObject *unicode, 7635 PyObject *mapping) 7636{ 7637 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7638 PyErr_BadArgument(); 7639 return NULL; 7640 } 7641 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7642 PyUnicode_GET_SIZE(unicode), 7643 mapping, 7644 NULL); 7645} 7646 7647/* create or adjust a UnicodeTranslateError */ 7648static void 7649make_translate_exception(PyObject **exceptionObject, 7650 PyObject *unicode, 7651 Py_ssize_t startpos, Py_ssize_t endpos, 7652 const char *reason) 7653{ 7654 if (*exceptionObject == NULL) { 7655 *exceptionObject = _PyUnicodeTranslateError_Create( 7656 unicode, startpos, endpos, reason); 7657 } 7658 else { 7659 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7660 goto onError; 7661 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7662 goto onError; 7663 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7664 goto onError; 7665 return; 7666 onError: 7667 Py_DECREF(*exceptionObject); 7668 *exceptionObject = NULL; 7669 } 7670} 7671 7672/* raises a UnicodeTranslateError */ 7673static void 7674raise_translate_exception(PyObject **exceptionObject, 7675 PyObject *unicode, 7676 Py_ssize_t startpos, Py_ssize_t endpos, 7677 const char *reason) 7678{ 7679 make_translate_exception(exceptionObject, 7680 unicode, startpos, endpos, reason); 7681 if (*exceptionObject != NULL) 7682 PyCodec_StrictErrors(*exceptionObject); 7683} 7684 7685/* error handling callback helper: 7686 build arguments, call the callback and check the arguments, 7687 put the result into newpos and return the replacement string, which 7688 has to be freed by the caller */ 7689static PyObject * 7690unicode_translate_call_errorhandler(const char *errors, 7691 PyObject **errorHandler, 7692 const char *reason, 7693 PyObject *unicode, PyObject **exceptionObject, 7694 Py_ssize_t startpos, Py_ssize_t endpos, 7695 Py_ssize_t *newpos) 7696{ 7697 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7698 7699 Py_ssize_t i_newpos; 7700 PyObject *restuple; 7701 PyObject *resunicode; 7702 7703 if (*errorHandler == NULL) { 7704 *errorHandler = PyCodec_LookupError(errors); 7705 if (*errorHandler == NULL) 7706 return NULL; 7707 } 7708 7709 make_translate_exception(exceptionObject, 7710 unicode, startpos, endpos, reason); 7711 if (*exceptionObject == NULL) 7712 return NULL; 7713 7714 restuple = PyObject_CallFunctionObjArgs( 7715 *errorHandler, *exceptionObject, NULL); 7716 if (restuple == NULL) 7717 return NULL; 7718 if (!PyTuple_Check(restuple)) { 7719 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7720 Py_DECREF(restuple); 7721 return NULL; 7722 } 7723 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7724 &resunicode, &i_newpos)) { 7725 Py_DECREF(restuple); 7726 return NULL; 7727 } 7728 if (i_newpos<0) 7729 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7730 else 7731 *newpos = i_newpos; 7732 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7733 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7734 Py_DECREF(restuple); 7735 return NULL; 7736 } 7737 Py_INCREF(resunicode); 7738 Py_DECREF(restuple); 7739 return resunicode; 7740} 7741 7742/* Lookup the character ch in the mapping and put the result in result, 7743 which must be decrefed by the caller. 7744 Return 0 on success, -1 on error */ 7745static int 7746charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7747{ 7748 PyObject *w = PyLong_FromLong((long)c); 7749 PyObject *x; 7750 7751 if (w == NULL) 7752 return -1; 7753 x = PyObject_GetItem(mapping, w); 7754 Py_DECREF(w); 7755 if (x == NULL) { 7756 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7757 /* No mapping found means: use 1:1 mapping. */ 7758 PyErr_Clear(); 7759 *result = NULL; 7760 return 0; 7761 } else 7762 return -1; 7763 } 7764 else if (x == Py_None) { 7765 *result = x; 7766 return 0; 7767 } 7768 else if (PyLong_Check(x)) { 7769 long value = PyLong_AS_LONG(x); 7770 long max = PyUnicode_GetMax(); 7771 if (value < 0 || value > max) { 7772 PyErr_Format(PyExc_TypeError, 7773 "character mapping must be in range(0x%x)", max+1); 7774 Py_DECREF(x); 7775 return -1; 7776 } 7777 *result = x; 7778 return 0; 7779 } 7780 else if (PyUnicode_Check(x)) { 7781 *result = x; 7782 return 0; 7783 } 7784 else { 7785 /* wrong return value */ 7786 PyErr_SetString(PyExc_TypeError, 7787 "character mapping must return integer, None or str"); 7788 Py_DECREF(x); 7789 return -1; 7790 } 7791} 7792/* ensure that *outobj is at least requiredsize characters long, 7793 if not reallocate and adjust various state variables. 7794 Return 0 on success, -1 on error */ 7795static int 7796charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7797 Py_ssize_t requiredsize) 7798{ 7799 Py_ssize_t oldsize = *psize; 7800 if (requiredsize > oldsize) { 7801 /* exponentially overallocate to minimize reallocations */ 7802 if (requiredsize < 2 * oldsize) 7803 requiredsize = 2 * oldsize; 7804 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7805 if (*outobj == 0) 7806 return -1; 7807 *psize = requiredsize; 7808 } 7809 return 0; 7810} 7811/* lookup the character, put the result in the output string and adjust 7812 various state variables. Return a new reference to the object that 7813 was put in the output buffer in *result, or Py_None, if the mapping was 7814 undefined (in which case no character was written). 7815 The called must decref result. 7816 Return 0 on success, -1 on error. */ 7817static int 7818charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7819 PyObject *mapping, Py_UCS4 **output, 7820 Py_ssize_t *osize, Py_ssize_t *opos, 7821 PyObject **res) 7822{ 7823 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7824 if (charmaptranslate_lookup(curinp, mapping, res)) 7825 return -1; 7826 if (*res==NULL) { 7827 /* not found => default to 1:1 mapping */ 7828 (*output)[(*opos)++] = curinp; 7829 } 7830 else if (*res==Py_None) 7831 ; 7832 else if (PyLong_Check(*res)) { 7833 /* no overflow check, because we know that the space is enough */ 7834 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7835 } 7836 else if (PyUnicode_Check(*res)) { 7837 Py_ssize_t repsize; 7838 if (PyUnicode_READY(*res) == -1) 7839 return -1; 7840 repsize = PyUnicode_GET_LENGTH(*res); 7841 if (repsize==1) { 7842 /* no overflow check, because we know that the space is enough */ 7843 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7844 } 7845 else if (repsize!=0) { 7846 /* more than one character */ 7847 Py_ssize_t requiredsize = *opos + 7848 (PyUnicode_GET_LENGTH(input) - ipos) + 7849 repsize - 1; 7850 Py_ssize_t i; 7851 if (charmaptranslate_makespace(output, osize, requiredsize)) 7852 return -1; 7853 for(i = 0; i < repsize; i++) 7854 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7855 } 7856 } 7857 else 7858 return -1; 7859 return 0; 7860} 7861 7862PyObject * 7863_PyUnicode_TranslateCharmap(PyObject *input, 7864 PyObject *mapping, 7865 const char *errors) 7866{ 7867 /* input object */ 7868 char *idata; 7869 Py_ssize_t size, i; 7870 int kind; 7871 /* output buffer */ 7872 Py_UCS4 *output = NULL; 7873 Py_ssize_t osize; 7874 PyObject *res; 7875 /* current output position */ 7876 Py_ssize_t opos; 7877 char *reason = "character maps to <undefined>"; 7878 PyObject *errorHandler = NULL; 7879 PyObject *exc = NULL; 7880 /* the following variable is used for caching string comparisons 7881 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7882 * 3=ignore, 4=xmlcharrefreplace */ 7883 int known_errorHandler = -1; 7884 7885 if (mapping == NULL) { 7886 PyErr_BadArgument(); 7887 return NULL; 7888 } 7889 7890 if (PyUnicode_READY(input) == -1) 7891 return NULL; 7892 idata = (char*)PyUnicode_DATA(input); 7893 kind = PyUnicode_KIND(input); 7894 size = PyUnicode_GET_LENGTH(input); 7895 i = 0; 7896 7897 if (size == 0) { 7898 Py_INCREF(input); 7899 return input; 7900 } 7901 7902 /* allocate enough for a simple 1:1 translation without 7903 replacements, if we need more, we'll resize */ 7904 osize = size; 7905 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 7906 opos = 0; 7907 if (output == NULL) { 7908 PyErr_NoMemory(); 7909 goto onError; 7910 } 7911 7912 while (i<size) { 7913 /* try to encode it */ 7914 PyObject *x = NULL; 7915 if (charmaptranslate_output(input, i, mapping, 7916 &output, &osize, &opos, &x)) { 7917 Py_XDECREF(x); 7918 goto onError; 7919 } 7920 Py_XDECREF(x); 7921 if (x!=Py_None) /* it worked => adjust input pointer */ 7922 ++i; 7923 else { /* untranslatable character */ 7924 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7925 Py_ssize_t repsize; 7926 Py_ssize_t newpos; 7927 Py_ssize_t uni2; 7928 /* startpos for collecting untranslatable chars */ 7929 Py_ssize_t collstart = i; 7930 Py_ssize_t collend = i+1; 7931 Py_ssize_t coll; 7932 7933 /* find all untranslatable characters */ 7934 while (collend < size) { 7935 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 7936 goto onError; 7937 Py_XDECREF(x); 7938 if (x!=Py_None) 7939 break; 7940 ++collend; 7941 } 7942 /* cache callback name lookup 7943 * (if not done yet, i.e. it's the first error) */ 7944 if (known_errorHandler==-1) { 7945 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7946 known_errorHandler = 1; 7947 else if (!strcmp(errors, "replace")) 7948 known_errorHandler = 2; 7949 else if (!strcmp(errors, "ignore")) 7950 known_errorHandler = 3; 7951 else if (!strcmp(errors, "xmlcharrefreplace")) 7952 known_errorHandler = 4; 7953 else 7954 known_errorHandler = 0; 7955 } 7956 switch (known_errorHandler) { 7957 case 1: /* strict */ 7958 raise_translate_exception(&exc, input, collstart, 7959 collend, reason); 7960 goto onError; 7961 case 2: /* replace */ 7962 /* No need to check for space, this is a 1:1 replacement */ 7963 for (coll = collstart; coll<collend; coll++) 7964 output[opos++] = '?'; 7965 /* fall through */ 7966 case 3: /* ignore */ 7967 i = collend; 7968 break; 7969 case 4: /* xmlcharrefreplace */ 7970 /* generate replacement (temporarily (mis)uses i) */ 7971 for (i = collstart; i < collend; ++i) { 7972 char buffer[2+29+1+1]; 7973 char *cp; 7974 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 7975 if (charmaptranslate_makespace(&output, &osize, 7976 opos+strlen(buffer)+(size-collend))) 7977 goto onError; 7978 for (cp = buffer; *cp; ++cp) 7979 output[opos++] = *cp; 7980 } 7981 i = collend; 7982 break; 7983 default: 7984 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 7985 reason, input, &exc, 7986 collstart, collend, &newpos); 7987 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 7988 goto onError; 7989 /* generate replacement */ 7990 repsize = PyUnicode_GET_LENGTH(repunicode); 7991 if (charmaptranslate_makespace(&output, &osize, 7992 opos+repsize+(size-collend))) { 7993 Py_DECREF(repunicode); 7994 goto onError; 7995 } 7996 for (uni2 = 0; repsize-->0; ++uni2) 7997 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 7998 i = newpos; 7999 Py_DECREF(repunicode); 8000 } 8001 } 8002 } 8003 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8004 if (!res) 8005 goto onError; 8006 PyMem_Free(output); 8007 Py_XDECREF(exc); 8008 Py_XDECREF(errorHandler); 8009 return res; 8010 8011 onError: 8012 PyMem_Free(output); 8013 Py_XDECREF(exc); 8014 Py_XDECREF(errorHandler); 8015 return NULL; 8016} 8017 8018/* Deprecated. Use PyUnicode_Translate instead. */ 8019PyObject * 8020PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8021 Py_ssize_t size, 8022 PyObject *mapping, 8023 const char *errors) 8024{ 8025 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8026 if (!unicode) 8027 return NULL; 8028 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8029} 8030 8031PyObject * 8032PyUnicode_Translate(PyObject *str, 8033 PyObject *mapping, 8034 const char *errors) 8035{ 8036 PyObject *result; 8037 8038 str = PyUnicode_FromObject(str); 8039 if (str == NULL) 8040 goto onError; 8041 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8042 Py_DECREF(str); 8043 return result; 8044 8045 onError: 8046 Py_XDECREF(str); 8047 return NULL; 8048} 8049 8050static Py_UCS4 8051fix_decimal_and_space_to_ascii(PyUnicodeObject *self) 8052{ 8053 /* No need to call PyUnicode_READY(self) because this function is only 8054 called as a callback from fixup() which does it already. */ 8055 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8056 const int kind = PyUnicode_KIND(self); 8057 void *data = PyUnicode_DATA(self); 8058 Py_UCS4 maxchar = 0, ch, fixed; 8059 Py_ssize_t i; 8060 8061 for (i = 0; i < len; ++i) { 8062 ch = PyUnicode_READ(kind, data, i); 8063 fixed = 0; 8064 if (ch > 127) { 8065 if (Py_UNICODE_ISSPACE(ch)) 8066 fixed = ' '; 8067 else { 8068 const int decimal = Py_UNICODE_TODECIMAL(ch); 8069 if (decimal >= 0) 8070 fixed = '0' + decimal; 8071 } 8072 if (fixed != 0) { 8073 if (fixed > maxchar) 8074 maxchar = fixed; 8075 PyUnicode_WRITE(kind, data, i, fixed); 8076 } 8077 else if (ch > maxchar) 8078 maxchar = ch; 8079 } 8080 else if (ch > maxchar) 8081 maxchar = ch; 8082 } 8083 8084 return maxchar; 8085} 8086 8087PyObject * 8088_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8089{ 8090 if (!PyUnicode_Check(unicode)) { 8091 PyErr_BadInternalCall(); 8092 return NULL; 8093 } 8094 if (PyUnicode_READY(unicode) == -1) 8095 return NULL; 8096 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8097 /* If the string is already ASCII, just return the same string */ 8098 Py_INCREF(unicode); 8099 return unicode; 8100 } 8101 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); 8102} 8103 8104PyObject * 8105PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8106 Py_ssize_t length) 8107{ 8108 PyObject *result; 8109 Py_UNICODE *p; /* write pointer into result */ 8110 Py_ssize_t i; 8111 /* Copy to a new string */ 8112 result = (PyObject *)_PyUnicode_New(length); 8113 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8114 if (result == NULL) 8115 return result; 8116 p = PyUnicode_AS_UNICODE(result); 8117 /* Iterate over code points */ 8118 for (i = 0; i < length; i++) { 8119 Py_UNICODE ch =s[i]; 8120 if (ch > 127) { 8121 int decimal = Py_UNICODE_TODECIMAL(ch); 8122 if (decimal >= 0) 8123 p[i] = '0' + decimal; 8124 } 8125 } 8126#ifndef DONT_MAKE_RESULT_READY 8127 if (_PyUnicode_READY_REPLACE(&result)) { 8128 Py_DECREF(result); 8129 return NULL; 8130 } 8131#endif 8132 return result; 8133} 8134/* --- Decimal Encoder ---------------------------------------------------- */ 8135 8136int 8137PyUnicode_EncodeDecimal(Py_UNICODE *s, 8138 Py_ssize_t length, 8139 char *output, 8140 const char *errors) 8141{ 8142 Py_UNICODE *p, *end; 8143 PyObject *errorHandler = NULL; 8144 PyObject *exc = NULL; 8145 const char *encoding = "decimal"; 8146 const char *reason = "invalid decimal Unicode string"; 8147 /* the following variable is used for caching string comparisons 8148 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8149 int known_errorHandler = -1; 8150 8151 if (output == NULL) { 8152 PyErr_BadArgument(); 8153 return -1; 8154 } 8155 8156 p = s; 8157 end = s + length; 8158 while (p < end) { 8159 register Py_UNICODE ch = *p; 8160 int decimal; 8161 PyObject *repunicode; 8162 Py_ssize_t repsize; 8163 Py_ssize_t newpos; 8164 Py_UNICODE *uni2; 8165 Py_UNICODE *collstart; 8166 Py_UNICODE *collend; 8167 8168 if (Py_UNICODE_ISSPACE(ch)) { 8169 *output++ = ' '; 8170 ++p; 8171 continue; 8172 } 8173 decimal = Py_UNICODE_TODECIMAL(ch); 8174 if (decimal >= 0) { 8175 *output++ = '0' + decimal; 8176 ++p; 8177 continue; 8178 } 8179 if (0 < ch && ch < 256) { 8180 *output++ = (char)ch; 8181 ++p; 8182 continue; 8183 } 8184 /* All other characters are considered unencodable */ 8185 collstart = p; 8186 collend = p+1; 8187 while (collend < end) { 8188 if ((0 < *collend && *collend < 256) || 8189 !Py_UNICODE_ISSPACE(*collend) || 8190 Py_UNICODE_TODECIMAL(*collend)) 8191 break; 8192 } 8193 /* cache callback name lookup 8194 * (if not done yet, i.e. it's the first error) */ 8195 if (known_errorHandler==-1) { 8196 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8197 known_errorHandler = 1; 8198 else if (!strcmp(errors, "replace")) 8199 known_errorHandler = 2; 8200 else if (!strcmp(errors, "ignore")) 8201 known_errorHandler = 3; 8202 else if (!strcmp(errors, "xmlcharrefreplace")) 8203 known_errorHandler = 4; 8204 else 8205 known_errorHandler = 0; 8206 } 8207 switch (known_errorHandler) { 8208 case 1: /* strict */ 8209 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8210 goto onError; 8211 case 2: /* replace */ 8212 for (p = collstart; p < collend; ++p) 8213 *output++ = '?'; 8214 /* fall through */ 8215 case 3: /* ignore */ 8216 p = collend; 8217 break; 8218 case 4: /* xmlcharrefreplace */ 8219 /* generate replacement (temporarily (mis)uses p) */ 8220 for (p = collstart; p < collend; ++p) 8221 output += sprintf(output, "&#%d;", (int)*p); 8222 p = collend; 8223 break; 8224 default: 8225 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8226 encoding, reason, s, length, &exc, 8227 collstart-s, collend-s, &newpos); 8228 if (repunicode == NULL) 8229 goto onError; 8230 if (!PyUnicode_Check(repunicode)) { 8231 /* Byte results not supported, since they have no decimal property. */ 8232 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8233 Py_DECREF(repunicode); 8234 goto onError; 8235 } 8236 /* generate replacement */ 8237 repsize = PyUnicode_GET_SIZE(repunicode); 8238 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8239 Py_UNICODE ch = *uni2; 8240 if (Py_UNICODE_ISSPACE(ch)) 8241 *output++ = ' '; 8242 else { 8243 decimal = Py_UNICODE_TODECIMAL(ch); 8244 if (decimal >= 0) 8245 *output++ = '0' + decimal; 8246 else if (0 < ch && ch < 256) 8247 *output++ = (char)ch; 8248 else { 8249 Py_DECREF(repunicode); 8250 raise_encode_exception(&exc, encoding, 8251 s, length, collstart-s, collend-s, reason); 8252 goto onError; 8253 } 8254 } 8255 } 8256 p = s + newpos; 8257 Py_DECREF(repunicode); 8258 } 8259 } 8260 /* 0-terminate the output string */ 8261 *output++ = '\0'; 8262 Py_XDECREF(exc); 8263 Py_XDECREF(errorHandler); 8264 return 0; 8265 8266 onError: 8267 Py_XDECREF(exc); 8268 Py_XDECREF(errorHandler); 8269 return -1; 8270} 8271 8272/* --- Helpers ------------------------------------------------------------ */ 8273 8274#include "stringlib/ucs1lib.h" 8275#include "stringlib/fastsearch.h" 8276#include "stringlib/partition.h" 8277#include "stringlib/split.h" 8278#include "stringlib/count.h" 8279#include "stringlib/find.h" 8280#include "stringlib/localeutil.h" 8281#include "stringlib/undef.h" 8282 8283#include "stringlib/ucs2lib.h" 8284#include "stringlib/fastsearch.h" 8285#include "stringlib/partition.h" 8286#include "stringlib/split.h" 8287#include "stringlib/count.h" 8288#include "stringlib/find.h" 8289#include "stringlib/localeutil.h" 8290#include "stringlib/undef.h" 8291 8292#include "stringlib/ucs4lib.h" 8293#include "stringlib/fastsearch.h" 8294#include "stringlib/partition.h" 8295#include "stringlib/split.h" 8296#include "stringlib/count.h" 8297#include "stringlib/find.h" 8298#include "stringlib/localeutil.h" 8299#include "stringlib/undef.h" 8300 8301static Py_ssize_t 8302any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8303 const Py_UCS1*, Py_ssize_t, 8304 Py_ssize_t, Py_ssize_t), 8305 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8306 const Py_UCS2*, Py_ssize_t, 8307 Py_ssize_t, Py_ssize_t), 8308 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8309 const Py_UCS4*, Py_ssize_t, 8310 Py_ssize_t, Py_ssize_t), 8311 PyObject* s1, PyObject* s2, 8312 Py_ssize_t start, 8313 Py_ssize_t end) 8314{ 8315 int kind1, kind2, kind; 8316 void *buf1, *buf2; 8317 Py_ssize_t len1, len2, result; 8318 8319 kind1 = PyUnicode_KIND(s1); 8320 kind2 = PyUnicode_KIND(s2); 8321 kind = kind1 > kind2 ? kind1 : kind2; 8322 buf1 = PyUnicode_DATA(s1); 8323 buf2 = PyUnicode_DATA(s2); 8324 if (kind1 != kind) 8325 buf1 = _PyUnicode_AsKind(s1, kind); 8326 if (!buf1) 8327 return -2; 8328 if (kind2 != kind) 8329 buf2 = _PyUnicode_AsKind(s2, kind); 8330 if (!buf2) { 8331 if (kind1 != kind) PyMem_Free(buf1); 8332 return -2; 8333 } 8334 len1 = PyUnicode_GET_LENGTH(s1); 8335 len2 = PyUnicode_GET_LENGTH(s2); 8336 8337 switch(kind) { 8338 case PyUnicode_1BYTE_KIND: 8339 result = ucs1(buf1, len1, buf2, len2, start, end); 8340 break; 8341 case PyUnicode_2BYTE_KIND: 8342 result = ucs2(buf1, len1, buf2, len2, start, end); 8343 break; 8344 case PyUnicode_4BYTE_KIND: 8345 result = ucs4(buf1, len1, buf2, len2, start, end); 8346 break; 8347 default: 8348 assert(0); result = -2; 8349 } 8350 8351 if (kind1 != kind) 8352 PyMem_Free(buf1); 8353 if (kind2 != kind) 8354 PyMem_Free(buf2); 8355 8356 return result; 8357} 8358 8359Py_ssize_t 8360_PyUnicode_InsertThousandsGrouping(int kind, void *data, 8361 Py_ssize_t n_buffer, 8362 void *digits, Py_ssize_t n_digits, 8363 Py_ssize_t min_width, 8364 const char *grouping, 8365 const char *thousands_sep) 8366{ 8367 switch(kind) { 8368 case PyUnicode_1BYTE_KIND: 8369 return _PyUnicode_ucs1_InsertThousandsGrouping( 8370 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8371 min_width, grouping, thousands_sep); 8372 case PyUnicode_2BYTE_KIND: 8373 return _PyUnicode_ucs2_InsertThousandsGrouping( 8374 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8375 min_width, grouping, thousands_sep); 8376 case PyUnicode_4BYTE_KIND: 8377 return _PyUnicode_ucs4_InsertThousandsGrouping( 8378 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8379 min_width, grouping, thousands_sep); 8380 } 8381 assert(0); 8382 return -1; 8383} 8384 8385 8386#include "stringlib/unicodedefs.h" 8387#include "stringlib/fastsearch.h" 8388 8389#include "stringlib/count.h" 8390#include "stringlib/find.h" 8391 8392/* helper macro to fixup start/end slice values */ 8393#define ADJUST_INDICES(start, end, len) \ 8394 if (end > len) \ 8395 end = len; \ 8396 else if (end < 0) { \ 8397 end += len; \ 8398 if (end < 0) \ 8399 end = 0; \ 8400 } \ 8401 if (start < 0) { \ 8402 start += len; \ 8403 if (start < 0) \ 8404 start = 0; \ 8405 } 8406 8407Py_ssize_t 8408PyUnicode_Count(PyObject *str, 8409 PyObject *substr, 8410 Py_ssize_t start, 8411 Py_ssize_t end) 8412{ 8413 Py_ssize_t result; 8414 PyUnicodeObject* str_obj; 8415 PyUnicodeObject* sub_obj; 8416 int kind1, kind2, kind; 8417 void *buf1 = NULL, *buf2 = NULL; 8418 Py_ssize_t len1, len2; 8419 8420 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8421 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8422 return -1; 8423 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8424 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8425 Py_DECREF(str_obj); 8426 return -1; 8427 } 8428 8429 kind1 = PyUnicode_KIND(str_obj); 8430 kind2 = PyUnicode_KIND(sub_obj); 8431 kind = kind1 > kind2 ? kind1 : kind2; 8432 buf1 = PyUnicode_DATA(str_obj); 8433 if (kind1 != kind) 8434 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8435 if (!buf1) 8436 goto onError; 8437 buf2 = PyUnicode_DATA(sub_obj); 8438 if (kind2 != kind) 8439 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8440 if (!buf2) 8441 goto onError; 8442 len1 = PyUnicode_GET_LENGTH(str_obj); 8443 len2 = PyUnicode_GET_LENGTH(sub_obj); 8444 8445 ADJUST_INDICES(start, end, len1); 8446 switch(kind) { 8447 case PyUnicode_1BYTE_KIND: 8448 result = ucs1lib_count( 8449 ((Py_UCS1*)buf1) + start, end - start, 8450 buf2, len2, PY_SSIZE_T_MAX 8451 ); 8452 break; 8453 case PyUnicode_2BYTE_KIND: 8454 result = ucs2lib_count( 8455 ((Py_UCS2*)buf1) + start, end - start, 8456 buf2, len2, PY_SSIZE_T_MAX 8457 ); 8458 break; 8459 case PyUnicode_4BYTE_KIND: 8460 result = ucs4lib_count( 8461 ((Py_UCS4*)buf1) + start, end - start, 8462 buf2, len2, PY_SSIZE_T_MAX 8463 ); 8464 break; 8465 default: 8466 assert(0); result = 0; 8467 } 8468 8469 Py_DECREF(sub_obj); 8470 Py_DECREF(str_obj); 8471 8472 if (kind1 != kind) 8473 PyMem_Free(buf1); 8474 if (kind2 != kind) 8475 PyMem_Free(buf2); 8476 8477 return result; 8478 onError: 8479 Py_DECREF(sub_obj); 8480 Py_DECREF(str_obj); 8481 if (kind1 != kind && buf1) 8482 PyMem_Free(buf1); 8483 if (kind2 != kind && buf2) 8484 PyMem_Free(buf2); 8485 return -1; 8486} 8487 8488Py_ssize_t 8489PyUnicode_Find(PyObject *str, 8490 PyObject *sub, 8491 Py_ssize_t start, 8492 Py_ssize_t end, 8493 int direction) 8494{ 8495 Py_ssize_t result; 8496 8497 str = PyUnicode_FromObject(str); 8498 if (!str || PyUnicode_READY(str) == -1) 8499 return -2; 8500 sub = PyUnicode_FromObject(sub); 8501 if (!sub || PyUnicode_READY(sub) == -1) { 8502 Py_DECREF(str); 8503 return -2; 8504 } 8505 8506 if (direction > 0) 8507 result = any_find_slice( 8508 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 8509 str, sub, start, end 8510 ); 8511 else 8512 result = any_find_slice( 8513 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8514 str, sub, start, end 8515 ); 8516 8517 Py_DECREF(str); 8518 Py_DECREF(sub); 8519 8520 return result; 8521} 8522 8523Py_ssize_t 8524PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8525 Py_ssize_t start, Py_ssize_t end, 8526 int direction) 8527{ 8528 char *result; 8529 int kind; 8530 if (PyUnicode_READY(str) == -1) 8531 return -2; 8532 if (start < 0 || end < 0) { 8533 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8534 return -2; 8535 } 8536 if (end > PyUnicode_GET_LENGTH(str)) 8537 end = PyUnicode_GET_LENGTH(str); 8538 kind = PyUnicode_KIND(str); 8539 result = findchar(PyUnicode_1BYTE_DATA(str) 8540 + PyUnicode_KIND_SIZE(kind, start), 8541 kind, 8542 end-start, ch, direction); 8543 if (!result) 8544 return -1; 8545 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8546} 8547 8548static int 8549tailmatch(PyUnicodeObject *self, 8550 PyUnicodeObject *substring, 8551 Py_ssize_t start, 8552 Py_ssize_t end, 8553 int direction) 8554{ 8555 int kind_self; 8556 int kind_sub; 8557 void *data_self; 8558 void *data_sub; 8559 Py_ssize_t offset; 8560 Py_ssize_t i; 8561 Py_ssize_t end_sub; 8562 8563 if (PyUnicode_READY(self) == -1 || 8564 PyUnicode_READY(substring) == -1) 8565 return 0; 8566 8567 if (PyUnicode_GET_LENGTH(substring) == 0) 8568 return 1; 8569 8570 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8571 end -= PyUnicode_GET_LENGTH(substring); 8572 if (end < start) 8573 return 0; 8574 8575 kind_self = PyUnicode_KIND(self); 8576 data_self = PyUnicode_DATA(self); 8577 kind_sub = PyUnicode_KIND(substring); 8578 data_sub = PyUnicode_DATA(substring); 8579 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8580 8581 if (direction > 0) 8582 offset = end; 8583 else 8584 offset = start; 8585 8586 if (PyUnicode_READ(kind_self, data_self, offset) == 8587 PyUnicode_READ(kind_sub, data_sub, 0) && 8588 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8589 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8590 /* If both are of the same kind, memcmp is sufficient */ 8591 if (kind_self == kind_sub) { 8592 return ! memcmp((char *)data_self + 8593 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8594 data_sub, 8595 PyUnicode_GET_LENGTH(substring) * 8596 PyUnicode_CHARACTER_SIZE(substring)); 8597 } 8598 /* otherwise we have to compare each character by first accesing it */ 8599 else { 8600 /* We do not need to compare 0 and len(substring)-1 because 8601 the if statement above ensured already that they are equal 8602 when we end up here. */ 8603 // TODO: honor direction and do a forward or backwards search 8604 for (i = 1; i < end_sub; ++i) { 8605 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8606 PyUnicode_READ(kind_sub, data_sub, i)) 8607 return 0; 8608 } 8609 return 1; 8610 } 8611 } 8612 8613 return 0; 8614} 8615 8616Py_ssize_t 8617PyUnicode_Tailmatch(PyObject *str, 8618 PyObject *substr, 8619 Py_ssize_t start, 8620 Py_ssize_t end, 8621 int direction) 8622{ 8623 Py_ssize_t result; 8624 8625 str = PyUnicode_FromObject(str); 8626 if (str == NULL) 8627 return -1; 8628 substr = PyUnicode_FromObject(substr); 8629 if (substr == NULL) { 8630 Py_DECREF(str); 8631 return -1; 8632 } 8633 8634 result = tailmatch((PyUnicodeObject *)str, 8635 (PyUnicodeObject *)substr, 8636 start, end, direction); 8637 Py_DECREF(str); 8638 Py_DECREF(substr); 8639 return result; 8640} 8641 8642/* Apply fixfct filter to the Unicode object self and return a 8643 reference to the modified object */ 8644 8645static PyObject * 8646fixup(PyUnicodeObject *self, 8647 Py_UCS4 (*fixfct)(PyUnicodeObject *s)) 8648{ 8649 PyObject *u; 8650 Py_UCS4 maxchar_old, maxchar_new = 0; 8651 8652 if (PyUnicode_READY(self) == -1) 8653 return NULL; 8654 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8655 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8656 maxchar_old); 8657 if (u == NULL) 8658 return NULL; 8659 8660 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8661 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8662 8663 /* fix functions return the new maximum character in a string, 8664 if the kind of the resulting unicode object does not change, 8665 everything is fine. Otherwise we need to change the string kind 8666 and re-run the fix function. */ 8667 maxchar_new = fixfct((PyUnicodeObject*)u); 8668 if (maxchar_new == 0) 8669 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8670 else if (maxchar_new <= 127) 8671 maxchar_new = 127; 8672 else if (maxchar_new <= 255) 8673 maxchar_new = 255; 8674 else if (maxchar_new <= 65535) 8675 maxchar_new = 65535; 8676 else 8677 maxchar_new = 1114111; /* 0x10ffff */ 8678 8679 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8680 /* fixfct should return TRUE if it modified the buffer. If 8681 FALSE, return a reference to the original buffer instead 8682 (to save space, not time) */ 8683 Py_INCREF(self); 8684 Py_DECREF(u); 8685 return (PyObject*) self; 8686 } 8687 else if (maxchar_new == maxchar_old) { 8688 return u; 8689 } 8690 else { 8691 /* In case the maximum character changed, we need to 8692 convert the string to the new category. */ 8693 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8694 if (v == NULL) { 8695 Py_DECREF(u); 8696 return NULL; 8697 } 8698 if (maxchar_new > maxchar_old) { 8699 /* If the maxchar increased so that the kind changed, not all 8700 characters are representable anymore and we need to fix the 8701 string again. This only happens in very few cases. */ 8702 if (PyUnicode_CopyCharacters(v, 0, 8703 (PyObject*)self, 0, 8704 PyUnicode_GET_LENGTH(self)) < 0) 8705 { 8706 Py_DECREF(u); 8707 return NULL; 8708 } 8709 maxchar_old = fixfct((PyUnicodeObject*)v); 8710 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8711 } 8712 else { 8713 if (PyUnicode_CopyCharacters(v, 0, 8714 u, 0, 8715 PyUnicode_GET_LENGTH(self)) < 0) 8716 { 8717 Py_DECREF(u); 8718 return NULL; 8719 } 8720 } 8721 8722 Py_DECREF(u); 8723 return v; 8724 } 8725} 8726 8727static Py_UCS4 8728fixupper(PyUnicodeObject *self) 8729{ 8730 /* No need to call PyUnicode_READY(self) because this function is only 8731 called as a callback from fixup() which does it already. */ 8732 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8733 const int kind = PyUnicode_KIND(self); 8734 void *data = PyUnicode_DATA(self); 8735 int touched = 0; 8736 Py_UCS4 maxchar = 0; 8737 Py_ssize_t i; 8738 8739 for (i = 0; i < len; ++i) { 8740 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8741 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8742 if (up != ch) { 8743 if (up > maxchar) 8744 maxchar = up; 8745 PyUnicode_WRITE(kind, data, i, up); 8746 touched = 1; 8747 } 8748 else if (ch > maxchar) 8749 maxchar = ch; 8750 } 8751 8752 if (touched) 8753 return maxchar; 8754 else 8755 return 0; 8756} 8757 8758static Py_UCS4 8759fixlower(PyUnicodeObject *self) 8760{ 8761 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8762 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8763 const int kind = PyUnicode_KIND(self); 8764 void *data = PyUnicode_DATA(self); 8765 int touched = 0; 8766 Py_UCS4 maxchar = 0; 8767 Py_ssize_t i; 8768 8769 for(i = 0; i < len; ++i) { 8770 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8771 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8772 if (lo != ch) { 8773 if (lo > maxchar) 8774 maxchar = lo; 8775 PyUnicode_WRITE(kind, data, i, lo); 8776 touched = 1; 8777 } 8778 else if (ch > maxchar) 8779 maxchar = ch; 8780 } 8781 8782 if (touched) 8783 return maxchar; 8784 else 8785 return 0; 8786} 8787 8788static Py_UCS4 8789fixswapcase(PyUnicodeObject *self) 8790{ 8791 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8792 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8793 const int kind = PyUnicode_KIND(self); 8794 void *data = PyUnicode_DATA(self); 8795 int touched = 0; 8796 Py_UCS4 maxchar = 0; 8797 Py_ssize_t i; 8798 8799 for(i = 0; i < len; ++i) { 8800 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8801 Py_UCS4 nu = 0; 8802 8803 if (Py_UNICODE_ISUPPER(ch)) 8804 nu = Py_UNICODE_TOLOWER(ch); 8805 else if (Py_UNICODE_ISLOWER(ch)) 8806 nu = Py_UNICODE_TOUPPER(ch); 8807 8808 if (nu != 0) { 8809 if (nu > maxchar) 8810 maxchar = nu; 8811 PyUnicode_WRITE(kind, data, i, nu); 8812 touched = 1; 8813 } 8814 else if (ch > maxchar) 8815 maxchar = ch; 8816 } 8817 8818 if (touched) 8819 return maxchar; 8820 else 8821 return 0; 8822} 8823 8824static Py_UCS4 8825fixcapitalize(PyUnicodeObject *self) 8826{ 8827 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8828 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8829 const int kind = PyUnicode_KIND(self); 8830 void *data = PyUnicode_DATA(self); 8831 int touched = 0; 8832 Py_UCS4 maxchar = 0; 8833 Py_ssize_t i = 0; 8834 Py_UCS4 ch; 8835 8836 if (len == 0) 8837 return 0; 8838 8839 ch = PyUnicode_READ(kind, data, i); 8840 if (!Py_UNICODE_ISUPPER(ch)) { 8841 maxchar = Py_UNICODE_TOUPPER(ch); 8842 PyUnicode_WRITE(kind, data, i, maxchar); 8843 touched = 1; 8844 } 8845 ++i; 8846 for(; i < len; ++i) { 8847 ch = PyUnicode_READ(kind, data, i); 8848 if (!Py_UNICODE_ISLOWER(ch)) { 8849 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8850 if (lo > maxchar) 8851 maxchar = lo; 8852 PyUnicode_WRITE(kind, data, i, lo); 8853 touched = 1; 8854 } 8855 else if (ch > maxchar) 8856 maxchar = ch; 8857 } 8858 8859 if (touched) 8860 return maxchar; 8861 else 8862 return 0; 8863} 8864 8865static Py_UCS4 8866fixtitle(PyUnicodeObject *self) 8867{ 8868 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8869 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8870 const int kind = PyUnicode_KIND(self); 8871 void *data = PyUnicode_DATA(self); 8872 Py_UCS4 maxchar = 0; 8873 Py_ssize_t i = 0; 8874 int previous_is_cased; 8875 8876 /* Shortcut for single character strings */ 8877 if (len == 1) { 8878 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8879 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 8880 if (ti != ch) { 8881 PyUnicode_WRITE(kind, data, i, ti); 8882 return ti; 8883 } 8884 else 8885 return 0; 8886 } 8887 previous_is_cased = 0; 8888 for(; i < len; ++i) { 8889 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8890 Py_UCS4 nu; 8891 8892 if (previous_is_cased) 8893 nu = Py_UNICODE_TOLOWER(ch); 8894 else 8895 nu = Py_UNICODE_TOTITLE(ch); 8896 8897 if (nu > maxchar) 8898 maxchar = nu; 8899 PyUnicode_WRITE(kind, data, i, nu); 8900 8901 if (Py_UNICODE_ISLOWER(ch) || 8902 Py_UNICODE_ISUPPER(ch) || 8903 Py_UNICODE_ISTITLE(ch)) 8904 previous_is_cased = 1; 8905 else 8906 previous_is_cased = 0; 8907 } 8908 return maxchar; 8909} 8910 8911PyObject * 8912PyUnicode_Join(PyObject *separator, PyObject *seq) 8913{ 8914 PyObject *sep = NULL; 8915 Py_ssize_t seplen = 1; 8916 PyObject *res = NULL; /* the result */ 8917 PyObject *fseq; /* PySequence_Fast(seq) */ 8918 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 8919 PyObject **items; 8920 PyObject *item; 8921 Py_ssize_t sz, i, res_offset; 8922 Py_UCS4 maxchar = 0; 8923 Py_UCS4 item_maxchar; 8924 8925 fseq = PySequence_Fast(seq, ""); 8926 if (fseq == NULL) { 8927 return NULL; 8928 } 8929 8930 /* NOTE: the following code can't call back into Python code, 8931 * so we are sure that fseq won't be mutated. 8932 */ 8933 8934 seqlen = PySequence_Fast_GET_SIZE(fseq); 8935 /* If empty sequence, return u"". */ 8936 if (seqlen == 0) { 8937 res = PyUnicode_New(0, 0); 8938 goto Done; 8939 } 8940 items = PySequence_Fast_ITEMS(fseq); 8941 /* If singleton sequence with an exact Unicode, return that. */ 8942 if (seqlen == 1) { 8943 item = items[0]; 8944 if (PyUnicode_CheckExact(item)) { 8945 Py_INCREF(item); 8946 res = item; 8947 goto Done; 8948 } 8949 } 8950 else { 8951 /* Set up sep and seplen */ 8952 if (separator == NULL) { 8953 /* fall back to a blank space separator */ 8954 sep = PyUnicode_FromOrdinal(' '); 8955 if (!sep) 8956 goto onError; 8957 } 8958 else { 8959 if (!PyUnicode_Check(separator)) { 8960 PyErr_Format(PyExc_TypeError, 8961 "separator: expected str instance," 8962 " %.80s found", 8963 Py_TYPE(separator)->tp_name); 8964 goto onError; 8965 } 8966 if (PyUnicode_READY(separator)) 8967 goto onError; 8968 sep = separator; 8969 seplen = PyUnicode_GET_LENGTH(separator); 8970 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 8971 /* inc refcount to keep this code path symetric with the 8972 above case of a blank separator */ 8973 Py_INCREF(sep); 8974 } 8975 } 8976 8977 /* There are at least two things to join, or else we have a subclass 8978 * of str in the sequence. 8979 * Do a pre-pass to figure out the total amount of space we'll 8980 * need (sz), and see whether all argument are strings. 8981 */ 8982 sz = 0; 8983 for (i = 0; i < seqlen; i++) { 8984 const Py_ssize_t old_sz = sz; 8985 item = items[i]; 8986 if (!PyUnicode_Check(item)) { 8987 PyErr_Format(PyExc_TypeError, 8988 "sequence item %zd: expected str instance," 8989 " %.80s found", 8990 i, Py_TYPE(item)->tp_name); 8991 goto onError; 8992 } 8993 if (PyUnicode_READY(item) == -1) 8994 goto onError; 8995 sz += PyUnicode_GET_LENGTH(item); 8996 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 8997 if (item_maxchar > maxchar) 8998 maxchar = item_maxchar; 8999 if (i != 0) 9000 sz += seplen; 9001 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9002 PyErr_SetString(PyExc_OverflowError, 9003 "join() result is too long for a Python string"); 9004 goto onError; 9005 } 9006 } 9007 9008 res = PyUnicode_New(sz, maxchar); 9009 if (res == NULL) 9010 goto onError; 9011 9012 /* Catenate everything. */ 9013 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9014 Py_ssize_t itemlen, copied; 9015 item = items[i]; 9016 /* Copy item, and maybe the separator. */ 9017 if (i && seplen != 0) { 9018 copied = PyUnicode_CopyCharacters(res, res_offset, 9019 sep, 0, seplen); 9020 if (copied < 0) 9021 goto onError; 9022#ifdef Py_DEBUG 9023 res_offset += copied; 9024#else 9025 res_offset += seplen; 9026#endif 9027 } 9028 itemlen = PyUnicode_GET_LENGTH(item); 9029 if (itemlen != 0) { 9030 copied = PyUnicode_CopyCharacters(res, res_offset, 9031 item, 0, itemlen); 9032 if (copied < 0) 9033 goto onError; 9034#ifdef Py_DEBUG 9035 res_offset += copied; 9036#else 9037 res_offset += itemlen; 9038#endif 9039 } 9040 } 9041 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9042 9043 Done: 9044 Py_DECREF(fseq); 9045 Py_XDECREF(sep); 9046 return res; 9047 9048 onError: 9049 Py_DECREF(fseq); 9050 Py_XDECREF(sep); 9051 Py_XDECREF(res); 9052 return NULL; 9053} 9054 9055#define FILL(kind, data, value, start, length) \ 9056 do { \ 9057 Py_ssize_t i_ = 0; \ 9058 assert(kind != PyUnicode_WCHAR_KIND); \ 9059 switch ((kind)) { \ 9060 case PyUnicode_1BYTE_KIND: { \ 9061 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9062 memset(to_, (unsigned char)value, length); \ 9063 break; \ 9064 } \ 9065 case PyUnicode_2BYTE_KIND: { \ 9066 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9067 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9068 break; \ 9069 } \ 9070 default: { \ 9071 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9072 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9073 break; \ 9074 } \ 9075 } \ 9076 } while (0) 9077 9078static PyUnicodeObject * 9079pad(PyUnicodeObject *self, 9080 Py_ssize_t left, 9081 Py_ssize_t right, 9082 Py_UCS4 fill) 9083{ 9084 PyObject *u; 9085 Py_UCS4 maxchar; 9086 int kind; 9087 void *data; 9088 9089 if (left < 0) 9090 left = 0; 9091 if (right < 0) 9092 right = 0; 9093 9094 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9095 Py_INCREF(self); 9096 return self; 9097 } 9098 9099 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9100 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9101 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9102 return NULL; 9103 } 9104 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9105 if (fill > maxchar) 9106 maxchar = fill; 9107 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9108 if (!u) 9109 return NULL; 9110 9111 kind = PyUnicode_KIND(u); 9112 data = PyUnicode_DATA(u); 9113 if (left) 9114 FILL(kind, data, fill, 0, left); 9115 if (right) 9116 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9117 if (PyUnicode_CopyCharacters(u, left, 9118 (PyObject*)self, 0, 9119 _PyUnicode_LENGTH(self)) < 0) 9120 { 9121 Py_DECREF(u); 9122 return NULL; 9123 } 9124 9125 return (PyUnicodeObject*)u; 9126} 9127#undef FILL 9128 9129PyObject * 9130PyUnicode_Splitlines(PyObject *string, int keepends) 9131{ 9132 PyObject *list; 9133 9134 string = PyUnicode_FromObject(string); 9135 if (string == NULL || PyUnicode_READY(string) == -1) 9136 return NULL; 9137 9138 switch(PyUnicode_KIND(string)) { 9139 case PyUnicode_1BYTE_KIND: 9140 list = ucs1lib_splitlines( 9141 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9142 PyUnicode_GET_LENGTH(string), keepends); 9143 break; 9144 case PyUnicode_2BYTE_KIND: 9145 list = ucs2lib_splitlines( 9146 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9147 PyUnicode_GET_LENGTH(string), keepends); 9148 break; 9149 case PyUnicode_4BYTE_KIND: 9150 list = ucs4lib_splitlines( 9151 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9152 PyUnicode_GET_LENGTH(string), keepends); 9153 break; 9154 default: 9155 assert(0); 9156 list = 0; 9157 } 9158 Py_DECREF(string); 9159 return list; 9160} 9161 9162static PyObject * 9163split(PyUnicodeObject *self, 9164 PyUnicodeObject *substring, 9165 Py_ssize_t maxcount) 9166{ 9167 int kind1, kind2, kind; 9168 void *buf1, *buf2; 9169 Py_ssize_t len1, len2; 9170 PyObject* out; 9171 9172 if (maxcount < 0) 9173 maxcount = PY_SSIZE_T_MAX; 9174 9175 if (PyUnicode_READY(self) == -1) 9176 return NULL; 9177 9178 if (substring == NULL) 9179 switch(PyUnicode_KIND(self)) { 9180 case PyUnicode_1BYTE_KIND: 9181 return ucs1lib_split_whitespace( 9182 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9183 PyUnicode_GET_LENGTH(self), maxcount 9184 ); 9185 case PyUnicode_2BYTE_KIND: 9186 return ucs2lib_split_whitespace( 9187 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9188 PyUnicode_GET_LENGTH(self), maxcount 9189 ); 9190 case PyUnicode_4BYTE_KIND: 9191 return ucs4lib_split_whitespace( 9192 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9193 PyUnicode_GET_LENGTH(self), maxcount 9194 ); 9195 default: 9196 assert(0); 9197 return NULL; 9198 } 9199 9200 if (PyUnicode_READY(substring) == -1) 9201 return NULL; 9202 9203 kind1 = PyUnicode_KIND(self); 9204 kind2 = PyUnicode_KIND(substring); 9205 kind = kind1 > kind2 ? kind1 : kind2; 9206 buf1 = PyUnicode_DATA(self); 9207 buf2 = PyUnicode_DATA(substring); 9208 if (kind1 != kind) 9209 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9210 if (!buf1) 9211 return NULL; 9212 if (kind2 != kind) 9213 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9214 if (!buf2) { 9215 if (kind1 != kind) PyMem_Free(buf1); 9216 return NULL; 9217 } 9218 len1 = PyUnicode_GET_LENGTH(self); 9219 len2 = PyUnicode_GET_LENGTH(substring); 9220 9221 switch(kind) { 9222 case PyUnicode_1BYTE_KIND: 9223 out = ucs1lib_split( 9224 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9225 break; 9226 case PyUnicode_2BYTE_KIND: 9227 out = ucs2lib_split( 9228 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9229 break; 9230 case PyUnicode_4BYTE_KIND: 9231 out = ucs4lib_split( 9232 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9233 break; 9234 default: 9235 out = NULL; 9236 } 9237 if (kind1 != kind) 9238 PyMem_Free(buf1); 9239 if (kind2 != kind) 9240 PyMem_Free(buf2); 9241 return out; 9242} 9243 9244static PyObject * 9245rsplit(PyUnicodeObject *self, 9246 PyUnicodeObject *substring, 9247 Py_ssize_t maxcount) 9248{ 9249 int kind1, kind2, kind; 9250 void *buf1, *buf2; 9251 Py_ssize_t len1, len2; 9252 PyObject* out; 9253 9254 if (maxcount < 0) 9255 maxcount = PY_SSIZE_T_MAX; 9256 9257 if (PyUnicode_READY(self) == -1) 9258 return NULL; 9259 9260 if (substring == NULL) 9261 switch(PyUnicode_KIND(self)) { 9262 case PyUnicode_1BYTE_KIND: 9263 return ucs1lib_rsplit_whitespace( 9264 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9265 PyUnicode_GET_LENGTH(self), maxcount 9266 ); 9267 case PyUnicode_2BYTE_KIND: 9268 return ucs2lib_rsplit_whitespace( 9269 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9270 PyUnicode_GET_LENGTH(self), maxcount 9271 ); 9272 case PyUnicode_4BYTE_KIND: 9273 return ucs4lib_rsplit_whitespace( 9274 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9275 PyUnicode_GET_LENGTH(self), maxcount 9276 ); 9277 default: 9278 assert(0); 9279 return NULL; 9280 } 9281 9282 if (PyUnicode_READY(substring) == -1) 9283 return NULL; 9284 9285 kind1 = PyUnicode_KIND(self); 9286 kind2 = PyUnicode_KIND(substring); 9287 kind = kind1 > kind2 ? kind1 : kind2; 9288 buf1 = PyUnicode_DATA(self); 9289 buf2 = PyUnicode_DATA(substring); 9290 if (kind1 != kind) 9291 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9292 if (!buf1) 9293 return NULL; 9294 if (kind2 != kind) 9295 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9296 if (!buf2) { 9297 if (kind1 != kind) PyMem_Free(buf1); 9298 return NULL; 9299 } 9300 len1 = PyUnicode_GET_LENGTH(self); 9301 len2 = PyUnicode_GET_LENGTH(substring); 9302 9303 switch(kind) { 9304 case PyUnicode_1BYTE_KIND: 9305 out = ucs1lib_rsplit( 9306 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9307 break; 9308 case PyUnicode_2BYTE_KIND: 9309 out = ucs2lib_rsplit( 9310 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9311 break; 9312 case PyUnicode_4BYTE_KIND: 9313 out = ucs4lib_rsplit( 9314 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9315 break; 9316 default: 9317 out = NULL; 9318 } 9319 if (kind1 != kind) 9320 PyMem_Free(buf1); 9321 if (kind2 != kind) 9322 PyMem_Free(buf2); 9323 return out; 9324} 9325 9326static Py_ssize_t 9327anylib_find(int kind, void *buf1, Py_ssize_t len1, 9328 void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9329{ 9330 switch(kind) { 9331 case PyUnicode_1BYTE_KIND: 9332 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9333 case PyUnicode_2BYTE_KIND: 9334 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9335 case PyUnicode_4BYTE_KIND: 9336 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9337 } 9338 assert(0); 9339 return -1; 9340} 9341 9342static Py_ssize_t 9343anylib_count(int kind, void* sbuf, Py_ssize_t slen, 9344 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9345{ 9346 switch(kind) { 9347 case PyUnicode_1BYTE_KIND: 9348 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9349 case PyUnicode_2BYTE_KIND: 9350 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9351 case PyUnicode_4BYTE_KIND: 9352 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9353 } 9354 assert(0); 9355 return 0; 9356} 9357 9358static PyObject * 9359replace(PyObject *self, PyObject *str1, 9360 PyObject *str2, Py_ssize_t maxcount) 9361{ 9362 PyObject *u; 9363 char *sbuf = PyUnicode_DATA(self); 9364 char *buf1 = PyUnicode_DATA(str1); 9365 char *buf2 = PyUnicode_DATA(str2); 9366 int srelease = 0, release1 = 0, release2 = 0; 9367 int skind = PyUnicode_KIND(self); 9368 int kind1 = PyUnicode_KIND(str1); 9369 int kind2 = PyUnicode_KIND(str2); 9370 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9371 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9372 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9373 9374 if (maxcount < 0) 9375 maxcount = PY_SSIZE_T_MAX; 9376 else if (maxcount == 0 || slen == 0) 9377 goto nothing; 9378 9379 if (skind < kind1) 9380 /* substring too wide to be present */ 9381 goto nothing; 9382 9383 if (len1 == len2) { 9384 Py_ssize_t i; 9385 /* same length */ 9386 if (len1 == 0) 9387 goto nothing; 9388 if (len1 == 1) { 9389 /* replace characters */ 9390 Py_UCS4 u1, u2, maxchar; 9391 int mayshrink, rkind; 9392 u1 = PyUnicode_READ_CHAR(str1, 0); 9393 if (!findchar(sbuf, PyUnicode_KIND(self), 9394 slen, u1, 1)) 9395 goto nothing; 9396 u2 = PyUnicode_READ_CHAR(str2, 0); 9397 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9398 /* Replacing u1 with u2 may cause a maxchar reduction in the 9399 result string. */ 9400 if (u2 > maxchar) { 9401 maxchar = u2; 9402 mayshrink = 0; 9403 } 9404 else 9405 mayshrink = maxchar > 127; 9406 u = PyUnicode_New(slen, maxchar); 9407 if (!u) 9408 goto error; 9409 if (PyUnicode_CopyCharacters(u, 0, 9410 (PyObject*)self, 0, slen) < 0) 9411 { 9412 Py_DECREF(u); 9413 return NULL; 9414 } 9415 rkind = PyUnicode_KIND(u); 9416 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9417 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9418 if (--maxcount < 0) 9419 break; 9420 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9421 } 9422 if (mayshrink) { 9423 PyObject *tmp = u; 9424 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9425 PyUnicode_GET_LENGTH(tmp)); 9426 Py_DECREF(tmp); 9427 } 9428 } else { 9429 int rkind = skind; 9430 char *res; 9431 if (kind1 < rkind) { 9432 /* widen substring */ 9433 buf1 = _PyUnicode_AsKind(str1, rkind); 9434 if (!buf1) goto error; 9435 release1 = 1; 9436 } 9437 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); 9438 if (i < 0) 9439 goto nothing; 9440 if (rkind > kind2) { 9441 /* widen replacement */ 9442 buf2 = _PyUnicode_AsKind(str2, rkind); 9443 if (!buf2) goto error; 9444 release2 = 1; 9445 } 9446 else if (rkind < kind2) { 9447 /* widen self and buf1 */ 9448 rkind = kind2; 9449 if (release1) PyMem_Free(buf1); 9450 sbuf = _PyUnicode_AsKind(self, rkind); 9451 if (!sbuf) goto error; 9452 srelease = 1; 9453 buf1 = _PyUnicode_AsKind(str1, rkind); 9454 if (!buf1) goto error; 9455 release1 = 1; 9456 } 9457 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9458 if (!res) { 9459 PyErr_NoMemory(); 9460 goto error; 9461 } 9462 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9463 /* change everything in-place, starting with this one */ 9464 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9465 buf2, 9466 PyUnicode_KIND_SIZE(rkind, len2)); 9467 i += len1; 9468 9469 while ( --maxcount > 0) { 9470 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), 9471 slen-i, 9472 buf1, len1, i); 9473 if (i == -1) 9474 break; 9475 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9476 buf2, 9477 PyUnicode_KIND_SIZE(rkind, len2)); 9478 i += len1; 9479 } 9480 9481 u = PyUnicode_FromKindAndData(rkind, res, slen); 9482 PyMem_Free(res); 9483 if (!u) goto error; 9484 } 9485 } else { 9486 9487 Py_ssize_t n, i, j, ires; 9488 Py_ssize_t product, new_size; 9489 int rkind = skind; 9490 char *res; 9491 9492 if (kind1 < rkind) { 9493 buf1 = _PyUnicode_AsKind(str1, rkind); 9494 if (!buf1) goto error; 9495 release1 = 1; 9496 } 9497 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); 9498 if (n == 0) 9499 goto nothing; 9500 if (kind2 < rkind) { 9501 buf2 = _PyUnicode_AsKind(str2, rkind); 9502 if (!buf2) goto error; 9503 release2 = 1; 9504 } 9505 else if (kind2 > rkind) { 9506 rkind = kind2; 9507 sbuf = _PyUnicode_AsKind(self, rkind); 9508 if (!sbuf) goto error; 9509 srelease = 1; 9510 if (release1) PyMem_Free(buf1); 9511 buf1 = _PyUnicode_AsKind(str1, rkind); 9512 if (!buf1) goto error; 9513 release1 = 1; 9514 } 9515 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9516 PyUnicode_GET_LENGTH(str1))); */ 9517 product = n * (len2-len1); 9518 if ((product / (len2-len1)) != n) { 9519 PyErr_SetString(PyExc_OverflowError, 9520 "replace string is too long"); 9521 goto error; 9522 } 9523 new_size = slen + product; 9524 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9525 PyErr_SetString(PyExc_OverflowError, 9526 "replace string is too long"); 9527 goto error; 9528 } 9529 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9530 if (!res) 9531 goto error; 9532 ires = i = 0; 9533 if (len1 > 0) { 9534 while (n-- > 0) { 9535 /* look for next match */ 9536 j = anylib_find(rkind, 9537 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9538 slen-i, buf1, len1, i); 9539 if (j == -1) 9540 break; 9541 else if (j > i) { 9542 /* copy unchanged part [i:j] */ 9543 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9544 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9545 PyUnicode_KIND_SIZE(rkind, j-i)); 9546 ires += j - i; 9547 } 9548 /* copy substitution string */ 9549 if (len2 > 0) { 9550 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9551 buf2, 9552 PyUnicode_KIND_SIZE(rkind, len2)); 9553 ires += len2; 9554 } 9555 i = j + len1; 9556 } 9557 if (i < slen) 9558 /* copy tail [i:] */ 9559 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9560 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9561 PyUnicode_KIND_SIZE(rkind, slen-i)); 9562 } else { 9563 /* interleave */ 9564 while (n > 0) { 9565 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9566 buf2, 9567 PyUnicode_KIND_SIZE(rkind, len2)); 9568 ires += len2; 9569 if (--n <= 0) 9570 break; 9571 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9572 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9573 PyUnicode_KIND_SIZE(rkind, 1)); 9574 ires++; 9575 i++; 9576 } 9577 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9578 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9579 PyUnicode_KIND_SIZE(rkind, slen-i)); 9580 } 9581 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9582 PyMem_Free(res); 9583 } 9584 if (srelease) 9585 PyMem_FREE(sbuf); 9586 if (release1) 9587 PyMem_FREE(buf1); 9588 if (release2) 9589 PyMem_FREE(buf2); 9590 return u; 9591 9592 nothing: 9593 /* nothing to replace; return original string (when possible) */ 9594 if (srelease) 9595 PyMem_FREE(sbuf); 9596 if (release1) 9597 PyMem_FREE(buf1); 9598 if (release2) 9599 PyMem_FREE(buf2); 9600 if (PyUnicode_CheckExact(self)) { 9601 Py_INCREF(self); 9602 return (PyObject *) self; 9603 } 9604 return PyUnicode_Copy(self); 9605 error: 9606 if (srelease && sbuf) 9607 PyMem_FREE(sbuf); 9608 if (release1 && buf1) 9609 PyMem_FREE(buf1); 9610 if (release2 && buf2) 9611 PyMem_FREE(buf2); 9612 return NULL; 9613} 9614 9615/* --- Unicode Object Methods --------------------------------------------- */ 9616 9617PyDoc_STRVAR(title__doc__, 9618 "S.title() -> str\n\ 9619\n\ 9620Return a titlecased version of S, i.e. words start with title case\n\ 9621characters, all remaining cased characters have lower case."); 9622 9623static PyObject* 9624unicode_title(PyUnicodeObject *self) 9625{ 9626 return fixup(self, fixtitle); 9627} 9628 9629PyDoc_STRVAR(capitalize__doc__, 9630 "S.capitalize() -> str\n\ 9631\n\ 9632Return a capitalized version of S, i.e. make the first character\n\ 9633have upper case and the rest lower case."); 9634 9635static PyObject* 9636unicode_capitalize(PyUnicodeObject *self) 9637{ 9638 return fixup(self, fixcapitalize); 9639} 9640 9641#if 0 9642PyDoc_STRVAR(capwords__doc__, 9643 "S.capwords() -> str\n\ 9644\n\ 9645Apply .capitalize() to all words in S and return the result with\n\ 9646normalized whitespace (all whitespace strings are replaced by ' ')."); 9647 9648static PyObject* 9649unicode_capwords(PyUnicodeObject *self) 9650{ 9651 PyObject *list; 9652 PyObject *item; 9653 Py_ssize_t i; 9654 9655 /* Split into words */ 9656 list = split(self, NULL, -1); 9657 if (!list) 9658 return NULL; 9659 9660 /* Capitalize each word */ 9661 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9662 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9663 fixcapitalize); 9664 if (item == NULL) 9665 goto onError; 9666 Py_DECREF(PyList_GET_ITEM(list, i)); 9667 PyList_SET_ITEM(list, i, item); 9668 } 9669 9670 /* Join the words to form a new string */ 9671 item = PyUnicode_Join(NULL, list); 9672 9673 onError: 9674 Py_DECREF(list); 9675 return (PyObject *)item; 9676} 9677#endif 9678 9679/* Argument converter. Coerces to a single unicode character */ 9680 9681static int 9682convert_uc(PyObject *obj, void *addr) 9683{ 9684 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9685 PyObject *uniobj; 9686 9687 uniobj = PyUnicode_FromObject(obj); 9688 if (uniobj == NULL) { 9689 PyErr_SetString(PyExc_TypeError, 9690 "The fill character cannot be converted to Unicode"); 9691 return 0; 9692 } 9693 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9694 PyErr_SetString(PyExc_TypeError, 9695 "The fill character must be exactly one character long"); 9696 Py_DECREF(uniobj); 9697 return 0; 9698 } 9699 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9700 Py_DECREF(uniobj); 9701 return 1; 9702} 9703 9704PyDoc_STRVAR(center__doc__, 9705 "S.center(width[, fillchar]) -> str\n\ 9706\n\ 9707Return S centered in a string of length width. Padding is\n\ 9708done using the specified fill character (default is a space)"); 9709 9710static PyObject * 9711unicode_center(PyUnicodeObject *self, PyObject *args) 9712{ 9713 Py_ssize_t marg, left; 9714 Py_ssize_t width; 9715 Py_UCS4 fillchar = ' '; 9716 9717 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9718 return NULL; 9719 9720 if (PyUnicode_READY(self) == -1) 9721 return NULL; 9722 9723 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9724 Py_INCREF(self); 9725 return (PyObject*) self; 9726 } 9727 9728 marg = width - _PyUnicode_LENGTH(self); 9729 left = marg / 2 + (marg & width & 1); 9730 9731 return (PyObject*) pad(self, left, marg - left, fillchar); 9732} 9733 9734#if 0 9735 9736/* This code should go into some future Unicode collation support 9737 module. The basic comparison should compare ordinals on a naive 9738 basis (this is what Java does and thus Jython too). */ 9739 9740/* speedy UTF-16 code point order comparison */ 9741/* gleaned from: */ 9742/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9743 9744static short utf16Fixup[32] = 9745{ 9746 0, 0, 0, 0, 0, 0, 0, 0, 9747 0, 0, 0, 0, 0, 0, 0, 0, 9748 0, 0, 0, 0, 0, 0, 0, 0, 9749 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9750}; 9751 9752static int 9753unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9754{ 9755 Py_ssize_t len1, len2; 9756 9757 Py_UNICODE *s1 = str1->str; 9758 Py_UNICODE *s2 = str2->str; 9759 9760 len1 = str1->_base._base.length; 9761 len2 = str2->_base._base.length; 9762 9763 while (len1 > 0 && len2 > 0) { 9764 Py_UNICODE c1, c2; 9765 9766 c1 = *s1++; 9767 c2 = *s2++; 9768 9769 if (c1 > (1<<11) * 26) 9770 c1 += utf16Fixup[c1>>11]; 9771 if (c2 > (1<<11) * 26) 9772 c2 += utf16Fixup[c2>>11]; 9773 /* now c1 and c2 are in UTF-32-compatible order */ 9774 9775 if (c1 != c2) 9776 return (c1 < c2) ? -1 : 1; 9777 9778 len1--; len2--; 9779 } 9780 9781 return (len1 < len2) ? -1 : (len1 != len2); 9782} 9783 9784#else 9785 9786/* This function assumes that str1 and str2 are readied by the caller. */ 9787 9788static int 9789unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9790{ 9791 int kind1, kind2; 9792 void *data1, *data2; 9793 Py_ssize_t len1, len2, i; 9794 9795 kind1 = PyUnicode_KIND(str1); 9796 kind2 = PyUnicode_KIND(str2); 9797 data1 = PyUnicode_DATA(str1); 9798 data2 = PyUnicode_DATA(str2); 9799 len1 = PyUnicode_GET_LENGTH(str1); 9800 len2 = PyUnicode_GET_LENGTH(str2); 9801 9802 for (i = 0; i < len1 && i < len2; ++i) { 9803 Py_UCS4 c1, c2; 9804 c1 = PyUnicode_READ(kind1, data1, i); 9805 c2 = PyUnicode_READ(kind2, data2, i); 9806 9807 if (c1 != c2) 9808 return (c1 < c2) ? -1 : 1; 9809 } 9810 9811 return (len1 < len2) ? -1 : (len1 != len2); 9812} 9813 9814#endif 9815 9816int 9817PyUnicode_Compare(PyObject *left, PyObject *right) 9818{ 9819 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9820 if (PyUnicode_READY(left) == -1 || 9821 PyUnicode_READY(right) == -1) 9822 return -1; 9823 return unicode_compare((PyUnicodeObject *)left, 9824 (PyUnicodeObject *)right); 9825 } 9826 PyErr_Format(PyExc_TypeError, 9827 "Can't compare %.100s and %.100s", 9828 left->ob_type->tp_name, 9829 right->ob_type->tp_name); 9830 return -1; 9831} 9832 9833int 9834PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9835{ 9836 Py_ssize_t i; 9837 int kind; 9838 void *data; 9839 Py_UCS4 chr; 9840 9841 assert(_PyUnicode_CHECK(uni)); 9842 if (PyUnicode_READY(uni) == -1) 9843 return -1; 9844 kind = PyUnicode_KIND(uni); 9845 data = PyUnicode_DATA(uni); 9846 /* Compare Unicode string and source character set string */ 9847 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9848 if (chr != str[i]) 9849 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9850 /* This check keeps Python strings that end in '\0' from comparing equal 9851 to C strings identical up to that point. */ 9852 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9853 return 1; /* uni is longer */ 9854 if (str[i]) 9855 return -1; /* str is longer */ 9856 return 0; 9857} 9858 9859 9860#define TEST_COND(cond) \ 9861 ((cond) ? Py_True : Py_False) 9862 9863PyObject * 9864PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 9865{ 9866 int result; 9867 9868 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9869 PyObject *v; 9870 if (PyUnicode_READY(left) == -1 || 9871 PyUnicode_READY(right) == -1) 9872 return NULL; 9873 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 9874 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 9875 if (op == Py_EQ) { 9876 Py_INCREF(Py_False); 9877 return Py_False; 9878 } 9879 if (op == Py_NE) { 9880 Py_INCREF(Py_True); 9881 return Py_True; 9882 } 9883 } 9884 if (left == right) 9885 result = 0; 9886 else 9887 result = unicode_compare((PyUnicodeObject *)left, 9888 (PyUnicodeObject *)right); 9889 9890 /* Convert the return value to a Boolean */ 9891 switch (op) { 9892 case Py_EQ: 9893 v = TEST_COND(result == 0); 9894 break; 9895 case Py_NE: 9896 v = TEST_COND(result != 0); 9897 break; 9898 case Py_LE: 9899 v = TEST_COND(result <= 0); 9900 break; 9901 case Py_GE: 9902 v = TEST_COND(result >= 0); 9903 break; 9904 case Py_LT: 9905 v = TEST_COND(result == -1); 9906 break; 9907 case Py_GT: 9908 v = TEST_COND(result == 1); 9909 break; 9910 default: 9911 PyErr_BadArgument(); 9912 return NULL; 9913 } 9914 Py_INCREF(v); 9915 return v; 9916 } 9917 9918 Py_RETURN_NOTIMPLEMENTED; 9919} 9920 9921int 9922PyUnicode_Contains(PyObject *container, PyObject *element) 9923{ 9924 PyObject *str, *sub; 9925 int kind1, kind2, kind; 9926 void *buf1, *buf2; 9927 Py_ssize_t len1, len2; 9928 int result; 9929 9930 /* Coerce the two arguments */ 9931 sub = PyUnicode_FromObject(element); 9932 if (!sub) { 9933 PyErr_Format(PyExc_TypeError, 9934 "'in <string>' requires string as left operand, not %s", 9935 element->ob_type->tp_name); 9936 return -1; 9937 } 9938 if (PyUnicode_READY(sub) == -1) 9939 return -1; 9940 9941 str = PyUnicode_FromObject(container); 9942 if (!str || PyUnicode_READY(str) == -1) { 9943 Py_DECREF(sub); 9944 return -1; 9945 } 9946 9947 kind1 = PyUnicode_KIND(str); 9948 kind2 = PyUnicode_KIND(sub); 9949 kind = kind1 > kind2 ? kind1 : kind2; 9950 buf1 = PyUnicode_DATA(str); 9951 buf2 = PyUnicode_DATA(sub); 9952 if (kind1 != kind) 9953 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 9954 if (!buf1) { 9955 Py_DECREF(sub); 9956 return -1; 9957 } 9958 if (kind2 != kind) 9959 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 9960 if (!buf2) { 9961 Py_DECREF(sub); 9962 if (kind1 != kind) PyMem_Free(buf1); 9963 return -1; 9964 } 9965 len1 = PyUnicode_GET_LENGTH(str); 9966 len2 = PyUnicode_GET_LENGTH(sub); 9967 9968 switch(kind) { 9969 case PyUnicode_1BYTE_KIND: 9970 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 9971 break; 9972 case PyUnicode_2BYTE_KIND: 9973 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 9974 break; 9975 case PyUnicode_4BYTE_KIND: 9976 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 9977 break; 9978 default: 9979 result = -1; 9980 assert(0); 9981 } 9982 9983 Py_DECREF(str); 9984 Py_DECREF(sub); 9985 9986 if (kind1 != kind) 9987 PyMem_Free(buf1); 9988 if (kind2 != kind) 9989 PyMem_Free(buf2); 9990 9991 return result; 9992} 9993 9994/* Concat to string or Unicode object giving a new Unicode object. */ 9995 9996PyObject * 9997PyUnicode_Concat(PyObject *left, PyObject *right) 9998{ 9999 PyObject *u = NULL, *v = NULL, *w; 10000 Py_UCS4 maxchar; 10001 10002 /* Coerce the two arguments */ 10003 u = PyUnicode_FromObject(left); 10004 if (u == NULL) 10005 goto onError; 10006 v = PyUnicode_FromObject(right); 10007 if (v == NULL) 10008 goto onError; 10009 10010 /* Shortcuts */ 10011 if (v == unicode_empty) { 10012 Py_DECREF(v); 10013 return u; 10014 } 10015 if (u == unicode_empty) { 10016 Py_DECREF(u); 10017 return v; 10018 } 10019 10020 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10021 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 10022 10023 /* Concat the two Unicode strings */ 10024 w = PyUnicode_New( 10025 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10026 maxchar); 10027 if (w == NULL) 10028 goto onError; 10029 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) 10030 goto onError; 10031 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), 10032 v, 0, 10033 PyUnicode_GET_LENGTH(v)) < 0) 10034 goto onError; 10035 Py_DECREF(u); 10036 Py_DECREF(v); 10037 return w; 10038 10039 onError: 10040 Py_XDECREF(u); 10041 Py_XDECREF(v); 10042 return NULL; 10043} 10044 10045static void 10046unicode_append_inplace(PyObject **p_left, PyObject *right) 10047{ 10048 Py_ssize_t left_len, right_len, new_len; 10049#ifdef Py_DEBUG 10050 Py_ssize_t copied; 10051#endif 10052 10053 assert(PyUnicode_IS_READY(*p_left)); 10054 assert(PyUnicode_IS_READY(right)); 10055 10056 left_len = PyUnicode_GET_LENGTH(*p_left); 10057 right_len = PyUnicode_GET_LENGTH(right); 10058 if (left_len > PY_SSIZE_T_MAX - right_len) { 10059 PyErr_SetString(PyExc_OverflowError, 10060 "strings are too large to concat"); 10061 goto error; 10062 } 10063 new_len = left_len + right_len; 10064 10065 /* Now we own the last reference to 'left', so we can resize it 10066 * in-place. 10067 */ 10068 if (unicode_resize(p_left, new_len) != 0) { 10069 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10070 * deallocated so it cannot be put back into 10071 * 'variable'. The MemoryError is raised when there 10072 * is no value in 'variable', which might (very 10073 * remotely) be a cause of incompatibilities. 10074 */ 10075 goto error; 10076 } 10077 /* copy 'right' into the newly allocated area of 'left' */ 10078#ifdef Py_DEBUG 10079 copied = PyUnicode_CopyCharacters(*p_left, left_len, 10080 right, 0, 10081 right_len); 10082 assert(0 <= copied); 10083#else 10084 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len); 10085#endif 10086 return; 10087 10088error: 10089 Py_DECREF(*p_left); 10090 *p_left = NULL; 10091} 10092 10093void 10094PyUnicode_Append(PyObject **p_left, PyObject *right) 10095{ 10096 PyObject *left, *res; 10097 10098 if (p_left == NULL) { 10099 if (!PyErr_Occurred()) 10100 PyErr_BadInternalCall(); 10101 return; 10102 } 10103 left = *p_left; 10104 if (right == NULL || !PyUnicode_Check(left)) { 10105 if (!PyErr_Occurred()) 10106 PyErr_BadInternalCall(); 10107 goto error; 10108 } 10109 10110 if (PyUnicode_READY(left)) 10111 goto error; 10112 if (PyUnicode_READY(right)) 10113 goto error; 10114 10115 if (PyUnicode_CheckExact(left) && left != unicode_empty 10116 && PyUnicode_CheckExact(right) && right != unicode_empty 10117 && unicode_resizable(left) 10118 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10119 || _PyUnicode_WSTR(left) != NULL)) 10120 { 10121 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10122 to change the structure size, but characters are stored just after 10123 the structure, and so it requires to move all charactres which is 10124 not so different than duplicating the string. */ 10125 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10126 { 10127 unicode_append_inplace(p_left, right); 10128 return; 10129 } 10130 } 10131 10132 res = PyUnicode_Concat(left, right); 10133 if (res == NULL) 10134 goto error; 10135 Py_DECREF(left); 10136 *p_left = res; 10137 return; 10138 10139error: 10140 Py_DECREF(*p_left); 10141 *p_left = NULL; 10142} 10143 10144void 10145PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10146{ 10147 PyUnicode_Append(pleft, right); 10148 Py_XDECREF(right); 10149} 10150 10151PyDoc_STRVAR(count__doc__, 10152 "S.count(sub[, start[, end]]) -> int\n\ 10153\n\ 10154Return the number of non-overlapping occurrences of substring sub in\n\ 10155string S[start:end]. Optional arguments start and end are\n\ 10156interpreted as in slice notation."); 10157 10158static PyObject * 10159unicode_count(PyUnicodeObject *self, PyObject *args) 10160{ 10161 PyUnicodeObject *substring; 10162 Py_ssize_t start = 0; 10163 Py_ssize_t end = PY_SSIZE_T_MAX; 10164 PyObject *result; 10165 int kind1, kind2, kind; 10166 void *buf1, *buf2; 10167 Py_ssize_t len1, len2, iresult; 10168 10169 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10170 &start, &end)) 10171 return NULL; 10172 10173 kind1 = PyUnicode_KIND(self); 10174 kind2 = PyUnicode_KIND(substring); 10175 kind = kind1 > kind2 ? kind1 : kind2; 10176 buf1 = PyUnicode_DATA(self); 10177 buf2 = PyUnicode_DATA(substring); 10178 if (kind1 != kind) 10179 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10180 if (!buf1) { 10181 Py_DECREF(substring); 10182 return NULL; 10183 } 10184 if (kind2 != kind) 10185 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10186 if (!buf2) { 10187 Py_DECREF(substring); 10188 if (kind1 != kind) PyMem_Free(buf1); 10189 return NULL; 10190 } 10191 len1 = PyUnicode_GET_LENGTH(self); 10192 len2 = PyUnicode_GET_LENGTH(substring); 10193 10194 ADJUST_INDICES(start, end, len1); 10195 switch(kind) { 10196 case PyUnicode_1BYTE_KIND: 10197 iresult = ucs1lib_count( 10198 ((Py_UCS1*)buf1) + start, end - start, 10199 buf2, len2, PY_SSIZE_T_MAX 10200 ); 10201 break; 10202 case PyUnicode_2BYTE_KIND: 10203 iresult = ucs2lib_count( 10204 ((Py_UCS2*)buf1) + start, end - start, 10205 buf2, len2, PY_SSIZE_T_MAX 10206 ); 10207 break; 10208 case PyUnicode_4BYTE_KIND: 10209 iresult = ucs4lib_count( 10210 ((Py_UCS4*)buf1) + start, end - start, 10211 buf2, len2, PY_SSIZE_T_MAX 10212 ); 10213 break; 10214 default: 10215 assert(0); iresult = 0; 10216 } 10217 10218 result = PyLong_FromSsize_t(iresult); 10219 10220 if (kind1 != kind) 10221 PyMem_Free(buf1); 10222 if (kind2 != kind) 10223 PyMem_Free(buf2); 10224 10225 Py_DECREF(substring); 10226 10227 return result; 10228} 10229 10230PyDoc_STRVAR(encode__doc__, 10231 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10232\n\ 10233Encode S using the codec registered for encoding. Default encoding\n\ 10234is 'utf-8'. errors may be given to set a different error\n\ 10235handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10236a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10237'xmlcharrefreplace' as well as any other name registered with\n\ 10238codecs.register_error that can handle UnicodeEncodeErrors."); 10239 10240static PyObject * 10241unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10242{ 10243 static char *kwlist[] = {"encoding", "errors", 0}; 10244 char *encoding = NULL; 10245 char *errors = NULL; 10246 10247 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10248 kwlist, &encoding, &errors)) 10249 return NULL; 10250 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10251} 10252 10253PyDoc_STRVAR(expandtabs__doc__, 10254 "S.expandtabs([tabsize]) -> str\n\ 10255\n\ 10256Return a copy of S where all tab characters are expanded using spaces.\n\ 10257If tabsize is not given, a tab size of 8 characters is assumed."); 10258 10259static PyObject* 10260unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10261{ 10262 Py_ssize_t i, j, line_pos, src_len, incr; 10263 Py_UCS4 ch; 10264 PyObject *u; 10265 void *src_data, *dest_data; 10266 int tabsize = 8; 10267 int kind; 10268 int found; 10269 10270 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10271 return NULL; 10272 10273 if (PyUnicode_READY(self) == -1) 10274 return NULL; 10275 10276 /* First pass: determine size of output string */ 10277 src_len = PyUnicode_GET_LENGTH(self); 10278 i = j = line_pos = 0; 10279 kind = PyUnicode_KIND(self); 10280 src_data = PyUnicode_DATA(self); 10281 found = 0; 10282 for (; i < src_len; i++) { 10283 ch = PyUnicode_READ(kind, src_data, i); 10284 if (ch == '\t') { 10285 found = 1; 10286 if (tabsize > 0) { 10287 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10288 if (j > PY_SSIZE_T_MAX - incr) 10289 goto overflow; 10290 line_pos += incr; 10291 j += incr; 10292 } 10293 } 10294 else { 10295 if (j > PY_SSIZE_T_MAX - 1) 10296 goto overflow; 10297 line_pos++; 10298 j++; 10299 if (ch == '\n' || ch == '\r') 10300 line_pos = 0; 10301 } 10302 } 10303 if (!found && PyUnicode_CheckExact(self)) { 10304 Py_INCREF((PyObject *) self); 10305 return (PyObject *) self; 10306 } 10307 10308 /* Second pass: create output string and fill it */ 10309 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10310 if (!u) 10311 return NULL; 10312 dest_data = PyUnicode_DATA(u); 10313 10314 i = j = line_pos = 0; 10315 10316 for (; i < src_len; i++) { 10317 ch = PyUnicode_READ(kind, src_data, i); 10318 if (ch == '\t') { 10319 if (tabsize > 0) { 10320 incr = tabsize - (line_pos % tabsize); 10321 line_pos += incr; 10322 while (incr--) { 10323 PyUnicode_WRITE(kind, dest_data, j, ' '); 10324 j++; 10325 } 10326 } 10327 } 10328 else { 10329 line_pos++; 10330 PyUnicode_WRITE(kind, dest_data, j, ch); 10331 j++; 10332 if (ch == '\n' || ch == '\r') 10333 line_pos = 0; 10334 } 10335 } 10336 assert (j == PyUnicode_GET_LENGTH(u)); 10337#ifndef DONT_MAKE_RESULT_READY 10338 if (_PyUnicode_READY_REPLACE(&u)) { 10339 Py_DECREF(u); 10340 return NULL; 10341 } 10342#endif 10343 return (PyObject*) u; 10344 10345 overflow: 10346 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10347 return NULL; 10348} 10349 10350PyDoc_STRVAR(find__doc__, 10351 "S.find(sub[, start[, end]]) -> int\n\ 10352\n\ 10353Return the lowest index in S where substring sub is found,\n\ 10354such that sub is contained within S[start:end]. Optional\n\ 10355arguments start and end are interpreted as in slice notation.\n\ 10356\n\ 10357Return -1 on failure."); 10358 10359static PyObject * 10360unicode_find(PyObject *self, PyObject *args) 10361{ 10362 PyUnicodeObject *substring; 10363 Py_ssize_t start; 10364 Py_ssize_t end; 10365 Py_ssize_t result; 10366 10367 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10368 &start, &end)) 10369 return NULL; 10370 10371 if (PyUnicode_READY(self) == -1) 10372 return NULL; 10373 if (PyUnicode_READY(substring) == -1) 10374 return NULL; 10375 10376 result = any_find_slice( 10377 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10378 self, (PyObject*)substring, start, end 10379 ); 10380 10381 Py_DECREF(substring); 10382 10383 if (result == -2) 10384 return NULL; 10385 10386 return PyLong_FromSsize_t(result); 10387} 10388 10389static PyObject * 10390unicode_getitem(PyObject *self, Py_ssize_t index) 10391{ 10392 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10393 if (ch == (Py_UCS4)-1) 10394 return NULL; 10395 return PyUnicode_FromOrdinal(ch); 10396} 10397 10398/* Believe it or not, this produces the same value for ASCII strings 10399 as bytes_hash(). */ 10400static Py_hash_t 10401unicode_hash(PyUnicodeObject *self) 10402{ 10403 Py_ssize_t len; 10404 Py_uhash_t x; 10405 10406 if (_PyUnicode_HASH(self) != -1) 10407 return _PyUnicode_HASH(self); 10408 if (PyUnicode_READY(self) == -1) 10409 return -1; 10410 len = PyUnicode_GET_LENGTH(self); 10411 10412 /* The hash function as a macro, gets expanded three times below. */ 10413#define HASH(P) \ 10414 x = (Py_uhash_t)*P << 7; \ 10415 while (--len >= 0) \ 10416 x = (1000003*x) ^ (Py_uhash_t)*P++; 10417 10418 switch (PyUnicode_KIND(self)) { 10419 case PyUnicode_1BYTE_KIND: { 10420 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10421 HASH(c); 10422 break; 10423 } 10424 case PyUnicode_2BYTE_KIND: { 10425 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10426 HASH(s); 10427 break; 10428 } 10429 default: { 10430 Py_UCS4 *l; 10431 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10432 "Impossible switch case in unicode_hash"); 10433 l = PyUnicode_4BYTE_DATA(self); 10434 HASH(l); 10435 break; 10436 } 10437 } 10438 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10439 10440 if (x == -1) 10441 x = -2; 10442 _PyUnicode_HASH(self) = x; 10443 return x; 10444} 10445#undef HASH 10446 10447PyDoc_STRVAR(index__doc__, 10448 "S.index(sub[, start[, end]]) -> int\n\ 10449\n\ 10450Like S.find() but raise ValueError when the substring is not found."); 10451 10452static PyObject * 10453unicode_index(PyObject *self, PyObject *args) 10454{ 10455 Py_ssize_t result; 10456 PyUnicodeObject *substring; 10457 Py_ssize_t start; 10458 Py_ssize_t end; 10459 10460 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10461 &start, &end)) 10462 return NULL; 10463 10464 if (PyUnicode_READY(self) == -1) 10465 return NULL; 10466 if (PyUnicode_READY(substring) == -1) 10467 return NULL; 10468 10469 result = any_find_slice( 10470 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10471 self, (PyObject*)substring, start, end 10472 ); 10473 10474 Py_DECREF(substring); 10475 10476 if (result == -2) 10477 return NULL; 10478 10479 if (result < 0) { 10480 PyErr_SetString(PyExc_ValueError, "substring not found"); 10481 return NULL; 10482 } 10483 10484 return PyLong_FromSsize_t(result); 10485} 10486 10487PyDoc_STRVAR(islower__doc__, 10488 "S.islower() -> bool\n\ 10489\n\ 10490Return True if all cased characters in S are lowercase and there is\n\ 10491at least one cased character in S, False otherwise."); 10492 10493static PyObject* 10494unicode_islower(PyUnicodeObject *self) 10495{ 10496 Py_ssize_t i, length; 10497 int kind; 10498 void *data; 10499 int cased; 10500 10501 if (PyUnicode_READY(self) == -1) 10502 return NULL; 10503 length = PyUnicode_GET_LENGTH(self); 10504 kind = PyUnicode_KIND(self); 10505 data = PyUnicode_DATA(self); 10506 10507 /* Shortcut for single character strings */ 10508 if (length == 1) 10509 return PyBool_FromLong( 10510 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10511 10512 /* Special case for empty strings */ 10513 if (length == 0) 10514 return PyBool_FromLong(0); 10515 10516 cased = 0; 10517 for (i = 0; i < length; i++) { 10518 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10519 10520 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10521 return PyBool_FromLong(0); 10522 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10523 cased = 1; 10524 } 10525 return PyBool_FromLong(cased); 10526} 10527 10528PyDoc_STRVAR(isupper__doc__, 10529 "S.isupper() -> bool\n\ 10530\n\ 10531Return True if all cased characters in S are uppercase and there is\n\ 10532at least one cased character in S, False otherwise."); 10533 10534static PyObject* 10535unicode_isupper(PyUnicodeObject *self) 10536{ 10537 Py_ssize_t i, length; 10538 int kind; 10539 void *data; 10540 int cased; 10541 10542 if (PyUnicode_READY(self) == -1) 10543 return NULL; 10544 length = PyUnicode_GET_LENGTH(self); 10545 kind = PyUnicode_KIND(self); 10546 data = PyUnicode_DATA(self); 10547 10548 /* Shortcut for single character strings */ 10549 if (length == 1) 10550 return PyBool_FromLong( 10551 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10552 10553 /* Special case for empty strings */ 10554 if (length == 0) 10555 return PyBool_FromLong(0); 10556 10557 cased = 0; 10558 for (i = 0; i < length; i++) { 10559 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10560 10561 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10562 return PyBool_FromLong(0); 10563 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10564 cased = 1; 10565 } 10566 return PyBool_FromLong(cased); 10567} 10568 10569PyDoc_STRVAR(istitle__doc__, 10570 "S.istitle() -> bool\n\ 10571\n\ 10572Return True if S is a titlecased string and there is at least one\n\ 10573character in S, i.e. upper- and titlecase characters may only\n\ 10574follow uncased characters and lowercase characters only cased ones.\n\ 10575Return False otherwise."); 10576 10577static PyObject* 10578unicode_istitle(PyUnicodeObject *self) 10579{ 10580 Py_ssize_t i, length; 10581 int kind; 10582 void *data; 10583 int cased, previous_is_cased; 10584 10585 if (PyUnicode_READY(self) == -1) 10586 return NULL; 10587 length = PyUnicode_GET_LENGTH(self); 10588 kind = PyUnicode_KIND(self); 10589 data = PyUnicode_DATA(self); 10590 10591 /* Shortcut for single character strings */ 10592 if (length == 1) { 10593 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10594 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10595 (Py_UNICODE_ISUPPER(ch) != 0)); 10596 } 10597 10598 /* Special case for empty strings */ 10599 if (length == 0) 10600 return PyBool_FromLong(0); 10601 10602 cased = 0; 10603 previous_is_cased = 0; 10604 for (i = 0; i < length; i++) { 10605 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10606 10607 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10608 if (previous_is_cased) 10609 return PyBool_FromLong(0); 10610 previous_is_cased = 1; 10611 cased = 1; 10612 } 10613 else if (Py_UNICODE_ISLOWER(ch)) { 10614 if (!previous_is_cased) 10615 return PyBool_FromLong(0); 10616 previous_is_cased = 1; 10617 cased = 1; 10618 } 10619 else 10620 previous_is_cased = 0; 10621 } 10622 return PyBool_FromLong(cased); 10623} 10624 10625PyDoc_STRVAR(isspace__doc__, 10626 "S.isspace() -> bool\n\ 10627\n\ 10628Return True if all characters in S are whitespace\n\ 10629and there is at least one character in S, False otherwise."); 10630 10631static PyObject* 10632unicode_isspace(PyUnicodeObject *self) 10633{ 10634 Py_ssize_t i, length; 10635 int kind; 10636 void *data; 10637 10638 if (PyUnicode_READY(self) == -1) 10639 return NULL; 10640 length = PyUnicode_GET_LENGTH(self); 10641 kind = PyUnicode_KIND(self); 10642 data = PyUnicode_DATA(self); 10643 10644 /* Shortcut for single character strings */ 10645 if (length == 1) 10646 return PyBool_FromLong( 10647 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10648 10649 /* Special case for empty strings */ 10650 if (length == 0) 10651 return PyBool_FromLong(0); 10652 10653 for (i = 0; i < length; i++) { 10654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10655 if (!Py_UNICODE_ISSPACE(ch)) 10656 return PyBool_FromLong(0); 10657 } 10658 return PyBool_FromLong(1); 10659} 10660 10661PyDoc_STRVAR(isalpha__doc__, 10662 "S.isalpha() -> bool\n\ 10663\n\ 10664Return True if all characters in S are alphabetic\n\ 10665and there is at least one character in S, False otherwise."); 10666 10667static PyObject* 10668unicode_isalpha(PyUnicodeObject *self) 10669{ 10670 Py_ssize_t i, length; 10671 int kind; 10672 void *data; 10673 10674 if (PyUnicode_READY(self) == -1) 10675 return NULL; 10676 length = PyUnicode_GET_LENGTH(self); 10677 kind = PyUnicode_KIND(self); 10678 data = PyUnicode_DATA(self); 10679 10680 /* Shortcut for single character strings */ 10681 if (length == 1) 10682 return PyBool_FromLong( 10683 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10684 10685 /* Special case for empty strings */ 10686 if (length == 0) 10687 return PyBool_FromLong(0); 10688 10689 for (i = 0; i < length; i++) { 10690 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10691 return PyBool_FromLong(0); 10692 } 10693 return PyBool_FromLong(1); 10694} 10695 10696PyDoc_STRVAR(isalnum__doc__, 10697 "S.isalnum() -> bool\n\ 10698\n\ 10699Return True if all characters in S are alphanumeric\n\ 10700and there is at least one character in S, False otherwise."); 10701 10702static PyObject* 10703unicode_isalnum(PyUnicodeObject *self) 10704{ 10705 int kind; 10706 void *data; 10707 Py_ssize_t len, i; 10708 10709 if (PyUnicode_READY(self) == -1) 10710 return NULL; 10711 10712 kind = PyUnicode_KIND(self); 10713 data = PyUnicode_DATA(self); 10714 len = PyUnicode_GET_LENGTH(self); 10715 10716 /* Shortcut for single character strings */ 10717 if (len == 1) { 10718 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10719 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10720 } 10721 10722 /* Special case for empty strings */ 10723 if (len == 0) 10724 return PyBool_FromLong(0); 10725 10726 for (i = 0; i < len; i++) { 10727 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10728 if (!Py_UNICODE_ISALNUM(ch)) 10729 return PyBool_FromLong(0); 10730 } 10731 return PyBool_FromLong(1); 10732} 10733 10734PyDoc_STRVAR(isdecimal__doc__, 10735 "S.isdecimal() -> bool\n\ 10736\n\ 10737Return True if there are only decimal characters in S,\n\ 10738False otherwise."); 10739 10740static PyObject* 10741unicode_isdecimal(PyUnicodeObject *self) 10742{ 10743 Py_ssize_t i, length; 10744 int kind; 10745 void *data; 10746 10747 if (PyUnicode_READY(self) == -1) 10748 return NULL; 10749 length = PyUnicode_GET_LENGTH(self); 10750 kind = PyUnicode_KIND(self); 10751 data = PyUnicode_DATA(self); 10752 10753 /* Shortcut for single character strings */ 10754 if (length == 1) 10755 return PyBool_FromLong( 10756 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10757 10758 /* Special case for empty strings */ 10759 if (length == 0) 10760 return PyBool_FromLong(0); 10761 10762 for (i = 0; i < length; i++) { 10763 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10764 return PyBool_FromLong(0); 10765 } 10766 return PyBool_FromLong(1); 10767} 10768 10769PyDoc_STRVAR(isdigit__doc__, 10770 "S.isdigit() -> bool\n\ 10771\n\ 10772Return True if all characters in S are digits\n\ 10773and there is at least one character in S, False otherwise."); 10774 10775static PyObject* 10776unicode_isdigit(PyUnicodeObject *self) 10777{ 10778 Py_ssize_t i, length; 10779 int kind; 10780 void *data; 10781 10782 if (PyUnicode_READY(self) == -1) 10783 return NULL; 10784 length = PyUnicode_GET_LENGTH(self); 10785 kind = PyUnicode_KIND(self); 10786 data = PyUnicode_DATA(self); 10787 10788 /* Shortcut for single character strings */ 10789 if (length == 1) { 10790 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10791 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10792 } 10793 10794 /* Special case for empty strings */ 10795 if (length == 0) 10796 return PyBool_FromLong(0); 10797 10798 for (i = 0; i < length; i++) { 10799 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10800 return PyBool_FromLong(0); 10801 } 10802 return PyBool_FromLong(1); 10803} 10804 10805PyDoc_STRVAR(isnumeric__doc__, 10806 "S.isnumeric() -> bool\n\ 10807\n\ 10808Return True if there are only numeric characters in S,\n\ 10809False otherwise."); 10810 10811static PyObject* 10812unicode_isnumeric(PyUnicodeObject *self) 10813{ 10814 Py_ssize_t i, length; 10815 int kind; 10816 void *data; 10817 10818 if (PyUnicode_READY(self) == -1) 10819 return NULL; 10820 length = PyUnicode_GET_LENGTH(self); 10821 kind = PyUnicode_KIND(self); 10822 data = PyUnicode_DATA(self); 10823 10824 /* Shortcut for single character strings */ 10825 if (length == 1) 10826 return PyBool_FromLong( 10827 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10828 10829 /* Special case for empty strings */ 10830 if (length == 0) 10831 return PyBool_FromLong(0); 10832 10833 for (i = 0; i < length; i++) { 10834 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10835 return PyBool_FromLong(0); 10836 } 10837 return PyBool_FromLong(1); 10838} 10839 10840int 10841PyUnicode_IsIdentifier(PyObject *self) 10842{ 10843 int kind; 10844 void *data; 10845 Py_ssize_t i; 10846 Py_UCS4 first; 10847 10848 if (PyUnicode_READY(self) == -1) { 10849 Py_FatalError("identifier not ready"); 10850 return 0; 10851 } 10852 10853 /* Special case for empty strings */ 10854 if (PyUnicode_GET_LENGTH(self) == 0) 10855 return 0; 10856 kind = PyUnicode_KIND(self); 10857 data = PyUnicode_DATA(self); 10858 10859 /* PEP 3131 says that the first character must be in 10860 XID_Start and subsequent characters in XID_Continue, 10861 and for the ASCII range, the 2.x rules apply (i.e 10862 start with letters and underscore, continue with 10863 letters, digits, underscore). However, given the current 10864 definition of XID_Start and XID_Continue, it is sufficient 10865 to check just for these, except that _ must be allowed 10866 as starting an identifier. */ 10867 first = PyUnicode_READ(kind, data, 0); 10868 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 10869 return 0; 10870 10871 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 10872 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 10873 return 0; 10874 return 1; 10875} 10876 10877PyDoc_STRVAR(isidentifier__doc__, 10878 "S.isidentifier() -> bool\n\ 10879\n\ 10880Return True if S is a valid identifier according\n\ 10881to the language definition."); 10882 10883static PyObject* 10884unicode_isidentifier(PyObject *self) 10885{ 10886 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 10887} 10888 10889PyDoc_STRVAR(isprintable__doc__, 10890 "S.isprintable() -> bool\n\ 10891\n\ 10892Return True if all characters in S are considered\n\ 10893printable in repr() or S is empty, False otherwise."); 10894 10895static PyObject* 10896unicode_isprintable(PyObject *self) 10897{ 10898 Py_ssize_t i, length; 10899 int kind; 10900 void *data; 10901 10902 if (PyUnicode_READY(self) == -1) 10903 return NULL; 10904 length = PyUnicode_GET_LENGTH(self); 10905 kind = PyUnicode_KIND(self); 10906 data = PyUnicode_DATA(self); 10907 10908 /* Shortcut for single character strings */ 10909 if (length == 1) 10910 return PyBool_FromLong( 10911 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 10912 10913 for (i = 0; i < length; i++) { 10914 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 10915 Py_RETURN_FALSE; 10916 } 10917 } 10918 Py_RETURN_TRUE; 10919} 10920 10921PyDoc_STRVAR(join__doc__, 10922 "S.join(iterable) -> str\n\ 10923\n\ 10924Return a string which is the concatenation of the strings in the\n\ 10925iterable. The separator between elements is S."); 10926 10927static PyObject* 10928unicode_join(PyObject *self, PyObject *data) 10929{ 10930 return PyUnicode_Join(self, data); 10931} 10932 10933static Py_ssize_t 10934unicode_length(PyUnicodeObject *self) 10935{ 10936 if (PyUnicode_READY(self) == -1) 10937 return -1; 10938 return PyUnicode_GET_LENGTH(self); 10939} 10940 10941PyDoc_STRVAR(ljust__doc__, 10942 "S.ljust(width[, fillchar]) -> str\n\ 10943\n\ 10944Return S left-justified in a Unicode string of length width. Padding is\n\ 10945done using the specified fill character (default is a space)."); 10946 10947static PyObject * 10948unicode_ljust(PyUnicodeObject *self, PyObject *args) 10949{ 10950 Py_ssize_t width; 10951 Py_UCS4 fillchar = ' '; 10952 10953 if (PyUnicode_READY(self) == -1) 10954 return NULL; 10955 10956 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 10957 return NULL; 10958 10959 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10960 Py_INCREF(self); 10961 return (PyObject*) self; 10962 } 10963 10964 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 10965} 10966 10967PyDoc_STRVAR(lower__doc__, 10968 "S.lower() -> str\n\ 10969\n\ 10970Return a copy of the string S converted to lowercase."); 10971 10972static PyObject* 10973unicode_lower(PyUnicodeObject *self) 10974{ 10975 return fixup(self, fixlower); 10976} 10977 10978#define LEFTSTRIP 0 10979#define RIGHTSTRIP 1 10980#define BOTHSTRIP 2 10981 10982/* Arrays indexed by above */ 10983static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 10984 10985#define STRIPNAME(i) (stripformat[i]+3) 10986 10987/* externally visible for str.strip(unicode) */ 10988PyObject * 10989_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 10990{ 10991 void *data; 10992 int kind; 10993 Py_ssize_t i, j, len; 10994 BLOOM_MASK sepmask; 10995 10996 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 10997 return NULL; 10998 10999 kind = PyUnicode_KIND(self); 11000 data = PyUnicode_DATA(self); 11001 len = PyUnicode_GET_LENGTH(self); 11002 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11003 PyUnicode_DATA(sepobj), 11004 PyUnicode_GET_LENGTH(sepobj)); 11005 11006 i = 0; 11007 if (striptype != RIGHTSTRIP) { 11008 while (i < len && 11009 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11010 i++; 11011 } 11012 } 11013 11014 j = len; 11015 if (striptype != LEFTSTRIP) { 11016 do { 11017 j--; 11018 } while (j >= i && 11019 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11020 j++; 11021 } 11022 11023 return PyUnicode_Substring((PyObject*)self, i, j); 11024} 11025 11026PyObject* 11027PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11028{ 11029 unsigned char *data; 11030 int kind; 11031 Py_ssize_t length; 11032 11033 if (PyUnicode_READY(self) == -1) 11034 return NULL; 11035 11036 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11037 11038 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11039 { 11040 if (PyUnicode_CheckExact(self)) { 11041 Py_INCREF(self); 11042 return self; 11043 } 11044 else 11045 return PyUnicode_Copy(self); 11046 } 11047 11048 length = end - start; 11049 if (length == 1) 11050 return unicode_getitem(self, start); 11051 11052 if (start < 0 || end < 0) { 11053 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11054 return NULL; 11055 } 11056 11057 if (PyUnicode_IS_ASCII(self)) { 11058 kind = PyUnicode_KIND(self); 11059 data = PyUnicode_1BYTE_DATA(self); 11060 return unicode_fromascii(data + start, length); 11061 } 11062 else { 11063 kind = PyUnicode_KIND(self); 11064 data = PyUnicode_1BYTE_DATA(self); 11065 return PyUnicode_FromKindAndData(kind, 11066 data + PyUnicode_KIND_SIZE(kind, start), 11067 length); 11068 } 11069} 11070 11071static PyObject * 11072do_strip(PyUnicodeObject *self, int striptype) 11073{ 11074 int kind; 11075 void *data; 11076 Py_ssize_t len, i, j; 11077 11078 if (PyUnicode_READY(self) == -1) 11079 return NULL; 11080 11081 kind = PyUnicode_KIND(self); 11082 data = PyUnicode_DATA(self); 11083 len = PyUnicode_GET_LENGTH(self); 11084 11085 i = 0; 11086 if (striptype != RIGHTSTRIP) { 11087 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11088 i++; 11089 } 11090 } 11091 11092 j = len; 11093 if (striptype != LEFTSTRIP) { 11094 do { 11095 j--; 11096 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11097 j++; 11098 } 11099 11100 return PyUnicode_Substring((PyObject*)self, i, j); 11101} 11102 11103 11104static PyObject * 11105do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 11106{ 11107 PyObject *sep = NULL; 11108 11109 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11110 return NULL; 11111 11112 if (sep != NULL && sep != Py_None) { 11113 if (PyUnicode_Check(sep)) 11114 return _PyUnicode_XStrip(self, striptype, sep); 11115 else { 11116 PyErr_Format(PyExc_TypeError, 11117 "%s arg must be None or str", 11118 STRIPNAME(striptype)); 11119 return NULL; 11120 } 11121 } 11122 11123 return do_strip(self, striptype); 11124} 11125 11126 11127PyDoc_STRVAR(strip__doc__, 11128 "S.strip([chars]) -> str\n\ 11129\n\ 11130Return a copy of the string S with leading and trailing\n\ 11131whitespace removed.\n\ 11132If chars is given and not None, remove characters in chars instead."); 11133 11134static PyObject * 11135unicode_strip(PyUnicodeObject *self, PyObject *args) 11136{ 11137 if (PyTuple_GET_SIZE(args) == 0) 11138 return do_strip(self, BOTHSTRIP); /* Common case */ 11139 else 11140 return do_argstrip(self, BOTHSTRIP, args); 11141} 11142 11143 11144PyDoc_STRVAR(lstrip__doc__, 11145 "S.lstrip([chars]) -> str\n\ 11146\n\ 11147Return a copy of the string S with leading whitespace removed.\n\ 11148If chars is given and not None, remove characters in chars instead."); 11149 11150static PyObject * 11151unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11152{ 11153 if (PyTuple_GET_SIZE(args) == 0) 11154 return do_strip(self, LEFTSTRIP); /* Common case */ 11155 else 11156 return do_argstrip(self, LEFTSTRIP, args); 11157} 11158 11159 11160PyDoc_STRVAR(rstrip__doc__, 11161 "S.rstrip([chars]) -> str\n\ 11162\n\ 11163Return a copy of the string S with trailing whitespace removed.\n\ 11164If chars is given and not None, remove characters in chars instead."); 11165 11166static PyObject * 11167unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11168{ 11169 if (PyTuple_GET_SIZE(args) == 0) 11170 return do_strip(self, RIGHTSTRIP); /* Common case */ 11171 else 11172 return do_argstrip(self, RIGHTSTRIP, args); 11173} 11174 11175 11176static PyObject* 11177unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11178{ 11179 PyUnicodeObject *u; 11180 Py_ssize_t nchars, n; 11181 11182 if (len < 1) { 11183 Py_INCREF(unicode_empty); 11184 return unicode_empty; 11185 } 11186 11187 if (len == 1 && PyUnicode_CheckExact(str)) { 11188 /* no repeat, return original string */ 11189 Py_INCREF(str); 11190 return (PyObject*) str; 11191 } 11192 11193 if (PyUnicode_READY(str) == -1) 11194 return NULL; 11195 11196 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11197 PyErr_SetString(PyExc_OverflowError, 11198 "repeated string is too long"); 11199 return NULL; 11200 } 11201 nchars = len * PyUnicode_GET_LENGTH(str); 11202 11203 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11204 if (!u) 11205 return NULL; 11206 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11207 11208 if (PyUnicode_GET_LENGTH(str) == 1) { 11209 const int kind = PyUnicode_KIND(str); 11210 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11211 void *to = PyUnicode_DATA(u); 11212 if (kind == PyUnicode_1BYTE_KIND) 11213 memset(to, (unsigned char)fill_char, len); 11214 else { 11215 for (n = 0; n < len; ++n) 11216 PyUnicode_WRITE(kind, to, n, fill_char); 11217 } 11218 } 11219 else { 11220 /* number of characters copied this far */ 11221 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11222 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11223 char *to = (char *) PyUnicode_DATA(u); 11224 Py_MEMCPY(to, PyUnicode_DATA(str), 11225 PyUnicode_GET_LENGTH(str) * char_size); 11226 while (done < nchars) { 11227 n = (done <= nchars-done) ? done : nchars-done; 11228 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11229 done += n; 11230 } 11231 } 11232 11233 return (PyObject*) u; 11234} 11235 11236PyObject * 11237PyUnicode_Replace(PyObject *obj, 11238 PyObject *subobj, 11239 PyObject *replobj, 11240 Py_ssize_t maxcount) 11241{ 11242 PyObject *self; 11243 PyObject *str1; 11244 PyObject *str2; 11245 PyObject *result; 11246 11247 self = PyUnicode_FromObject(obj); 11248 if (self == NULL || PyUnicode_READY(self) == -1) 11249 return NULL; 11250 str1 = PyUnicode_FromObject(subobj); 11251 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11252 Py_DECREF(self); 11253 return NULL; 11254 } 11255 str2 = PyUnicode_FromObject(replobj); 11256 if (str2 == NULL || PyUnicode_READY(str2)) { 11257 Py_DECREF(self); 11258 Py_DECREF(str1); 11259 return NULL; 11260 } 11261 result = replace(self, str1, str2, maxcount); 11262 Py_DECREF(self); 11263 Py_DECREF(str1); 11264 Py_DECREF(str2); 11265 return result; 11266} 11267 11268PyDoc_STRVAR(replace__doc__, 11269 "S.replace(old, new[, count]) -> str\n\ 11270\n\ 11271Return a copy of S with all occurrences of substring\n\ 11272old replaced by new. If the optional argument count is\n\ 11273given, only the first count occurrences are replaced."); 11274 11275static PyObject* 11276unicode_replace(PyObject *self, PyObject *args) 11277{ 11278 PyObject *str1; 11279 PyObject *str2; 11280 Py_ssize_t maxcount = -1; 11281 PyObject *result; 11282 11283 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11284 return NULL; 11285 if (!PyUnicode_READY(self) == -1) 11286 return NULL; 11287 str1 = PyUnicode_FromObject(str1); 11288 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11289 return NULL; 11290 str2 = PyUnicode_FromObject(str2); 11291 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11292 Py_DECREF(str1); 11293 return NULL; 11294 } 11295 11296 result = replace(self, str1, str2, maxcount); 11297 11298 Py_DECREF(str1); 11299 Py_DECREF(str2); 11300 return result; 11301} 11302 11303static PyObject * 11304unicode_repr(PyObject *unicode) 11305{ 11306 PyObject *repr; 11307 Py_ssize_t isize; 11308 Py_ssize_t osize, squote, dquote, i, o; 11309 Py_UCS4 max, quote; 11310 int ikind, okind; 11311 void *idata, *odata; 11312 11313 if (PyUnicode_READY(unicode) == -1) 11314 return NULL; 11315 11316 isize = PyUnicode_GET_LENGTH(unicode); 11317 idata = PyUnicode_DATA(unicode); 11318 11319 /* Compute length of output, quote characters, and 11320 maximum character */ 11321 osize = 2; /* quotes */ 11322 max = 127; 11323 squote = dquote = 0; 11324 ikind = PyUnicode_KIND(unicode); 11325 for (i = 0; i < isize; i++) { 11326 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11327 switch (ch) { 11328 case '\'': squote++; osize++; break; 11329 case '"': dquote++; osize++; break; 11330 case '\\': case '\t': case '\r': case '\n': 11331 osize += 2; break; 11332 default: 11333 /* Fast-path ASCII */ 11334 if (ch < ' ' || ch == 0x7f) 11335 osize += 4; /* \xHH */ 11336 else if (ch < 0x7f) 11337 osize++; 11338 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11339 osize++; 11340 max = ch > max ? ch : max; 11341 } 11342 else if (ch < 0x100) 11343 osize += 4; /* \xHH */ 11344 else if (ch < 0x10000) 11345 osize += 6; /* \uHHHH */ 11346 else 11347 osize += 10; /* \uHHHHHHHH */ 11348 } 11349 } 11350 11351 quote = '\''; 11352 if (squote) { 11353 if (dquote) 11354 /* Both squote and dquote present. Use squote, 11355 and escape them */ 11356 osize += squote; 11357 else 11358 quote = '"'; 11359 } 11360 11361 repr = PyUnicode_New(osize, max); 11362 if (repr == NULL) 11363 return NULL; 11364 okind = PyUnicode_KIND(repr); 11365 odata = PyUnicode_DATA(repr); 11366 11367 PyUnicode_WRITE(okind, odata, 0, quote); 11368 PyUnicode_WRITE(okind, odata, osize-1, quote); 11369 11370 for (i = 0, o = 1; i < isize; i++) { 11371 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11372 11373 /* Escape quotes and backslashes */ 11374 if ((ch == quote) || (ch == '\\')) { 11375 PyUnicode_WRITE(okind, odata, o++, '\\'); 11376 PyUnicode_WRITE(okind, odata, o++, ch); 11377 continue; 11378 } 11379 11380 /* Map special whitespace to '\t', \n', '\r' */ 11381 if (ch == '\t') { 11382 PyUnicode_WRITE(okind, odata, o++, '\\'); 11383 PyUnicode_WRITE(okind, odata, o++, 't'); 11384 } 11385 else if (ch == '\n') { 11386 PyUnicode_WRITE(okind, odata, o++, '\\'); 11387 PyUnicode_WRITE(okind, odata, o++, 'n'); 11388 } 11389 else if (ch == '\r') { 11390 PyUnicode_WRITE(okind, odata, o++, '\\'); 11391 PyUnicode_WRITE(okind, odata, o++, 'r'); 11392 } 11393 11394 /* Map non-printable US ASCII to '\xhh' */ 11395 else if (ch < ' ' || ch == 0x7F) { 11396 PyUnicode_WRITE(okind, odata, o++, '\\'); 11397 PyUnicode_WRITE(okind, odata, o++, 'x'); 11398 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11399 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11400 } 11401 11402 /* Copy ASCII characters as-is */ 11403 else if (ch < 0x7F) { 11404 PyUnicode_WRITE(okind, odata, o++, ch); 11405 } 11406 11407 /* Non-ASCII characters */ 11408 else { 11409 /* Map Unicode whitespace and control characters 11410 (categories Z* and C* except ASCII space) 11411 */ 11412 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11413 /* Map 8-bit characters to '\xhh' */ 11414 if (ch <= 0xff) { 11415 PyUnicode_WRITE(okind, odata, o++, '\\'); 11416 PyUnicode_WRITE(okind, odata, o++, 'x'); 11417 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11418 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11419 } 11420 /* Map 21-bit characters to '\U00xxxxxx' */ 11421 else if (ch >= 0x10000) { 11422 PyUnicode_WRITE(okind, odata, o++, '\\'); 11423 PyUnicode_WRITE(okind, odata, o++, 'U'); 11424 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11425 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11426 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11427 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11428 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11429 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11430 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11431 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11432 } 11433 /* Map 16-bit characters to '\uxxxx' */ 11434 else { 11435 PyUnicode_WRITE(okind, odata, o++, '\\'); 11436 PyUnicode_WRITE(okind, odata, o++, 'u'); 11437 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11438 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11439 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11440 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11441 } 11442 } 11443 /* Copy characters as-is */ 11444 else { 11445 PyUnicode_WRITE(okind, odata, o++, ch); 11446 } 11447 } 11448 } 11449 /* Closing quote already added at the beginning */ 11450 return repr; 11451} 11452 11453PyDoc_STRVAR(rfind__doc__, 11454 "S.rfind(sub[, start[, end]]) -> int\n\ 11455\n\ 11456Return the highest index in S where substring sub is found,\n\ 11457such that sub is contained within S[start:end]. Optional\n\ 11458arguments start and end are interpreted as in slice notation.\n\ 11459\n\ 11460Return -1 on failure."); 11461 11462static PyObject * 11463unicode_rfind(PyObject *self, PyObject *args) 11464{ 11465 PyUnicodeObject *substring; 11466 Py_ssize_t start; 11467 Py_ssize_t end; 11468 Py_ssize_t result; 11469 11470 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11471 &start, &end)) 11472 return NULL; 11473 11474 if (PyUnicode_READY(self) == -1) 11475 return NULL; 11476 if (PyUnicode_READY(substring) == -1) 11477 return NULL; 11478 11479 result = any_find_slice( 11480 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11481 self, (PyObject*)substring, start, end 11482 ); 11483 11484 Py_DECREF(substring); 11485 11486 if (result == -2) 11487 return NULL; 11488 11489 return PyLong_FromSsize_t(result); 11490} 11491 11492PyDoc_STRVAR(rindex__doc__, 11493 "S.rindex(sub[, start[, end]]) -> int\n\ 11494\n\ 11495Like S.rfind() but raise ValueError when the substring is not found."); 11496 11497static PyObject * 11498unicode_rindex(PyObject *self, PyObject *args) 11499{ 11500 PyUnicodeObject *substring; 11501 Py_ssize_t start; 11502 Py_ssize_t end; 11503 Py_ssize_t result; 11504 11505 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11506 &start, &end)) 11507 return NULL; 11508 11509 if (PyUnicode_READY(self) == -1) 11510 return NULL; 11511 if (PyUnicode_READY(substring) == -1) 11512 return NULL; 11513 11514 result = any_find_slice( 11515 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11516 self, (PyObject*)substring, start, end 11517 ); 11518 11519 Py_DECREF(substring); 11520 11521 if (result == -2) 11522 return NULL; 11523 11524 if (result < 0) { 11525 PyErr_SetString(PyExc_ValueError, "substring not found"); 11526 return NULL; 11527 } 11528 11529 return PyLong_FromSsize_t(result); 11530} 11531 11532PyDoc_STRVAR(rjust__doc__, 11533 "S.rjust(width[, fillchar]) -> str\n\ 11534\n\ 11535Return S right-justified in a string of length width. Padding is\n\ 11536done using the specified fill character (default is a space)."); 11537 11538static PyObject * 11539unicode_rjust(PyUnicodeObject *self, PyObject *args) 11540{ 11541 Py_ssize_t width; 11542 Py_UCS4 fillchar = ' '; 11543 11544 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11545 return NULL; 11546 11547 if (PyUnicode_READY(self) == -1) 11548 return NULL; 11549 11550 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11551 Py_INCREF(self); 11552 return (PyObject*) self; 11553 } 11554 11555 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11556} 11557 11558PyObject * 11559PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11560{ 11561 PyObject *result; 11562 11563 s = PyUnicode_FromObject(s); 11564 if (s == NULL) 11565 return NULL; 11566 if (sep != NULL) { 11567 sep = PyUnicode_FromObject(sep); 11568 if (sep == NULL) { 11569 Py_DECREF(s); 11570 return NULL; 11571 } 11572 } 11573 11574 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11575 11576 Py_DECREF(s); 11577 Py_XDECREF(sep); 11578 return result; 11579} 11580 11581PyDoc_STRVAR(split__doc__, 11582 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11583\n\ 11584Return a list of the words in S, using sep as the\n\ 11585delimiter string. If maxsplit is given, at most maxsplit\n\ 11586splits are done. If sep is not specified or is None, any\n\ 11587whitespace string is a separator and empty strings are\n\ 11588removed from the result."); 11589 11590static PyObject* 11591unicode_split(PyUnicodeObject *self, PyObject *args) 11592{ 11593 PyObject *substring = Py_None; 11594 Py_ssize_t maxcount = -1; 11595 11596 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11597 return NULL; 11598 11599 if (substring == Py_None) 11600 return split(self, NULL, maxcount); 11601 else if (PyUnicode_Check(substring)) 11602 return split(self, (PyUnicodeObject *)substring, maxcount); 11603 else 11604 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11605} 11606 11607PyObject * 11608PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11609{ 11610 PyObject* str_obj; 11611 PyObject* sep_obj; 11612 PyObject* out; 11613 int kind1, kind2, kind; 11614 void *buf1 = NULL, *buf2 = NULL; 11615 Py_ssize_t len1, len2; 11616 11617 str_obj = PyUnicode_FromObject(str_in); 11618 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11619 return NULL; 11620 sep_obj = PyUnicode_FromObject(sep_in); 11621 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11622 Py_DECREF(str_obj); 11623 return NULL; 11624 } 11625 11626 kind1 = PyUnicode_KIND(str_in); 11627 kind2 = PyUnicode_KIND(sep_obj); 11628 kind = kind1 > kind2 ? kind1 : kind2; 11629 buf1 = PyUnicode_DATA(str_in); 11630 if (kind1 != kind) 11631 buf1 = _PyUnicode_AsKind(str_in, kind); 11632 if (!buf1) 11633 goto onError; 11634 buf2 = PyUnicode_DATA(sep_obj); 11635 if (kind2 != kind) 11636 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11637 if (!buf2) 11638 goto onError; 11639 len1 = PyUnicode_GET_LENGTH(str_obj); 11640 len2 = PyUnicode_GET_LENGTH(sep_obj); 11641 11642 switch(PyUnicode_KIND(str_in)) { 11643 case PyUnicode_1BYTE_KIND: 11644 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11645 break; 11646 case PyUnicode_2BYTE_KIND: 11647 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11648 break; 11649 case PyUnicode_4BYTE_KIND: 11650 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11651 break; 11652 default: 11653 assert(0); 11654 out = 0; 11655 } 11656 11657 Py_DECREF(sep_obj); 11658 Py_DECREF(str_obj); 11659 if (kind1 != kind) 11660 PyMem_Free(buf1); 11661 if (kind2 != kind) 11662 PyMem_Free(buf2); 11663 11664 return out; 11665 onError: 11666 Py_DECREF(sep_obj); 11667 Py_DECREF(str_obj); 11668 if (kind1 != kind && buf1) 11669 PyMem_Free(buf1); 11670 if (kind2 != kind && buf2) 11671 PyMem_Free(buf2); 11672 return NULL; 11673} 11674 11675 11676PyObject * 11677PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11678{ 11679 PyObject* str_obj; 11680 PyObject* sep_obj; 11681 PyObject* out; 11682 int kind1, kind2, kind; 11683 void *buf1 = NULL, *buf2 = NULL; 11684 Py_ssize_t len1, len2; 11685 11686 str_obj = PyUnicode_FromObject(str_in); 11687 if (!str_obj) 11688 return NULL; 11689 sep_obj = PyUnicode_FromObject(sep_in); 11690 if (!sep_obj) { 11691 Py_DECREF(str_obj); 11692 return NULL; 11693 } 11694 11695 kind1 = PyUnicode_KIND(str_in); 11696 kind2 = PyUnicode_KIND(sep_obj); 11697 kind = Py_MAX(kind1, kind2); 11698 buf1 = PyUnicode_DATA(str_in); 11699 if (kind1 != kind) 11700 buf1 = _PyUnicode_AsKind(str_in, kind); 11701 if (!buf1) 11702 goto onError; 11703 buf2 = PyUnicode_DATA(sep_obj); 11704 if (kind2 != kind) 11705 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11706 if (!buf2) 11707 goto onError; 11708 len1 = PyUnicode_GET_LENGTH(str_obj); 11709 len2 = PyUnicode_GET_LENGTH(sep_obj); 11710 11711 switch(PyUnicode_KIND(str_in)) { 11712 case PyUnicode_1BYTE_KIND: 11713 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11714 break; 11715 case PyUnicode_2BYTE_KIND: 11716 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11717 break; 11718 case PyUnicode_4BYTE_KIND: 11719 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11720 break; 11721 default: 11722 assert(0); 11723 out = 0; 11724 } 11725 11726 Py_DECREF(sep_obj); 11727 Py_DECREF(str_obj); 11728 if (kind1 != kind) 11729 PyMem_Free(buf1); 11730 if (kind2 != kind) 11731 PyMem_Free(buf2); 11732 11733 return out; 11734 onError: 11735 Py_DECREF(sep_obj); 11736 Py_DECREF(str_obj); 11737 if (kind1 != kind && buf1) 11738 PyMem_Free(buf1); 11739 if (kind2 != kind && buf2) 11740 PyMem_Free(buf2); 11741 return NULL; 11742} 11743 11744PyDoc_STRVAR(partition__doc__, 11745 "S.partition(sep) -> (head, sep, tail)\n\ 11746\n\ 11747Search for the separator sep in S, and return the part before it,\n\ 11748the separator itself, and the part after it. If the separator is not\n\ 11749found, return S and two empty strings."); 11750 11751static PyObject* 11752unicode_partition(PyUnicodeObject *self, PyObject *separator) 11753{ 11754 return PyUnicode_Partition((PyObject *)self, separator); 11755} 11756 11757PyDoc_STRVAR(rpartition__doc__, 11758 "S.rpartition(sep) -> (head, sep, tail)\n\ 11759\n\ 11760Search for the separator sep in S, starting at the end of S, and return\n\ 11761the part before it, the separator itself, and the part after it. If the\n\ 11762separator is not found, return two empty strings and S."); 11763 11764static PyObject* 11765unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 11766{ 11767 return PyUnicode_RPartition((PyObject *)self, separator); 11768} 11769 11770PyObject * 11771PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11772{ 11773 PyObject *result; 11774 11775 s = PyUnicode_FromObject(s); 11776 if (s == NULL) 11777 return NULL; 11778 if (sep != NULL) { 11779 sep = PyUnicode_FromObject(sep); 11780 if (sep == NULL) { 11781 Py_DECREF(s); 11782 return NULL; 11783 } 11784 } 11785 11786 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11787 11788 Py_DECREF(s); 11789 Py_XDECREF(sep); 11790 return result; 11791} 11792 11793PyDoc_STRVAR(rsplit__doc__, 11794 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11795\n\ 11796Return a list of the words in S, using sep as the\n\ 11797delimiter string, starting at the end of the string and\n\ 11798working to the front. If maxsplit is given, at most maxsplit\n\ 11799splits are done. If sep is not specified, any whitespace string\n\ 11800is a separator."); 11801 11802static PyObject* 11803unicode_rsplit(PyUnicodeObject *self, PyObject *args) 11804{ 11805 PyObject *substring = Py_None; 11806 Py_ssize_t maxcount = -1; 11807 11808 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11809 return NULL; 11810 11811 if (substring == Py_None) 11812 return rsplit(self, NULL, maxcount); 11813 else if (PyUnicode_Check(substring)) 11814 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 11815 else 11816 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 11817} 11818 11819PyDoc_STRVAR(splitlines__doc__, 11820 "S.splitlines([keepends]) -> list of strings\n\ 11821\n\ 11822Return a list of the lines in S, breaking at line boundaries.\n\ 11823Line breaks are not included in the resulting list unless keepends\n\ 11824is given and true."); 11825 11826static PyObject* 11827unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11828{ 11829 static char *kwlist[] = {"keepends", 0}; 11830 int keepends = 0; 11831 11832 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11833 kwlist, &keepends)) 11834 return NULL; 11835 11836 return PyUnicode_Splitlines((PyObject *)self, keepends); 11837} 11838 11839static 11840PyObject *unicode_str(PyObject *self) 11841{ 11842 if (PyUnicode_CheckExact(self)) { 11843 Py_INCREF(self); 11844 return self; 11845 } else 11846 /* Subtype -- return genuine unicode string with the same value. */ 11847 return PyUnicode_Copy(self); 11848} 11849 11850PyDoc_STRVAR(swapcase__doc__, 11851 "S.swapcase() -> str\n\ 11852\n\ 11853Return a copy of S with uppercase characters converted to lowercase\n\ 11854and vice versa."); 11855 11856static PyObject* 11857unicode_swapcase(PyUnicodeObject *self) 11858{ 11859 return fixup(self, fixswapcase); 11860} 11861 11862PyDoc_STRVAR(maketrans__doc__, 11863 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 11864\n\ 11865Return a translation table usable for str.translate().\n\ 11866If there is only one argument, it must be a dictionary mapping Unicode\n\ 11867ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 11868Character keys will be then converted to ordinals.\n\ 11869If there are two arguments, they must be strings of equal length, and\n\ 11870in the resulting dictionary, each character in x will be mapped to the\n\ 11871character at the same position in y. If there is a third argument, it\n\ 11872must be a string, whose characters will be mapped to None in the result."); 11873 11874static PyObject* 11875unicode_maketrans(PyUnicodeObject *null, PyObject *args) 11876{ 11877 PyObject *x, *y = NULL, *z = NULL; 11878 PyObject *new = NULL, *key, *value; 11879 Py_ssize_t i = 0; 11880 int res; 11881 11882 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 11883 return NULL; 11884 new = PyDict_New(); 11885 if (!new) 11886 return NULL; 11887 if (y != NULL) { 11888 int x_kind, y_kind, z_kind; 11889 void *x_data, *y_data, *z_data; 11890 11891 /* x must be a string too, of equal length */ 11892 if (!PyUnicode_Check(x)) { 11893 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 11894 "be a string if there is a second argument"); 11895 goto err; 11896 } 11897 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 11898 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 11899 "arguments must have equal length"); 11900 goto err; 11901 } 11902 /* create entries for translating chars in x to those in y */ 11903 x_kind = PyUnicode_KIND(x); 11904 y_kind = PyUnicode_KIND(y); 11905 x_data = PyUnicode_DATA(x); 11906 y_data = PyUnicode_DATA(y); 11907 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 11908 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 11909 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 11910 if (!key || !value) 11911 goto err; 11912 res = PyDict_SetItem(new, key, value); 11913 Py_DECREF(key); 11914 Py_DECREF(value); 11915 if (res < 0) 11916 goto err; 11917 } 11918 /* create entries for deleting chars in z */ 11919 if (z != NULL) { 11920 z_kind = PyUnicode_KIND(z); 11921 z_data = PyUnicode_DATA(z); 11922 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 11923 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 11924 if (!key) 11925 goto err; 11926 res = PyDict_SetItem(new, key, Py_None); 11927 Py_DECREF(key); 11928 if (res < 0) 11929 goto err; 11930 } 11931 } 11932 } else { 11933 int kind; 11934 void *data; 11935 11936 /* x must be a dict */ 11937 if (!PyDict_CheckExact(x)) { 11938 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 11939 "to maketrans it must be a dict"); 11940 goto err; 11941 } 11942 /* copy entries into the new dict, converting string keys to int keys */ 11943 while (PyDict_Next(x, &i, &key, &value)) { 11944 if (PyUnicode_Check(key)) { 11945 /* convert string keys to integer keys */ 11946 PyObject *newkey; 11947 if (PyUnicode_GET_SIZE(key) != 1) { 11948 PyErr_SetString(PyExc_ValueError, "string keys in translate " 11949 "table must be of length 1"); 11950 goto err; 11951 } 11952 kind = PyUnicode_KIND(key); 11953 data = PyUnicode_DATA(key); 11954 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 11955 if (!newkey) 11956 goto err; 11957 res = PyDict_SetItem(new, newkey, value); 11958 Py_DECREF(newkey); 11959 if (res < 0) 11960 goto err; 11961 } else if (PyLong_Check(key)) { 11962 /* just keep integer keys */ 11963 if (PyDict_SetItem(new, key, value) < 0) 11964 goto err; 11965 } else { 11966 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 11967 "be strings or integers"); 11968 goto err; 11969 } 11970 } 11971 } 11972 return new; 11973 err: 11974 Py_DECREF(new); 11975 return NULL; 11976} 11977 11978PyDoc_STRVAR(translate__doc__, 11979 "S.translate(table) -> str\n\ 11980\n\ 11981Return a copy of the string S, where all characters have been mapped\n\ 11982through the given translation table, which must be a mapping of\n\ 11983Unicode ordinals to Unicode ordinals, strings, or None.\n\ 11984Unmapped characters are left untouched. Characters mapped to None\n\ 11985are deleted."); 11986 11987static PyObject* 11988unicode_translate(PyObject *self, PyObject *table) 11989{ 11990 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 11991} 11992 11993PyDoc_STRVAR(upper__doc__, 11994 "S.upper() -> str\n\ 11995\n\ 11996Return a copy of S converted to uppercase."); 11997 11998static PyObject* 11999unicode_upper(PyUnicodeObject *self) 12000{ 12001 return fixup(self, fixupper); 12002} 12003 12004PyDoc_STRVAR(zfill__doc__, 12005 "S.zfill(width) -> str\n\ 12006\n\ 12007Pad a numeric string S with zeros on the left, to fill a field\n\ 12008of the specified width. The string S is never truncated."); 12009 12010static PyObject * 12011unicode_zfill(PyUnicodeObject *self, PyObject *args) 12012{ 12013 Py_ssize_t fill; 12014 PyUnicodeObject *u; 12015 Py_ssize_t width; 12016 int kind; 12017 void *data; 12018 Py_UCS4 chr; 12019 12020 if (PyUnicode_READY(self) == -1) 12021 return NULL; 12022 12023 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12024 return NULL; 12025 12026 if (PyUnicode_GET_LENGTH(self) >= width) { 12027 if (PyUnicode_CheckExact(self)) { 12028 Py_INCREF(self); 12029 return (PyObject*) self; 12030 } 12031 else 12032 return PyUnicode_Copy((PyObject*)self); 12033 } 12034 12035 fill = width - _PyUnicode_LENGTH(self); 12036 12037 u = pad(self, fill, 0, '0'); 12038 12039 if (u == NULL) 12040 return NULL; 12041 12042 kind = PyUnicode_KIND(u); 12043 data = PyUnicode_DATA(u); 12044 chr = PyUnicode_READ(kind, data, fill); 12045 12046 if (chr == '+' || chr == '-') { 12047 /* move sign to beginning of string */ 12048 PyUnicode_WRITE(kind, data, 0, chr); 12049 PyUnicode_WRITE(kind, data, fill, '0'); 12050 } 12051 12052 return (PyObject*) u; 12053} 12054 12055#if 0 12056static PyObject * 12057unicode__decimal2ascii(PyObject *self) 12058{ 12059 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12060} 12061#endif 12062 12063PyDoc_STRVAR(startswith__doc__, 12064 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12065\n\ 12066Return True if S starts with the specified prefix, False otherwise.\n\ 12067With optional start, test S beginning at that position.\n\ 12068With optional end, stop comparing S at that position.\n\ 12069prefix can also be a tuple of strings to try."); 12070 12071static PyObject * 12072unicode_startswith(PyUnicodeObject *self, 12073 PyObject *args) 12074{ 12075 PyObject *subobj; 12076 PyUnicodeObject *substring; 12077 Py_ssize_t start = 0; 12078 Py_ssize_t end = PY_SSIZE_T_MAX; 12079 int result; 12080 12081 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12082 return NULL; 12083 if (PyTuple_Check(subobj)) { 12084 Py_ssize_t i; 12085 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12086 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12087 PyTuple_GET_ITEM(subobj, i)); 12088 if (substring == NULL) 12089 return NULL; 12090 result = tailmatch(self, substring, start, end, -1); 12091 Py_DECREF(substring); 12092 if (result) { 12093 Py_RETURN_TRUE; 12094 } 12095 } 12096 /* nothing matched */ 12097 Py_RETURN_FALSE; 12098 } 12099 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12100 if (substring == NULL) { 12101 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12102 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12103 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12104 return NULL; 12105 } 12106 result = tailmatch(self, substring, start, end, -1); 12107 Py_DECREF(substring); 12108 return PyBool_FromLong(result); 12109} 12110 12111 12112PyDoc_STRVAR(endswith__doc__, 12113 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12114\n\ 12115Return True if S ends with the specified suffix, False otherwise.\n\ 12116With optional start, test S beginning at that position.\n\ 12117With optional end, stop comparing S at that position.\n\ 12118suffix can also be a tuple of strings to try."); 12119 12120static PyObject * 12121unicode_endswith(PyUnicodeObject *self, 12122 PyObject *args) 12123{ 12124 PyObject *subobj; 12125 PyUnicodeObject *substring; 12126 Py_ssize_t start = 0; 12127 Py_ssize_t end = PY_SSIZE_T_MAX; 12128 int result; 12129 12130 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12131 return NULL; 12132 if (PyTuple_Check(subobj)) { 12133 Py_ssize_t i; 12134 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12135 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12136 PyTuple_GET_ITEM(subobj, i)); 12137 if (substring == NULL) 12138 return NULL; 12139 result = tailmatch(self, substring, start, end, +1); 12140 Py_DECREF(substring); 12141 if (result) { 12142 Py_RETURN_TRUE; 12143 } 12144 } 12145 Py_RETURN_FALSE; 12146 } 12147 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12148 if (substring == NULL) { 12149 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12150 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12151 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12152 return NULL; 12153 } 12154 result = tailmatch(self, substring, start, end, +1); 12155 Py_DECREF(substring); 12156 return PyBool_FromLong(result); 12157} 12158 12159#include "stringlib/unicode_format.h" 12160 12161PyDoc_STRVAR(format__doc__, 12162 "S.format(*args, **kwargs) -> str\n\ 12163\n\ 12164Return a formatted version of S, using substitutions from args and kwargs.\n\ 12165The substitutions are identified by braces ('{' and '}')."); 12166 12167PyDoc_STRVAR(format_map__doc__, 12168 "S.format_map(mapping) -> str\n\ 12169\n\ 12170Return a formatted version of S, using substitutions from mapping.\n\ 12171The substitutions are identified by braces ('{' and '}')."); 12172 12173static PyObject * 12174unicode__format__(PyObject* self, PyObject* args) 12175{ 12176 PyObject *format_spec; 12177 12178 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12179 return NULL; 12180 12181 return _PyUnicode_FormatAdvanced(self, format_spec, 0, 12182 PyUnicode_GET_LENGTH(format_spec)); 12183} 12184 12185PyDoc_STRVAR(p_format__doc__, 12186 "S.__format__(format_spec) -> str\n\ 12187\n\ 12188Return a formatted version of S as described by format_spec."); 12189 12190static PyObject * 12191unicode__sizeof__(PyUnicodeObject *v) 12192{ 12193 Py_ssize_t size; 12194 12195 /* If it's a compact object, account for base structure + 12196 character data. */ 12197 if (PyUnicode_IS_COMPACT_ASCII(v)) 12198 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12199 else if (PyUnicode_IS_COMPACT(v)) 12200 size = sizeof(PyCompactUnicodeObject) + 12201 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 12202 else { 12203 /* If it is a two-block object, account for base object, and 12204 for character block if present. */ 12205 size = sizeof(PyUnicodeObject); 12206 if (_PyUnicode_DATA_ANY(v)) 12207 size += (PyUnicode_GET_LENGTH(v) + 1) * 12208 PyUnicode_CHARACTER_SIZE(v); 12209 } 12210 /* If the wstr pointer is present, account for it unless it is shared 12211 with the data pointer. Check if the data is not shared. */ 12212 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12213 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12214 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12215 size += PyUnicode_UTF8_LENGTH(v) + 1; 12216 12217 return PyLong_FromSsize_t(size); 12218} 12219 12220PyDoc_STRVAR(sizeof__doc__, 12221 "S.__sizeof__() -> size of S in memory, in bytes"); 12222 12223static PyObject * 12224unicode_getnewargs(PyObject *v) 12225{ 12226 PyObject *copy = PyUnicode_Copy(v); 12227 if (!copy) 12228 return NULL; 12229 return Py_BuildValue("(N)", copy); 12230} 12231 12232static PyMethodDef unicode_methods[] = { 12233 12234 /* Order is according to common usage: often used methods should 12235 appear first, since lookup is done sequentially. */ 12236 12237 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12238 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12239 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12240 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12241 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12242 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12243 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12244 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12245 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12246 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12247 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12248 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12249 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12250 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12251 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12252 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12253 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12254 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12255 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12256 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12257 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12258 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12259 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12260 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12261 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12262 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12263 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12264 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12265 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12266 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12267 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12268 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12269 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12270 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12271 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12272 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12273 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12274 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12275 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12276 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12277 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12278 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12279 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12280 {"maketrans", (PyCFunction) unicode_maketrans, 12281 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12282 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12283#if 0 12284 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12285#endif 12286 12287#if 0 12288 /* These methods are just used for debugging the implementation. */ 12289 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12290#endif 12291 12292 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12293 {NULL, NULL} 12294}; 12295 12296static PyObject * 12297unicode_mod(PyObject *v, PyObject *w) 12298{ 12299 if (!PyUnicode_Check(v)) 12300 Py_RETURN_NOTIMPLEMENTED; 12301 return PyUnicode_Format(v, w); 12302} 12303 12304static PyNumberMethods unicode_as_number = { 12305 0, /*nb_add*/ 12306 0, /*nb_subtract*/ 12307 0, /*nb_multiply*/ 12308 unicode_mod, /*nb_remainder*/ 12309}; 12310 12311static PySequenceMethods unicode_as_sequence = { 12312 (lenfunc) unicode_length, /* sq_length */ 12313 PyUnicode_Concat, /* sq_concat */ 12314 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12315 (ssizeargfunc) unicode_getitem, /* sq_item */ 12316 0, /* sq_slice */ 12317 0, /* sq_ass_item */ 12318 0, /* sq_ass_slice */ 12319 PyUnicode_Contains, /* sq_contains */ 12320}; 12321 12322static PyObject* 12323unicode_subscript(PyUnicodeObject* self, PyObject* item) 12324{ 12325 if (PyUnicode_READY(self) == -1) 12326 return NULL; 12327 12328 if (PyIndex_Check(item)) { 12329 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12330 if (i == -1 && PyErr_Occurred()) 12331 return NULL; 12332 if (i < 0) 12333 i += PyUnicode_GET_LENGTH(self); 12334 return unicode_getitem((PyObject*)self, i); 12335 } else if (PySlice_Check(item)) { 12336 Py_ssize_t start, stop, step, slicelength, cur, i; 12337 PyObject *result; 12338 void *src_data, *dest_data; 12339 int src_kind, dest_kind; 12340 Py_UCS4 ch, max_char; 12341 12342 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12343 &start, &stop, &step, &slicelength) < 0) { 12344 return NULL; 12345 } 12346 12347 if (slicelength <= 0) { 12348 return PyUnicode_New(0, 0); 12349 } else if (start == 0 && step == 1 && 12350 slicelength == PyUnicode_GET_LENGTH(self) && 12351 PyUnicode_CheckExact(self)) { 12352 Py_INCREF(self); 12353 return (PyObject *)self; 12354 } else if (step == 1) { 12355 return PyUnicode_Substring((PyObject*)self, 12356 start, start + slicelength); 12357 } 12358 /* General case */ 12359 max_char = 127; 12360 src_kind = PyUnicode_KIND(self); 12361 src_data = PyUnicode_DATA(self); 12362 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12363 ch = PyUnicode_READ(src_kind, src_data, cur); 12364 if (ch > max_char) 12365 max_char = ch; 12366 } 12367 result = PyUnicode_New(slicelength, max_char); 12368 if (result == NULL) 12369 return NULL; 12370 dest_kind = PyUnicode_KIND(result); 12371 dest_data = PyUnicode_DATA(result); 12372 12373 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12374 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 12375 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 12376 } 12377 return result; 12378 } else { 12379 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12380 return NULL; 12381 } 12382} 12383 12384static PyMappingMethods unicode_as_mapping = { 12385 (lenfunc)unicode_length, /* mp_length */ 12386 (binaryfunc)unicode_subscript, /* mp_subscript */ 12387 (objobjargproc)0, /* mp_ass_subscript */ 12388}; 12389 12390 12391/* Helpers for PyUnicode_Format() */ 12392 12393static PyObject * 12394getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12395{ 12396 Py_ssize_t argidx = *p_argidx; 12397 if (argidx < arglen) { 12398 (*p_argidx)++; 12399 if (arglen < 0) 12400 return args; 12401 else 12402 return PyTuple_GetItem(args, argidx); 12403 } 12404 PyErr_SetString(PyExc_TypeError, 12405 "not enough arguments for format string"); 12406 return NULL; 12407} 12408 12409/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12410 12411static PyObject * 12412formatfloat(PyObject *v, int flags, int prec, int type) 12413{ 12414 char *p; 12415 PyObject *result; 12416 double x; 12417 12418 x = PyFloat_AsDouble(v); 12419 if (x == -1.0 && PyErr_Occurred()) 12420 return NULL; 12421 12422 if (prec < 0) 12423 prec = 6; 12424 12425 p = PyOS_double_to_string(x, type, prec, 12426 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12427 if (p == NULL) 12428 return NULL; 12429 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12430 PyMem_Free(p); 12431 return result; 12432} 12433 12434static PyObject* 12435formatlong(PyObject *val, int flags, int prec, int type) 12436{ 12437 char *buf; 12438 int len; 12439 PyObject *str; /* temporary string object. */ 12440 PyObject *result; 12441 12442 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12443 if (!str) 12444 return NULL; 12445 result = PyUnicode_DecodeASCII(buf, len, NULL); 12446 Py_DECREF(str); 12447 return result; 12448} 12449 12450static int 12451formatchar(Py_UCS4 *buf, 12452 size_t buflen, 12453 PyObject *v) 12454{ 12455 /* presume that the buffer is at least 3 characters long */ 12456 if (PyUnicode_Check(v)) { 12457 if (PyUnicode_GET_LENGTH(v) == 1) { 12458 buf[0] = PyUnicode_READ_CHAR(v, 0); 12459 buf[1] = '\0'; 12460 return 1; 12461 } 12462 goto onError; 12463 } 12464 else { 12465 /* Integer input truncated to a character */ 12466 long x; 12467 x = PyLong_AsLong(v); 12468 if (x == -1 && PyErr_Occurred()) 12469 goto onError; 12470 12471 if (x < 0 || x > 0x10ffff) { 12472 PyErr_SetString(PyExc_OverflowError, 12473 "%c arg not in range(0x110000)"); 12474 return -1; 12475 } 12476 12477 buf[0] = (Py_UCS4) x; 12478 buf[1] = '\0'; 12479 return 1; 12480 } 12481 12482 onError: 12483 PyErr_SetString(PyExc_TypeError, 12484 "%c requires int or char"); 12485 return -1; 12486} 12487 12488/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12489 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12490*/ 12491#define FORMATBUFLEN (size_t)10 12492 12493PyObject * 12494PyUnicode_Format(PyObject *format, PyObject *args) 12495{ 12496 void *fmt; 12497 int fmtkind; 12498 PyObject *result; 12499 Py_UCS4 *res, *res0; 12500 Py_UCS4 max; 12501 int kind; 12502 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12503 int args_owned = 0; 12504 PyObject *dict = NULL; 12505 PyUnicodeObject *uformat; 12506 12507 if (format == NULL || args == NULL) { 12508 PyErr_BadInternalCall(); 12509 return NULL; 12510 } 12511 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12512 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12513 return NULL; 12514 fmt = PyUnicode_DATA(uformat); 12515 fmtkind = PyUnicode_KIND(uformat); 12516 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12517 fmtpos = 0; 12518 12519 reslen = rescnt = fmtcnt + 100; 12520 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12521 if (res0 == NULL) { 12522 PyErr_NoMemory(); 12523 goto onError; 12524 } 12525 12526 if (PyTuple_Check(args)) { 12527 arglen = PyTuple_Size(args); 12528 argidx = 0; 12529 } 12530 else { 12531 arglen = -1; 12532 argidx = -2; 12533 } 12534 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12535 !PyUnicode_Check(args)) 12536 dict = args; 12537 12538 while (--fmtcnt >= 0) { 12539 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12540 if (--rescnt < 0) { 12541 rescnt = fmtcnt + 100; 12542 reslen += rescnt; 12543 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12544 if (res0 == NULL){ 12545 PyErr_NoMemory(); 12546 goto onError; 12547 } 12548 res = res0 + reslen - rescnt; 12549 --rescnt; 12550 } 12551 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12552 } 12553 else { 12554 /* Got a format specifier */ 12555 int flags = 0; 12556 Py_ssize_t width = -1; 12557 int prec = -1; 12558 Py_UCS4 c = '\0'; 12559 Py_UCS4 fill; 12560 int isnumok; 12561 PyObject *v = NULL; 12562 PyObject *temp = NULL; 12563 void *pbuf; 12564 Py_ssize_t pindex; 12565 Py_UNICODE sign; 12566 Py_ssize_t len, len1; 12567 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12568 12569 fmtpos++; 12570 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12571 Py_ssize_t keystart; 12572 Py_ssize_t keylen; 12573 PyObject *key; 12574 int pcount = 1; 12575 12576 if (dict == NULL) { 12577 PyErr_SetString(PyExc_TypeError, 12578 "format requires a mapping"); 12579 goto onError; 12580 } 12581 ++fmtpos; 12582 --fmtcnt; 12583 keystart = fmtpos; 12584 /* Skip over balanced parentheses */ 12585 while (pcount > 0 && --fmtcnt >= 0) { 12586 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12587 --pcount; 12588 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12589 ++pcount; 12590 fmtpos++; 12591 } 12592 keylen = fmtpos - keystart - 1; 12593 if (fmtcnt < 0 || pcount > 0) { 12594 PyErr_SetString(PyExc_ValueError, 12595 "incomplete format key"); 12596 goto onError; 12597 } 12598 key = PyUnicode_Substring((PyObject*)uformat, 12599 keystart, keystart + keylen); 12600 if (key == NULL) 12601 goto onError; 12602 if (args_owned) { 12603 Py_DECREF(args); 12604 args_owned = 0; 12605 } 12606 args = PyObject_GetItem(dict, key); 12607 Py_DECREF(key); 12608 if (args == NULL) { 12609 goto onError; 12610 } 12611 args_owned = 1; 12612 arglen = -1; 12613 argidx = -2; 12614 } 12615 while (--fmtcnt >= 0) { 12616 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12617 case '-': flags |= F_LJUST; continue; 12618 case '+': flags |= F_SIGN; continue; 12619 case ' ': flags |= F_BLANK; continue; 12620 case '#': flags |= F_ALT; continue; 12621 case '0': flags |= F_ZERO; continue; 12622 } 12623 break; 12624 } 12625 if (c == '*') { 12626 v = getnextarg(args, arglen, &argidx); 12627 if (v == NULL) 12628 goto onError; 12629 if (!PyLong_Check(v)) { 12630 PyErr_SetString(PyExc_TypeError, 12631 "* wants int"); 12632 goto onError; 12633 } 12634 width = PyLong_AsLong(v); 12635 if (width == -1 && PyErr_Occurred()) 12636 goto onError; 12637 if (width < 0) { 12638 flags |= F_LJUST; 12639 width = -width; 12640 } 12641 if (--fmtcnt >= 0) 12642 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12643 } 12644 else if (c >= '0' && c <= '9') { 12645 width = c - '0'; 12646 while (--fmtcnt >= 0) { 12647 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12648 if (c < '0' || c > '9') 12649 break; 12650 if ((width*10) / 10 != width) { 12651 PyErr_SetString(PyExc_ValueError, 12652 "width too big"); 12653 goto onError; 12654 } 12655 width = width*10 + (c - '0'); 12656 } 12657 } 12658 if (c == '.') { 12659 prec = 0; 12660 if (--fmtcnt >= 0) 12661 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12662 if (c == '*') { 12663 v = getnextarg(args, arglen, &argidx); 12664 if (v == NULL) 12665 goto onError; 12666 if (!PyLong_Check(v)) { 12667 PyErr_SetString(PyExc_TypeError, 12668 "* wants int"); 12669 goto onError; 12670 } 12671 prec = PyLong_AsLong(v); 12672 if (prec == -1 && PyErr_Occurred()) 12673 goto onError; 12674 if (prec < 0) 12675 prec = 0; 12676 if (--fmtcnt >= 0) 12677 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12678 } 12679 else if (c >= '0' && c <= '9') { 12680 prec = c - '0'; 12681 while (--fmtcnt >= 0) { 12682 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12683 if (c < '0' || c > '9') 12684 break; 12685 if ((prec*10) / 10 != prec) { 12686 PyErr_SetString(PyExc_ValueError, 12687 "prec too big"); 12688 goto onError; 12689 } 12690 prec = prec*10 + (c - '0'); 12691 } 12692 } 12693 } /* prec */ 12694 if (fmtcnt >= 0) { 12695 if (c == 'h' || c == 'l' || c == 'L') { 12696 if (--fmtcnt >= 0) 12697 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12698 } 12699 } 12700 if (fmtcnt < 0) { 12701 PyErr_SetString(PyExc_ValueError, 12702 "incomplete format"); 12703 goto onError; 12704 } 12705 if (c != '%') { 12706 v = getnextarg(args, arglen, &argidx); 12707 if (v == NULL) 12708 goto onError; 12709 } 12710 sign = 0; 12711 fill = ' '; 12712 switch (c) { 12713 12714 case '%': 12715 pbuf = formatbuf; 12716 kind = PyUnicode_4BYTE_KIND; 12717 /* presume that buffer length is at least 1 */ 12718 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12719 len = 1; 12720 break; 12721 12722 case 's': 12723 case 'r': 12724 case 'a': 12725 if (PyUnicode_CheckExact(v) && c == 's') { 12726 temp = v; 12727 Py_INCREF(temp); 12728 } 12729 else { 12730 if (c == 's') 12731 temp = PyObject_Str(v); 12732 else if (c == 'r') 12733 temp = PyObject_Repr(v); 12734 else 12735 temp = PyObject_ASCII(v); 12736 if (temp == NULL) 12737 goto onError; 12738 if (PyUnicode_Check(temp)) 12739 /* nothing to do */; 12740 else { 12741 Py_DECREF(temp); 12742 PyErr_SetString(PyExc_TypeError, 12743 "%s argument has non-string str()"); 12744 goto onError; 12745 } 12746 } 12747 if (PyUnicode_READY(temp) == -1) { 12748 Py_CLEAR(temp); 12749 goto onError; 12750 } 12751 pbuf = PyUnicode_DATA(temp); 12752 kind = PyUnicode_KIND(temp); 12753 len = PyUnicode_GET_LENGTH(temp); 12754 if (prec >= 0 && len > prec) 12755 len = prec; 12756 break; 12757 12758 case 'i': 12759 case 'd': 12760 case 'u': 12761 case 'o': 12762 case 'x': 12763 case 'X': 12764 isnumok = 0; 12765 if (PyNumber_Check(v)) { 12766 PyObject *iobj=NULL; 12767 12768 if (PyLong_Check(v)) { 12769 iobj = v; 12770 Py_INCREF(iobj); 12771 } 12772 else { 12773 iobj = PyNumber_Long(v); 12774 } 12775 if (iobj!=NULL) { 12776 if (PyLong_Check(iobj)) { 12777 isnumok = 1; 12778 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12779 Py_DECREF(iobj); 12780 if (!temp) 12781 goto onError; 12782 if (PyUnicode_READY(temp) == -1) { 12783 Py_CLEAR(temp); 12784 goto onError; 12785 } 12786 pbuf = PyUnicode_DATA(temp); 12787 kind = PyUnicode_KIND(temp); 12788 len = PyUnicode_GET_LENGTH(temp); 12789 sign = 1; 12790 } 12791 else { 12792 Py_DECREF(iobj); 12793 } 12794 } 12795 } 12796 if (!isnumok) { 12797 PyErr_Format(PyExc_TypeError, 12798 "%%%c format: a number is required, " 12799 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12800 goto onError; 12801 } 12802 if (flags & F_ZERO) 12803 fill = '0'; 12804 break; 12805 12806 case 'e': 12807 case 'E': 12808 case 'f': 12809 case 'F': 12810 case 'g': 12811 case 'G': 12812 temp = formatfloat(v, flags, prec, c); 12813 if (!temp) 12814 goto onError; 12815 if (PyUnicode_READY(temp) == -1) { 12816 Py_CLEAR(temp); 12817 goto onError; 12818 } 12819 pbuf = PyUnicode_DATA(temp); 12820 kind = PyUnicode_KIND(temp); 12821 len = PyUnicode_GET_LENGTH(temp); 12822 sign = 1; 12823 if (flags & F_ZERO) 12824 fill = '0'; 12825 break; 12826 12827 case 'c': 12828 pbuf = formatbuf; 12829 kind = PyUnicode_4BYTE_KIND; 12830 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12831 if (len < 0) 12832 goto onError; 12833 break; 12834 12835 default: 12836 PyErr_Format(PyExc_ValueError, 12837 "unsupported format character '%c' (0x%x) " 12838 "at index %zd", 12839 (31<=c && c<=126) ? (char)c : '?', 12840 (int)c, 12841 fmtpos - 1); 12842 goto onError; 12843 } 12844 /* pbuf is initialized here. */ 12845 pindex = 0; 12846 if (sign) { 12847 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 12848 PyUnicode_READ(kind, pbuf, pindex) == '+') { 12849 sign = PyUnicode_READ(kind, pbuf, pindex++); 12850 len--; 12851 } 12852 else if (flags & F_SIGN) 12853 sign = '+'; 12854 else if (flags & F_BLANK) 12855 sign = ' '; 12856 else 12857 sign = 0; 12858 } 12859 if (width < len) 12860 width = len; 12861 if (rescnt - (sign != 0) < width) { 12862 reslen -= rescnt; 12863 rescnt = width + fmtcnt + 100; 12864 reslen += rescnt; 12865 if (reslen < 0) { 12866 Py_XDECREF(temp); 12867 PyErr_NoMemory(); 12868 goto onError; 12869 } 12870 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12871 if (res0 == 0) { 12872 PyErr_NoMemory(); 12873 Py_XDECREF(temp); 12874 goto onError; 12875 } 12876 res = res0 + reslen - rescnt; 12877 } 12878 if (sign) { 12879 if (fill != ' ') 12880 *res++ = sign; 12881 rescnt--; 12882 if (width > len) 12883 width--; 12884 } 12885 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12886 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12887 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12888 if (fill != ' ') { 12889 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12890 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12891 } 12892 rescnt -= 2; 12893 width -= 2; 12894 if (width < 0) 12895 width = 0; 12896 len -= 2; 12897 } 12898 if (width > len && !(flags & F_LJUST)) { 12899 do { 12900 --rescnt; 12901 *res++ = fill; 12902 } while (--width > len); 12903 } 12904 if (fill == ' ') { 12905 if (sign) 12906 *res++ = sign; 12907 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12908 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12909 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12910 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12911 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12912 } 12913 } 12914 /* Copy all characters, preserving len */ 12915 len1 = len; 12916 while (len1--) { 12917 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12918 rescnt--; 12919 } 12920 while (--width >= len) { 12921 --rescnt; 12922 *res++ = ' '; 12923 } 12924 if (dict && (argidx < arglen) && c != '%') { 12925 PyErr_SetString(PyExc_TypeError, 12926 "not all arguments converted during string formatting"); 12927 Py_XDECREF(temp); 12928 goto onError; 12929 } 12930 Py_XDECREF(temp); 12931 } /* '%' */ 12932 } /* until end */ 12933 if (argidx < arglen && !dict) { 12934 PyErr_SetString(PyExc_TypeError, 12935 "not all arguments converted during string formatting"); 12936 goto onError; 12937 } 12938 12939 12940 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 12941 if (*res > max) 12942 max = *res; 12943 result = PyUnicode_New(reslen - rescnt, max); 12944 if (!result) 12945 goto onError; 12946 kind = PyUnicode_KIND(result); 12947 for (res = res0; res < res0+reslen-rescnt; res++) 12948 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 12949 PyMem_Free(res0); 12950 if (args_owned) { 12951 Py_DECREF(args); 12952 } 12953 Py_DECREF(uformat); 12954 return (PyObject *)result; 12955 12956 onError: 12957 PyMem_Free(res0); 12958 Py_DECREF(uformat); 12959 if (args_owned) { 12960 Py_DECREF(args); 12961 } 12962 return NULL; 12963} 12964 12965static PyObject * 12966unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 12967 12968static PyObject * 12969unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12970{ 12971 PyObject *x = NULL; 12972 static char *kwlist[] = {"object", "encoding", "errors", 0}; 12973 char *encoding = NULL; 12974 char *errors = NULL; 12975 12976 if (type != &PyUnicode_Type) 12977 return unicode_subtype_new(type, args, kwds); 12978 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 12979 kwlist, &x, &encoding, &errors)) 12980 return NULL; 12981 if (x == NULL) 12982 return (PyObject *)PyUnicode_New(0, 0); 12983 if (encoding == NULL && errors == NULL) 12984 return PyObject_Str(x); 12985 else 12986 return PyUnicode_FromEncodedObject(x, encoding, errors); 12987} 12988 12989static PyObject * 12990unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12991{ 12992 PyUnicodeObject *unicode, *self; 12993 Py_ssize_t length, char_size; 12994 int share_wstr, share_utf8; 12995 unsigned int kind; 12996 void *data; 12997 12998 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 12999 13000 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 13001 if (unicode == NULL) 13002 return NULL; 13003 assert(_PyUnicode_CHECK(unicode)); 13004 if (PyUnicode_READY(unicode)) 13005 return NULL; 13006 13007 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 13008 if (self == NULL) { 13009 Py_DECREF(unicode); 13010 return NULL; 13011 } 13012 kind = PyUnicode_KIND(unicode); 13013 length = PyUnicode_GET_LENGTH(unicode); 13014 13015 _PyUnicode_LENGTH(self) = length; 13016 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13017 _PyUnicode_STATE(self).interned = 0; 13018 _PyUnicode_STATE(self).kind = kind; 13019 _PyUnicode_STATE(self).compact = 0; 13020 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13021 _PyUnicode_STATE(self).ready = 1; 13022 _PyUnicode_WSTR(self) = NULL; 13023 _PyUnicode_UTF8_LENGTH(self) = 0; 13024 _PyUnicode_UTF8(self) = NULL; 13025 _PyUnicode_WSTR_LENGTH(self) = 0; 13026 _PyUnicode_DATA_ANY(self) = NULL; 13027 13028 share_utf8 = 0; 13029 share_wstr = 0; 13030 if (kind == PyUnicode_1BYTE_KIND) { 13031 char_size = 1; 13032 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13033 share_utf8 = 1; 13034 } 13035 else if (kind == PyUnicode_2BYTE_KIND) { 13036 char_size = 2; 13037 if (sizeof(wchar_t) == 2) 13038 share_wstr = 1; 13039 } 13040 else { 13041 assert(kind == PyUnicode_4BYTE_KIND); 13042 char_size = 4; 13043 if (sizeof(wchar_t) == 4) 13044 share_wstr = 1; 13045 } 13046 13047 /* Ensure we won't overflow the length. */ 13048 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13049 PyErr_NoMemory(); 13050 goto onError; 13051 } 13052 data = PyObject_MALLOC((length + 1) * char_size); 13053 if (data == NULL) { 13054 PyErr_NoMemory(); 13055 goto onError; 13056 } 13057 13058 _PyUnicode_DATA_ANY(self) = data; 13059 if (share_utf8) { 13060 _PyUnicode_UTF8_LENGTH(self) = length; 13061 _PyUnicode_UTF8(self) = data; 13062 } 13063 if (share_wstr) { 13064 _PyUnicode_WSTR_LENGTH(self) = length; 13065 _PyUnicode_WSTR(self) = (wchar_t *)data; 13066 } 13067 13068 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13069 PyUnicode_KIND_SIZE(kind, length + 1)); 13070 Py_DECREF(unicode); 13071 return (PyObject *)self; 13072 13073onError: 13074 Py_DECREF(unicode); 13075 Py_DECREF(self); 13076 return NULL; 13077} 13078 13079PyDoc_STRVAR(unicode_doc, 13080 "str(string[, encoding[, errors]]) -> str\n\ 13081\n\ 13082Create a new string object from the given encoded string.\n\ 13083encoding defaults to the current default string encoding.\n\ 13084errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13085 13086static PyObject *unicode_iter(PyObject *seq); 13087 13088PyTypeObject PyUnicode_Type = { 13089 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13090 "str", /* tp_name */ 13091 sizeof(PyUnicodeObject), /* tp_size */ 13092 0, /* tp_itemsize */ 13093 /* Slots */ 13094 (destructor)unicode_dealloc, /* tp_dealloc */ 13095 0, /* tp_print */ 13096 0, /* tp_getattr */ 13097 0, /* tp_setattr */ 13098 0, /* tp_reserved */ 13099 unicode_repr, /* tp_repr */ 13100 &unicode_as_number, /* tp_as_number */ 13101 &unicode_as_sequence, /* tp_as_sequence */ 13102 &unicode_as_mapping, /* tp_as_mapping */ 13103 (hashfunc) unicode_hash, /* tp_hash*/ 13104 0, /* tp_call*/ 13105 (reprfunc) unicode_str, /* tp_str */ 13106 PyObject_GenericGetAttr, /* tp_getattro */ 13107 0, /* tp_setattro */ 13108 0, /* tp_as_buffer */ 13109 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13110 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13111 unicode_doc, /* tp_doc */ 13112 0, /* tp_traverse */ 13113 0, /* tp_clear */ 13114 PyUnicode_RichCompare, /* tp_richcompare */ 13115 0, /* tp_weaklistoffset */ 13116 unicode_iter, /* tp_iter */ 13117 0, /* tp_iternext */ 13118 unicode_methods, /* tp_methods */ 13119 0, /* tp_members */ 13120 0, /* tp_getset */ 13121 &PyBaseObject_Type, /* tp_base */ 13122 0, /* tp_dict */ 13123 0, /* tp_descr_get */ 13124 0, /* tp_descr_set */ 13125 0, /* tp_dictoffset */ 13126 0, /* tp_init */ 13127 0, /* tp_alloc */ 13128 unicode_new, /* tp_new */ 13129 PyObject_Del, /* tp_free */ 13130}; 13131 13132/* Initialize the Unicode implementation */ 13133 13134void _PyUnicode_Init(void) 13135{ 13136 int i; 13137 13138 /* XXX - move this array to unicodectype.c ? */ 13139 Py_UCS2 linebreak[] = { 13140 0x000A, /* LINE FEED */ 13141 0x000D, /* CARRIAGE RETURN */ 13142 0x001C, /* FILE SEPARATOR */ 13143 0x001D, /* GROUP SEPARATOR */ 13144 0x001E, /* RECORD SEPARATOR */ 13145 0x0085, /* NEXT LINE */ 13146 0x2028, /* LINE SEPARATOR */ 13147 0x2029, /* PARAGRAPH SEPARATOR */ 13148 }; 13149 13150 /* Init the implementation */ 13151 unicode_empty = PyUnicode_New(0, 0); 13152 if (!unicode_empty) 13153 Py_FatalError("Can't create empty string"); 13154 13155 for (i = 0; i < 256; i++) 13156 unicode_latin1[i] = NULL; 13157 if (PyType_Ready(&PyUnicode_Type) < 0) 13158 Py_FatalError("Can't initialize 'unicode'"); 13159 13160 /* initialize the linebreak bloom filter */ 13161 bloom_linebreak = make_bloom_mask( 13162 PyUnicode_2BYTE_KIND, linebreak, 13163 Py_ARRAY_LENGTH(linebreak)); 13164 13165 PyType_Ready(&EncodingMapType); 13166} 13167 13168/* Finalize the Unicode implementation */ 13169 13170int 13171PyUnicode_ClearFreeList(void) 13172{ 13173 return 0; 13174} 13175 13176void 13177_PyUnicode_Fini(void) 13178{ 13179 int i; 13180 13181 Py_XDECREF(unicode_empty); 13182 unicode_empty = NULL; 13183 13184 for (i = 0; i < 256; i++) { 13185 if (unicode_latin1[i]) { 13186 Py_DECREF(unicode_latin1[i]); 13187 unicode_latin1[i] = NULL; 13188 } 13189 } 13190 (void)PyUnicode_ClearFreeList(); 13191} 13192 13193void 13194PyUnicode_InternInPlace(PyObject **p) 13195{ 13196 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13197 PyObject *t; 13198#ifdef Py_DEBUG 13199 assert(s != NULL); 13200 assert(_PyUnicode_CHECK(s)); 13201#else 13202 if (s == NULL || !PyUnicode_Check(s)) 13203 return; 13204#endif 13205 /* If it's a subclass, we don't really know what putting 13206 it in the interned dict might do. */ 13207 if (!PyUnicode_CheckExact(s)) 13208 return; 13209 if (PyUnicode_CHECK_INTERNED(s)) 13210 return; 13211 if (_PyUnicode_READY_REPLACE(p)) { 13212 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 13213 return; 13214 } 13215 s = (PyUnicodeObject *)(*p); 13216 if (interned == NULL) { 13217 interned = PyDict_New(); 13218 if (interned == NULL) { 13219 PyErr_Clear(); /* Don't leave an exception */ 13220 return; 13221 } 13222 } 13223 /* It might be that the GetItem call fails even 13224 though the key is present in the dictionary, 13225 namely when this happens during a stack overflow. */ 13226 Py_ALLOW_RECURSION 13227 t = PyDict_GetItem(interned, (PyObject *)s); 13228 Py_END_ALLOW_RECURSION 13229 13230 if (t) { 13231 Py_INCREF(t); 13232 Py_DECREF(*p); 13233 *p = t; 13234 return; 13235 } 13236 13237 PyThreadState_GET()->recursion_critical = 1; 13238 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13239 PyErr_Clear(); 13240 PyThreadState_GET()->recursion_critical = 0; 13241 return; 13242 } 13243 PyThreadState_GET()->recursion_critical = 0; 13244 /* The two references in interned are not counted by refcnt. 13245 The deallocator will take care of this */ 13246 Py_REFCNT(s) -= 2; 13247 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13248} 13249 13250void 13251PyUnicode_InternImmortal(PyObject **p) 13252{ 13253 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13254 13255 PyUnicode_InternInPlace(p); 13256 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13257 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13258 Py_INCREF(*p); 13259 } 13260} 13261 13262PyObject * 13263PyUnicode_InternFromString(const char *cp) 13264{ 13265 PyObject *s = PyUnicode_FromString(cp); 13266 if (s == NULL) 13267 return NULL; 13268 PyUnicode_InternInPlace(&s); 13269 return s; 13270} 13271 13272void 13273_Py_ReleaseInternedUnicodeStrings(void) 13274{ 13275 PyObject *keys; 13276 PyUnicodeObject *s; 13277 Py_ssize_t i, n; 13278 Py_ssize_t immortal_size = 0, mortal_size = 0; 13279 13280 if (interned == NULL || !PyDict_Check(interned)) 13281 return; 13282 keys = PyDict_Keys(interned); 13283 if (keys == NULL || !PyList_Check(keys)) { 13284 PyErr_Clear(); 13285 return; 13286 } 13287 13288 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13289 detector, interned unicode strings are not forcibly deallocated; 13290 rather, we give them their stolen references back, and then clear 13291 and DECREF the interned dict. */ 13292 13293 n = PyList_GET_SIZE(keys); 13294 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13295 n); 13296 for (i = 0; i < n; i++) { 13297 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13298 if (PyUnicode_READY(s) == -1) { 13299 assert(0 && "could not ready string"); 13300 fprintf(stderr, "could not ready string\n"); 13301 } 13302 switch (PyUnicode_CHECK_INTERNED(s)) { 13303 case SSTATE_NOT_INTERNED: 13304 /* XXX Shouldn't happen */ 13305 break; 13306 case SSTATE_INTERNED_IMMORTAL: 13307 Py_REFCNT(s) += 1; 13308 immortal_size += PyUnicode_GET_LENGTH(s); 13309 break; 13310 case SSTATE_INTERNED_MORTAL: 13311 Py_REFCNT(s) += 2; 13312 mortal_size += PyUnicode_GET_LENGTH(s); 13313 break; 13314 default: 13315 Py_FatalError("Inconsistent interned string state."); 13316 } 13317 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13318 } 13319 fprintf(stderr, "total size of all interned strings: " 13320 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13321 "mortal/immortal\n", mortal_size, immortal_size); 13322 Py_DECREF(keys); 13323 PyDict_Clear(interned); 13324 Py_DECREF(interned); 13325 interned = NULL; 13326} 13327 13328 13329/********************* Unicode Iterator **************************/ 13330 13331typedef struct { 13332 PyObject_HEAD 13333 Py_ssize_t it_index; 13334 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13335} unicodeiterobject; 13336 13337static void 13338unicodeiter_dealloc(unicodeiterobject *it) 13339{ 13340 _PyObject_GC_UNTRACK(it); 13341 Py_XDECREF(it->it_seq); 13342 PyObject_GC_Del(it); 13343} 13344 13345static int 13346unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13347{ 13348 Py_VISIT(it->it_seq); 13349 return 0; 13350} 13351 13352static PyObject * 13353unicodeiter_next(unicodeiterobject *it) 13354{ 13355 PyUnicodeObject *seq; 13356 PyObject *item; 13357 13358 assert(it != NULL); 13359 seq = it->it_seq; 13360 if (seq == NULL) 13361 return NULL; 13362 assert(_PyUnicode_CHECK(seq)); 13363 13364 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13365 int kind = PyUnicode_KIND(seq); 13366 void *data = PyUnicode_DATA(seq); 13367 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13368 item = PyUnicode_FromOrdinal(chr); 13369 if (item != NULL) 13370 ++it->it_index; 13371 return item; 13372 } 13373 13374 Py_DECREF(seq); 13375 it->it_seq = NULL; 13376 return NULL; 13377} 13378 13379static PyObject * 13380unicodeiter_len(unicodeiterobject *it) 13381{ 13382 Py_ssize_t len = 0; 13383 if (it->it_seq) 13384 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13385 return PyLong_FromSsize_t(len); 13386} 13387 13388PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13389 13390static PyMethodDef unicodeiter_methods[] = { 13391 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13392 length_hint_doc}, 13393 {NULL, NULL} /* sentinel */ 13394}; 13395 13396PyTypeObject PyUnicodeIter_Type = { 13397 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13398 "str_iterator", /* tp_name */ 13399 sizeof(unicodeiterobject), /* tp_basicsize */ 13400 0, /* tp_itemsize */ 13401 /* methods */ 13402 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13403 0, /* tp_print */ 13404 0, /* tp_getattr */ 13405 0, /* tp_setattr */ 13406 0, /* tp_reserved */ 13407 0, /* tp_repr */ 13408 0, /* tp_as_number */ 13409 0, /* tp_as_sequence */ 13410 0, /* tp_as_mapping */ 13411 0, /* tp_hash */ 13412 0, /* tp_call */ 13413 0, /* tp_str */ 13414 PyObject_GenericGetAttr, /* tp_getattro */ 13415 0, /* tp_setattro */ 13416 0, /* tp_as_buffer */ 13417 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13418 0, /* tp_doc */ 13419 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13420 0, /* tp_clear */ 13421 0, /* tp_richcompare */ 13422 0, /* tp_weaklistoffset */ 13423 PyObject_SelfIter, /* tp_iter */ 13424 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13425 unicodeiter_methods, /* tp_methods */ 13426 0, 13427}; 13428 13429static PyObject * 13430unicode_iter(PyObject *seq) 13431{ 13432 unicodeiterobject *it; 13433 13434 if (!PyUnicode_Check(seq)) { 13435 PyErr_BadInternalCall(); 13436 return NULL; 13437 } 13438 if (PyUnicode_READY(seq) == -1) 13439 return NULL; 13440 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13441 if (it == NULL) 13442 return NULL; 13443 it->it_index = 0; 13444 Py_INCREF(seq); 13445 it->it_seq = (PyUnicodeObject *)seq; 13446 _PyObject_GC_TRACK(it); 13447 return (PyObject *)it; 13448} 13449 13450#define UNIOP(x) Py_UNICODE_##x 13451#define UNIOP_t Py_UNICODE 13452#include "uniops.h" 13453#undef UNIOP 13454#undef UNIOP_t 13455#define UNIOP(x) Py_UCS4_##x 13456#define UNIOP_t Py_UCS4 13457#include "uniops.h" 13458#undef UNIOP 13459#undef UNIOP_t 13460 13461Py_UNICODE* 13462PyUnicode_AsUnicodeCopy(PyObject *object) 13463{ 13464 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13465 Py_UNICODE *copy; 13466 Py_ssize_t size; 13467 13468 if (!PyUnicode_Check(unicode)) { 13469 PyErr_BadArgument(); 13470 return NULL; 13471 } 13472 /* Ensure we won't overflow the size. */ 13473 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13474 PyErr_NoMemory(); 13475 return NULL; 13476 } 13477 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13478 size *= sizeof(Py_UNICODE); 13479 copy = PyMem_Malloc(size); 13480 if (copy == NULL) { 13481 PyErr_NoMemory(); 13482 return NULL; 13483 } 13484 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13485 return copy; 13486} 13487 13488/* A _string module, to export formatter_parser and formatter_field_name_split 13489 to the string.Formatter class implemented in Python. */ 13490 13491static PyMethodDef _string_methods[] = { 13492 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13493 METH_O, PyDoc_STR("split the argument as a field name")}, 13494 {"formatter_parser", (PyCFunction) formatter_parser, 13495 METH_O, PyDoc_STR("parse the argument as a format string")}, 13496 {NULL, NULL} 13497}; 13498 13499static struct PyModuleDef _string_module = { 13500 PyModuleDef_HEAD_INIT, 13501 "_string", 13502 PyDoc_STR("string helper module"), 13503 0, 13504 _string_methods, 13505 NULL, 13506 NULL, 13507 NULL, 13508 NULL 13509}; 13510 13511PyMODINIT_FUNC 13512PyInit__string(void) 13513{ 13514 return PyModule_Create(&_string_module); 13515} 13516 13517 13518#ifdef __cplusplus 13519} 13520#endif 13521