unicodeobject.c revision 7f11ad4594f63dec8cd18a16243fb58cf0e9589b
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Limit for the Unicode object free list */ 50 51#define PyUnicode_MAXFREELIST 1024 52 53/* Limit for the Unicode object free list stay alive optimization. 54 55 The implementation will keep allocated Unicode memory intact for 56 all objects on the free list having a size less than this 57 limit. This reduces malloc() overhead for small Unicode objects. 58 59 At worst this will result in PyUnicode_MAXFREELIST * 60 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 61 malloc()-overhead) bytes of unused garbage. 62 63 Setting the limit to 0 effectively turns the feature off. 64 65 Note: This is an experimental feature ! If you get core dumps when 66 using Unicode objects, turn this feature off. 67 68*/ 69 70#define KEEPALIVE_SIZE_LIMIT 9 71 72/* Endianness switches; defaults to little endian */ 73 74#ifdef WORDS_BIGENDIAN 75# define BYTEORDER_IS_BIG_ENDIAN 76#else 77# define BYTEORDER_IS_LITTLE_ENDIAN 78#endif 79 80/* --- Globals ------------------------------------------------------------ 81 82 The globals are initialized by the _PyUnicode_Init() API and should 83 not be used before calling that API. 84 85*/ 86 87 88#ifdef __cplusplus 89extern "C" { 90#endif 91 92#ifdef Py_DEBUG 93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) 94#else 95# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 96#endif 97 98#define _PyUnicode_UTF8(op) \ 99 (((PyCompactUnicodeObject*)(op))->utf8) 100#define PyUnicode_UTF8(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 assert(PyUnicode_IS_READY(op)), \ 103 PyUnicode_IS_COMPACT_ASCII(op) ? \ 104 ((char*)((PyASCIIObject*)(op) + 1)) : \ 105 _PyUnicode_UTF8(op)) 106#define _PyUnicode_UTF8_LENGTH(op) \ 107 (((PyCompactUnicodeObject*)(op))->utf8_length) 108#define PyUnicode_UTF8_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 assert(PyUnicode_IS_READY(op)), \ 111 PyUnicode_IS_COMPACT_ASCII(op) ? \ 112 ((PyASCIIObject*)(op))->length : \ 113 _PyUnicode_UTF8_LENGTH(op)) 114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr) 115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length) 116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length) 117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state) 118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash) 119#define _PyUnicode_KIND(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 ((PyASCIIObject *)(op))->state.kind) 122#define _PyUnicode_GET_LENGTH(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 ((PyASCIIObject *)(op))->length) 125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any) 126 127#undef PyUnicode_READY 128#define PyUnicode_READY(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 (PyUnicode_IS_READY(op) ? \ 131 0 : _PyUnicode_Ready((PyObject *)(op)))) 132 133#define _PyUnicode_READY_REPLACE(p_obj) \ 134 (assert(_PyUnicode_CHECK(*p_obj)), \ 135 (PyUnicode_IS_READY(*p_obj) ? \ 136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 137 138#define _PyUnicode_SHARE_UTF8(op) \ 139 (assert(_PyUnicode_CHECK(op)), \ 140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 142#define _PyUnicode_SHARE_WSTR(op) \ 143 (assert(_PyUnicode_CHECK(op)), \ 144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 145 146/* true if the Unicode object has an allocated UTF-8 memory block 147 (not shared with other data) */ 148#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 149 (assert(_PyUnicode_CHECK(op)), \ 150 (!PyUnicode_IS_COMPACT_ASCII(op) \ 151 && _PyUnicode_UTF8(op) \ 152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 153 154/* true if the Unicode object has an allocated wstr memory block 155 (not shared with other data) */ 156#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 157 (assert(_PyUnicode_CHECK(op)), \ 158 (_PyUnicode_WSTR(op) && \ 159 (!PyUnicode_IS_READY(op) || \ 160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 161 162/* Generic helper macro to convert characters of different types. 163 from_type and to_type have to be valid type names, begin and end 164 are pointers to the source characters which should be of type 165 "from_type *". to is a pointer of type "to_type *" and points to the 166 buffer where the result characters are written to. */ 167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 168 do { \ 169 const from_type *iter_; to_type *to_; \ 170 for (iter_ = (begin), to_ = (to_type *)(to); \ 171 iter_ < (end); \ 172 ++iter_, ++to_) { \ 173 *to_ = (to_type)*iter_; \ 174 } \ 175 } while (0) 176 177/* The Unicode string has been modified: reset the hash */ 178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 179 180/* This dictionary holds all interned unicode strings. Note that references 181 to strings in this dictionary are *not* counted in the string's ob_refcnt. 182 When the interned string reaches a refcnt of 0 the string deallocation 183 function will delete the reference from this dictionary. 184 185 Another way to look at this is that to say that the actual reference 186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 187*/ 188static PyObject *interned; 189 190/* The empty Unicode object is shared to improve performance. */ 191static PyObject *unicode_empty; 192 193/* Single character Unicode strings in the Latin-1 range are being 194 shared as well. */ 195static PyObject *unicode_latin1[256]; 196 197/* Fast detection of the most frequent whitespace characters */ 198const unsigned char _Py_ascii_whitespace[] = { 199 0, 0, 0, 0, 0, 0, 0, 0, 200/* case 0x0009: * CHARACTER TABULATION */ 201/* case 0x000A: * LINE FEED */ 202/* case 0x000B: * LINE TABULATION */ 203/* case 0x000C: * FORM FEED */ 204/* case 0x000D: * CARRIAGE RETURN */ 205 0, 1, 1, 1, 1, 1, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x001C: * FILE SEPARATOR */ 208/* case 0x001D: * GROUP SEPARATOR */ 209/* case 0x001E: * RECORD SEPARATOR */ 210/* case 0x001F: * UNIT SEPARATOR */ 211 0, 0, 0, 0, 1, 1, 1, 1, 212/* case 0x0020: * SPACE */ 213 1, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0 226}; 227 228/* forward */ 229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 230static PyObject* get_latin1_char(unsigned char ch); 231 232static PyObject * 233unicode_encode_call_errorhandler(const char *errors, 234 PyObject **errorHandler,const char *encoding, const char *reason, 235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 237 238static void 239raise_encode_exception(PyObject **exceptionObject, 240 const char *encoding, 241 const Py_UNICODE *unicode, Py_ssize_t size, 242 Py_ssize_t startpos, Py_ssize_t endpos, 243 const char *reason); 244 245/* Same for linebreaks */ 246static unsigned char ascii_linebreak[] = { 247 0, 0, 0, 0, 0, 0, 0, 0, 248/* 0x000A, * LINE FEED */ 249/* 0x000B, * LINE TABULATION */ 250/* 0x000C, * FORM FEED */ 251/* 0x000D, * CARRIAGE RETURN */ 252 0, 0, 1, 1, 1, 1, 0, 0, 253 0, 0, 0, 0, 0, 0, 0, 0, 254/* 0x001C, * FILE SEPARATOR */ 255/* 0x001D, * GROUP SEPARATOR */ 256/* 0x001E, * RECORD SEPARATOR */ 257 0, 0, 0, 0, 1, 1, 1, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261 0, 0, 0, 0, 0, 0, 0, 0, 262 263 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269 0, 0, 0, 0, 0, 0, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0 271}; 272 273/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 274 This function is kept for backward compatibility with the old API. */ 275Py_UNICODE 276PyUnicode_GetMax(void) 277{ 278#ifdef Py_UNICODE_WIDE 279 return 0x10FFFF; 280#else 281 /* This is actually an illegal character, so it should 282 not be passed to unichr. */ 283 return 0xFFFF; 284#endif 285} 286 287#ifdef Py_DEBUG 288static int 289_PyUnicode_CheckConsistency(void *op) 290{ 291 PyASCIIObject *ascii; 292 unsigned int kind; 293 294 assert(PyUnicode_Check(op)); 295 296 ascii = (PyASCIIObject *)op; 297 kind = ascii->state.kind; 298 299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 300 assert(kind == PyUnicode_1BYTE_KIND); 301 assert(ascii->state.ready == 1); 302 } 303 else if (ascii->state.compact == 1) { 304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 305 void *data; 306 assert(kind == PyUnicode_1BYTE_KIND 307 || kind == PyUnicode_2BYTE_KIND 308 || kind == PyUnicode_4BYTE_KIND); 309 assert(ascii->state.ascii == 0); 310 assert(ascii->state.ready == 1); 311 data = compact + 1; 312 assert (compact->utf8 != data); 313 if ( 314#if SIZEOF_WCHAR_T == 2 315 kind == PyUnicode_2BYTE_KIND 316#else 317 kind == PyUnicode_4BYTE_KIND 318#endif 319 ) 320 assert(ascii->wstr == data); 321 else 322 assert(ascii->wstr != data); 323 } else { 324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 325 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 326 327 if (kind == PyUnicode_WCHAR_KIND) { 328 assert(ascii->state.compact == 0); 329 assert(ascii->state.ascii == 0); 330 assert(ascii->state.ready == 0); 331 assert(ascii->wstr != NULL); 332 assert(unicode->data.any == NULL); 333 assert(compact->utf8 == NULL); 334 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 335 } 336 else { 337 assert(kind == PyUnicode_1BYTE_KIND 338 || kind == PyUnicode_2BYTE_KIND 339 || kind == PyUnicode_4BYTE_KIND); 340 assert(ascii->state.compact == 0); 341 assert(ascii->state.ready == 1); 342 assert(unicode->data.any != NULL); 343 if (ascii->state.ascii) 344 assert (compact->utf8 == unicode->data.any); 345 else 346 assert (compact->utf8 != unicode->data.any); 347 if ( 348#if SIZEOF_WCHAR_T == 2 349 kind == PyUnicode_2BYTE_KIND 350#else 351 kind == PyUnicode_4BYTE_KIND 352#endif 353 ) 354 assert(ascii->wstr == unicode->data.any); 355 else 356 assert(ascii->wstr != unicode->data.any); 357 } 358 } 359 return 1; 360} 361#endif 362 363/* --- Bloom Filters ----------------------------------------------------- */ 364 365/* stuff to implement simple "bloom filters" for Unicode characters. 366 to keep things simple, we use a single bitmask, using the least 5 367 bits from each unicode characters as the bit index. */ 368 369/* the linebreak mask is set up by Unicode_Init below */ 370 371#if LONG_BIT >= 128 372#define BLOOM_WIDTH 128 373#elif LONG_BIT >= 64 374#define BLOOM_WIDTH 64 375#elif LONG_BIT >= 32 376#define BLOOM_WIDTH 32 377#else 378#error "LONG_BIT is smaller than 32" 379#endif 380 381#define BLOOM_MASK unsigned long 382 383static BLOOM_MASK bloom_linebreak; 384 385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 386#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 387 388#define BLOOM_LINEBREAK(ch) \ 389 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 390 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 391 392Py_LOCAL_INLINE(BLOOM_MASK) 393make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 394{ 395 /* calculate simple bloom-style bitmask for a given unicode string */ 396 397 BLOOM_MASK mask; 398 Py_ssize_t i; 399 400 mask = 0; 401 for (i = 0; i < len; i++) 402 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 403 404 return mask; 405} 406 407#define BLOOM_MEMBER(mask, chr, str) \ 408 (BLOOM(mask, chr) \ 409 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 410 411/* --- Unicode Object ----------------------------------------------------- */ 412 413static PyObject * 414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); 415 416Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 417 Py_ssize_t size, Py_UCS4 ch, 418 int direction) 419{ 420 /* like wcschr, but doesn't stop at NULL characters */ 421 Py_ssize_t i; 422 if (direction == 1) { 423 for(i = 0; i < size; i++) 424 if (PyUnicode_READ(kind, s, i) == ch) 425 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 426 } 427 else { 428 for(i = size-1; i >= 0; i--) 429 if (PyUnicode_READ(kind, s, i) == ch) 430 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 431 } 432 return NULL; 433} 434 435static PyObject* 436resize_compact(PyObject *unicode, Py_ssize_t length) 437{ 438 Py_ssize_t char_size; 439 Py_ssize_t struct_size; 440 Py_ssize_t new_size; 441 int share_wstr; 442 443 assert(PyUnicode_IS_READY(unicode)); 444 char_size = PyUnicode_CHARACTER_SIZE(unicode); 445 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 446 struct_size = sizeof(PyASCIIObject); 447 else 448 struct_size = sizeof(PyCompactUnicodeObject); 449 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 450 451 _Py_DEC_REFTOTAL; 452 _Py_ForgetReference(unicode); 453 454 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 455 PyErr_NoMemory(); 456 return NULL; 457 } 458 new_size = (struct_size + (length + 1) * char_size); 459 460 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 461 if (unicode == NULL) { 462 PyObject_Del(unicode); 463 PyErr_NoMemory(); 464 return NULL; 465 } 466 _Py_NewReference(unicode); 467 _PyUnicode_LENGTH(unicode) = length; 468 if (share_wstr) { 469 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 470 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 471 _PyUnicode_WSTR_LENGTH(unicode) = length; 472 } 473 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 474 length, 0); 475 return unicode; 476} 477 478static int 479resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length) 480{ 481 void *oldstr; 482 483 assert(!PyUnicode_IS_COMPACT(unicode)); 484 485 assert(Py_REFCNT(unicode) == 1); 486 _PyUnicode_DIRTY(unicode); 487 488 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 489 { 490 PyObject_DEL(_PyUnicode_UTF8(unicode)); 491 _PyUnicode_UTF8(unicode) = NULL; 492 } 493 494 if (PyUnicode_IS_READY(unicode)) { 495 Py_ssize_t char_size; 496 Py_ssize_t new_size; 497 int share_wstr, share_utf8; 498 void *data; 499 500 data = _PyUnicode_DATA_ANY(unicode); 501 assert(data != NULL); 502 char_size = PyUnicode_CHARACTER_SIZE(unicode); 503 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 504 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 505 506 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 507 PyErr_NoMemory(); 508 return -1; 509 } 510 new_size = (length + 1) * char_size; 511 512 data = (PyObject *)PyObject_REALLOC(data, new_size); 513 if (data == NULL) { 514 PyErr_NoMemory(); 515 return -1; 516 } 517 _PyUnicode_DATA_ANY(unicode) = data; 518 if (share_wstr) { 519 _PyUnicode_WSTR(unicode) = data; 520 _PyUnicode_WSTR_LENGTH(unicode) = length; 521 } 522 if (share_utf8) { 523 _PyUnicode_UTF8(unicode) = data; 524 _PyUnicode_UTF8_LENGTH(unicode) = length; 525 } 526 _PyUnicode_LENGTH(unicode) = length; 527 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 528 if (share_wstr) 529 return 0; 530 } 531 if (_PyUnicode_WSTR(unicode) != NULL) { 532 assert(_PyUnicode_WSTR(unicode) != NULL); 533 534 oldstr = _PyUnicode_WSTR(unicode); 535 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), 536 sizeof(Py_UNICODE) * (length + 1)); 537 if (!_PyUnicode_WSTR(unicode)) { 538 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr; 539 PyErr_NoMemory(); 540 return -1; 541 } 542 _PyUnicode_WSTR(unicode)[length] = 0; 543 _PyUnicode_WSTR_LENGTH(unicode) = length; 544 } 545 return 0; 546} 547 548static PyObject* 549resize_copy(PyObject *unicode, Py_ssize_t length) 550{ 551 Py_ssize_t copy_length; 552 if (PyUnicode_IS_COMPACT(unicode)) { 553 PyObject *copy; 554 assert(PyUnicode_IS_READY(unicode)); 555 556 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 557 if (copy == NULL) 558 return NULL; 559 560 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 561 if (PyUnicode_CopyCharacters(copy, 0, 562 unicode, 0, 563 copy_length) < 0) 564 { 565 Py_DECREF(copy); 566 return NULL; 567 } 568 return copy; 569 } 570 else { 571 PyUnicodeObject *w; 572 assert(_PyUnicode_WSTR(unicode) != NULL); 573 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 574 w = _PyUnicode_New(length); 575 if (w == NULL) 576 return NULL; 577 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 578 copy_length = Py_MIN(copy_length, length); 579 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 580 copy_length); 581 return (PyObject*)w; 582 } 583} 584 585/* We allocate one more byte to make sure the string is 586 Ux0000 terminated; some code (e.g. new_identifier) 587 relies on that. 588 589 XXX This allocator could further be enhanced by assuring that the 590 free list never reduces its size below 1. 591 592*/ 593 594#ifdef Py_DEBUG 595int unicode_old_new_calls = 0; 596#endif 597 598static PyUnicodeObject * 599_PyUnicode_New(Py_ssize_t length) 600{ 601 register PyUnicodeObject *unicode; 602 size_t new_size; 603 604 /* Optimization for empty strings */ 605 if (length == 0 && unicode_empty != NULL) { 606 Py_INCREF(unicode_empty); 607 return (PyUnicodeObject*)unicode_empty; 608 } 609 610 /* Ensure we won't overflow the size. */ 611 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 612 return (PyUnicodeObject *)PyErr_NoMemory(); 613 } 614 if (length < 0) { 615 PyErr_SetString(PyExc_SystemError, 616 "Negative size passed to _PyUnicode_New"); 617 return NULL; 618 } 619 620#ifdef Py_DEBUG 621 ++unicode_old_new_calls; 622#endif 623 624 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 625 if (unicode == NULL) 626 return NULL; 627 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 628 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 629 if (!_PyUnicode_WSTR(unicode)) { 630 PyErr_NoMemory(); 631 goto onError; 632 } 633 634 /* Initialize the first element to guard against cases where 635 * the caller fails before initializing str -- unicode_resize() 636 * reads str[0], and the Keep-Alive optimization can keep memory 637 * allocated for str alive across a call to unicode_dealloc(unicode). 638 * We don't want unicode_resize to read uninitialized memory in 639 * that case. 640 */ 641 _PyUnicode_WSTR(unicode)[0] = 0; 642 _PyUnicode_WSTR(unicode)[length] = 0; 643 _PyUnicode_WSTR_LENGTH(unicode) = length; 644 _PyUnicode_HASH(unicode) = -1; 645 _PyUnicode_STATE(unicode).interned = 0; 646 _PyUnicode_STATE(unicode).kind = 0; 647 _PyUnicode_STATE(unicode).compact = 0; 648 _PyUnicode_STATE(unicode).ready = 0; 649 _PyUnicode_STATE(unicode).ascii = 0; 650 _PyUnicode_DATA_ANY(unicode) = NULL; 651 _PyUnicode_LENGTH(unicode) = 0; 652 _PyUnicode_UTF8(unicode) = NULL; 653 _PyUnicode_UTF8_LENGTH(unicode) = 0; 654 return unicode; 655 656 onError: 657 /* XXX UNREF/NEWREF interface should be more symmetrical */ 658 _Py_DEC_REFTOTAL; 659 _Py_ForgetReference((PyObject *)unicode); 660 PyObject_Del(unicode); 661 return NULL; 662} 663 664static const char* 665unicode_kind_name(PyObject *unicode) 666{ 667 /* don't check consistency: unicode_kind_name() is called from 668 _PyUnicode_Dump() */ 669 if (!PyUnicode_IS_COMPACT(unicode)) 670 { 671 if (!PyUnicode_IS_READY(unicode)) 672 return "wstr"; 673 switch(PyUnicode_KIND(unicode)) 674 { 675 case PyUnicode_1BYTE_KIND: 676 if (PyUnicode_IS_ASCII(unicode)) 677 return "legacy ascii"; 678 else 679 return "legacy latin1"; 680 case PyUnicode_2BYTE_KIND: 681 return "legacy UCS2"; 682 case PyUnicode_4BYTE_KIND: 683 return "legacy UCS4"; 684 default: 685 return "<legacy invalid kind>"; 686 } 687 } 688 assert(PyUnicode_IS_READY(unicode)); 689 switch(PyUnicode_KIND(unicode)) 690 { 691 case PyUnicode_1BYTE_KIND: 692 if (PyUnicode_IS_ASCII(unicode)) 693 return "ascii"; 694 else 695 return "latin1"; 696 case PyUnicode_2BYTE_KIND: 697 return "UCS2"; 698 case PyUnicode_4BYTE_KIND: 699 return "UCS4"; 700 default: 701 return "<invalid compact kind>"; 702 } 703} 704 705#ifdef Py_DEBUG 706int unicode_new_new_calls = 0; 707 708/* Functions wrapping macros for use in debugger */ 709char *_PyUnicode_utf8(void *unicode){ 710 return PyUnicode_UTF8(unicode); 711} 712 713void *_PyUnicode_compact_data(void *unicode) { 714 return _PyUnicode_COMPACT_DATA(unicode); 715} 716void *_PyUnicode_data(void *unicode){ 717 printf("obj %p\n", unicode); 718 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 719 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 720 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 721 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 722 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 723 return PyUnicode_DATA(unicode); 724} 725 726void 727_PyUnicode_Dump(PyObject *op) 728{ 729 PyASCIIObject *ascii = (PyASCIIObject *)op; 730 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 731 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 732 void *data; 733 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 734 if (ascii->state.compact) 735 data = (compact + 1); 736 else 737 data = unicode->data.any; 738 if (ascii->wstr == data) 739 printf("shared "); 740 printf("wstr=%p", ascii->wstr); 741 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 742 printf(" (%zu), ", compact->wstr_length); 743 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 744 printf("shared "); 745 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 746 } 747 printf(", data=%p\n", data); 748} 749#endif 750 751PyObject * 752PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 753{ 754 PyObject *obj; 755 PyCompactUnicodeObject *unicode; 756 void *data; 757 int kind_state; 758 int is_sharing = 0, is_ascii = 0; 759 Py_ssize_t char_size; 760 Py_ssize_t struct_size; 761 762 /* Optimization for empty strings */ 763 if (size == 0 && unicode_empty != NULL) { 764 Py_INCREF(unicode_empty); 765 return unicode_empty; 766 } 767 768#ifdef Py_DEBUG 769 ++unicode_new_new_calls; 770#endif 771 772 struct_size = sizeof(PyCompactUnicodeObject); 773 if (maxchar < 128) { 774 kind_state = PyUnicode_1BYTE_KIND; 775 char_size = 1; 776 is_ascii = 1; 777 struct_size = sizeof(PyASCIIObject); 778 } 779 else if (maxchar < 256) { 780 kind_state = PyUnicode_1BYTE_KIND; 781 char_size = 1; 782 } 783 else if (maxchar < 65536) { 784 kind_state = PyUnicode_2BYTE_KIND; 785 char_size = 2; 786 if (sizeof(wchar_t) == 2) 787 is_sharing = 1; 788 } 789 else { 790 kind_state = PyUnicode_4BYTE_KIND; 791 char_size = 4; 792 if (sizeof(wchar_t) == 4) 793 is_sharing = 1; 794 } 795 796 /* Ensure we won't overflow the size. */ 797 if (size < 0) { 798 PyErr_SetString(PyExc_SystemError, 799 "Negative size passed to PyUnicode_New"); 800 return NULL; 801 } 802 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 803 return PyErr_NoMemory(); 804 805 /* Duplicated allocation code from _PyObject_New() instead of a call to 806 * PyObject_New() so we are able to allocate space for the object and 807 * it's data buffer. 808 */ 809 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 810 if (obj == NULL) 811 return PyErr_NoMemory(); 812 obj = PyObject_INIT(obj, &PyUnicode_Type); 813 if (obj == NULL) 814 return NULL; 815 816 unicode = (PyCompactUnicodeObject *)obj; 817 if (is_ascii) 818 data = ((PyASCIIObject*)obj) + 1; 819 else 820 data = unicode + 1; 821 _PyUnicode_LENGTH(unicode) = size; 822 _PyUnicode_HASH(unicode) = -1; 823 _PyUnicode_STATE(unicode).interned = 0; 824 _PyUnicode_STATE(unicode).kind = kind_state; 825 _PyUnicode_STATE(unicode).compact = 1; 826 _PyUnicode_STATE(unicode).ready = 1; 827 _PyUnicode_STATE(unicode).ascii = is_ascii; 828 if (is_ascii) { 829 ((char*)data)[size] = 0; 830 _PyUnicode_WSTR(unicode) = NULL; 831 } 832 else if (kind_state == PyUnicode_1BYTE_KIND) { 833 ((char*)data)[size] = 0; 834 _PyUnicode_WSTR(unicode) = NULL; 835 _PyUnicode_WSTR_LENGTH(unicode) = 0; 836 unicode->utf8_length = 0; 837 unicode->utf8 = NULL; 838 } 839 else { 840 unicode->utf8 = NULL; 841 if (kind_state == PyUnicode_2BYTE_KIND) 842 ((Py_UCS2*)data)[size] = 0; 843 else /* kind_state == PyUnicode_4BYTE_KIND */ 844 ((Py_UCS4*)data)[size] = 0; 845 if (is_sharing) { 846 _PyUnicode_WSTR_LENGTH(unicode) = size; 847 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 848 } 849 else { 850 _PyUnicode_WSTR_LENGTH(unicode) = 0; 851 _PyUnicode_WSTR(unicode) = NULL; 852 } 853 } 854 return obj; 855} 856 857#if SIZEOF_WCHAR_T == 2 858/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 859 will decode surrogate pairs, the other conversions are implemented as macros 860 for efficency. 861 862 This function assumes that unicode can hold one more code point than wstr 863 characters for a terminating null character. */ 864static void 865unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 866 PyUnicodeObject *unicode) 867{ 868 const wchar_t *iter; 869 Py_UCS4 *ucs4_out; 870 871 assert(unicode != NULL); 872 assert(_PyUnicode_CHECK(unicode)); 873 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 874 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 875 876 for (iter = begin; iter < end; ) { 877 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 878 _PyUnicode_GET_LENGTH(unicode))); 879 if (*iter >= 0xD800 && *iter <= 0xDBFF 880 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 881 { 882 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 883 iter += 2; 884 } 885 else { 886 *ucs4_out++ = *iter; 887 iter++; 888 } 889 } 890 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 891 _PyUnicode_GET_LENGTH(unicode))); 892 893} 894#endif 895 896static int 897_PyUnicode_Dirty(PyObject *unicode) 898{ 899 assert(_PyUnicode_CHECK(unicode)); 900 if (Py_REFCNT(unicode) != 1) { 901 PyErr_SetString(PyExc_ValueError, 902 "Cannot modify a string having more than 1 reference"); 903 return -1; 904 } 905 _PyUnicode_DIRTY(unicode); 906 return 0; 907} 908 909Py_ssize_t 910PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 911 PyObject *from, Py_ssize_t from_start, 912 Py_ssize_t how_many) 913{ 914 unsigned int from_kind, to_kind; 915 void *from_data, *to_data; 916 917 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 918 PyErr_BadInternalCall(); 919 return -1; 920 } 921 922 if (PyUnicode_READY(from)) 923 return -1; 924 if (PyUnicode_READY(to)) 925 return -1; 926 927 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 928 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 929 PyErr_Format(PyExc_ValueError, 930 "Cannot write %zi characters at %zi " 931 "in a string of %zi characters", 932 how_many, to_start, PyUnicode_GET_LENGTH(to)); 933 return -1; 934 } 935 if (how_many == 0) 936 return 0; 937 938 if (_PyUnicode_Dirty(to)) 939 return -1; 940 941 from_kind = PyUnicode_KIND(from); 942 from_data = PyUnicode_DATA(from); 943 to_kind = PyUnicode_KIND(to); 944 to_data = PyUnicode_DATA(to); 945 946 if (from_kind == to_kind 947 /* deny latin1 => ascii */ 948 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from)) 949 { 950 Py_MEMCPY((char*)to_data 951 + PyUnicode_KIND_SIZE(to_kind, to_start), 952 (char*)from_data 953 + PyUnicode_KIND_SIZE(from_kind, from_start), 954 PyUnicode_KIND_SIZE(to_kind, how_many)); 955 } 956 else if (from_kind == PyUnicode_1BYTE_KIND 957 && to_kind == PyUnicode_2BYTE_KIND) 958 { 959 _PyUnicode_CONVERT_BYTES( 960 Py_UCS1, Py_UCS2, 961 PyUnicode_1BYTE_DATA(from) + from_start, 962 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 963 PyUnicode_2BYTE_DATA(to) + to_start 964 ); 965 } 966 else if (from_kind == PyUnicode_1BYTE_KIND 967 && to_kind == PyUnicode_4BYTE_KIND) 968 { 969 _PyUnicode_CONVERT_BYTES( 970 Py_UCS1, Py_UCS4, 971 PyUnicode_1BYTE_DATA(from) + from_start, 972 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 973 PyUnicode_4BYTE_DATA(to) + to_start 974 ); 975 } 976 else if (from_kind == PyUnicode_2BYTE_KIND 977 && to_kind == PyUnicode_4BYTE_KIND) 978 { 979 _PyUnicode_CONVERT_BYTES( 980 Py_UCS2, Py_UCS4, 981 PyUnicode_2BYTE_DATA(from) + from_start, 982 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 983 PyUnicode_4BYTE_DATA(to) + to_start 984 ); 985 } 986 else { 987 int invalid_kinds; 988 989 /* check if max_char(from substring) <= max_char(to) */ 990 if (from_kind > to_kind 991 /* latin1 => ascii */ 992 || (PyUnicode_IS_ASCII(to) 993 && to_kind == PyUnicode_1BYTE_KIND 994 && !PyUnicode_IS_ASCII(from))) 995 { 996 /* slow path to check for character overflow */ 997 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 998 Py_UCS4 ch, maxchar; 999 Py_ssize_t i; 1000 1001 maxchar = 0; 1002 invalid_kinds = 0; 1003 for (i=0; i < how_many; i++) { 1004 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1005 if (ch > maxchar) { 1006 maxchar = ch; 1007 if (maxchar > to_maxchar) { 1008 invalid_kinds = 1; 1009 break; 1010 } 1011 } 1012 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1013 } 1014 } 1015 else 1016 invalid_kinds = 1; 1017 if (invalid_kinds) { 1018 PyErr_Format(PyExc_ValueError, 1019 "Cannot copy %s characters " 1020 "into a string of %s characters", 1021 unicode_kind_name(from), 1022 unicode_kind_name(to)); 1023 return -1; 1024 } 1025 } 1026 return how_many; 1027} 1028 1029/* Find the maximum code point and count the number of surrogate pairs so a 1030 correct string length can be computed before converting a string to UCS4. 1031 This function counts single surrogates as a character and not as a pair. 1032 1033 Return 0 on success, or -1 on error. */ 1034static int 1035find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1036 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1037{ 1038 const wchar_t *iter; 1039 1040 assert(num_surrogates != NULL && maxchar != NULL); 1041 if (num_surrogates == NULL || maxchar == NULL) { 1042 PyErr_SetString(PyExc_SystemError, 1043 "unexpected NULL arguments to " 1044 "PyUnicode_FindMaxCharAndNumSurrogatePairs"); 1045 return -1; 1046 } 1047 1048 *num_surrogates = 0; 1049 *maxchar = 0; 1050 1051 for (iter = begin; iter < end; ) { 1052 if (*iter > *maxchar) 1053 *maxchar = *iter; 1054#if SIZEOF_WCHAR_T == 2 1055 if (*iter >= 0xD800 && *iter <= 0xDBFF 1056 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1057 { 1058 Py_UCS4 surrogate_val; 1059 surrogate_val = (((iter[0] & 0x3FF)<<10) 1060 | (iter[1] & 0x3FF)) + 0x10000; 1061 ++(*num_surrogates); 1062 if (surrogate_val > *maxchar) 1063 *maxchar = surrogate_val; 1064 iter += 2; 1065 } 1066 else 1067 iter++; 1068#else 1069 iter++; 1070#endif 1071 } 1072 return 0; 1073} 1074 1075#ifdef Py_DEBUG 1076int unicode_ready_calls = 0; 1077#endif 1078 1079static int 1080unicode_ready(PyObject **p_obj, int replace) 1081{ 1082 PyUnicodeObject *unicode; 1083 wchar_t *end; 1084 Py_UCS4 maxchar = 0; 1085 Py_ssize_t num_surrogates; 1086#if SIZEOF_WCHAR_T == 2 1087 Py_ssize_t length_wo_surrogates; 1088#endif 1089 1090 assert(p_obj != NULL); 1091 unicode = (PyUnicodeObject *)*p_obj; 1092 1093 /* _PyUnicode_Ready() is only intented for old-style API usage where 1094 strings were created using _PyObject_New() and where no canonical 1095 representation (the str field) has been set yet aka strings 1096 which are not yet ready. */ 1097 assert(_PyUnicode_CHECK(unicode)); 1098 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1099 assert(_PyUnicode_WSTR(unicode) != NULL); 1100 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1101 assert(_PyUnicode_UTF8(unicode) == NULL); 1102 /* Actually, it should neither be interned nor be anything else: */ 1103 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1104 1105#ifdef Py_DEBUG 1106 ++unicode_ready_calls; 1107#endif 1108 1109#ifdef Py_DEBUG 1110 assert(!replace || Py_REFCNT(unicode) == 1); 1111#else 1112 if (replace && Py_REFCNT(unicode) != 1) 1113 replace = 0; 1114#endif 1115 if (replace) { 1116 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1117 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1118 /* Optimization for empty strings */ 1119 if (len == 0) { 1120 Py_INCREF(unicode_empty); 1121 Py_DECREF(*p_obj); 1122 *p_obj = unicode_empty; 1123 return 0; 1124 } 1125 if (len == 1 && wstr[0] < 256) { 1126 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1127 if (latin1_char == NULL) 1128 return -1; 1129 Py_DECREF(*p_obj); 1130 *p_obj = latin1_char; 1131 return 0; 1132 } 1133 } 1134 1135 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1136 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1137 &maxchar, &num_surrogates) == -1) 1138 return -1; 1139 1140 if (maxchar < 256) { 1141 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1142 if (!_PyUnicode_DATA_ANY(unicode)) { 1143 PyErr_NoMemory(); 1144 return -1; 1145 } 1146 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1147 _PyUnicode_WSTR(unicode), end, 1148 PyUnicode_1BYTE_DATA(unicode)); 1149 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1150 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1151 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1152 if (maxchar < 128) { 1153 _PyUnicode_STATE(unicode).ascii = 1; 1154 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1155 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1156 } 1157 else { 1158 _PyUnicode_STATE(unicode).ascii = 0; 1159 _PyUnicode_UTF8(unicode) = NULL; 1160 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1161 } 1162 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1163 _PyUnicode_WSTR(unicode) = NULL; 1164 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1165 } 1166 /* In this case we might have to convert down from 4-byte native 1167 wchar_t to 2-byte unicode. */ 1168 else if (maxchar < 65536) { 1169 assert(num_surrogates == 0 && 1170 "FindMaxCharAndNumSurrogatePairs() messed up"); 1171 1172#if SIZEOF_WCHAR_T == 2 1173 /* We can share representations and are done. */ 1174 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1175 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1176 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1177 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1178 _PyUnicode_UTF8(unicode) = NULL; 1179 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1180#else 1181 /* sizeof(wchar_t) == 4 */ 1182 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1183 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1184 if (!_PyUnicode_DATA_ANY(unicode)) { 1185 PyErr_NoMemory(); 1186 return -1; 1187 } 1188 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1189 _PyUnicode_WSTR(unicode), end, 1190 PyUnicode_2BYTE_DATA(unicode)); 1191 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1192 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1193 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1194 _PyUnicode_UTF8(unicode) = NULL; 1195 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1196 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1197 _PyUnicode_WSTR(unicode) = NULL; 1198 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1199#endif 1200 } 1201 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1202 else { 1203#if SIZEOF_WCHAR_T == 2 1204 /* in case the native representation is 2-bytes, we need to allocate a 1205 new normalized 4-byte version. */ 1206 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1207 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1208 if (!_PyUnicode_DATA_ANY(unicode)) { 1209 PyErr_NoMemory(); 1210 return -1; 1211 } 1212 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1213 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1214 _PyUnicode_UTF8(unicode) = NULL; 1215 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1216 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1217 _PyUnicode_STATE(unicode).ready = 1; 1218 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1219 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1220 _PyUnicode_WSTR(unicode) = NULL; 1221 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1222#else 1223 assert(num_surrogates == 0); 1224 1225 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1226 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1227 _PyUnicode_UTF8(unicode) = NULL; 1228 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1229 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1230#endif 1231 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1232 } 1233 _PyUnicode_STATE(unicode).ready = 1; 1234 return 0; 1235} 1236 1237int 1238_PyUnicode_ReadyReplace(PyObject **op) 1239{ 1240 return unicode_ready(op, 1); 1241} 1242 1243int 1244_PyUnicode_Ready(PyObject *op) 1245{ 1246 return unicode_ready(&op, 0); 1247} 1248 1249static void 1250unicode_dealloc(register PyUnicodeObject *unicode) 1251{ 1252 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1253 case SSTATE_NOT_INTERNED: 1254 break; 1255 1256 case SSTATE_INTERNED_MORTAL: 1257 /* revive dead object temporarily for DelItem */ 1258 Py_REFCNT(unicode) = 3; 1259 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1260 Py_FatalError( 1261 "deletion of interned string failed"); 1262 break; 1263 1264 case SSTATE_INTERNED_IMMORTAL: 1265 Py_FatalError("Immortal interned string died."); 1266 1267 default: 1268 Py_FatalError("Inconsistent interned string state."); 1269 } 1270 1271 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1272 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1273 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1274 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1275 1276 if (PyUnicode_IS_COMPACT(unicode)) { 1277 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1278 } 1279 else { 1280 if (_PyUnicode_DATA_ANY(unicode)) 1281 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1282 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1283 } 1284} 1285 1286static int 1287unicode_resizable(PyObject *unicode) 1288{ 1289 if (Py_REFCNT(unicode) != 1) 1290 return 0; 1291 if (PyUnicode_CHECK_INTERNED(unicode)) 1292 return 0; 1293 assert (unicode != unicode_empty); 1294#ifdef Py_DEBUG 1295 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND 1296 && PyUnicode_GET_LENGTH(unicode) == 1) 1297 { 1298 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1299 if (ch < 256 && unicode_latin1[ch] == unicode) 1300 return 0; 1301 } 1302#endif 1303 return 1; 1304} 1305 1306static int 1307unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1308{ 1309 PyObject *unicode; 1310 Py_ssize_t old_length; 1311 1312 assert(p_unicode != NULL); 1313 unicode = *p_unicode; 1314 1315 assert(unicode != NULL); 1316 assert(PyUnicode_Check(unicode)); 1317 assert(0 <= length); 1318 1319 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1320 old_length = PyUnicode_WSTR_LENGTH(unicode); 1321 else 1322 old_length = PyUnicode_GET_LENGTH(unicode); 1323 if (old_length == length) 1324 return 0; 1325 1326 if (!unicode_resizable(unicode)) { 1327 PyObject *copy = resize_copy(unicode, length); 1328 if (copy == NULL) 1329 return -1; 1330 Py_DECREF(*p_unicode); 1331 *p_unicode = copy; 1332 return 0; 1333 } 1334 1335 if (PyUnicode_IS_COMPACT(unicode)) { 1336 *p_unicode = resize_compact(unicode, length); 1337 if (*p_unicode == NULL) 1338 return -1; 1339 return 0; 1340 } else 1341 return resize_inplace((PyUnicodeObject*)unicode, length); 1342} 1343 1344int 1345PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1346{ 1347 PyObject *unicode; 1348 if (p_unicode == NULL) { 1349 PyErr_BadInternalCall(); 1350 return -1; 1351 } 1352 unicode = *p_unicode; 1353 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1354 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1355 { 1356 PyErr_BadInternalCall(); 1357 return -1; 1358 } 1359 return unicode_resize(p_unicode, length); 1360} 1361 1362static PyObject* 1363get_latin1_char(unsigned char ch) 1364{ 1365 PyObject *unicode = unicode_latin1[ch]; 1366 if (!unicode) { 1367 unicode = PyUnicode_New(1, ch); 1368 if (!unicode) 1369 return NULL; 1370 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1371 unicode_latin1[ch] = unicode; 1372 } 1373 Py_INCREF(unicode); 1374 return unicode; 1375} 1376 1377PyObject * 1378PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1379{ 1380 PyUnicodeObject *unicode; 1381 Py_UCS4 maxchar = 0; 1382 Py_ssize_t num_surrogates; 1383 1384 if (u == NULL) 1385 return (PyObject*)_PyUnicode_New(size); 1386 1387 /* If the Unicode data is known at construction time, we can apply 1388 some optimizations which share commonly used objects. */ 1389 1390 /* Optimization for empty strings */ 1391 if (size == 0 && unicode_empty != NULL) { 1392 Py_INCREF(unicode_empty); 1393 return unicode_empty; 1394 } 1395 1396 /* Single character Unicode objects in the Latin-1 range are 1397 shared when using this constructor */ 1398 if (size == 1 && *u < 256) 1399 return get_latin1_char((unsigned char)*u); 1400 1401 /* If not empty and not single character, copy the Unicode data 1402 into the new object */ 1403 if (find_maxchar_surrogates(u, u + size, 1404 &maxchar, &num_surrogates) == -1) 1405 return NULL; 1406 1407 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1408 maxchar); 1409 if (!unicode) 1410 return NULL; 1411 1412 switch (PyUnicode_KIND(unicode)) { 1413 case PyUnicode_1BYTE_KIND: 1414 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1415 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1416 break; 1417 case PyUnicode_2BYTE_KIND: 1418#if Py_UNICODE_SIZE == 2 1419 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1420#else 1421 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1422 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1423#endif 1424 break; 1425 case PyUnicode_4BYTE_KIND: 1426#if SIZEOF_WCHAR_T == 2 1427 /* This is the only case which has to process surrogates, thus 1428 a simple copy loop is not enough and we need a function. */ 1429 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1430#else 1431 assert(num_surrogates == 0); 1432 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1433#endif 1434 break; 1435 default: 1436 assert(0 && "Impossible state"); 1437 } 1438 1439 return (PyObject *)unicode; 1440} 1441 1442PyObject * 1443PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1444{ 1445 PyUnicodeObject *unicode; 1446 1447 if (size < 0) { 1448 PyErr_SetString(PyExc_SystemError, 1449 "Negative size passed to PyUnicode_FromStringAndSize"); 1450 return NULL; 1451 } 1452 1453 /* If the Unicode data is known at construction time, we can apply 1454 some optimizations which share commonly used objects. 1455 Also, this means the input must be UTF-8, so fall back to the 1456 UTF-8 decoder at the end. */ 1457 if (u != NULL) { 1458 1459 /* Optimization for empty strings */ 1460 if (size == 0 && unicode_empty != NULL) { 1461 Py_INCREF(unicode_empty); 1462 return unicode_empty; 1463 } 1464 1465 /* Single characters are shared when using this constructor. 1466 Restrict to ASCII, since the input must be UTF-8. */ 1467 if (size == 1 && Py_CHARMASK(*u) < 128) 1468 return get_latin1_char(Py_CHARMASK(*u)); 1469 1470 return PyUnicode_DecodeUTF8(u, size, NULL); 1471 } 1472 1473 unicode = _PyUnicode_New(size); 1474 if (!unicode) 1475 return NULL; 1476 1477 return (PyObject *)unicode; 1478} 1479 1480PyObject * 1481PyUnicode_FromString(const char *u) 1482{ 1483 size_t size = strlen(u); 1484 if (size > PY_SSIZE_T_MAX) { 1485 PyErr_SetString(PyExc_OverflowError, "input too long"); 1486 return NULL; 1487 } 1488 1489 return PyUnicode_FromStringAndSize(u, size); 1490} 1491 1492static PyObject* 1493_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1494{ 1495 PyObject *res; 1496 unsigned char max = 127; 1497 Py_ssize_t i; 1498 for (i = 0; i < size; i++) { 1499 if (u[i] & 0x80) { 1500 max = 255; 1501 break; 1502 } 1503 } 1504 res = PyUnicode_New(size, max); 1505 if (!res) 1506 return NULL; 1507 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1508 return res; 1509} 1510 1511static PyObject* 1512_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1513{ 1514 PyObject *res; 1515 Py_UCS2 max = 0; 1516 Py_ssize_t i; 1517 for (i = 0; i < size; i++) 1518 if (u[i] > max) 1519 max = u[i]; 1520 res = PyUnicode_New(size, max); 1521 if (!res) 1522 return NULL; 1523 if (max >= 256) 1524 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1525 else 1526 for (i = 0; i < size; i++) 1527 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1528 return res; 1529} 1530 1531static PyObject* 1532_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1533{ 1534 PyObject *res; 1535 Py_UCS4 max = 0; 1536 Py_ssize_t i; 1537 for (i = 0; i < size; i++) 1538 if (u[i] > max) 1539 max = u[i]; 1540 res = PyUnicode_New(size, max); 1541 if (!res) 1542 return NULL; 1543 if (max >= 0x10000) 1544 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1545 else { 1546 int kind = PyUnicode_KIND(res); 1547 void *data = PyUnicode_DATA(res); 1548 for (i = 0; i < size; i++) 1549 PyUnicode_WRITE(kind, data, i, u[i]); 1550 } 1551 return res; 1552} 1553 1554PyObject* 1555PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1556{ 1557 switch(kind) { 1558 case PyUnicode_1BYTE_KIND: 1559 return _PyUnicode_FromUCS1(buffer, size); 1560 case PyUnicode_2BYTE_KIND: 1561 return _PyUnicode_FromUCS2(buffer, size); 1562 case PyUnicode_4BYTE_KIND: 1563 return _PyUnicode_FromUCS4(buffer, size); 1564 } 1565 PyErr_SetString(PyExc_ValueError, "invalid kind"); 1566 return NULL; 1567} 1568 1569PyObject* 1570PyUnicode_Copy(PyObject *unicode) 1571{ 1572 Py_ssize_t size; 1573 PyObject *copy; 1574 void *data; 1575 1576 if (!PyUnicode_Check(unicode)) { 1577 PyErr_BadInternalCall(); 1578 return NULL; 1579 } 1580 if (PyUnicode_READY(unicode)) 1581 return NULL; 1582 1583 size = PyUnicode_GET_LENGTH(unicode); 1584 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1585 if (!copy) 1586 return NULL; 1587 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1588 1589 data = PyUnicode_DATA(unicode); 1590 switch (PyUnicode_KIND(unicode)) 1591 { 1592 case PyUnicode_1BYTE_KIND: 1593 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1594 break; 1595 case PyUnicode_2BYTE_KIND: 1596 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1597 break; 1598 case PyUnicode_4BYTE_KIND: 1599 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1600 break; 1601 default: 1602 assert(0); 1603 break; 1604 } 1605 return copy; 1606} 1607 1608 1609/* Widen Unicode objects to larger buffers. Don't write terminating null 1610 character. Return NULL on error. */ 1611 1612void* 1613_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1614{ 1615 Py_ssize_t len; 1616 void *result; 1617 unsigned int skind; 1618 1619 if (PyUnicode_READY(s)) 1620 return NULL; 1621 1622 len = PyUnicode_GET_LENGTH(s); 1623 skind = PyUnicode_KIND(s); 1624 if (skind >= kind) { 1625 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt"); 1626 return NULL; 1627 } 1628 switch(kind) { 1629 case PyUnicode_2BYTE_KIND: 1630 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1631 if (!result) 1632 return PyErr_NoMemory(); 1633 assert(skind == PyUnicode_1BYTE_KIND); 1634 _PyUnicode_CONVERT_BYTES( 1635 Py_UCS1, Py_UCS2, 1636 PyUnicode_1BYTE_DATA(s), 1637 PyUnicode_1BYTE_DATA(s) + len, 1638 result); 1639 return result; 1640 case PyUnicode_4BYTE_KIND: 1641 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1642 if (!result) 1643 return PyErr_NoMemory(); 1644 if (skind == PyUnicode_2BYTE_KIND) { 1645 _PyUnicode_CONVERT_BYTES( 1646 Py_UCS2, Py_UCS4, 1647 PyUnicode_2BYTE_DATA(s), 1648 PyUnicode_2BYTE_DATA(s) + len, 1649 result); 1650 } 1651 else { 1652 assert(skind == PyUnicode_1BYTE_KIND); 1653 _PyUnicode_CONVERT_BYTES( 1654 Py_UCS1, Py_UCS4, 1655 PyUnicode_1BYTE_DATA(s), 1656 PyUnicode_1BYTE_DATA(s) + len, 1657 result); 1658 } 1659 return result; 1660 default: 1661 break; 1662 } 1663 PyErr_SetString(PyExc_ValueError, "invalid kind"); 1664 return NULL; 1665} 1666 1667static Py_UCS4* 1668as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1669 int copy_null) 1670{ 1671 int kind; 1672 void *data; 1673 Py_ssize_t len, targetlen; 1674 if (PyUnicode_READY(string) == -1) 1675 return NULL; 1676 kind = PyUnicode_KIND(string); 1677 data = PyUnicode_DATA(string); 1678 len = PyUnicode_GET_LENGTH(string); 1679 targetlen = len; 1680 if (copy_null) 1681 targetlen++; 1682 if (!target) { 1683 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1684 PyErr_NoMemory(); 1685 return NULL; 1686 } 1687 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1688 if (!target) { 1689 PyErr_NoMemory(); 1690 return NULL; 1691 } 1692 } 1693 else { 1694 if (targetsize < targetlen) { 1695 PyErr_Format(PyExc_SystemError, 1696 "string is longer than the buffer"); 1697 if (copy_null && 0 < targetsize) 1698 target[0] = 0; 1699 return NULL; 1700 } 1701 } 1702 if (kind != PyUnicode_4BYTE_KIND) { 1703 Py_ssize_t i; 1704 for (i = 0; i < len; i++) 1705 target[i] = PyUnicode_READ(kind, data, i); 1706 } 1707 else 1708 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1709 if (copy_null) 1710 target[len] = 0; 1711 return target; 1712} 1713 1714Py_UCS4* 1715PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1716 int copy_null) 1717{ 1718 if (target == NULL || targetsize < 1) { 1719 PyErr_BadInternalCall(); 1720 return NULL; 1721 } 1722 return as_ucs4(string, target, targetsize, copy_null); 1723} 1724 1725Py_UCS4* 1726PyUnicode_AsUCS4Copy(PyObject *string) 1727{ 1728 return as_ucs4(string, NULL, 0, 1); 1729} 1730 1731#ifdef HAVE_WCHAR_H 1732 1733PyObject * 1734PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1735{ 1736 if (w == NULL) { 1737 if (size == 0) 1738 return PyUnicode_New(0, 0); 1739 PyErr_BadInternalCall(); 1740 return NULL; 1741 } 1742 1743 if (size == -1) { 1744 size = wcslen(w); 1745 } 1746 1747 return PyUnicode_FromUnicode(w, size); 1748} 1749 1750#endif /* HAVE_WCHAR_H */ 1751 1752static void 1753makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1754 int zeropad, int width, int precision, char c) 1755{ 1756 *fmt++ = '%'; 1757 if (width) { 1758 if (zeropad) 1759 *fmt++ = '0'; 1760 fmt += sprintf(fmt, "%d", width); 1761 } 1762 if (precision) 1763 fmt += sprintf(fmt, ".%d", precision); 1764 if (longflag) 1765 *fmt++ = 'l'; 1766 else if (longlongflag) { 1767 /* longlongflag should only ever be nonzero on machines with 1768 HAVE_LONG_LONG defined */ 1769#ifdef HAVE_LONG_LONG 1770 char *f = PY_FORMAT_LONG_LONG; 1771 while (*f) 1772 *fmt++ = *f++; 1773#else 1774 /* we shouldn't ever get here */ 1775 assert(0); 1776 *fmt++ = 'l'; 1777#endif 1778 } 1779 else if (size_tflag) { 1780 char *f = PY_FORMAT_SIZE_T; 1781 while (*f) 1782 *fmt++ = *f++; 1783 } 1784 *fmt++ = c; 1785 *fmt = '\0'; 1786} 1787 1788/* helper for PyUnicode_FromFormatV() */ 1789 1790static const char* 1791parse_format_flags(const char *f, 1792 int *p_width, int *p_precision, 1793 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1794{ 1795 int width, precision, longflag, longlongflag, size_tflag; 1796 1797 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1798 f++; 1799 width = 0; 1800 while (Py_ISDIGIT((unsigned)*f)) 1801 width = (width*10) + *f++ - '0'; 1802 precision = 0; 1803 if (*f == '.') { 1804 f++; 1805 while (Py_ISDIGIT((unsigned)*f)) 1806 precision = (precision*10) + *f++ - '0'; 1807 if (*f == '%') { 1808 /* "%.3%s" => f points to "3" */ 1809 f--; 1810 } 1811 } 1812 if (*f == '\0') { 1813 /* bogus format "%.1" => go backward, f points to "1" */ 1814 f--; 1815 } 1816 if (p_width != NULL) 1817 *p_width = width; 1818 if (p_precision != NULL) 1819 *p_precision = precision; 1820 1821 /* Handle %ld, %lu, %lld and %llu. */ 1822 longflag = 0; 1823 longlongflag = 0; 1824 size_tflag = 0; 1825 1826 if (*f == 'l') { 1827 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1828 longflag = 1; 1829 ++f; 1830 } 1831#ifdef HAVE_LONG_LONG 1832 else if (f[1] == 'l' && 1833 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1834 longlongflag = 1; 1835 f += 2; 1836 } 1837#endif 1838 } 1839 /* handle the size_t flag. */ 1840 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 1841 size_tflag = 1; 1842 ++f; 1843 } 1844 if (p_longflag != NULL) 1845 *p_longflag = longflag; 1846 if (p_longlongflag != NULL) 1847 *p_longlongflag = longlongflag; 1848 if (p_size_tflag != NULL) 1849 *p_size_tflag = size_tflag; 1850 return f; 1851} 1852 1853/* maximum number of characters required for output of %ld. 21 characters 1854 allows for 64-bit integers (in decimal) and an optional sign. */ 1855#define MAX_LONG_CHARS 21 1856/* maximum number of characters required for output of %lld. 1857 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 1858 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 1859#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 1860 1861PyObject * 1862PyUnicode_FromFormatV(const char *format, va_list vargs) 1863{ 1864 va_list count; 1865 Py_ssize_t callcount = 0; 1866 PyObject **callresults = NULL; 1867 PyObject **callresult = NULL; 1868 Py_ssize_t n = 0; 1869 int width = 0; 1870 int precision = 0; 1871 int zeropad; 1872 const char* f; 1873 PyUnicodeObject *string; 1874 /* used by sprintf */ 1875 char fmt[61]; /* should be enough for %0width.precisionlld */ 1876 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 1877 Py_UCS4 argmaxchar; 1878 Py_ssize_t numbersize = 0; 1879 char *numberresults = NULL; 1880 char *numberresult = NULL; 1881 Py_ssize_t i; 1882 int kind; 1883 void *data; 1884 1885 Py_VA_COPY(count, vargs); 1886 /* step 1: count the number of %S/%R/%A/%s format specifications 1887 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 1888 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 1889 * result in an array) 1890 * also esimate a upper bound for all the number formats in the string, 1891 * numbers will be formated in step 3 and be keept in a '\0'-separated 1892 * buffer before putting everything together. */ 1893 for (f = format; *f; f++) { 1894 if (*f == '%') { 1895 int longlongflag; 1896 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 1897 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 1898 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 1899 ++callcount; 1900 1901 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 1902#ifdef HAVE_LONG_LONG 1903 if (longlongflag) { 1904 if (width < MAX_LONG_LONG_CHARS) 1905 width = MAX_LONG_LONG_CHARS; 1906 } 1907 else 1908#endif 1909 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 1910 including sign. Decimal takes the most space. This 1911 isn't enough for octal. If a width is specified we 1912 need more (which we allocate later). */ 1913 if (width < MAX_LONG_CHARS) 1914 width = MAX_LONG_CHARS; 1915 1916 /* account for the size + '\0' to separate numbers 1917 inside of the numberresults buffer */ 1918 numbersize += (width + 1); 1919 } 1920 } 1921 else if ((unsigned char)*f > 127) { 1922 PyErr_Format(PyExc_ValueError, 1923 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1924 "string, got a non-ASCII byte: 0x%02x", 1925 (unsigned char)*f); 1926 return NULL; 1927 } 1928 } 1929 /* step 2: allocate memory for the results of 1930 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 1931 if (callcount) { 1932 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 1933 if (!callresults) { 1934 PyErr_NoMemory(); 1935 return NULL; 1936 } 1937 callresult = callresults; 1938 } 1939 /* step 2.5: allocate memory for the results of formating numbers */ 1940 if (numbersize) { 1941 numberresults = PyObject_Malloc(numbersize); 1942 if (!numberresults) { 1943 PyErr_NoMemory(); 1944 goto fail; 1945 } 1946 numberresult = numberresults; 1947 } 1948 1949 /* step 3: format numbers and figure out how large a buffer we need */ 1950 for (f = format; *f; f++) { 1951 if (*f == '%') { 1952 const char* p; 1953 int longflag; 1954 int longlongflag; 1955 int size_tflag; 1956 int numprinted; 1957 1958 p = f; 1959 zeropad = (f[1] == '0'); 1960 f = parse_format_flags(f, &width, &precision, 1961 &longflag, &longlongflag, &size_tflag); 1962 switch (*f) { 1963 case 'c': 1964 { 1965 Py_UCS4 ordinal = va_arg(count, int); 1966 maxchar = Py_MAX(maxchar, ordinal); 1967 n++; 1968 break; 1969 } 1970 case '%': 1971 n++; 1972 break; 1973 case 'i': 1974 case 'd': 1975 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1976 width, precision, *f); 1977 if (longflag) 1978 numprinted = sprintf(numberresult, fmt, 1979 va_arg(count, long)); 1980#ifdef HAVE_LONG_LONG 1981 else if (longlongflag) 1982 numprinted = sprintf(numberresult, fmt, 1983 va_arg(count, PY_LONG_LONG)); 1984#endif 1985 else if (size_tflag) 1986 numprinted = sprintf(numberresult, fmt, 1987 va_arg(count, Py_ssize_t)); 1988 else 1989 numprinted = sprintf(numberresult, fmt, 1990 va_arg(count, int)); 1991 n += numprinted; 1992 /* advance by +1 to skip over the '\0' */ 1993 numberresult += (numprinted + 1); 1994 assert(*(numberresult - 1) == '\0'); 1995 assert(*(numberresult - 2) != '\0'); 1996 assert(numprinted >= 0); 1997 assert(numberresult <= numberresults + numbersize); 1998 break; 1999 case 'u': 2000 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2001 width, precision, 'u'); 2002 if (longflag) 2003 numprinted = sprintf(numberresult, fmt, 2004 va_arg(count, unsigned long)); 2005#ifdef HAVE_LONG_LONG 2006 else if (longlongflag) 2007 numprinted = sprintf(numberresult, fmt, 2008 va_arg(count, unsigned PY_LONG_LONG)); 2009#endif 2010 else if (size_tflag) 2011 numprinted = sprintf(numberresult, fmt, 2012 va_arg(count, size_t)); 2013 else 2014 numprinted = sprintf(numberresult, fmt, 2015 va_arg(count, unsigned int)); 2016 n += numprinted; 2017 numberresult += (numprinted + 1); 2018 assert(*(numberresult - 1) == '\0'); 2019 assert(*(numberresult - 2) != '\0'); 2020 assert(numprinted >= 0); 2021 assert(numberresult <= numberresults + numbersize); 2022 break; 2023 case 'x': 2024 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2025 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2026 n += numprinted; 2027 numberresult += (numprinted + 1); 2028 assert(*(numberresult - 1) == '\0'); 2029 assert(*(numberresult - 2) != '\0'); 2030 assert(numprinted >= 0); 2031 assert(numberresult <= numberresults + numbersize); 2032 break; 2033 case 'p': 2034 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2035 /* %p is ill-defined: ensure leading 0x. */ 2036 if (numberresult[1] == 'X') 2037 numberresult[1] = 'x'; 2038 else if (numberresult[1] != 'x') { 2039 memmove(numberresult + 2, numberresult, 2040 strlen(numberresult) + 1); 2041 numberresult[0] = '0'; 2042 numberresult[1] = 'x'; 2043 numprinted += 2; 2044 } 2045 n += numprinted; 2046 numberresult += (numprinted + 1); 2047 assert(*(numberresult - 1) == '\0'); 2048 assert(*(numberresult - 2) != '\0'); 2049 assert(numprinted >= 0); 2050 assert(numberresult <= numberresults + numbersize); 2051 break; 2052 case 's': 2053 { 2054 /* UTF-8 */ 2055 const char *s = va_arg(count, const char*); 2056 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2057 if (!str) 2058 goto fail; 2059 /* since PyUnicode_DecodeUTF8 returns already flexible 2060 unicode objects, there is no need to call ready on them */ 2061 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2062 maxchar = Py_MAX(maxchar, argmaxchar); 2063 n += PyUnicode_GET_LENGTH(str); 2064 /* Remember the str and switch to the next slot */ 2065 *callresult++ = str; 2066 break; 2067 } 2068 case 'U': 2069 { 2070 PyObject *obj = va_arg(count, PyObject *); 2071 assert(obj && _PyUnicode_CHECK(obj)); 2072 if (PyUnicode_READY(obj) == -1) 2073 goto fail; 2074 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2075 maxchar = Py_MAX(maxchar, argmaxchar); 2076 n += PyUnicode_GET_LENGTH(obj); 2077 break; 2078 } 2079 case 'V': 2080 { 2081 PyObject *obj = va_arg(count, PyObject *); 2082 const char *str = va_arg(count, const char *); 2083 PyObject *str_obj; 2084 assert(obj || str); 2085 assert(!obj || _PyUnicode_CHECK(obj)); 2086 if (obj) { 2087 if (PyUnicode_READY(obj) == -1) 2088 goto fail; 2089 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2090 maxchar = Py_MAX(maxchar, argmaxchar); 2091 n += PyUnicode_GET_LENGTH(obj); 2092 *callresult++ = NULL; 2093 } 2094 else { 2095 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2096 if (!str_obj) 2097 goto fail; 2098 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2099 maxchar = Py_MAX(maxchar, argmaxchar); 2100 n += PyUnicode_GET_LENGTH(str_obj); 2101 *callresult++ = str_obj; 2102 } 2103 break; 2104 } 2105 case 'S': 2106 { 2107 PyObject *obj = va_arg(count, PyObject *); 2108 PyObject *str; 2109 assert(obj); 2110 str = PyObject_Str(obj); 2111 if (!str || PyUnicode_READY(str) == -1) 2112 goto fail; 2113 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2114 maxchar = Py_MAX(maxchar, argmaxchar); 2115 n += PyUnicode_GET_LENGTH(str); 2116 /* Remember the str and switch to the next slot */ 2117 *callresult++ = str; 2118 break; 2119 } 2120 case 'R': 2121 { 2122 PyObject *obj = va_arg(count, PyObject *); 2123 PyObject *repr; 2124 assert(obj); 2125 repr = PyObject_Repr(obj); 2126 if (!repr || PyUnicode_READY(repr) == -1) 2127 goto fail; 2128 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2129 maxchar = Py_MAX(maxchar, argmaxchar); 2130 n += PyUnicode_GET_LENGTH(repr); 2131 /* Remember the repr and switch to the next slot */ 2132 *callresult++ = repr; 2133 break; 2134 } 2135 case 'A': 2136 { 2137 PyObject *obj = va_arg(count, PyObject *); 2138 PyObject *ascii; 2139 assert(obj); 2140 ascii = PyObject_ASCII(obj); 2141 if (!ascii || PyUnicode_READY(ascii) == -1) 2142 goto fail; 2143 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2144 maxchar = Py_MAX(maxchar, argmaxchar); 2145 n += PyUnicode_GET_LENGTH(ascii); 2146 /* Remember the repr and switch to the next slot */ 2147 *callresult++ = ascii; 2148 break; 2149 } 2150 default: 2151 /* if we stumble upon an unknown 2152 formatting code, copy the rest of 2153 the format string to the output 2154 string. (we cannot just skip the 2155 code, since there's no way to know 2156 what's in the argument list) */ 2157 n += strlen(p); 2158 goto expand; 2159 } 2160 } else 2161 n++; 2162 } 2163 expand: 2164 /* step 4: fill the buffer */ 2165 /* Since we've analyzed how much space we need, 2166 we don't have to resize the string. 2167 There can be no errors beyond this point. */ 2168 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); 2169 if (!string) 2170 goto fail; 2171 kind = PyUnicode_KIND(string); 2172 data = PyUnicode_DATA(string); 2173 callresult = callresults; 2174 numberresult = numberresults; 2175 2176 for (i = 0, f = format; *f; f++) { 2177 if (*f == '%') { 2178 const char* p; 2179 2180 p = f; 2181 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2182 /* checking for == because the last argument could be a empty 2183 string, which causes i to point to end, the assert at the end of 2184 the loop */ 2185 assert(i <= PyUnicode_GET_LENGTH(string)); 2186 2187 switch (*f) { 2188 case 'c': 2189 { 2190 const int ordinal = va_arg(vargs, int); 2191 PyUnicode_WRITE(kind, data, i++, ordinal); 2192 break; 2193 } 2194 case 'i': 2195 case 'd': 2196 case 'u': 2197 case 'x': 2198 case 'p': 2199 /* unused, since we already have the result */ 2200 if (*f == 'p') 2201 (void) va_arg(vargs, void *); 2202 else 2203 (void) va_arg(vargs, int); 2204 /* extract the result from numberresults and append. */ 2205 for (; *numberresult; ++i, ++numberresult) 2206 PyUnicode_WRITE(kind, data, i, *numberresult); 2207 /* skip over the separating '\0' */ 2208 assert(*numberresult == '\0'); 2209 numberresult++; 2210 assert(numberresult <= numberresults + numbersize); 2211 break; 2212 case 's': 2213 { 2214 /* unused, since we already have the result */ 2215 Py_ssize_t size; 2216 (void) va_arg(vargs, char *); 2217 size = PyUnicode_GET_LENGTH(*callresult); 2218 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2219 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2220 *callresult, 0, 2221 size) < 0) 2222 goto fail; 2223 i += size; 2224 /* We're done with the unicode()/repr() => forget it */ 2225 Py_DECREF(*callresult); 2226 /* switch to next unicode()/repr() result */ 2227 ++callresult; 2228 break; 2229 } 2230 case 'U': 2231 { 2232 PyObject *obj = va_arg(vargs, PyObject *); 2233 Py_ssize_t size; 2234 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2235 size = PyUnicode_GET_LENGTH(obj); 2236 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2237 obj, 0, 2238 size) < 0) 2239 goto fail; 2240 i += size; 2241 break; 2242 } 2243 case 'V': 2244 { 2245 Py_ssize_t size; 2246 PyObject *obj = va_arg(vargs, PyObject *); 2247 va_arg(vargs, const char *); 2248 if (obj) { 2249 size = PyUnicode_GET_LENGTH(obj); 2250 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2251 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2252 obj, 0, 2253 size) < 0) 2254 goto fail; 2255 i += size; 2256 } else { 2257 size = PyUnicode_GET_LENGTH(*callresult); 2258 assert(PyUnicode_KIND(*callresult) <= 2259 PyUnicode_KIND(string)); 2260 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2261 *callresult, 2262 0, size) < 0) 2263 goto fail; 2264 i += size; 2265 Py_DECREF(*callresult); 2266 } 2267 ++callresult; 2268 break; 2269 } 2270 case 'S': 2271 case 'R': 2272 case 'A': 2273 { 2274 /* unused, since we already have the result */ 2275 (void) va_arg(vargs, PyObject *); 2276 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2277 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2278 *callresult, 0, 2279 PyUnicode_GET_LENGTH(*callresult)) < 0) 2280 goto fail; 2281 i += PyUnicode_GET_LENGTH(*callresult); 2282 /* We're done with the unicode()/repr() => forget it */ 2283 Py_DECREF(*callresult); 2284 /* switch to next unicode()/repr() result */ 2285 ++callresult; 2286 break; 2287 } 2288 case '%': 2289 PyUnicode_WRITE(kind, data, i++, '%'); 2290 break; 2291 default: 2292 for (; *p; ++p, ++i) 2293 PyUnicode_WRITE(kind, data, i, *p); 2294 assert(i == PyUnicode_GET_LENGTH(string)); 2295 goto end; 2296 } 2297 } 2298 else { 2299 assert(i < PyUnicode_GET_LENGTH(string)); 2300 PyUnicode_WRITE(kind, data, i++, *f); 2301 } 2302 } 2303 assert(i == PyUnicode_GET_LENGTH(string)); 2304 2305 end: 2306 if (callresults) 2307 PyObject_Free(callresults); 2308 if (numberresults) 2309 PyObject_Free(numberresults); 2310 return (PyObject *)string; 2311 fail: 2312 if (callresults) { 2313 PyObject **callresult2 = callresults; 2314 while (callresult2 < callresult) { 2315 Py_XDECREF(*callresult2); 2316 ++callresult2; 2317 } 2318 PyObject_Free(callresults); 2319 } 2320 if (numberresults) 2321 PyObject_Free(numberresults); 2322 return NULL; 2323} 2324 2325PyObject * 2326PyUnicode_FromFormat(const char *format, ...) 2327{ 2328 PyObject* ret; 2329 va_list vargs; 2330 2331#ifdef HAVE_STDARG_PROTOTYPES 2332 va_start(vargs, format); 2333#else 2334 va_start(vargs); 2335#endif 2336 ret = PyUnicode_FromFormatV(format, vargs); 2337 va_end(vargs); 2338 return ret; 2339} 2340 2341#ifdef HAVE_WCHAR_H 2342 2343/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2344 convert a Unicode object to a wide character string. 2345 2346 - If w is NULL: return the number of wide characters (including the null 2347 character) required to convert the unicode object. Ignore size argument. 2348 2349 - Otherwise: return the number of wide characters (excluding the null 2350 character) written into w. Write at most size wide characters (including 2351 the null character). */ 2352static Py_ssize_t 2353unicode_aswidechar(PyUnicodeObject *unicode, 2354 wchar_t *w, 2355 Py_ssize_t size) 2356{ 2357 Py_ssize_t res; 2358 const wchar_t *wstr; 2359 2360 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2361 if (wstr == NULL) 2362 return -1; 2363 2364 if (w != NULL) { 2365 if (size > res) 2366 size = res + 1; 2367 else 2368 res = size; 2369 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2370 return res; 2371 } 2372 else 2373 return res + 1; 2374} 2375 2376Py_ssize_t 2377PyUnicode_AsWideChar(PyObject *unicode, 2378 wchar_t *w, 2379 Py_ssize_t size) 2380{ 2381 if (unicode == NULL) { 2382 PyErr_BadInternalCall(); 2383 return -1; 2384 } 2385 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2386} 2387 2388wchar_t* 2389PyUnicode_AsWideCharString(PyObject *unicode, 2390 Py_ssize_t *size) 2391{ 2392 wchar_t* buffer; 2393 Py_ssize_t buflen; 2394 2395 if (unicode == NULL) { 2396 PyErr_BadInternalCall(); 2397 return NULL; 2398 } 2399 2400 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2401 if (buflen == -1) 2402 return NULL; 2403 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2404 PyErr_NoMemory(); 2405 return NULL; 2406 } 2407 2408 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2409 if (buffer == NULL) { 2410 PyErr_NoMemory(); 2411 return NULL; 2412 } 2413 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2414 if (buflen == -1) 2415 return NULL; 2416 if (size != NULL) 2417 *size = buflen; 2418 return buffer; 2419} 2420 2421#endif /* HAVE_WCHAR_H */ 2422 2423PyObject * 2424PyUnicode_FromOrdinal(int ordinal) 2425{ 2426 PyObject *v; 2427 if (ordinal < 0 || ordinal > 0x10ffff) { 2428 PyErr_SetString(PyExc_ValueError, 2429 "chr() arg not in range(0x110000)"); 2430 return NULL; 2431 } 2432 2433 if (ordinal < 256) 2434 return get_latin1_char(ordinal); 2435 2436 v = PyUnicode_New(1, ordinal); 2437 if (v == NULL) 2438 return NULL; 2439 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2440 return v; 2441} 2442 2443PyObject * 2444PyUnicode_FromObject(register PyObject *obj) 2445{ 2446 /* XXX Perhaps we should make this API an alias of 2447 PyObject_Str() instead ?! */ 2448 if (PyUnicode_CheckExact(obj)) { 2449 if (PyUnicode_READY(obj)) 2450 return NULL; 2451 Py_INCREF(obj); 2452 return obj; 2453 } 2454 if (PyUnicode_Check(obj)) { 2455 /* For a Unicode subtype that's not a Unicode object, 2456 return a true Unicode object with the same data. */ 2457 return PyUnicode_Copy(obj); 2458 } 2459 PyErr_Format(PyExc_TypeError, 2460 "Can't convert '%.100s' object to str implicitly", 2461 Py_TYPE(obj)->tp_name); 2462 return NULL; 2463} 2464 2465PyObject * 2466PyUnicode_FromEncodedObject(register PyObject *obj, 2467 const char *encoding, 2468 const char *errors) 2469{ 2470 Py_buffer buffer; 2471 PyObject *v; 2472 2473 if (obj == NULL) { 2474 PyErr_BadInternalCall(); 2475 return NULL; 2476 } 2477 2478 /* Decoding bytes objects is the most common case and should be fast */ 2479 if (PyBytes_Check(obj)) { 2480 if (PyBytes_GET_SIZE(obj) == 0) { 2481 Py_INCREF(unicode_empty); 2482 v = unicode_empty; 2483 } 2484 else { 2485 v = PyUnicode_Decode( 2486 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2487 encoding, errors); 2488 } 2489 return v; 2490 } 2491 2492 if (PyUnicode_Check(obj)) { 2493 PyErr_SetString(PyExc_TypeError, 2494 "decoding str is not supported"); 2495 return NULL; 2496 } 2497 2498 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2499 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2500 PyErr_Format(PyExc_TypeError, 2501 "coercing to str: need bytes, bytearray " 2502 "or buffer-like object, %.80s found", 2503 Py_TYPE(obj)->tp_name); 2504 return NULL; 2505 } 2506 2507 if (buffer.len == 0) { 2508 Py_INCREF(unicode_empty); 2509 v = unicode_empty; 2510 } 2511 else 2512 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2513 2514 PyBuffer_Release(&buffer); 2515 return v; 2516} 2517 2518/* Convert encoding to lower case and replace '_' with '-' in order to 2519 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2520 1 on success. */ 2521static int 2522normalize_encoding(const char *encoding, 2523 char *lower, 2524 size_t lower_len) 2525{ 2526 const char *e; 2527 char *l; 2528 char *l_end; 2529 2530 e = encoding; 2531 l = lower; 2532 l_end = &lower[lower_len - 1]; 2533 while (*e) { 2534 if (l == l_end) 2535 return 0; 2536 if (Py_ISUPPER(*e)) { 2537 *l++ = Py_TOLOWER(*e++); 2538 } 2539 else if (*e == '_') { 2540 *l++ = '-'; 2541 e++; 2542 } 2543 else { 2544 *l++ = *e++; 2545 } 2546 } 2547 *l = '\0'; 2548 return 1; 2549} 2550 2551PyObject * 2552PyUnicode_Decode(const char *s, 2553 Py_ssize_t size, 2554 const char *encoding, 2555 const char *errors) 2556{ 2557 PyObject *buffer = NULL, *unicode; 2558 Py_buffer info; 2559 char lower[11]; /* Enough for any encoding shortcut */ 2560 2561 if (encoding == NULL) 2562 return PyUnicode_DecodeUTF8(s, size, errors); 2563 2564 /* Shortcuts for common default encodings */ 2565 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2566 if ((strcmp(lower, "utf-8") == 0) || 2567 (strcmp(lower, "utf8") == 0)) 2568 return PyUnicode_DecodeUTF8(s, size, errors); 2569 else if ((strcmp(lower, "latin-1") == 0) || 2570 (strcmp(lower, "latin1") == 0) || 2571 (strcmp(lower, "iso-8859-1") == 0)) 2572 return PyUnicode_DecodeLatin1(s, size, errors); 2573#ifdef HAVE_MBCS 2574 else if (strcmp(lower, "mbcs") == 0) 2575 return PyUnicode_DecodeMBCS(s, size, errors); 2576#endif 2577 else if (strcmp(lower, "ascii") == 0) 2578 return PyUnicode_DecodeASCII(s, size, errors); 2579 else if (strcmp(lower, "utf-16") == 0) 2580 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2581 else if (strcmp(lower, "utf-32") == 0) 2582 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2583 } 2584 2585 /* Decode via the codec registry */ 2586 buffer = NULL; 2587 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2588 goto onError; 2589 buffer = PyMemoryView_FromBuffer(&info); 2590 if (buffer == NULL) 2591 goto onError; 2592 unicode = PyCodec_Decode(buffer, encoding, errors); 2593 if (unicode == NULL) 2594 goto onError; 2595 if (!PyUnicode_Check(unicode)) { 2596 PyErr_Format(PyExc_TypeError, 2597 "decoder did not return a str object (type=%.400s)", 2598 Py_TYPE(unicode)->tp_name); 2599 Py_DECREF(unicode); 2600 goto onError; 2601 } 2602 Py_DECREF(buffer); 2603 if (_PyUnicode_READY_REPLACE(&unicode)) { 2604 Py_DECREF(unicode); 2605 return NULL; 2606 } 2607 return unicode; 2608 2609 onError: 2610 Py_XDECREF(buffer); 2611 return NULL; 2612} 2613 2614PyObject * 2615PyUnicode_AsDecodedObject(PyObject *unicode, 2616 const char *encoding, 2617 const char *errors) 2618{ 2619 PyObject *v; 2620 2621 if (!PyUnicode_Check(unicode)) { 2622 PyErr_BadArgument(); 2623 goto onError; 2624 } 2625 2626 if (encoding == NULL) 2627 encoding = PyUnicode_GetDefaultEncoding(); 2628 2629 /* Decode via the codec registry */ 2630 v = PyCodec_Decode(unicode, encoding, errors); 2631 if (v == NULL) 2632 goto onError; 2633 return v; 2634 2635 onError: 2636 return NULL; 2637} 2638 2639PyObject * 2640PyUnicode_AsDecodedUnicode(PyObject *unicode, 2641 const char *encoding, 2642 const char *errors) 2643{ 2644 PyObject *v; 2645 2646 if (!PyUnicode_Check(unicode)) { 2647 PyErr_BadArgument(); 2648 goto onError; 2649 } 2650 2651 if (encoding == NULL) 2652 encoding = PyUnicode_GetDefaultEncoding(); 2653 2654 /* Decode via the codec registry */ 2655 v = PyCodec_Decode(unicode, encoding, errors); 2656 if (v == NULL) 2657 goto onError; 2658 if (!PyUnicode_Check(v)) { 2659 PyErr_Format(PyExc_TypeError, 2660 "decoder did not return a str object (type=%.400s)", 2661 Py_TYPE(v)->tp_name); 2662 Py_DECREF(v); 2663 goto onError; 2664 } 2665 return v; 2666 2667 onError: 2668 return NULL; 2669} 2670 2671PyObject * 2672PyUnicode_Encode(const Py_UNICODE *s, 2673 Py_ssize_t size, 2674 const char *encoding, 2675 const char *errors) 2676{ 2677 PyObject *v, *unicode; 2678 2679 unicode = PyUnicode_FromUnicode(s, size); 2680 if (unicode == NULL) 2681 return NULL; 2682 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2683 Py_DECREF(unicode); 2684 return v; 2685} 2686 2687PyObject * 2688PyUnicode_AsEncodedObject(PyObject *unicode, 2689 const char *encoding, 2690 const char *errors) 2691{ 2692 PyObject *v; 2693 2694 if (!PyUnicode_Check(unicode)) { 2695 PyErr_BadArgument(); 2696 goto onError; 2697 } 2698 2699 if (encoding == NULL) 2700 encoding = PyUnicode_GetDefaultEncoding(); 2701 2702 /* Encode via the codec registry */ 2703 v = PyCodec_Encode(unicode, encoding, errors); 2704 if (v == NULL) 2705 goto onError; 2706 return v; 2707 2708 onError: 2709 return NULL; 2710} 2711 2712PyObject * 2713PyUnicode_EncodeFSDefault(PyObject *unicode) 2714{ 2715#ifdef HAVE_MBCS 2716 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2717 PyUnicode_GET_SIZE(unicode), 2718 NULL); 2719#elif defined(__APPLE__) 2720 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2721#else 2722 PyInterpreterState *interp = PyThreadState_GET()->interp; 2723 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2724 cannot use it to encode and decode filenames before it is loaded. Load 2725 the Python codec requires to encode at least its own filename. Use the C 2726 version of the locale codec until the codec registry is initialized and 2727 the Python codec is loaded. 2728 2729 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2730 cannot only rely on it: check also interp->fscodec_initialized for 2731 subinterpreters. */ 2732 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2733 return PyUnicode_AsEncodedString(unicode, 2734 Py_FileSystemDefaultEncoding, 2735 "surrogateescape"); 2736 } 2737 else { 2738 /* locale encoding with surrogateescape */ 2739 wchar_t *wchar; 2740 char *bytes; 2741 PyObject *bytes_obj; 2742 size_t error_pos; 2743 2744 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2745 if (wchar == NULL) 2746 return NULL; 2747 bytes = _Py_wchar2char(wchar, &error_pos); 2748 if (bytes == NULL) { 2749 if (error_pos != (size_t)-1) { 2750 char *errmsg = strerror(errno); 2751 PyObject *exc = NULL; 2752 if (errmsg == NULL) 2753 errmsg = "Py_wchar2char() failed"; 2754 raise_encode_exception(&exc, 2755 "filesystemencoding", 2756 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2757 error_pos, error_pos+1, 2758 errmsg); 2759 Py_XDECREF(exc); 2760 } 2761 else 2762 PyErr_NoMemory(); 2763 PyMem_Free(wchar); 2764 return NULL; 2765 } 2766 PyMem_Free(wchar); 2767 2768 bytes_obj = PyBytes_FromString(bytes); 2769 PyMem_Free(bytes); 2770 return bytes_obj; 2771 } 2772#endif 2773} 2774 2775PyObject * 2776PyUnicode_AsEncodedString(PyObject *unicode, 2777 const char *encoding, 2778 const char *errors) 2779{ 2780 PyObject *v; 2781 char lower[11]; /* Enough for any encoding shortcut */ 2782 2783 if (!PyUnicode_Check(unicode)) { 2784 PyErr_BadArgument(); 2785 return NULL; 2786 } 2787 2788 if (encoding == NULL) { 2789 if (errors == NULL || strcmp(errors, "strict") == 0) 2790 return _PyUnicode_AsUTF8String(unicode, NULL); 2791 else 2792 return _PyUnicode_AsUTF8String(unicode, errors); 2793 } 2794 2795 /* Shortcuts for common default encodings */ 2796 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2797 if ((strcmp(lower, "utf-8") == 0) || 2798 (strcmp(lower, "utf8") == 0)) 2799 { 2800 if (errors == NULL || strcmp(errors, "strict") == 0) 2801 return _PyUnicode_AsUTF8String(unicode, NULL); 2802 else 2803 return _PyUnicode_AsUTF8String(unicode, errors); 2804 } 2805 else if ((strcmp(lower, "latin-1") == 0) || 2806 (strcmp(lower, "latin1") == 0) || 2807 (strcmp(lower, "iso-8859-1") == 0)) 2808 return _PyUnicode_AsLatin1String(unicode, errors); 2809#ifdef HAVE_MBCS 2810 else if (strcmp(lower, "mbcs") == 0) 2811 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2812 PyUnicode_GET_SIZE(unicode), 2813 errors); 2814#endif 2815 else if (strcmp(lower, "ascii") == 0) 2816 return _PyUnicode_AsASCIIString(unicode, errors); 2817 } 2818 2819 /* Encode via the codec registry */ 2820 v = PyCodec_Encode(unicode, encoding, errors); 2821 if (v == NULL) 2822 return NULL; 2823 2824 /* The normal path */ 2825 if (PyBytes_Check(v)) 2826 return v; 2827 2828 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2829 if (PyByteArray_Check(v)) { 2830 int error; 2831 PyObject *b; 2832 2833 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2834 "encoder %s returned bytearray instead of bytes", 2835 encoding); 2836 if (error) { 2837 Py_DECREF(v); 2838 return NULL; 2839 } 2840 2841 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2842 Py_DECREF(v); 2843 return b; 2844 } 2845 2846 PyErr_Format(PyExc_TypeError, 2847 "encoder did not return a bytes object (type=%.400s)", 2848 Py_TYPE(v)->tp_name); 2849 Py_DECREF(v); 2850 return NULL; 2851} 2852 2853PyObject * 2854PyUnicode_AsEncodedUnicode(PyObject *unicode, 2855 const char *encoding, 2856 const char *errors) 2857{ 2858 PyObject *v; 2859 2860 if (!PyUnicode_Check(unicode)) { 2861 PyErr_BadArgument(); 2862 goto onError; 2863 } 2864 2865 if (encoding == NULL) 2866 encoding = PyUnicode_GetDefaultEncoding(); 2867 2868 /* Encode via the codec registry */ 2869 v = PyCodec_Encode(unicode, encoding, errors); 2870 if (v == NULL) 2871 goto onError; 2872 if (!PyUnicode_Check(v)) { 2873 PyErr_Format(PyExc_TypeError, 2874 "encoder did not return an str object (type=%.400s)", 2875 Py_TYPE(v)->tp_name); 2876 Py_DECREF(v); 2877 goto onError; 2878 } 2879 return v; 2880 2881 onError: 2882 return NULL; 2883} 2884 2885PyObject* 2886PyUnicode_DecodeFSDefault(const char *s) { 2887 Py_ssize_t size = (Py_ssize_t)strlen(s); 2888 return PyUnicode_DecodeFSDefaultAndSize(s, size); 2889} 2890 2891PyObject* 2892PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 2893{ 2894#ifdef HAVE_MBCS 2895 return PyUnicode_DecodeMBCS(s, size, NULL); 2896#elif defined(__APPLE__) 2897 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 2898#else 2899 PyInterpreterState *interp = PyThreadState_GET()->interp; 2900 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2901 cannot use it to encode and decode filenames before it is loaded. Load 2902 the Python codec requires to encode at least its own filename. Use the C 2903 version of the locale codec until the codec registry is initialized and 2904 the Python codec is loaded. 2905 2906 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2907 cannot only rely on it: check also interp->fscodec_initialized for 2908 subinterpreters. */ 2909 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2910 return PyUnicode_Decode(s, size, 2911 Py_FileSystemDefaultEncoding, 2912 "surrogateescape"); 2913 } 2914 else { 2915 /* locale encoding with surrogateescape */ 2916 wchar_t *wchar; 2917 PyObject *unicode; 2918 size_t len; 2919 2920 if (s[size] != '\0' || size != strlen(s)) { 2921 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2922 return NULL; 2923 } 2924 2925 wchar = _Py_char2wchar(s, &len); 2926 if (wchar == NULL) 2927 return PyErr_NoMemory(); 2928 2929 unicode = PyUnicode_FromWideChar(wchar, len); 2930 PyMem_Free(wchar); 2931 return unicode; 2932 } 2933#endif 2934} 2935 2936 2937int 2938PyUnicode_FSConverter(PyObject* arg, void* addr) 2939{ 2940 PyObject *output = NULL; 2941 Py_ssize_t size; 2942 void *data; 2943 if (arg == NULL) { 2944 Py_DECREF(*(PyObject**)addr); 2945 return 1; 2946 } 2947 if (PyBytes_Check(arg)) { 2948 output = arg; 2949 Py_INCREF(output); 2950 } 2951 else { 2952 arg = PyUnicode_FromObject(arg); 2953 if (!arg) 2954 return 0; 2955 output = PyUnicode_EncodeFSDefault(arg); 2956 Py_DECREF(arg); 2957 if (!output) 2958 return 0; 2959 if (!PyBytes_Check(output)) { 2960 Py_DECREF(output); 2961 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 2962 return 0; 2963 } 2964 } 2965 size = PyBytes_GET_SIZE(output); 2966 data = PyBytes_AS_STRING(output); 2967 if (size != strlen(data)) { 2968 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2969 Py_DECREF(output); 2970 return 0; 2971 } 2972 *(PyObject**)addr = output; 2973 return Py_CLEANUP_SUPPORTED; 2974} 2975 2976 2977int 2978PyUnicode_FSDecoder(PyObject* arg, void* addr) 2979{ 2980 PyObject *output = NULL; 2981 if (arg == NULL) { 2982 Py_DECREF(*(PyObject**)addr); 2983 return 1; 2984 } 2985 if (PyUnicode_Check(arg)) { 2986 if (PyUnicode_READY(arg)) 2987 return 0; 2988 output = arg; 2989 Py_INCREF(output); 2990 } 2991 else { 2992 arg = PyBytes_FromObject(arg); 2993 if (!arg) 2994 return 0; 2995 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 2996 PyBytes_GET_SIZE(arg)); 2997 Py_DECREF(arg); 2998 if (!output) 2999 return 0; 3000 if (!PyUnicode_Check(output)) { 3001 Py_DECREF(output); 3002 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3003 return 0; 3004 } 3005 } 3006 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3007 PyUnicode_GET_LENGTH(output), 0, 1)) { 3008 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3009 Py_DECREF(output); 3010 return 0; 3011 } 3012 *(PyObject**)addr = output; 3013 return Py_CLEANUP_SUPPORTED; 3014} 3015 3016 3017char* 3018PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3019{ 3020 PyObject *bytes; 3021 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3022 3023 if (!PyUnicode_Check(unicode)) { 3024 PyErr_BadArgument(); 3025 return NULL; 3026 } 3027 if (PyUnicode_READY(u) == -1) 3028 return NULL; 3029 3030 if (PyUnicode_UTF8(unicode) == NULL) { 3031 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3032 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3033 if (bytes == NULL) 3034 return NULL; 3035 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3036 if (_PyUnicode_UTF8(u) == NULL) { 3037 Py_DECREF(bytes); 3038 return NULL; 3039 } 3040 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3041 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3042 Py_DECREF(bytes); 3043 } 3044 3045 if (psize) 3046 *psize = PyUnicode_UTF8_LENGTH(unicode); 3047 return PyUnicode_UTF8(unicode); 3048} 3049 3050char* 3051PyUnicode_AsUTF8(PyObject *unicode) 3052{ 3053 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3054} 3055 3056#ifdef Py_DEBUG 3057int unicode_as_unicode_calls = 0; 3058#endif 3059 3060 3061Py_UNICODE * 3062PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3063{ 3064 PyUnicodeObject *u; 3065 const unsigned char *one_byte; 3066#if SIZEOF_WCHAR_T == 4 3067 const Py_UCS2 *two_bytes; 3068#else 3069 const Py_UCS4 *four_bytes; 3070 const Py_UCS4 *ucs4_end; 3071 Py_ssize_t num_surrogates; 3072#endif 3073 wchar_t *w; 3074 wchar_t *wchar_end; 3075 3076 if (!PyUnicode_Check(unicode)) { 3077 PyErr_BadArgument(); 3078 return NULL; 3079 } 3080 u = (PyUnicodeObject*)unicode; 3081 if (_PyUnicode_WSTR(u) == NULL) { 3082 /* Non-ASCII compact unicode object */ 3083 assert(_PyUnicode_KIND(u) != 0); 3084 assert(PyUnicode_IS_READY(u)); 3085 3086#ifdef Py_DEBUG 3087 ++unicode_as_unicode_calls; 3088#endif 3089 3090 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3091#if SIZEOF_WCHAR_T == 2 3092 four_bytes = PyUnicode_4BYTE_DATA(u); 3093 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3094 num_surrogates = 0; 3095 3096 for (; four_bytes < ucs4_end; ++four_bytes) { 3097 if (*four_bytes > 0xFFFF) 3098 ++num_surrogates; 3099 } 3100 3101 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3102 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3103 if (!_PyUnicode_WSTR(u)) { 3104 PyErr_NoMemory(); 3105 return NULL; 3106 } 3107 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3108 3109 w = _PyUnicode_WSTR(u); 3110 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3111 four_bytes = PyUnicode_4BYTE_DATA(u); 3112 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3113 if (*four_bytes > 0xFFFF) { 3114 /* encode surrogate pair in this case */ 3115 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3116 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3117 } 3118 else 3119 *w = *four_bytes; 3120 3121 if (w > wchar_end) { 3122 assert(0 && "Miscalculated string end"); 3123 } 3124 } 3125 *w = 0; 3126#else 3127 /* sizeof(wchar_t) == 4 */ 3128 Py_FatalError("Impossible unicode object state, wstr and str " 3129 "should share memory already."); 3130 return NULL; 3131#endif 3132 } 3133 else { 3134 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3135 (_PyUnicode_LENGTH(u) + 1)); 3136 if (!_PyUnicode_WSTR(u)) { 3137 PyErr_NoMemory(); 3138 return NULL; 3139 } 3140 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3141 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3142 w = _PyUnicode_WSTR(u); 3143 wchar_end = w + _PyUnicode_LENGTH(u); 3144 3145 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3146 one_byte = PyUnicode_1BYTE_DATA(u); 3147 for (; w < wchar_end; ++one_byte, ++w) 3148 *w = *one_byte; 3149 /* null-terminate the wstr */ 3150 *w = 0; 3151 } 3152 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3153#if SIZEOF_WCHAR_T == 4 3154 two_bytes = PyUnicode_2BYTE_DATA(u); 3155 for (; w < wchar_end; ++two_bytes, ++w) 3156 *w = *two_bytes; 3157 /* null-terminate the wstr */ 3158 *w = 0; 3159#else 3160 /* sizeof(wchar_t) == 2 */ 3161 PyObject_FREE(_PyUnicode_WSTR(u)); 3162 _PyUnicode_WSTR(u) = NULL; 3163 Py_FatalError("Impossible unicode object state, wstr " 3164 "and str should share memory already."); 3165 return NULL; 3166#endif 3167 } 3168 else { 3169 assert(0 && "This should never happen."); 3170 } 3171 } 3172 } 3173 if (size != NULL) 3174 *size = PyUnicode_WSTR_LENGTH(u); 3175 return _PyUnicode_WSTR(u); 3176} 3177 3178Py_UNICODE * 3179PyUnicode_AsUnicode(PyObject *unicode) 3180{ 3181 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3182} 3183 3184 3185Py_ssize_t 3186PyUnicode_GetSize(PyObject *unicode) 3187{ 3188 if (!PyUnicode_Check(unicode)) { 3189 PyErr_BadArgument(); 3190 goto onError; 3191 } 3192 return PyUnicode_GET_SIZE(unicode); 3193 3194 onError: 3195 return -1; 3196} 3197 3198Py_ssize_t 3199PyUnicode_GetLength(PyObject *unicode) 3200{ 3201 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3202 PyErr_BadArgument(); 3203 return -1; 3204 } 3205 3206 return PyUnicode_GET_LENGTH(unicode); 3207} 3208 3209Py_UCS4 3210PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3211{ 3212 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3213 PyErr_BadArgument(); 3214 return (Py_UCS4)-1; 3215 } 3216 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3217 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3218 return (Py_UCS4)-1; 3219 } 3220 return PyUnicode_READ_CHAR(unicode, index); 3221} 3222 3223int 3224PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3225{ 3226 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3227 PyErr_BadArgument(); 3228 return -1; 3229 } 3230 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3231 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3232 return -1; 3233 } 3234 if (_PyUnicode_Dirty(unicode)) 3235 return -1; 3236 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3237 index, ch); 3238 return 0; 3239} 3240 3241const char * 3242PyUnicode_GetDefaultEncoding(void) 3243{ 3244 return "utf-8"; 3245} 3246 3247/* create or adjust a UnicodeDecodeError */ 3248static void 3249make_decode_exception(PyObject **exceptionObject, 3250 const char *encoding, 3251 const char *input, Py_ssize_t length, 3252 Py_ssize_t startpos, Py_ssize_t endpos, 3253 const char *reason) 3254{ 3255 if (*exceptionObject == NULL) { 3256 *exceptionObject = PyUnicodeDecodeError_Create( 3257 encoding, input, length, startpos, endpos, reason); 3258 } 3259 else { 3260 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3261 goto onError; 3262 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3263 goto onError; 3264 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3265 goto onError; 3266 } 3267 return; 3268 3269onError: 3270 Py_DECREF(*exceptionObject); 3271 *exceptionObject = NULL; 3272} 3273 3274/* error handling callback helper: 3275 build arguments, call the callback and check the arguments, 3276 if no exception occurred, copy the replacement to the output 3277 and adjust various state variables. 3278 return 0 on success, -1 on error 3279*/ 3280 3281static int 3282unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3283 const char *encoding, const char *reason, 3284 const char **input, const char **inend, Py_ssize_t *startinpos, 3285 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3286 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3287{ 3288 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3289 3290 PyObject *restuple = NULL; 3291 PyObject *repunicode = NULL; 3292 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3293 Py_ssize_t insize; 3294 Py_ssize_t requiredsize; 3295 Py_ssize_t newpos; 3296 const Py_UNICODE *repptr; 3297 PyObject *inputobj = NULL; 3298 Py_ssize_t repsize; 3299 int res = -1; 3300 3301 if (*errorHandler == NULL) { 3302 *errorHandler = PyCodec_LookupError(errors); 3303 if (*errorHandler == NULL) 3304 goto onError; 3305 } 3306 3307 make_decode_exception(exceptionObject, 3308 encoding, 3309 *input, *inend - *input, 3310 *startinpos, *endinpos, 3311 reason); 3312 if (*exceptionObject == NULL) 3313 goto onError; 3314 3315 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3316 if (restuple == NULL) 3317 goto onError; 3318 if (!PyTuple_Check(restuple)) { 3319 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3320 goto onError; 3321 } 3322 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3323 goto onError; 3324 3325 /* Copy back the bytes variables, which might have been modified by the 3326 callback */ 3327 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3328 if (!inputobj) 3329 goto onError; 3330 if (!PyBytes_Check(inputobj)) { 3331 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3332 } 3333 *input = PyBytes_AS_STRING(inputobj); 3334 insize = PyBytes_GET_SIZE(inputobj); 3335 *inend = *input + insize; 3336 /* we can DECREF safely, as the exception has another reference, 3337 so the object won't go away. */ 3338 Py_DECREF(inputobj); 3339 3340 if (newpos<0) 3341 newpos = insize+newpos; 3342 if (newpos<0 || newpos>insize) { 3343 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3344 goto onError; 3345 } 3346 3347 /* need more space? (at least enough for what we 3348 have+the replacement+the rest of the string (starting 3349 at the new input position), so we won't have to check space 3350 when there are no errors in the rest of the string) */ 3351 repptr = PyUnicode_AS_UNICODE(repunicode); 3352 repsize = PyUnicode_GET_SIZE(repunicode); 3353 requiredsize = *outpos + repsize + insize-newpos; 3354 if (requiredsize > outsize) { 3355 if (requiredsize<2*outsize) 3356 requiredsize = 2*outsize; 3357 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3358 goto onError; 3359 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3360 } 3361 *endinpos = newpos; 3362 *inptr = *input + newpos; 3363 Py_UNICODE_COPY(*outptr, repptr, repsize); 3364 *outptr += repsize; 3365 *outpos += repsize; 3366 3367 /* we made it! */ 3368 res = 0; 3369 3370 onError: 3371 Py_XDECREF(restuple); 3372 return res; 3373} 3374 3375/* --- UTF-7 Codec -------------------------------------------------------- */ 3376 3377/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3378 3379/* Three simple macros defining base-64. */ 3380 3381/* Is c a base-64 character? */ 3382 3383#define IS_BASE64(c) \ 3384 (((c) >= 'A' && (c) <= 'Z') || \ 3385 ((c) >= 'a' && (c) <= 'z') || \ 3386 ((c) >= '0' && (c) <= '9') || \ 3387 (c) == '+' || (c) == '/') 3388 3389/* given that c is a base-64 character, what is its base-64 value? */ 3390 3391#define FROM_BASE64(c) \ 3392 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3393 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3394 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3395 (c) == '+' ? 62 : 63) 3396 3397/* What is the base-64 character of the bottom 6 bits of n? */ 3398 3399#define TO_BASE64(n) \ 3400 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3401 3402/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3403 * decoded as itself. We are permissive on decoding; the only ASCII 3404 * byte not decoding to itself is the + which begins a base64 3405 * string. */ 3406 3407#define DECODE_DIRECT(c) \ 3408 ((c) <= 127 && (c) != '+') 3409 3410/* The UTF-7 encoder treats ASCII characters differently according to 3411 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3412 * the above). See RFC2152. This array identifies these different 3413 * sets: 3414 * 0 : "Set D" 3415 * alphanumeric and '(),-./:? 3416 * 1 : "Set O" 3417 * !"#$%&*;<=>@[]^_`{|} 3418 * 2 : "whitespace" 3419 * ht nl cr sp 3420 * 3 : special (must be base64 encoded) 3421 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3422 */ 3423 3424static 3425char utf7_category[128] = { 3426/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3427 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3428/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3429 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3430/* sp ! " # $ % & ' ( ) * + , - . / */ 3431 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3432/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3434/* @ A B C D E F G H I J K L M N O */ 3435 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3436/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3438/* ` a b c d e f g h i j k l m n o */ 3439 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3440/* p q r s t u v w x y z { | } ~ del */ 3441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3442}; 3443 3444/* ENCODE_DIRECT: this character should be encoded as itself. The 3445 * answer depends on whether we are encoding set O as itself, and also 3446 * on whether we are encoding whitespace as itself. RFC2152 makes it 3447 * clear that the answers to these questions vary between 3448 * applications, so this code needs to be flexible. */ 3449 3450#define ENCODE_DIRECT(c, directO, directWS) \ 3451 ((c) < 128 && (c) > 0 && \ 3452 ((utf7_category[(c)] == 0) || \ 3453 (directWS && (utf7_category[(c)] == 2)) || \ 3454 (directO && (utf7_category[(c)] == 1)))) 3455 3456PyObject * 3457PyUnicode_DecodeUTF7(const char *s, 3458 Py_ssize_t size, 3459 const char *errors) 3460{ 3461 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3462} 3463 3464/* The decoder. The only state we preserve is our read position, 3465 * i.e. how many characters we have consumed. So if we end in the 3466 * middle of a shift sequence we have to back off the read position 3467 * and the output to the beginning of the sequence, otherwise we lose 3468 * all the shift state (seen bits, number of bits seen, high 3469 * surrogate). */ 3470 3471PyObject * 3472PyUnicode_DecodeUTF7Stateful(const char *s, 3473 Py_ssize_t size, 3474 const char *errors, 3475 Py_ssize_t *consumed) 3476{ 3477 const char *starts = s; 3478 Py_ssize_t startinpos; 3479 Py_ssize_t endinpos; 3480 Py_ssize_t outpos; 3481 const char *e; 3482 PyUnicodeObject *unicode; 3483 Py_UNICODE *p; 3484 const char *errmsg = ""; 3485 int inShift = 0; 3486 Py_UNICODE *shiftOutStart; 3487 unsigned int base64bits = 0; 3488 unsigned long base64buffer = 0; 3489 Py_UNICODE surrogate = 0; 3490 PyObject *errorHandler = NULL; 3491 PyObject *exc = NULL; 3492 3493 unicode = _PyUnicode_New(size); 3494 if (!unicode) 3495 return NULL; 3496 if (size == 0) { 3497 if (consumed) 3498 *consumed = 0; 3499 return (PyObject *)unicode; 3500 } 3501 3502 p = PyUnicode_AS_UNICODE(unicode); 3503 shiftOutStart = p; 3504 e = s + size; 3505 3506 while (s < e) { 3507 Py_UNICODE ch; 3508 restart: 3509 ch = (unsigned char) *s; 3510 3511 if (inShift) { /* in a base-64 section */ 3512 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3513 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3514 base64bits += 6; 3515 s++; 3516 if (base64bits >= 16) { 3517 /* we have enough bits for a UTF-16 value */ 3518 Py_UNICODE outCh = (Py_UNICODE) 3519 (base64buffer >> (base64bits-16)); 3520 base64bits -= 16; 3521 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3522 if (surrogate) { 3523 /* expecting a second surrogate */ 3524 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3525#ifdef Py_UNICODE_WIDE 3526 *p++ = (((surrogate & 0x3FF)<<10) 3527 | (outCh & 0x3FF)) + 0x10000; 3528#else 3529 *p++ = surrogate; 3530 *p++ = outCh; 3531#endif 3532 surrogate = 0; 3533 } 3534 else { 3535 surrogate = 0; 3536 errmsg = "second surrogate missing"; 3537 goto utf7Error; 3538 } 3539 } 3540 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3541 /* first surrogate */ 3542 surrogate = outCh; 3543 } 3544 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3545 errmsg = "unexpected second surrogate"; 3546 goto utf7Error; 3547 } 3548 else { 3549 *p++ = outCh; 3550 } 3551 } 3552 } 3553 else { /* now leaving a base-64 section */ 3554 inShift = 0; 3555 s++; 3556 if (surrogate) { 3557 errmsg = "second surrogate missing at end of shift sequence"; 3558 goto utf7Error; 3559 } 3560 if (base64bits > 0) { /* left-over bits */ 3561 if (base64bits >= 6) { 3562 /* We've seen at least one base-64 character */ 3563 errmsg = "partial character in shift sequence"; 3564 goto utf7Error; 3565 } 3566 else { 3567 /* Some bits remain; they should be zero */ 3568 if (base64buffer != 0) { 3569 errmsg = "non-zero padding bits in shift sequence"; 3570 goto utf7Error; 3571 } 3572 } 3573 } 3574 if (ch != '-') { 3575 /* '-' is absorbed; other terminating 3576 characters are preserved */ 3577 *p++ = ch; 3578 } 3579 } 3580 } 3581 else if ( ch == '+' ) { 3582 startinpos = s-starts; 3583 s++; /* consume '+' */ 3584 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3585 s++; 3586 *p++ = '+'; 3587 } 3588 else { /* begin base64-encoded section */ 3589 inShift = 1; 3590 shiftOutStart = p; 3591 base64bits = 0; 3592 } 3593 } 3594 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3595 *p++ = ch; 3596 s++; 3597 } 3598 else { 3599 startinpos = s-starts; 3600 s++; 3601 errmsg = "unexpected special character"; 3602 goto utf7Error; 3603 } 3604 continue; 3605utf7Error: 3606 outpos = p-PyUnicode_AS_UNICODE(unicode); 3607 endinpos = s-starts; 3608 if (unicode_decode_call_errorhandler( 3609 errors, &errorHandler, 3610 "utf7", errmsg, 3611 &starts, &e, &startinpos, &endinpos, &exc, &s, 3612 &unicode, &outpos, &p)) 3613 goto onError; 3614 } 3615 3616 /* end of string */ 3617 3618 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3619 /* if we're in an inconsistent state, that's an error */ 3620 if (surrogate || 3621 (base64bits >= 6) || 3622 (base64bits > 0 && base64buffer != 0)) { 3623 outpos = p-PyUnicode_AS_UNICODE(unicode); 3624 endinpos = size; 3625 if (unicode_decode_call_errorhandler( 3626 errors, &errorHandler, 3627 "utf7", "unterminated shift sequence", 3628 &starts, &e, &startinpos, &endinpos, &exc, &s, 3629 &unicode, &outpos, &p)) 3630 goto onError; 3631 if (s < e) 3632 goto restart; 3633 } 3634 } 3635 3636 /* return state */ 3637 if (consumed) { 3638 if (inShift) { 3639 p = shiftOutStart; /* back off output */ 3640 *consumed = startinpos; 3641 } 3642 else { 3643 *consumed = s-starts; 3644 } 3645 } 3646 3647 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3648 goto onError; 3649 3650 Py_XDECREF(errorHandler); 3651 Py_XDECREF(exc); 3652 if (_PyUnicode_READY_REPLACE(&unicode)) { 3653 Py_DECREF(unicode); 3654 return NULL; 3655 } 3656 return (PyObject *)unicode; 3657 3658 onError: 3659 Py_XDECREF(errorHandler); 3660 Py_XDECREF(exc); 3661 Py_DECREF(unicode); 3662 return NULL; 3663} 3664 3665 3666PyObject * 3667PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3668 Py_ssize_t size, 3669 int base64SetO, 3670 int base64WhiteSpace, 3671 const char *errors) 3672{ 3673 PyObject *v; 3674 /* It might be possible to tighten this worst case */ 3675 Py_ssize_t allocated = 8 * size; 3676 int inShift = 0; 3677 Py_ssize_t i = 0; 3678 unsigned int base64bits = 0; 3679 unsigned long base64buffer = 0; 3680 char * out; 3681 char * start; 3682 3683 if (size == 0) 3684 return PyBytes_FromStringAndSize(NULL, 0); 3685 3686 if (allocated / 8 != size) 3687 return PyErr_NoMemory(); 3688 3689 v = PyBytes_FromStringAndSize(NULL, allocated); 3690 if (v == NULL) 3691 return NULL; 3692 3693 start = out = PyBytes_AS_STRING(v); 3694 for (;i < size; ++i) { 3695 Py_UNICODE ch = s[i]; 3696 3697 if (inShift) { 3698 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3699 /* shifting out */ 3700 if (base64bits) { /* output remaining bits */ 3701 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3702 base64buffer = 0; 3703 base64bits = 0; 3704 } 3705 inShift = 0; 3706 /* Characters not in the BASE64 set implicitly unshift the sequence 3707 so no '-' is required, except if the character is itself a '-' */ 3708 if (IS_BASE64(ch) || ch == '-') { 3709 *out++ = '-'; 3710 } 3711 *out++ = (char) ch; 3712 } 3713 else { 3714 goto encode_char; 3715 } 3716 } 3717 else { /* not in a shift sequence */ 3718 if (ch == '+') { 3719 *out++ = '+'; 3720 *out++ = '-'; 3721 } 3722 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3723 *out++ = (char) ch; 3724 } 3725 else { 3726 *out++ = '+'; 3727 inShift = 1; 3728 goto encode_char; 3729 } 3730 } 3731 continue; 3732encode_char: 3733#ifdef Py_UNICODE_WIDE 3734 if (ch >= 0x10000) { 3735 /* code first surrogate */ 3736 base64bits += 16; 3737 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3738 while (base64bits >= 6) { 3739 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3740 base64bits -= 6; 3741 } 3742 /* prepare second surrogate */ 3743 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3744 } 3745#endif 3746 base64bits += 16; 3747 base64buffer = (base64buffer << 16) | ch; 3748 while (base64bits >= 6) { 3749 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3750 base64bits -= 6; 3751 } 3752 } 3753 if (base64bits) 3754 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3755 if (inShift) 3756 *out++ = '-'; 3757 if (_PyBytes_Resize(&v, out - start) < 0) 3758 return NULL; 3759 return v; 3760} 3761 3762#undef IS_BASE64 3763#undef FROM_BASE64 3764#undef TO_BASE64 3765#undef DECODE_DIRECT 3766#undef ENCODE_DIRECT 3767 3768/* --- UTF-8 Codec -------------------------------------------------------- */ 3769 3770static 3771char utf8_code_length[256] = { 3772 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3773 illegal prefix. See RFC 3629 for details */ 3774 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3775 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3776 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3786 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3787 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3788 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3789 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3790}; 3791 3792PyObject * 3793PyUnicode_DecodeUTF8(const char *s, 3794 Py_ssize_t size, 3795 const char *errors) 3796{ 3797 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3798} 3799 3800/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3801#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3802 3803/* Mask to quickly check whether a C 'long' contains a 3804 non-ASCII, UTF8-encoded char. */ 3805#if (SIZEOF_LONG == 8) 3806# define ASCII_CHAR_MASK 0x8080808080808080L 3807#elif (SIZEOF_LONG == 4) 3808# define ASCII_CHAR_MASK 0x80808080L 3809#else 3810# error C 'long' size should be either 4 or 8! 3811#endif 3812 3813/* Scans a UTF-8 string and returns the maximum character to be expected, 3814 the size of the decoded unicode string and if any major errors were 3815 encountered. 3816 3817 This function does check basic UTF-8 sanity, it does however NOT CHECK 3818 if the string contains surrogates, and if all continuation bytes are 3819 within the correct ranges, these checks are performed in 3820 PyUnicode_DecodeUTF8Stateful. 3821 3822 If it sets has_errors to 1, it means the value of unicode_size and max_char 3823 will be bogus and you should not rely on useful information in them. 3824 */ 3825static Py_UCS4 3826utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3827 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3828 int *has_errors) 3829{ 3830 Py_ssize_t n; 3831 Py_ssize_t char_count = 0; 3832 Py_UCS4 max_char = 127, new_max; 3833 Py_UCS4 upper_bound; 3834 const unsigned char *p = (const unsigned char *)s; 3835 const unsigned char *end = p + string_size; 3836 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3837 int err = 0; 3838 3839 for (; p < end && !err; ++p, ++char_count) { 3840 /* Only check value if it's not a ASCII char... */ 3841 if (*p < 0x80) { 3842 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 3843 an explanation. */ 3844 if (!((size_t) p & LONG_PTR_MASK)) { 3845 /* Help register allocation */ 3846 register const unsigned char *_p = p; 3847 while (_p < aligned_end) { 3848 unsigned long value = *(unsigned long *) _p; 3849 if (value & ASCII_CHAR_MASK) 3850 break; 3851 _p += SIZEOF_LONG; 3852 char_count += SIZEOF_LONG; 3853 } 3854 p = _p; 3855 if (p == end) 3856 break; 3857 } 3858 } 3859 if (*p >= 0x80) { 3860 n = utf8_code_length[*p]; 3861 new_max = max_char; 3862 switch (n) { 3863 /* invalid start byte */ 3864 case 0: 3865 err = 1; 3866 break; 3867 case 2: 3868 /* Code points between 0x00FF and 0x07FF inclusive. 3869 Approximate the upper bound of the code point, 3870 if this flips over 255 we can be sure it will be more 3871 than 255 and the string will need 2 bytes per code coint, 3872 if it stays under or equal to 255, we can be sure 1 byte 3873 is enough. 3874 ((*p & 0b00011111) << 6) | 0b00111111 */ 3875 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 3876 if (max_char < upper_bound) 3877 new_max = upper_bound; 3878 /* Ensure we track at least that we left ASCII space. */ 3879 if (new_max < 128) 3880 new_max = 128; 3881 break; 3882 case 3: 3883 /* Between 0x0FFF and 0xFFFF inclusive, so values are 3884 always > 255 and <= 65535 and will always need 2 bytes. */ 3885 if (max_char < 65535) 3886 new_max = 65535; 3887 break; 3888 case 4: 3889 /* Code point will be above 0xFFFF for sure in this case. */ 3890 new_max = 65537; 3891 break; 3892 /* Internal error, this should be caught by the first if */ 3893 case 1: 3894 default: 3895 assert(0 && "Impossible case in utf8_max_char_and_size"); 3896 err = 1; 3897 } 3898 /* Instead of number of overall bytes for this code point, 3899 n containts the number of following bytes: */ 3900 --n; 3901 /* Check if the follow up chars are all valid continuation bytes */ 3902 if (n >= 1) { 3903 const unsigned char *cont; 3904 if ((p + n) >= end) { 3905 if (consumed == 0) 3906 /* incomplete data, non-incremental decoding */ 3907 err = 1; 3908 break; 3909 } 3910 for (cont = p + 1; cont < (p + n); ++cont) { 3911 if ((*cont & 0xc0) != 0x80) { 3912 err = 1; 3913 break; 3914 } 3915 } 3916 p += n; 3917 } 3918 else 3919 err = 1; 3920 max_char = new_max; 3921 } 3922 } 3923 3924 if (unicode_size) 3925 *unicode_size = char_count; 3926 if (has_errors) 3927 *has_errors = err; 3928 return max_char; 3929} 3930 3931/* Similar to PyUnicode_WRITE but can also write into wstr field 3932 of the legacy unicode representation */ 3933#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 3934 do { \ 3935 const int k_ = (kind); \ 3936 if (k_ == PyUnicode_WCHAR_KIND) \ 3937 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 3938 else if (k_ == PyUnicode_1BYTE_KIND) \ 3939 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 3940 else if (k_ == PyUnicode_2BYTE_KIND) \ 3941 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 3942 else \ 3943 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 3944 } while (0) 3945 3946PyObject * 3947PyUnicode_DecodeUTF8Stateful(const char *s, 3948 Py_ssize_t size, 3949 const char *errors, 3950 Py_ssize_t *consumed) 3951{ 3952 const char *starts = s; 3953 int n; 3954 int k; 3955 Py_ssize_t startinpos; 3956 Py_ssize_t endinpos; 3957 const char *e, *aligned_end; 3958 PyUnicodeObject *unicode; 3959 const char *errmsg = ""; 3960 PyObject *errorHandler = NULL; 3961 PyObject *exc = NULL; 3962 Py_UCS4 maxchar = 0; 3963 Py_ssize_t unicode_size; 3964 Py_ssize_t i; 3965 int kind; 3966 void *data; 3967 int has_errors; 3968 Py_UNICODE *error_outptr; 3969#if SIZEOF_WCHAR_T == 2 3970 Py_ssize_t wchar_offset = 0; 3971#endif 3972 3973 if (size == 0) { 3974 if (consumed) 3975 *consumed = 0; 3976 return (PyObject *)PyUnicode_New(0, 0); 3977 } 3978 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 3979 consumed, &has_errors); 3980 if (has_errors) { 3981 unicode = _PyUnicode_New(size); 3982 if (!unicode) 3983 return NULL; 3984 kind = PyUnicode_WCHAR_KIND; 3985 data = PyUnicode_AS_UNICODE(unicode); 3986 assert(data != NULL); 3987 } 3988 else { 3989 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 3990 if (!unicode) 3991 return NULL; 3992 /* When the string is ASCII only, just use memcpy and return. 3993 unicode_size may be != size if there is an incomplete UTF-8 3994 sequence at the end of the ASCII block. */ 3995 if (maxchar < 128 && size == unicode_size) { 3996 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 3997 return (PyObject *)unicode; 3998 } 3999 kind = PyUnicode_KIND(unicode); 4000 data = PyUnicode_DATA(unicode); 4001 } 4002 /* Unpack UTF-8 encoded data */ 4003 i = 0; 4004 e = s + size; 4005 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4006 4007 while (s < e) { 4008 Py_UCS4 ch = (unsigned char)*s; 4009 4010 if (ch < 0x80) { 4011 /* Fast path for runs of ASCII characters. Given that common UTF-8 4012 input will consist of an overwhelming majority of ASCII 4013 characters, we try to optimize for this case by checking 4014 as many characters as a C 'long' can contain. 4015 First, check if we can do an aligned read, as most CPUs have 4016 a penalty for unaligned reads. 4017 */ 4018 if (!((size_t) s & LONG_PTR_MASK)) { 4019 /* Help register allocation */ 4020 register const char *_s = s; 4021 register Py_ssize_t _i = i; 4022 while (_s < aligned_end) { 4023 /* Read a whole long at a time (either 4 or 8 bytes), 4024 and do a fast unrolled copy if it only contains ASCII 4025 characters. */ 4026 unsigned long value = *(unsigned long *) _s; 4027 if (value & ASCII_CHAR_MASK) 4028 break; 4029 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4030 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4031 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4032 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4033#if (SIZEOF_LONG == 8) 4034 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4035 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4036 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4037 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4038#endif 4039 _s += SIZEOF_LONG; 4040 _i += SIZEOF_LONG; 4041 } 4042 s = _s; 4043 i = _i; 4044 if (s == e) 4045 break; 4046 ch = (unsigned char)*s; 4047 } 4048 } 4049 4050 if (ch < 0x80) { 4051 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4052 s++; 4053 continue; 4054 } 4055 4056 n = utf8_code_length[ch]; 4057 4058 if (s + n > e) { 4059 if (consumed) 4060 break; 4061 else { 4062 errmsg = "unexpected end of data"; 4063 startinpos = s-starts; 4064 endinpos = startinpos+1; 4065 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4066 endinpos++; 4067 goto utf8Error; 4068 } 4069 } 4070 4071 switch (n) { 4072 4073 case 0: 4074 errmsg = "invalid start byte"; 4075 startinpos = s-starts; 4076 endinpos = startinpos+1; 4077 goto utf8Error; 4078 4079 case 1: 4080 errmsg = "internal error"; 4081 startinpos = s-starts; 4082 endinpos = startinpos+1; 4083 goto utf8Error; 4084 4085 case 2: 4086 if ((s[1] & 0xc0) != 0x80) { 4087 errmsg = "invalid continuation byte"; 4088 startinpos = s-starts; 4089 endinpos = startinpos + 1; 4090 goto utf8Error; 4091 } 4092 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4093 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4094 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4095 break; 4096 4097 case 3: 4098 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4099 will result in surrogates in range d800-dfff. Surrogates are 4100 not valid UTF-8 so they are rejected. 4101 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4102 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4103 if ((s[1] & 0xc0) != 0x80 || 4104 (s[2] & 0xc0) != 0x80 || 4105 ((unsigned char)s[0] == 0xE0 && 4106 (unsigned char)s[1] < 0xA0) || 4107 ((unsigned char)s[0] == 0xED && 4108 (unsigned char)s[1] > 0x9F)) { 4109 errmsg = "invalid continuation byte"; 4110 startinpos = s-starts; 4111 endinpos = startinpos + 1; 4112 4113 /* if s[1] first two bits are 1 and 0, then the invalid 4114 continuation byte is s[2], so increment endinpos by 1, 4115 if not, s[1] is invalid and endinpos doesn't need to 4116 be incremented. */ 4117 if ((s[1] & 0xC0) == 0x80) 4118 endinpos++; 4119 goto utf8Error; 4120 } 4121 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4122 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4123 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4124 break; 4125 4126 case 4: 4127 if ((s[1] & 0xc0) != 0x80 || 4128 (s[2] & 0xc0) != 0x80 || 4129 (s[3] & 0xc0) != 0x80 || 4130 ((unsigned char)s[0] == 0xF0 && 4131 (unsigned char)s[1] < 0x90) || 4132 ((unsigned char)s[0] == 0xF4 && 4133 (unsigned char)s[1] > 0x8F)) { 4134 errmsg = "invalid continuation byte"; 4135 startinpos = s-starts; 4136 endinpos = startinpos + 1; 4137 if ((s[1] & 0xC0) == 0x80) { 4138 endinpos++; 4139 if ((s[2] & 0xC0) == 0x80) 4140 endinpos++; 4141 } 4142 goto utf8Error; 4143 } 4144 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4145 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4146 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4147 4148 /* If the string is flexible or we have native UCS-4, write 4149 directly.. */ 4150 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4151 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4152 4153 else { 4154 /* compute and append the two surrogates: */ 4155 4156 /* translate from 10000..10FFFF to 0..FFFF */ 4157 ch -= 0x10000; 4158 4159 /* high surrogate = top 10 bits added to D800 */ 4160 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4161 (Py_UNICODE)(0xD800 + (ch >> 10))); 4162 4163 /* low surrogate = bottom 10 bits added to DC00 */ 4164 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4165 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4166 } 4167#if SIZEOF_WCHAR_T == 2 4168 wchar_offset++; 4169#endif 4170 break; 4171 } 4172 s += n; 4173 continue; 4174 4175 utf8Error: 4176 /* If this is not yet a resizable string, make it one.. */ 4177 if (kind != PyUnicode_WCHAR_KIND) { 4178 const Py_UNICODE *u; 4179 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4180 if (!new_unicode) 4181 goto onError; 4182 u = PyUnicode_AsUnicode((PyObject *)unicode); 4183 if (!u) 4184 goto onError; 4185#if SIZEOF_WCHAR_T == 2 4186 i += wchar_offset; 4187#endif 4188 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4189 Py_DECREF(unicode); 4190 unicode = new_unicode; 4191 kind = 0; 4192 data = PyUnicode_AS_UNICODE(new_unicode); 4193 assert(data != NULL); 4194 } 4195 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4196 if (unicode_decode_call_errorhandler( 4197 errors, &errorHandler, 4198 "utf8", errmsg, 4199 &starts, &e, &startinpos, &endinpos, &exc, &s, 4200 &unicode, &i, &error_outptr)) 4201 goto onError; 4202 /* Update data because unicode_decode_call_errorhandler might have 4203 re-created or resized the unicode object. */ 4204 data = PyUnicode_AS_UNICODE(unicode); 4205 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4206 } 4207 /* Ensure the unicode_size calculation above was correct: */ 4208 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4209 4210 if (consumed) 4211 *consumed = s-starts; 4212 4213 /* Adjust length and ready string when it contained errors and 4214 is of the old resizable kind. */ 4215 if (kind == PyUnicode_WCHAR_KIND) { 4216 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4217 goto onError; 4218 } 4219 4220 Py_XDECREF(errorHandler); 4221 Py_XDECREF(exc); 4222 if (_PyUnicode_READY_REPLACE(&unicode)) { 4223 Py_DECREF(unicode); 4224 return NULL; 4225 } 4226 return (PyObject *)unicode; 4227 4228 onError: 4229 Py_XDECREF(errorHandler); 4230 Py_XDECREF(exc); 4231 Py_DECREF(unicode); 4232 return NULL; 4233} 4234 4235#undef WRITE_FLEXIBLE_OR_WSTR 4236 4237#ifdef __APPLE__ 4238 4239/* Simplified UTF-8 decoder using surrogateescape error handler, 4240 used to decode the command line arguments on Mac OS X. */ 4241 4242wchar_t* 4243_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4244{ 4245 int n; 4246 const char *e; 4247 wchar_t *unicode, *p; 4248 4249 /* Note: size will always be longer than the resulting Unicode 4250 character count */ 4251 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4252 PyErr_NoMemory(); 4253 return NULL; 4254 } 4255 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4256 if (!unicode) 4257 return NULL; 4258 4259 /* Unpack UTF-8 encoded data */ 4260 p = unicode; 4261 e = s + size; 4262 while (s < e) { 4263 Py_UCS4 ch = (unsigned char)*s; 4264 4265 if (ch < 0x80) { 4266 *p++ = (wchar_t)ch; 4267 s++; 4268 continue; 4269 } 4270 4271 n = utf8_code_length[ch]; 4272 if (s + n > e) { 4273 goto surrogateescape; 4274 } 4275 4276 switch (n) { 4277 case 0: 4278 case 1: 4279 goto surrogateescape; 4280 4281 case 2: 4282 if ((s[1] & 0xc0) != 0x80) 4283 goto surrogateescape; 4284 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4285 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4286 *p++ = (wchar_t)ch; 4287 break; 4288 4289 case 3: 4290 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4291 will result in surrogates in range d800-dfff. Surrogates are 4292 not valid UTF-8 so they are rejected. 4293 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4294 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4295 if ((s[1] & 0xc0) != 0x80 || 4296 (s[2] & 0xc0) != 0x80 || 4297 ((unsigned char)s[0] == 0xE0 && 4298 (unsigned char)s[1] < 0xA0) || 4299 ((unsigned char)s[0] == 0xED && 4300 (unsigned char)s[1] > 0x9F)) { 4301 4302 goto surrogateescape; 4303 } 4304 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4305 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4306 *p++ = (wchar_t)ch; 4307 break; 4308 4309 case 4: 4310 if ((s[1] & 0xc0) != 0x80 || 4311 (s[2] & 0xc0) != 0x80 || 4312 (s[3] & 0xc0) != 0x80 || 4313 ((unsigned char)s[0] == 0xF0 && 4314 (unsigned char)s[1] < 0x90) || 4315 ((unsigned char)s[0] == 0xF4 && 4316 (unsigned char)s[1] > 0x8F)) { 4317 goto surrogateescape; 4318 } 4319 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4320 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4321 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4322 4323#if SIZEOF_WCHAR_T == 4 4324 *p++ = (wchar_t)ch; 4325#else 4326 /* compute and append the two surrogates: */ 4327 4328 /* translate from 10000..10FFFF to 0..FFFF */ 4329 ch -= 0x10000; 4330 4331 /* high surrogate = top 10 bits added to D800 */ 4332 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4333 4334 /* low surrogate = bottom 10 bits added to DC00 */ 4335 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4336#endif 4337 break; 4338 } 4339 s += n; 4340 continue; 4341 4342 surrogateescape: 4343 *p++ = 0xDC00 + ch; 4344 s++; 4345 } 4346 *p = L'\0'; 4347 return unicode; 4348} 4349 4350#endif /* __APPLE__ */ 4351 4352/* Primary internal function which creates utf8 encoded bytes objects. 4353 4354 Allocation strategy: if the string is short, convert into a stack buffer 4355 and allocate exactly as much space needed at the end. Else allocate the 4356 maximum possible needed (4 result bytes per Unicode character), and return 4357 the excess memory at the end. 4358*/ 4359PyObject * 4360_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4361{ 4362#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4363 4364 Py_ssize_t i; /* index into s of next input byte */ 4365 PyObject *result; /* result string object */ 4366 char *p; /* next free byte in output buffer */ 4367 Py_ssize_t nallocated; /* number of result bytes allocated */ 4368 Py_ssize_t nneeded; /* number of result bytes needed */ 4369 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4370 PyObject *errorHandler = NULL; 4371 PyObject *exc = NULL; 4372 int kind; 4373 void *data; 4374 Py_ssize_t size; 4375 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4376#if SIZEOF_WCHAR_T == 2 4377 Py_ssize_t wchar_offset = 0; 4378#endif 4379 4380 if (!PyUnicode_Check(unicode)) { 4381 PyErr_BadArgument(); 4382 return NULL; 4383 } 4384 4385 if (PyUnicode_READY(unicode) == -1) 4386 return NULL; 4387 4388 if (PyUnicode_UTF8(unicode)) 4389 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4390 PyUnicode_UTF8_LENGTH(unicode)); 4391 4392 kind = PyUnicode_KIND(unicode); 4393 data = PyUnicode_DATA(unicode); 4394 size = PyUnicode_GET_LENGTH(unicode); 4395 4396 assert(size >= 0); 4397 4398 if (size <= MAX_SHORT_UNICHARS) { 4399 /* Write into the stack buffer; nallocated can't overflow. 4400 * At the end, we'll allocate exactly as much heap space as it 4401 * turns out we need. 4402 */ 4403 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4404 result = NULL; /* will allocate after we're done */ 4405 p = stackbuf; 4406 } 4407 else { 4408 /* Overallocate on the heap, and give the excess back at the end. */ 4409 nallocated = size * 4; 4410 if (nallocated / 4 != size) /* overflow! */ 4411 return PyErr_NoMemory(); 4412 result = PyBytes_FromStringAndSize(NULL, nallocated); 4413 if (result == NULL) 4414 return NULL; 4415 p = PyBytes_AS_STRING(result); 4416 } 4417 4418 for (i = 0; i < size;) { 4419 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4420 4421 if (ch < 0x80) 4422 /* Encode ASCII */ 4423 *p++ = (char) ch; 4424 4425 else if (ch < 0x0800) { 4426 /* Encode Latin-1 */ 4427 *p++ = (char)(0xc0 | (ch >> 6)); 4428 *p++ = (char)(0x80 | (ch & 0x3f)); 4429 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4430 Py_ssize_t newpos; 4431 PyObject *rep; 4432 Py_ssize_t repsize, k, startpos; 4433 startpos = i-1; 4434#if SIZEOF_WCHAR_T == 2 4435 startpos += wchar_offset; 4436#endif 4437 rep = unicode_encode_call_errorhandler( 4438 errors, &errorHandler, "utf-8", "surrogates not allowed", 4439 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4440 &exc, startpos, startpos+1, &newpos); 4441 if (!rep) 4442 goto error; 4443 4444 if (PyBytes_Check(rep)) 4445 repsize = PyBytes_GET_SIZE(rep); 4446 else 4447 repsize = PyUnicode_GET_SIZE(rep); 4448 4449 if (repsize > 4) { 4450 Py_ssize_t offset; 4451 4452 if (result == NULL) 4453 offset = p - stackbuf; 4454 else 4455 offset = p - PyBytes_AS_STRING(result); 4456 4457 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4458 /* integer overflow */ 4459 PyErr_NoMemory(); 4460 goto error; 4461 } 4462 nallocated += repsize - 4; 4463 if (result != NULL) { 4464 if (_PyBytes_Resize(&result, nallocated) < 0) 4465 goto error; 4466 } else { 4467 result = PyBytes_FromStringAndSize(NULL, nallocated); 4468 if (result == NULL) 4469 goto error; 4470 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4471 } 4472 p = PyBytes_AS_STRING(result) + offset; 4473 } 4474 4475 if (PyBytes_Check(rep)) { 4476 char *prep = PyBytes_AS_STRING(rep); 4477 for(k = repsize; k > 0; k--) 4478 *p++ = *prep++; 4479 } else /* rep is unicode */ { 4480 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4481 Py_UNICODE c; 4482 4483 for(k=0; k<repsize; k++) { 4484 c = prep[k]; 4485 if (0x80 <= c) { 4486 raise_encode_exception(&exc, "utf-8", 4487 PyUnicode_AS_UNICODE(unicode), 4488 size, i-1, i, 4489 "surrogates not allowed"); 4490 goto error; 4491 } 4492 *p++ = (char)prep[k]; 4493 } 4494 } 4495 Py_DECREF(rep); 4496 } else if (ch < 0x10000) { 4497 *p++ = (char)(0xe0 | (ch >> 12)); 4498 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4499 *p++ = (char)(0x80 | (ch & 0x3f)); 4500 } else /* ch >= 0x10000 */ { 4501 /* Encode UCS4 Unicode ordinals */ 4502 *p++ = (char)(0xf0 | (ch >> 18)); 4503 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4504 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4505 *p++ = (char)(0x80 | (ch & 0x3f)); 4506#if SIZEOF_WCHAR_T == 2 4507 wchar_offset++; 4508#endif 4509 } 4510 } 4511 4512 if (result == NULL) { 4513 /* This was stack allocated. */ 4514 nneeded = p - stackbuf; 4515 assert(nneeded <= nallocated); 4516 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4517 } 4518 else { 4519 /* Cut back to size actually needed. */ 4520 nneeded = p - PyBytes_AS_STRING(result); 4521 assert(nneeded <= nallocated); 4522 _PyBytes_Resize(&result, nneeded); 4523 } 4524 4525 Py_XDECREF(errorHandler); 4526 Py_XDECREF(exc); 4527 return result; 4528 error: 4529 Py_XDECREF(errorHandler); 4530 Py_XDECREF(exc); 4531 Py_XDECREF(result); 4532 return NULL; 4533 4534#undef MAX_SHORT_UNICHARS 4535} 4536 4537PyObject * 4538PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4539 Py_ssize_t size, 4540 const char *errors) 4541{ 4542 PyObject *v, *unicode; 4543 4544 unicode = PyUnicode_FromUnicode(s, size); 4545 if (unicode == NULL) 4546 return NULL; 4547 v = _PyUnicode_AsUTF8String(unicode, errors); 4548 Py_DECREF(unicode); 4549 return v; 4550} 4551 4552PyObject * 4553PyUnicode_AsUTF8String(PyObject *unicode) 4554{ 4555 return _PyUnicode_AsUTF8String(unicode, NULL); 4556} 4557 4558/* --- UTF-32 Codec ------------------------------------------------------- */ 4559 4560PyObject * 4561PyUnicode_DecodeUTF32(const char *s, 4562 Py_ssize_t size, 4563 const char *errors, 4564 int *byteorder) 4565{ 4566 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4567} 4568 4569PyObject * 4570PyUnicode_DecodeUTF32Stateful(const char *s, 4571 Py_ssize_t size, 4572 const char *errors, 4573 int *byteorder, 4574 Py_ssize_t *consumed) 4575{ 4576 const char *starts = s; 4577 Py_ssize_t startinpos; 4578 Py_ssize_t endinpos; 4579 Py_ssize_t outpos; 4580 PyUnicodeObject *unicode; 4581 Py_UNICODE *p; 4582#ifndef Py_UNICODE_WIDE 4583 int pairs = 0; 4584 const unsigned char *qq; 4585#else 4586 const int pairs = 0; 4587#endif 4588 const unsigned char *q, *e; 4589 int bo = 0; /* assume native ordering by default */ 4590 const char *errmsg = ""; 4591 /* Offsets from q for retrieving bytes in the right order. */ 4592#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4593 int iorder[] = {0, 1, 2, 3}; 4594#else 4595 int iorder[] = {3, 2, 1, 0}; 4596#endif 4597 PyObject *errorHandler = NULL; 4598 PyObject *exc = NULL; 4599 4600 q = (unsigned char *)s; 4601 e = q + size; 4602 4603 if (byteorder) 4604 bo = *byteorder; 4605 4606 /* Check for BOM marks (U+FEFF) in the input and adjust current 4607 byte order setting accordingly. In native mode, the leading BOM 4608 mark is skipped, in all other modes, it is copied to the output 4609 stream as-is (giving a ZWNBSP character). */ 4610 if (bo == 0) { 4611 if (size >= 4) { 4612 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4613 (q[iorder[1]] << 8) | q[iorder[0]]; 4614#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4615 if (bom == 0x0000FEFF) { 4616 q += 4; 4617 bo = -1; 4618 } 4619 else if (bom == 0xFFFE0000) { 4620 q += 4; 4621 bo = 1; 4622 } 4623#else 4624 if (bom == 0x0000FEFF) { 4625 q += 4; 4626 bo = 1; 4627 } 4628 else if (bom == 0xFFFE0000) { 4629 q += 4; 4630 bo = -1; 4631 } 4632#endif 4633 } 4634 } 4635 4636 if (bo == -1) { 4637 /* force LE */ 4638 iorder[0] = 0; 4639 iorder[1] = 1; 4640 iorder[2] = 2; 4641 iorder[3] = 3; 4642 } 4643 else if (bo == 1) { 4644 /* force BE */ 4645 iorder[0] = 3; 4646 iorder[1] = 2; 4647 iorder[2] = 1; 4648 iorder[3] = 0; 4649 } 4650 4651 /* On narrow builds we split characters outside the BMP into two 4652 codepoints => count how much extra space we need. */ 4653#ifndef Py_UNICODE_WIDE 4654 for (qq = q; qq < e; qq += 4) 4655 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4656 pairs++; 4657#endif 4658 4659 /* This might be one to much, because of a BOM */ 4660 unicode = _PyUnicode_New((size+3)/4+pairs); 4661 if (!unicode) 4662 return NULL; 4663 if (size == 0) 4664 return (PyObject *)unicode; 4665 4666 /* Unpack UTF-32 encoded data */ 4667 p = PyUnicode_AS_UNICODE(unicode); 4668 4669 while (q < e) { 4670 Py_UCS4 ch; 4671 /* remaining bytes at the end? (size should be divisible by 4) */ 4672 if (e-q<4) { 4673 if (consumed) 4674 break; 4675 errmsg = "truncated data"; 4676 startinpos = ((const char *)q)-starts; 4677 endinpos = ((const char *)e)-starts; 4678 goto utf32Error; 4679 /* The remaining input chars are ignored if the callback 4680 chooses to skip the input */ 4681 } 4682 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4683 (q[iorder[1]] << 8) | q[iorder[0]]; 4684 4685 if (ch >= 0x110000) 4686 { 4687 errmsg = "codepoint not in range(0x110000)"; 4688 startinpos = ((const char *)q)-starts; 4689 endinpos = startinpos+4; 4690 goto utf32Error; 4691 } 4692#ifndef Py_UNICODE_WIDE 4693 if (ch >= 0x10000) 4694 { 4695 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4696 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4697 } 4698 else 4699#endif 4700 *p++ = ch; 4701 q += 4; 4702 continue; 4703 utf32Error: 4704 outpos = p-PyUnicode_AS_UNICODE(unicode); 4705 if (unicode_decode_call_errorhandler( 4706 errors, &errorHandler, 4707 "utf32", errmsg, 4708 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4709 &unicode, &outpos, &p)) 4710 goto onError; 4711 } 4712 4713 if (byteorder) 4714 *byteorder = bo; 4715 4716 if (consumed) 4717 *consumed = (const char *)q-starts; 4718 4719 /* Adjust length */ 4720 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4721 goto onError; 4722 4723 Py_XDECREF(errorHandler); 4724 Py_XDECREF(exc); 4725 if (_PyUnicode_READY_REPLACE(&unicode)) { 4726 Py_DECREF(unicode); 4727 return NULL; 4728 } 4729 return (PyObject *)unicode; 4730 4731 onError: 4732 Py_DECREF(unicode); 4733 Py_XDECREF(errorHandler); 4734 Py_XDECREF(exc); 4735 return NULL; 4736} 4737 4738PyObject * 4739PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4740 Py_ssize_t size, 4741 const char *errors, 4742 int byteorder) 4743{ 4744 PyObject *v; 4745 unsigned char *p; 4746 Py_ssize_t nsize, bytesize; 4747#ifndef Py_UNICODE_WIDE 4748 Py_ssize_t i, pairs; 4749#else 4750 const int pairs = 0; 4751#endif 4752 /* Offsets from p for storing byte pairs in the right order. */ 4753#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4754 int iorder[] = {0, 1, 2, 3}; 4755#else 4756 int iorder[] = {3, 2, 1, 0}; 4757#endif 4758 4759#define STORECHAR(CH) \ 4760 do { \ 4761 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4762 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4763 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4764 p[iorder[0]] = (CH) & 0xff; \ 4765 p += 4; \ 4766 } while(0) 4767 4768 /* In narrow builds we can output surrogate pairs as one codepoint, 4769 so we need less space. */ 4770#ifndef Py_UNICODE_WIDE 4771 for (i = pairs = 0; i < size-1; i++) 4772 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4773 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4774 pairs++; 4775#endif 4776 nsize = (size - pairs + (byteorder == 0)); 4777 bytesize = nsize * 4; 4778 if (bytesize / 4 != nsize) 4779 return PyErr_NoMemory(); 4780 v = PyBytes_FromStringAndSize(NULL, bytesize); 4781 if (v == NULL) 4782 return NULL; 4783 4784 p = (unsigned char *)PyBytes_AS_STRING(v); 4785 if (byteorder == 0) 4786 STORECHAR(0xFEFF); 4787 if (size == 0) 4788 goto done; 4789 4790 if (byteorder == -1) { 4791 /* force LE */ 4792 iorder[0] = 0; 4793 iorder[1] = 1; 4794 iorder[2] = 2; 4795 iorder[3] = 3; 4796 } 4797 else if (byteorder == 1) { 4798 /* force BE */ 4799 iorder[0] = 3; 4800 iorder[1] = 2; 4801 iorder[2] = 1; 4802 iorder[3] = 0; 4803 } 4804 4805 while (size-- > 0) { 4806 Py_UCS4 ch = *s++; 4807#ifndef Py_UNICODE_WIDE 4808 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4809 Py_UCS4 ch2 = *s; 4810 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4811 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4812 s++; 4813 size--; 4814 } 4815 } 4816#endif 4817 STORECHAR(ch); 4818 } 4819 4820 done: 4821 return v; 4822#undef STORECHAR 4823} 4824 4825PyObject * 4826PyUnicode_AsUTF32String(PyObject *unicode) 4827{ 4828 if (!PyUnicode_Check(unicode)) { 4829 PyErr_BadArgument(); 4830 return NULL; 4831 } 4832 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 4833 PyUnicode_GET_SIZE(unicode), 4834 NULL, 4835 0); 4836} 4837 4838/* --- UTF-16 Codec ------------------------------------------------------- */ 4839 4840PyObject * 4841PyUnicode_DecodeUTF16(const char *s, 4842 Py_ssize_t size, 4843 const char *errors, 4844 int *byteorder) 4845{ 4846 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 4847} 4848 4849/* Two masks for fast checking of whether a C 'long' may contain 4850 UTF16-encoded surrogate characters. This is an efficient heuristic, 4851 assuming that non-surrogate characters with a code point >= 0x8000 are 4852 rare in most input. 4853 FAST_CHAR_MASK is used when the input is in native byte ordering, 4854 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 4855*/ 4856#if (SIZEOF_LONG == 8) 4857# define FAST_CHAR_MASK 0x8000800080008000L 4858# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 4859#elif (SIZEOF_LONG == 4) 4860# define FAST_CHAR_MASK 0x80008000L 4861# define SWAPPED_FAST_CHAR_MASK 0x00800080L 4862#else 4863# error C 'long' size should be either 4 or 8! 4864#endif 4865 4866PyObject * 4867PyUnicode_DecodeUTF16Stateful(const char *s, 4868 Py_ssize_t size, 4869 const char *errors, 4870 int *byteorder, 4871 Py_ssize_t *consumed) 4872{ 4873 const char *starts = s; 4874 Py_ssize_t startinpos; 4875 Py_ssize_t endinpos; 4876 Py_ssize_t outpos; 4877 PyUnicodeObject *unicode; 4878 Py_UNICODE *p; 4879 const unsigned char *q, *e, *aligned_end; 4880 int bo = 0; /* assume native ordering by default */ 4881 int native_ordering = 0; 4882 const char *errmsg = ""; 4883 /* Offsets from q for retrieving byte pairs in the right order. */ 4884#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4885 int ihi = 1, ilo = 0; 4886#else 4887 int ihi = 0, ilo = 1; 4888#endif 4889 PyObject *errorHandler = NULL; 4890 PyObject *exc = NULL; 4891 4892 /* Note: size will always be longer than the resulting Unicode 4893 character count */ 4894 unicode = _PyUnicode_New(size); 4895 if (!unicode) 4896 return NULL; 4897 if (size == 0) 4898 return (PyObject *)unicode; 4899 4900 /* Unpack UTF-16 encoded data */ 4901 p = PyUnicode_AS_UNICODE(unicode); 4902 q = (unsigned char *)s; 4903 e = q + size - 1; 4904 4905 if (byteorder) 4906 bo = *byteorder; 4907 4908 /* Check for BOM marks (U+FEFF) in the input and adjust current 4909 byte order setting accordingly. In native mode, the leading BOM 4910 mark is skipped, in all other modes, it is copied to the output 4911 stream as-is (giving a ZWNBSP character). */ 4912 if (bo == 0) { 4913 if (size >= 2) { 4914 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 4915#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4916 if (bom == 0xFEFF) { 4917 q += 2; 4918 bo = -1; 4919 } 4920 else if (bom == 0xFFFE) { 4921 q += 2; 4922 bo = 1; 4923 } 4924#else 4925 if (bom == 0xFEFF) { 4926 q += 2; 4927 bo = 1; 4928 } 4929 else if (bom == 0xFFFE) { 4930 q += 2; 4931 bo = -1; 4932 } 4933#endif 4934 } 4935 } 4936 4937 if (bo == -1) { 4938 /* force LE */ 4939 ihi = 1; 4940 ilo = 0; 4941 } 4942 else if (bo == 1) { 4943 /* force BE */ 4944 ihi = 0; 4945 ilo = 1; 4946 } 4947#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4948 native_ordering = ilo < ihi; 4949#else 4950 native_ordering = ilo > ihi; 4951#endif 4952 4953 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 4954 while (q < e) { 4955 Py_UNICODE ch; 4956 /* First check for possible aligned read of a C 'long'. Unaligned 4957 reads are more expensive, better to defer to another iteration. */ 4958 if (!((size_t) q & LONG_PTR_MASK)) { 4959 /* Fast path for runs of non-surrogate chars. */ 4960 register const unsigned char *_q = q; 4961 Py_UNICODE *_p = p; 4962 if (native_ordering) { 4963 /* Native ordering is simple: as long as the input cannot 4964 possibly contain a surrogate char, do an unrolled copy 4965 of several 16-bit code points to the target object. 4966 The non-surrogate check is done on several input bytes 4967 at a time (as many as a C 'long' can contain). */ 4968 while (_q < aligned_end) { 4969 unsigned long data = * (unsigned long *) _q; 4970 if (data & FAST_CHAR_MASK) 4971 break; 4972 _p[0] = ((unsigned short *) _q)[0]; 4973 _p[1] = ((unsigned short *) _q)[1]; 4974#if (SIZEOF_LONG == 8) 4975 _p[2] = ((unsigned short *) _q)[2]; 4976 _p[3] = ((unsigned short *) _q)[3]; 4977#endif 4978 _q += SIZEOF_LONG; 4979 _p += SIZEOF_LONG / 2; 4980 } 4981 } 4982 else { 4983 /* Byteswapped ordering is similar, but we must decompose 4984 the copy bytewise, and take care of zero'ing out the 4985 upper bytes if the target object is in 32-bit units 4986 (that is, in UCS-4 builds). */ 4987 while (_q < aligned_end) { 4988 unsigned long data = * (unsigned long *) _q; 4989 if (data & SWAPPED_FAST_CHAR_MASK) 4990 break; 4991 /* Zero upper bytes in UCS-4 builds */ 4992#if (Py_UNICODE_SIZE > 2) 4993 _p[0] = 0; 4994 _p[1] = 0; 4995#if (SIZEOF_LONG == 8) 4996 _p[2] = 0; 4997 _p[3] = 0; 4998#endif 4999#endif 5000 /* Issue #4916; UCS-4 builds on big endian machines must 5001 fill the two last bytes of each 4-byte unit. */ 5002#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5003# define OFF 2 5004#else 5005# define OFF 0 5006#endif 5007 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5008 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5009 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5010 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5011#if (SIZEOF_LONG == 8) 5012 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5013 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5014 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5015 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5016#endif 5017#undef OFF 5018 _q += SIZEOF_LONG; 5019 _p += SIZEOF_LONG / 2; 5020 } 5021 } 5022 p = _p; 5023 q = _q; 5024 if (q >= e) 5025 break; 5026 } 5027 ch = (q[ihi] << 8) | q[ilo]; 5028 5029 q += 2; 5030 5031 if (ch < 0xD800 || ch > 0xDFFF) { 5032 *p++ = ch; 5033 continue; 5034 } 5035 5036 /* UTF-16 code pair: */ 5037 if (q > e) { 5038 errmsg = "unexpected end of data"; 5039 startinpos = (((const char *)q) - 2) - starts; 5040 endinpos = ((const char *)e) + 1 - starts; 5041 goto utf16Error; 5042 } 5043 if (0xD800 <= ch && ch <= 0xDBFF) { 5044 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5045 q += 2; 5046 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5047#ifndef Py_UNICODE_WIDE 5048 *p++ = ch; 5049 *p++ = ch2; 5050#else 5051 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5052#endif 5053 continue; 5054 } 5055 else { 5056 errmsg = "illegal UTF-16 surrogate"; 5057 startinpos = (((const char *)q)-4)-starts; 5058 endinpos = startinpos+2; 5059 goto utf16Error; 5060 } 5061 5062 } 5063 errmsg = "illegal encoding"; 5064 startinpos = (((const char *)q)-2)-starts; 5065 endinpos = startinpos+2; 5066 /* Fall through to report the error */ 5067 5068 utf16Error: 5069 outpos = p - PyUnicode_AS_UNICODE(unicode); 5070 if (unicode_decode_call_errorhandler( 5071 errors, 5072 &errorHandler, 5073 "utf16", errmsg, 5074 &starts, 5075 (const char **)&e, 5076 &startinpos, 5077 &endinpos, 5078 &exc, 5079 (const char **)&q, 5080 &unicode, 5081 &outpos, 5082 &p)) 5083 goto onError; 5084 } 5085 /* remaining byte at the end? (size should be even) */ 5086 if (e == q) { 5087 if (!consumed) { 5088 errmsg = "truncated data"; 5089 startinpos = ((const char *)q) - starts; 5090 endinpos = ((const char *)e) + 1 - starts; 5091 outpos = p - PyUnicode_AS_UNICODE(unicode); 5092 if (unicode_decode_call_errorhandler( 5093 errors, 5094 &errorHandler, 5095 "utf16", errmsg, 5096 &starts, 5097 (const char **)&e, 5098 &startinpos, 5099 &endinpos, 5100 &exc, 5101 (const char **)&q, 5102 &unicode, 5103 &outpos, 5104 &p)) 5105 goto onError; 5106 /* The remaining input chars are ignored if the callback 5107 chooses to skip the input */ 5108 } 5109 } 5110 5111 if (byteorder) 5112 *byteorder = bo; 5113 5114 if (consumed) 5115 *consumed = (const char *)q-starts; 5116 5117 /* Adjust length */ 5118 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5119 goto onError; 5120 5121 Py_XDECREF(errorHandler); 5122 Py_XDECREF(exc); 5123 if (_PyUnicode_READY_REPLACE(&unicode)) { 5124 Py_DECREF(unicode); 5125 return NULL; 5126 } 5127 return (PyObject *)unicode; 5128 5129 onError: 5130 Py_DECREF(unicode); 5131 Py_XDECREF(errorHandler); 5132 Py_XDECREF(exc); 5133 return NULL; 5134} 5135 5136#undef FAST_CHAR_MASK 5137#undef SWAPPED_FAST_CHAR_MASK 5138 5139PyObject * 5140PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5141 Py_ssize_t size, 5142 const char *errors, 5143 int byteorder) 5144{ 5145 PyObject *v; 5146 unsigned char *p; 5147 Py_ssize_t nsize, bytesize; 5148#ifdef Py_UNICODE_WIDE 5149 Py_ssize_t i, pairs; 5150#else 5151 const int pairs = 0; 5152#endif 5153 /* Offsets from p for storing byte pairs in the right order. */ 5154#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5155 int ihi = 1, ilo = 0; 5156#else 5157 int ihi = 0, ilo = 1; 5158#endif 5159 5160#define STORECHAR(CH) \ 5161 do { \ 5162 p[ihi] = ((CH) >> 8) & 0xff; \ 5163 p[ilo] = (CH) & 0xff; \ 5164 p += 2; \ 5165 } while(0) 5166 5167#ifdef Py_UNICODE_WIDE 5168 for (i = pairs = 0; i < size; i++) 5169 if (s[i] >= 0x10000) 5170 pairs++; 5171#endif 5172 /* 2 * (size + pairs + (byteorder == 0)) */ 5173 if (size > PY_SSIZE_T_MAX || 5174 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5175 return PyErr_NoMemory(); 5176 nsize = size + pairs + (byteorder == 0); 5177 bytesize = nsize * 2; 5178 if (bytesize / 2 != nsize) 5179 return PyErr_NoMemory(); 5180 v = PyBytes_FromStringAndSize(NULL, bytesize); 5181 if (v == NULL) 5182 return NULL; 5183 5184 p = (unsigned char *)PyBytes_AS_STRING(v); 5185 if (byteorder == 0) 5186 STORECHAR(0xFEFF); 5187 if (size == 0) 5188 goto done; 5189 5190 if (byteorder == -1) { 5191 /* force LE */ 5192 ihi = 1; 5193 ilo = 0; 5194 } 5195 else if (byteorder == 1) { 5196 /* force BE */ 5197 ihi = 0; 5198 ilo = 1; 5199 } 5200 5201 while (size-- > 0) { 5202 Py_UNICODE ch = *s++; 5203 Py_UNICODE ch2 = 0; 5204#ifdef Py_UNICODE_WIDE 5205 if (ch >= 0x10000) { 5206 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5207 ch = 0xD800 | ((ch-0x10000) >> 10); 5208 } 5209#endif 5210 STORECHAR(ch); 5211 if (ch2) 5212 STORECHAR(ch2); 5213 } 5214 5215 done: 5216 return v; 5217#undef STORECHAR 5218} 5219 5220PyObject * 5221PyUnicode_AsUTF16String(PyObject *unicode) 5222{ 5223 if (!PyUnicode_Check(unicode)) { 5224 PyErr_BadArgument(); 5225 return NULL; 5226 } 5227 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5228 PyUnicode_GET_SIZE(unicode), 5229 NULL, 5230 0); 5231} 5232 5233/* --- Unicode Escape Codec ----------------------------------------------- */ 5234 5235/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5236 if all the escapes in the string make it still a valid ASCII string. 5237 Returns -1 if any escapes were found which cause the string to 5238 pop out of ASCII range. Otherwise returns the length of the 5239 required buffer to hold the string. 5240 */ 5241Py_ssize_t 5242length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5243{ 5244 const unsigned char *p = (const unsigned char *)s; 5245 const unsigned char *end = p + size; 5246 Py_ssize_t length = 0; 5247 5248 if (size < 0) 5249 return -1; 5250 5251 for (; p < end; ++p) { 5252 if (*p > 127) { 5253 /* Non-ASCII */ 5254 return -1; 5255 } 5256 else if (*p != '\\') { 5257 /* Normal character */ 5258 ++length; 5259 } 5260 else { 5261 /* Backslash-escape, check next char */ 5262 ++p; 5263 /* Escape sequence reaches till end of string or 5264 non-ASCII follow-up. */ 5265 if (p >= end || *p > 127) 5266 return -1; 5267 switch (*p) { 5268 case '\n': 5269 /* backslash + \n result in zero characters */ 5270 break; 5271 case '\\': case '\'': case '\"': 5272 case 'b': case 'f': case 't': 5273 case 'n': case 'r': case 'v': case 'a': 5274 ++length; 5275 break; 5276 case '0': case '1': case '2': case '3': 5277 case '4': case '5': case '6': case '7': 5278 case 'x': case 'u': case 'U': case 'N': 5279 /* these do not guarantee ASCII characters */ 5280 return -1; 5281 default: 5282 /* count the backslash + the other character */ 5283 length += 2; 5284 } 5285 } 5286 } 5287 return length; 5288} 5289 5290/* Similar to PyUnicode_WRITE but either write into wstr field 5291 or treat string as ASCII. */ 5292#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5293 do { \ 5294 if ((kind) != PyUnicode_WCHAR_KIND) \ 5295 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5296 else \ 5297 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5298 } while (0) 5299 5300#define WRITE_WSTR(buf, index, value) \ 5301 assert(kind == PyUnicode_WCHAR_KIND), \ 5302 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5303 5304 5305static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5306 5307PyObject * 5308PyUnicode_DecodeUnicodeEscape(const char *s, 5309 Py_ssize_t size, 5310 const char *errors) 5311{ 5312 const char *starts = s; 5313 Py_ssize_t startinpos; 5314 Py_ssize_t endinpos; 5315 int j; 5316 PyUnicodeObject *v; 5317 Py_UNICODE *p; 5318 const char *end; 5319 char* message; 5320 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5321 PyObject *errorHandler = NULL; 5322 PyObject *exc = NULL; 5323 Py_ssize_t ascii_length; 5324 Py_ssize_t i; 5325 int kind; 5326 void *data; 5327 5328 ascii_length = length_of_escaped_ascii_string(s, size); 5329 5330 /* After length_of_escaped_ascii_string() there are two alternatives, 5331 either the string is pure ASCII with named escapes like \n, etc. 5332 and we determined it's exact size (common case) 5333 or it contains \x, \u, ... escape sequences. then we create a 5334 legacy wchar string and resize it at the end of this function. */ 5335 if (ascii_length >= 0) { 5336 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5337 if (!v) 5338 goto onError; 5339 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5340 kind = PyUnicode_1BYTE_KIND; 5341 data = PyUnicode_DATA(v); 5342 } 5343 else { 5344 /* Escaped strings will always be longer than the resulting 5345 Unicode string, so we start with size here and then reduce the 5346 length after conversion to the true value. 5347 (but if the error callback returns a long replacement string 5348 we'll have to allocate more space) */ 5349 v = _PyUnicode_New(size); 5350 if (!v) 5351 goto onError; 5352 kind = PyUnicode_WCHAR_KIND; 5353 data = PyUnicode_AS_UNICODE(v); 5354 } 5355 5356 if (size == 0) 5357 return (PyObject *)v; 5358 i = 0; 5359 end = s + size; 5360 5361 while (s < end) { 5362 unsigned char c; 5363 Py_UNICODE x; 5364 int digits; 5365 5366 if (kind == PyUnicode_WCHAR_KIND) { 5367 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5368 } 5369 else { 5370 /* The only case in which i == ascii_length is a backslash 5371 followed by a newline. */ 5372 assert(i <= ascii_length); 5373 } 5374 5375 /* Non-escape characters are interpreted as Unicode ordinals */ 5376 if (*s != '\\') { 5377 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5378 continue; 5379 } 5380 5381 startinpos = s-starts; 5382 /* \ - Escapes */ 5383 s++; 5384 c = *s++; 5385 if (s > end) 5386 c = '\0'; /* Invalid after \ */ 5387 5388 if (kind == PyUnicode_WCHAR_KIND) { 5389 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5390 } 5391 else { 5392 /* The only case in which i == ascii_length is a backslash 5393 followed by a newline. */ 5394 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5395 } 5396 5397 switch (c) { 5398 5399 /* \x escapes */ 5400 case '\n': break; 5401 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5402 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5403 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5404 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5405 /* FF */ 5406 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5407 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5408 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5409 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5410 /* VT */ 5411 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5412 /* BEL, not classic C */ 5413 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5414 5415 /* \OOO (octal) escapes */ 5416 case '0': case '1': case '2': case '3': 5417 case '4': case '5': case '6': case '7': 5418 x = s[-1] - '0'; 5419 if (s < end && '0' <= *s && *s <= '7') { 5420 x = (x<<3) + *s++ - '0'; 5421 if (s < end && '0' <= *s && *s <= '7') 5422 x = (x<<3) + *s++ - '0'; 5423 } 5424 WRITE_WSTR(data, i++, x); 5425 break; 5426 5427 /* hex escapes */ 5428 /* \xXX */ 5429 case 'x': 5430 digits = 2; 5431 message = "truncated \\xXX escape"; 5432 goto hexescape; 5433 5434 /* \uXXXX */ 5435 case 'u': 5436 digits = 4; 5437 message = "truncated \\uXXXX escape"; 5438 goto hexescape; 5439 5440 /* \UXXXXXXXX */ 5441 case 'U': 5442 digits = 8; 5443 message = "truncated \\UXXXXXXXX escape"; 5444 hexescape: 5445 chr = 0; 5446 p = PyUnicode_AS_UNICODE(v) + i; 5447 if (s+digits>end) { 5448 endinpos = size; 5449 if (unicode_decode_call_errorhandler( 5450 errors, &errorHandler, 5451 "unicodeescape", "end of string in escape sequence", 5452 &starts, &end, &startinpos, &endinpos, &exc, &s, 5453 &v, &i, &p)) 5454 goto onError; 5455 data = PyUnicode_AS_UNICODE(v); 5456 goto nextByte; 5457 } 5458 for (j = 0; j < digits; ++j) { 5459 c = (unsigned char) s[j]; 5460 if (!Py_ISXDIGIT(c)) { 5461 endinpos = (s+j+1)-starts; 5462 p = PyUnicode_AS_UNICODE(v) + i; 5463 if (unicode_decode_call_errorhandler( 5464 errors, &errorHandler, 5465 "unicodeescape", message, 5466 &starts, &end, &startinpos, &endinpos, &exc, &s, 5467 &v, &i, &p)) 5468 goto onError; 5469 data = PyUnicode_AS_UNICODE(v); 5470 goto nextByte; 5471 } 5472 chr = (chr<<4) & ~0xF; 5473 if (c >= '0' && c <= '9') 5474 chr += c - '0'; 5475 else if (c >= 'a' && c <= 'f') 5476 chr += 10 + c - 'a'; 5477 else 5478 chr += 10 + c - 'A'; 5479 } 5480 s += j; 5481 if (chr == 0xffffffff && PyErr_Occurred()) 5482 /* _decoding_error will have already written into the 5483 target buffer. */ 5484 break; 5485 store: 5486 /* when we get here, chr is a 32-bit unicode character */ 5487 if (chr <= 0xffff) 5488 /* UCS-2 character */ 5489 WRITE_WSTR(data, i++, chr); 5490 else if (chr <= 0x10ffff) { 5491 /* UCS-4 character. Either store directly, or as 5492 surrogate pair. */ 5493#ifdef Py_UNICODE_WIDE 5494 WRITE_WSTR(data, i++, chr); 5495#else 5496 chr -= 0x10000L; 5497 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5498 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5499#endif 5500 } else { 5501 endinpos = s-starts; 5502 p = PyUnicode_AS_UNICODE(v) + i; 5503 if (unicode_decode_call_errorhandler( 5504 errors, &errorHandler, 5505 "unicodeescape", "illegal Unicode character", 5506 &starts, &end, &startinpos, &endinpos, &exc, &s, 5507 &v, &i, &p)) 5508 goto onError; 5509 data = PyUnicode_AS_UNICODE(v); 5510 } 5511 break; 5512 5513 /* \N{name} */ 5514 case 'N': 5515 message = "malformed \\N character escape"; 5516 if (ucnhash_CAPI == NULL) { 5517 /* load the unicode data module */ 5518 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5519 PyUnicodeData_CAPSULE_NAME, 1); 5520 if (ucnhash_CAPI == NULL) 5521 goto ucnhashError; 5522 } 5523 if (*s == '{') { 5524 const char *start = s+1; 5525 /* look for the closing brace */ 5526 while (*s != '}' && s < end) 5527 s++; 5528 if (s > start && s < end && *s == '}') { 5529 /* found a name. look it up in the unicode database */ 5530 message = "unknown Unicode character name"; 5531 s++; 5532 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5533 &chr)) 5534 goto store; 5535 } 5536 } 5537 endinpos = s-starts; 5538 p = PyUnicode_AS_UNICODE(v) + i; 5539 if (unicode_decode_call_errorhandler( 5540 errors, &errorHandler, 5541 "unicodeescape", message, 5542 &starts, &end, &startinpos, &endinpos, &exc, &s, 5543 &v, &i, &p)) 5544 goto onError; 5545 data = PyUnicode_AS_UNICODE(v); 5546 break; 5547 5548 default: 5549 if (s > end) { 5550 assert(kind == PyUnicode_WCHAR_KIND); 5551 message = "\\ at end of string"; 5552 s--; 5553 endinpos = s-starts; 5554 p = PyUnicode_AS_UNICODE(v) + i; 5555 if (unicode_decode_call_errorhandler( 5556 errors, &errorHandler, 5557 "unicodeescape", message, 5558 &starts, &end, &startinpos, &endinpos, &exc, &s, 5559 &v, &i, &p)) 5560 goto onError; 5561 data = PyUnicode_AS_UNICODE(v); 5562 } 5563 else { 5564 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5565 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5566 } 5567 break; 5568 } 5569 nextByte: 5570 ; 5571 } 5572 /* Ensure the length prediction worked in case of ASCII strings */ 5573 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5574 5575 if (kind == PyUnicode_WCHAR_KIND) 5576 { 5577 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5578 goto onError; 5579 } 5580 Py_XDECREF(errorHandler); 5581 Py_XDECREF(exc); 5582 if (_PyUnicode_READY_REPLACE(&v)) { 5583 Py_DECREF(v); 5584 return NULL; 5585 } 5586 return (PyObject *)v; 5587 5588 ucnhashError: 5589 PyErr_SetString( 5590 PyExc_UnicodeError, 5591 "\\N escapes not supported (can't load unicodedata module)" 5592 ); 5593 Py_XDECREF(v); 5594 Py_XDECREF(errorHandler); 5595 Py_XDECREF(exc); 5596 return NULL; 5597 5598 onError: 5599 Py_XDECREF(v); 5600 Py_XDECREF(errorHandler); 5601 Py_XDECREF(exc); 5602 return NULL; 5603} 5604 5605#undef WRITE_ASCII_OR_WSTR 5606#undef WRITE_WSTR 5607 5608/* Return a Unicode-Escape string version of the Unicode object. 5609 5610 If quotes is true, the string is enclosed in u"" or u'' quotes as 5611 appropriate. 5612 5613*/ 5614 5615static const char *hexdigits = "0123456789abcdef"; 5616 5617PyObject * 5618PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5619 Py_ssize_t size) 5620{ 5621 PyObject *repr; 5622 char *p; 5623 5624#ifdef Py_UNICODE_WIDE 5625 const Py_ssize_t expandsize = 10; 5626#else 5627 const Py_ssize_t expandsize = 6; 5628#endif 5629 5630 /* XXX(nnorwitz): rather than over-allocating, it would be 5631 better to choose a different scheme. Perhaps scan the 5632 first N-chars of the string and allocate based on that size. 5633 */ 5634 /* Initial allocation is based on the longest-possible unichr 5635 escape. 5636 5637 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5638 unichr, so in this case it's the longest unichr escape. In 5639 narrow (UTF-16) builds this is five chars per source unichr 5640 since there are two unichrs in the surrogate pair, so in narrow 5641 (UTF-16) builds it's not the longest unichr escape. 5642 5643 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5644 so in the narrow (UTF-16) build case it's the longest unichr 5645 escape. 5646 */ 5647 5648 if (size == 0) 5649 return PyBytes_FromStringAndSize(NULL, 0); 5650 5651 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5652 return PyErr_NoMemory(); 5653 5654 repr = PyBytes_FromStringAndSize(NULL, 5655 2 5656 + expandsize*size 5657 + 1); 5658 if (repr == NULL) 5659 return NULL; 5660 5661 p = PyBytes_AS_STRING(repr); 5662 5663 while (size-- > 0) { 5664 Py_UNICODE ch = *s++; 5665 5666 /* Escape backslashes */ 5667 if (ch == '\\') { 5668 *p++ = '\\'; 5669 *p++ = (char) ch; 5670 continue; 5671 } 5672 5673#ifdef Py_UNICODE_WIDE 5674 /* Map 21-bit characters to '\U00xxxxxx' */ 5675 else if (ch >= 0x10000) { 5676 *p++ = '\\'; 5677 *p++ = 'U'; 5678 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5679 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5680 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5681 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5682 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5683 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5684 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5685 *p++ = hexdigits[ch & 0x0000000F]; 5686 continue; 5687 } 5688#else 5689 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5690 else if (ch >= 0xD800 && ch < 0xDC00) { 5691 Py_UNICODE ch2; 5692 Py_UCS4 ucs; 5693 5694 ch2 = *s++; 5695 size--; 5696 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5697 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5698 *p++ = '\\'; 5699 *p++ = 'U'; 5700 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5701 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5702 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5703 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5704 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5705 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5706 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5707 *p++ = hexdigits[ucs & 0x0000000F]; 5708 continue; 5709 } 5710 /* Fall through: isolated surrogates are copied as-is */ 5711 s--; 5712 size++; 5713 } 5714#endif 5715 5716 /* Map 16-bit characters to '\uxxxx' */ 5717 if (ch >= 256) { 5718 *p++ = '\\'; 5719 *p++ = 'u'; 5720 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5721 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5722 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5723 *p++ = hexdigits[ch & 0x000F]; 5724 } 5725 5726 /* Map special whitespace to '\t', \n', '\r' */ 5727 else if (ch == '\t') { 5728 *p++ = '\\'; 5729 *p++ = 't'; 5730 } 5731 else if (ch == '\n') { 5732 *p++ = '\\'; 5733 *p++ = 'n'; 5734 } 5735 else if (ch == '\r') { 5736 *p++ = '\\'; 5737 *p++ = 'r'; 5738 } 5739 5740 /* Map non-printable US ASCII to '\xhh' */ 5741 else if (ch < ' ' || ch >= 0x7F) { 5742 *p++ = '\\'; 5743 *p++ = 'x'; 5744 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5745 *p++ = hexdigits[ch & 0x000F]; 5746 } 5747 5748 /* Copy everything else as-is */ 5749 else 5750 *p++ = (char) ch; 5751 } 5752 5753 assert(p - PyBytes_AS_STRING(repr) > 0); 5754 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5755 return NULL; 5756 return repr; 5757} 5758 5759PyObject * 5760PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5761{ 5762 PyObject *s; 5763 if (!PyUnicode_Check(unicode)) { 5764 PyErr_BadArgument(); 5765 return NULL; 5766 } 5767 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5768 PyUnicode_GET_SIZE(unicode)); 5769 return s; 5770} 5771 5772/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5773 5774PyObject * 5775PyUnicode_DecodeRawUnicodeEscape(const char *s, 5776 Py_ssize_t size, 5777 const char *errors) 5778{ 5779 const char *starts = s; 5780 Py_ssize_t startinpos; 5781 Py_ssize_t endinpos; 5782 Py_ssize_t outpos; 5783 PyUnicodeObject *v; 5784 Py_UNICODE *p; 5785 const char *end; 5786 const char *bs; 5787 PyObject *errorHandler = NULL; 5788 PyObject *exc = NULL; 5789 5790 /* Escaped strings will always be longer than the resulting 5791 Unicode string, so we start with size here and then reduce the 5792 length after conversion to the true value. (But decoding error 5793 handler might have to resize the string) */ 5794 v = _PyUnicode_New(size); 5795 if (v == NULL) 5796 goto onError; 5797 if (size == 0) 5798 return (PyObject *)v; 5799 p = PyUnicode_AS_UNICODE(v); 5800 end = s + size; 5801 while (s < end) { 5802 unsigned char c; 5803 Py_UCS4 x; 5804 int i; 5805 int count; 5806 5807 /* Non-escape characters are interpreted as Unicode ordinals */ 5808 if (*s != '\\') { 5809 *p++ = (unsigned char)*s++; 5810 continue; 5811 } 5812 startinpos = s-starts; 5813 5814 /* \u-escapes are only interpreted iff the number of leading 5815 backslashes if odd */ 5816 bs = s; 5817 for (;s < end;) { 5818 if (*s != '\\') 5819 break; 5820 *p++ = (unsigned char)*s++; 5821 } 5822 if (((s - bs) & 1) == 0 || 5823 s >= end || 5824 (*s != 'u' && *s != 'U')) { 5825 continue; 5826 } 5827 p--; 5828 count = *s=='u' ? 4 : 8; 5829 s++; 5830 5831 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5832 outpos = p-PyUnicode_AS_UNICODE(v); 5833 for (x = 0, i = 0; i < count; ++i, ++s) { 5834 c = (unsigned char)*s; 5835 if (!Py_ISXDIGIT(c)) { 5836 endinpos = s-starts; 5837 if (unicode_decode_call_errorhandler( 5838 errors, &errorHandler, 5839 "rawunicodeescape", "truncated \\uXXXX", 5840 &starts, &end, &startinpos, &endinpos, &exc, &s, 5841 &v, &outpos, &p)) 5842 goto onError; 5843 goto nextByte; 5844 } 5845 x = (x<<4) & ~0xF; 5846 if (c >= '0' && c <= '9') 5847 x += c - '0'; 5848 else if (c >= 'a' && c <= 'f') 5849 x += 10 + c - 'a'; 5850 else 5851 x += 10 + c - 'A'; 5852 } 5853 if (x <= 0xffff) 5854 /* UCS-2 character */ 5855 *p++ = (Py_UNICODE) x; 5856 else if (x <= 0x10ffff) { 5857 /* UCS-4 character. Either store directly, or as 5858 surrogate pair. */ 5859#ifdef Py_UNICODE_WIDE 5860 *p++ = (Py_UNICODE) x; 5861#else 5862 x -= 0x10000L; 5863 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 5864 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 5865#endif 5866 } else { 5867 endinpos = s-starts; 5868 outpos = p-PyUnicode_AS_UNICODE(v); 5869 if (unicode_decode_call_errorhandler( 5870 errors, &errorHandler, 5871 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5872 &starts, &end, &startinpos, &endinpos, &exc, &s, 5873 &v, &outpos, &p)) 5874 goto onError; 5875 } 5876 nextByte: 5877 ; 5878 } 5879 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5880 goto onError; 5881 Py_XDECREF(errorHandler); 5882 Py_XDECREF(exc); 5883 if (_PyUnicode_READY_REPLACE(&v)) { 5884 Py_DECREF(v); 5885 return NULL; 5886 } 5887 return (PyObject *)v; 5888 5889 onError: 5890 Py_XDECREF(v); 5891 Py_XDECREF(errorHandler); 5892 Py_XDECREF(exc); 5893 return NULL; 5894} 5895 5896PyObject * 5897PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5898 Py_ssize_t size) 5899{ 5900 PyObject *repr; 5901 char *p; 5902 char *q; 5903 5904#ifdef Py_UNICODE_WIDE 5905 const Py_ssize_t expandsize = 10; 5906#else 5907 const Py_ssize_t expandsize = 6; 5908#endif 5909 5910 if (size > PY_SSIZE_T_MAX / expandsize) 5911 return PyErr_NoMemory(); 5912 5913 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 5914 if (repr == NULL) 5915 return NULL; 5916 if (size == 0) 5917 return repr; 5918 5919 p = q = PyBytes_AS_STRING(repr); 5920 while (size-- > 0) { 5921 Py_UNICODE ch = *s++; 5922#ifdef Py_UNICODE_WIDE 5923 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5924 if (ch >= 0x10000) { 5925 *p++ = '\\'; 5926 *p++ = 'U'; 5927 *p++ = hexdigits[(ch >> 28) & 0xf]; 5928 *p++ = hexdigits[(ch >> 24) & 0xf]; 5929 *p++ = hexdigits[(ch >> 20) & 0xf]; 5930 *p++ = hexdigits[(ch >> 16) & 0xf]; 5931 *p++ = hexdigits[(ch >> 12) & 0xf]; 5932 *p++ = hexdigits[(ch >> 8) & 0xf]; 5933 *p++ = hexdigits[(ch >> 4) & 0xf]; 5934 *p++ = hexdigits[ch & 15]; 5935 } 5936 else 5937#else 5938 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5939 if (ch >= 0xD800 && ch < 0xDC00) { 5940 Py_UNICODE ch2; 5941 Py_UCS4 ucs; 5942 5943 ch2 = *s++; 5944 size--; 5945 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5946 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5947 *p++ = '\\'; 5948 *p++ = 'U'; 5949 *p++ = hexdigits[(ucs >> 28) & 0xf]; 5950 *p++ = hexdigits[(ucs >> 24) & 0xf]; 5951 *p++ = hexdigits[(ucs >> 20) & 0xf]; 5952 *p++ = hexdigits[(ucs >> 16) & 0xf]; 5953 *p++ = hexdigits[(ucs >> 12) & 0xf]; 5954 *p++ = hexdigits[(ucs >> 8) & 0xf]; 5955 *p++ = hexdigits[(ucs >> 4) & 0xf]; 5956 *p++ = hexdigits[ucs & 0xf]; 5957 continue; 5958 } 5959 /* Fall through: isolated surrogates are copied as-is */ 5960 s--; 5961 size++; 5962 } 5963#endif 5964 /* Map 16-bit characters to '\uxxxx' */ 5965 if (ch >= 256) { 5966 *p++ = '\\'; 5967 *p++ = 'u'; 5968 *p++ = hexdigits[(ch >> 12) & 0xf]; 5969 *p++ = hexdigits[(ch >> 8) & 0xf]; 5970 *p++ = hexdigits[(ch >> 4) & 0xf]; 5971 *p++ = hexdigits[ch & 15]; 5972 } 5973 /* Copy everything else as-is */ 5974 else 5975 *p++ = (char) ch; 5976 } 5977 size = p - q; 5978 5979 assert(size > 0); 5980 if (_PyBytes_Resize(&repr, size) < 0) 5981 return NULL; 5982 return repr; 5983} 5984 5985PyObject * 5986PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5987{ 5988 PyObject *s; 5989 if (!PyUnicode_Check(unicode)) { 5990 PyErr_BadArgument(); 5991 return NULL; 5992 } 5993 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5994 PyUnicode_GET_SIZE(unicode)); 5995 5996 return s; 5997} 5998 5999/* --- Unicode Internal Codec ------------------------------------------- */ 6000 6001PyObject * 6002_PyUnicode_DecodeUnicodeInternal(const char *s, 6003 Py_ssize_t size, 6004 const char *errors) 6005{ 6006 const char *starts = s; 6007 Py_ssize_t startinpos; 6008 Py_ssize_t endinpos; 6009 Py_ssize_t outpos; 6010 PyUnicodeObject *v; 6011 Py_UNICODE *p; 6012 const char *end; 6013 const char *reason; 6014 PyObject *errorHandler = NULL; 6015 PyObject *exc = NULL; 6016 6017#ifdef Py_UNICODE_WIDE 6018 Py_UNICODE unimax = PyUnicode_GetMax(); 6019#endif 6020 6021 /* XXX overflow detection missing */ 6022 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6023 if (v == NULL) 6024 goto onError; 6025 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6026 as string was created with the old API. */ 6027 if (PyUnicode_GET_SIZE(v) == 0) 6028 return (PyObject *)v; 6029 p = PyUnicode_AS_UNICODE(v); 6030 end = s + size; 6031 6032 while (s < end) { 6033 memcpy(p, s, sizeof(Py_UNICODE)); 6034 /* We have to sanity check the raw data, otherwise doom looms for 6035 some malformed UCS-4 data. */ 6036 if ( 6037#ifdef Py_UNICODE_WIDE 6038 *p > unimax || *p < 0 || 6039#endif 6040 end-s < Py_UNICODE_SIZE 6041 ) 6042 { 6043 startinpos = s - starts; 6044 if (end-s < Py_UNICODE_SIZE) { 6045 endinpos = end-starts; 6046 reason = "truncated input"; 6047 } 6048 else { 6049 endinpos = s - starts + Py_UNICODE_SIZE; 6050 reason = "illegal code point (> 0x10FFFF)"; 6051 } 6052 outpos = p - PyUnicode_AS_UNICODE(v); 6053 if (unicode_decode_call_errorhandler( 6054 errors, &errorHandler, 6055 "unicode_internal", reason, 6056 &starts, &end, &startinpos, &endinpos, &exc, &s, 6057 &v, &outpos, &p)) { 6058 goto onError; 6059 } 6060 } 6061 else { 6062 p++; 6063 s += Py_UNICODE_SIZE; 6064 } 6065 } 6066 6067 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6068 goto onError; 6069 Py_XDECREF(errorHandler); 6070 Py_XDECREF(exc); 6071 if (_PyUnicode_READY_REPLACE(&v)) { 6072 Py_DECREF(v); 6073 return NULL; 6074 } 6075 return (PyObject *)v; 6076 6077 onError: 6078 Py_XDECREF(v); 6079 Py_XDECREF(errorHandler); 6080 Py_XDECREF(exc); 6081 return NULL; 6082} 6083 6084/* --- Latin-1 Codec ------------------------------------------------------ */ 6085 6086PyObject * 6087PyUnicode_DecodeLatin1(const char *s, 6088 Py_ssize_t size, 6089 const char *errors) 6090{ 6091 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6092 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6093} 6094 6095/* create or adjust a UnicodeEncodeError */ 6096static void 6097make_encode_exception(PyObject **exceptionObject, 6098 const char *encoding, 6099 const Py_UNICODE *unicode, Py_ssize_t size, 6100 Py_ssize_t startpos, Py_ssize_t endpos, 6101 const char *reason) 6102{ 6103 if (*exceptionObject == NULL) { 6104 *exceptionObject = PyUnicodeEncodeError_Create( 6105 encoding, unicode, size, startpos, endpos, reason); 6106 } 6107 else { 6108 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6109 goto onError; 6110 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6111 goto onError; 6112 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6113 goto onError; 6114 return; 6115 onError: 6116 Py_DECREF(*exceptionObject); 6117 *exceptionObject = NULL; 6118 } 6119} 6120 6121/* raises a UnicodeEncodeError */ 6122static void 6123raise_encode_exception(PyObject **exceptionObject, 6124 const char *encoding, 6125 const Py_UNICODE *unicode, Py_ssize_t size, 6126 Py_ssize_t startpos, Py_ssize_t endpos, 6127 const char *reason) 6128{ 6129 make_encode_exception(exceptionObject, 6130 encoding, unicode, size, startpos, endpos, reason); 6131 if (*exceptionObject != NULL) 6132 PyCodec_StrictErrors(*exceptionObject); 6133} 6134 6135/* error handling callback helper: 6136 build arguments, call the callback and check the arguments, 6137 put the result into newpos and return the replacement string, which 6138 has to be freed by the caller */ 6139static PyObject * 6140unicode_encode_call_errorhandler(const char *errors, 6141 PyObject **errorHandler, 6142 const char *encoding, const char *reason, 6143 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6144 Py_ssize_t startpos, Py_ssize_t endpos, 6145 Py_ssize_t *newpos) 6146{ 6147 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6148 6149 PyObject *restuple; 6150 PyObject *resunicode; 6151 6152 if (*errorHandler == NULL) { 6153 *errorHandler = PyCodec_LookupError(errors); 6154 if (*errorHandler == NULL) 6155 return NULL; 6156 } 6157 6158 make_encode_exception(exceptionObject, 6159 encoding, unicode, size, startpos, endpos, reason); 6160 if (*exceptionObject == NULL) 6161 return NULL; 6162 6163 restuple = PyObject_CallFunctionObjArgs( 6164 *errorHandler, *exceptionObject, NULL); 6165 if (restuple == NULL) 6166 return NULL; 6167 if (!PyTuple_Check(restuple)) { 6168 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6169 Py_DECREF(restuple); 6170 return NULL; 6171 } 6172 if (!PyArg_ParseTuple(restuple, argparse, 6173 &resunicode, newpos)) { 6174 Py_DECREF(restuple); 6175 return NULL; 6176 } 6177 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6178 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6179 Py_DECREF(restuple); 6180 return NULL; 6181 } 6182 if (*newpos<0) 6183 *newpos = size+*newpos; 6184 if (*newpos<0 || *newpos>size) { 6185 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6186 Py_DECREF(restuple); 6187 return NULL; 6188 } 6189 Py_INCREF(resunicode); 6190 Py_DECREF(restuple); 6191 return resunicode; 6192} 6193 6194static PyObject * 6195unicode_encode_ucs1(const Py_UNICODE *p, 6196 Py_ssize_t size, 6197 const char *errors, 6198 int limit) 6199{ 6200 /* output object */ 6201 PyObject *res; 6202 /* pointers to the beginning and end+1 of input */ 6203 const Py_UNICODE *startp = p; 6204 const Py_UNICODE *endp = p + size; 6205 /* pointer to the beginning of the unencodable characters */ 6206 /* const Py_UNICODE *badp = NULL; */ 6207 /* pointer into the output */ 6208 char *str; 6209 /* current output position */ 6210 Py_ssize_t ressize; 6211 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6212 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6213 PyObject *errorHandler = NULL; 6214 PyObject *exc = NULL; 6215 /* the following variable is used for caching string comparisons 6216 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6217 int known_errorHandler = -1; 6218 6219 /* allocate enough for a simple encoding without 6220 replacements, if we need more, we'll resize */ 6221 if (size == 0) 6222 return PyBytes_FromStringAndSize(NULL, 0); 6223 res = PyBytes_FromStringAndSize(NULL, size); 6224 if (res == NULL) 6225 return NULL; 6226 str = PyBytes_AS_STRING(res); 6227 ressize = size; 6228 6229 while (p<endp) { 6230 Py_UNICODE c = *p; 6231 6232 /* can we encode this? */ 6233 if (c<limit) { 6234 /* no overflow check, because we know that the space is enough */ 6235 *str++ = (char)c; 6236 ++p; 6237 } 6238 else { 6239 Py_ssize_t unicodepos = p-startp; 6240 Py_ssize_t requiredsize; 6241 PyObject *repunicode; 6242 Py_ssize_t repsize; 6243 Py_ssize_t newpos; 6244 Py_ssize_t respos; 6245 Py_UNICODE *uni2; 6246 /* startpos for collecting unencodable chars */ 6247 const Py_UNICODE *collstart = p; 6248 const Py_UNICODE *collend = p; 6249 /* find all unecodable characters */ 6250 while ((collend < endp) && ((*collend)>=limit)) 6251 ++collend; 6252 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6253 if (known_errorHandler==-1) { 6254 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6255 known_errorHandler = 1; 6256 else if (!strcmp(errors, "replace")) 6257 known_errorHandler = 2; 6258 else if (!strcmp(errors, "ignore")) 6259 known_errorHandler = 3; 6260 else if (!strcmp(errors, "xmlcharrefreplace")) 6261 known_errorHandler = 4; 6262 else 6263 known_errorHandler = 0; 6264 } 6265 switch (known_errorHandler) { 6266 case 1: /* strict */ 6267 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6268 goto onError; 6269 case 2: /* replace */ 6270 while (collstart++<collend) 6271 *str++ = '?'; /* fall through */ 6272 case 3: /* ignore */ 6273 p = collend; 6274 break; 6275 case 4: /* xmlcharrefreplace */ 6276 respos = str - PyBytes_AS_STRING(res); 6277 /* determine replacement size (temporarily (mis)uses p) */ 6278 for (p = collstart, repsize = 0; p < collend; ++p) { 6279 if (*p<10) 6280 repsize += 2+1+1; 6281 else if (*p<100) 6282 repsize += 2+2+1; 6283 else if (*p<1000) 6284 repsize += 2+3+1; 6285 else if (*p<10000) 6286 repsize += 2+4+1; 6287#ifndef Py_UNICODE_WIDE 6288 else 6289 repsize += 2+5+1; 6290#else 6291 else if (*p<100000) 6292 repsize += 2+5+1; 6293 else if (*p<1000000) 6294 repsize += 2+6+1; 6295 else 6296 repsize += 2+7+1; 6297#endif 6298 } 6299 requiredsize = respos+repsize+(endp-collend); 6300 if (requiredsize > ressize) { 6301 if (requiredsize<2*ressize) 6302 requiredsize = 2*ressize; 6303 if (_PyBytes_Resize(&res, requiredsize)) 6304 goto onError; 6305 str = PyBytes_AS_STRING(res) + respos; 6306 ressize = requiredsize; 6307 } 6308 /* generate replacement (temporarily (mis)uses p) */ 6309 for (p = collstart; p < collend; ++p) { 6310 str += sprintf(str, "&#%d;", (int)*p); 6311 } 6312 p = collend; 6313 break; 6314 default: 6315 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6316 encoding, reason, startp, size, &exc, 6317 collstart-startp, collend-startp, &newpos); 6318 if (repunicode == NULL) 6319 goto onError; 6320 if (PyBytes_Check(repunicode)) { 6321 /* Directly copy bytes result to output. */ 6322 repsize = PyBytes_Size(repunicode); 6323 if (repsize > 1) { 6324 /* Make room for all additional bytes. */ 6325 respos = str - PyBytes_AS_STRING(res); 6326 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6327 Py_DECREF(repunicode); 6328 goto onError; 6329 } 6330 str = PyBytes_AS_STRING(res) + respos; 6331 ressize += repsize-1; 6332 } 6333 memcpy(str, PyBytes_AsString(repunicode), repsize); 6334 str += repsize; 6335 p = startp + newpos; 6336 Py_DECREF(repunicode); 6337 break; 6338 } 6339 /* need more space? (at least enough for what we 6340 have+the replacement+the rest of the string, so 6341 we won't have to check space for encodable characters) */ 6342 respos = str - PyBytes_AS_STRING(res); 6343 repsize = PyUnicode_GET_SIZE(repunicode); 6344 requiredsize = respos+repsize+(endp-collend); 6345 if (requiredsize > ressize) { 6346 if (requiredsize<2*ressize) 6347 requiredsize = 2*ressize; 6348 if (_PyBytes_Resize(&res, requiredsize)) { 6349 Py_DECREF(repunicode); 6350 goto onError; 6351 } 6352 str = PyBytes_AS_STRING(res) + respos; 6353 ressize = requiredsize; 6354 } 6355 /* check if there is anything unencodable in the replacement 6356 and copy it to the output */ 6357 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6358 c = *uni2; 6359 if (c >= limit) { 6360 raise_encode_exception(&exc, encoding, startp, size, 6361 unicodepos, unicodepos+1, reason); 6362 Py_DECREF(repunicode); 6363 goto onError; 6364 } 6365 *str = (char)c; 6366 } 6367 p = startp + newpos; 6368 Py_DECREF(repunicode); 6369 } 6370 } 6371 } 6372 /* Resize if we allocated to much */ 6373 size = str - PyBytes_AS_STRING(res); 6374 if (size < ressize) { /* If this falls res will be NULL */ 6375 assert(size >= 0); 6376 if (_PyBytes_Resize(&res, size) < 0) 6377 goto onError; 6378 } 6379 6380 Py_XDECREF(errorHandler); 6381 Py_XDECREF(exc); 6382 return res; 6383 6384 onError: 6385 Py_XDECREF(res); 6386 Py_XDECREF(errorHandler); 6387 Py_XDECREF(exc); 6388 return NULL; 6389} 6390 6391PyObject * 6392PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6393 Py_ssize_t size, 6394 const char *errors) 6395{ 6396 return unicode_encode_ucs1(p, size, errors, 256); 6397} 6398 6399PyObject * 6400_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6401{ 6402 if (!PyUnicode_Check(unicode)) { 6403 PyErr_BadArgument(); 6404 return NULL; 6405 } 6406 if (PyUnicode_READY(unicode) == -1) 6407 return NULL; 6408 /* Fast path: if it is a one-byte string, construct 6409 bytes object directly. */ 6410 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6411 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6412 PyUnicode_GET_LENGTH(unicode)); 6413 /* Non-Latin-1 characters present. Defer to above function to 6414 raise the exception. */ 6415 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6416 PyUnicode_GET_SIZE(unicode), 6417 errors); 6418} 6419 6420PyObject* 6421PyUnicode_AsLatin1String(PyObject *unicode) 6422{ 6423 return _PyUnicode_AsLatin1String(unicode, NULL); 6424} 6425 6426/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6427 6428PyObject * 6429PyUnicode_DecodeASCII(const char *s, 6430 Py_ssize_t size, 6431 const char *errors) 6432{ 6433 const char *starts = s; 6434 PyUnicodeObject *v; 6435 Py_UNICODE *p; 6436 Py_ssize_t startinpos; 6437 Py_ssize_t endinpos; 6438 Py_ssize_t outpos; 6439 const char *e; 6440 unsigned char* d; 6441 PyObject *errorHandler = NULL; 6442 PyObject *exc = NULL; 6443 Py_ssize_t i; 6444 6445 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6446 if (size == 1 && *(unsigned char*)s < 128) 6447 return PyUnicode_FromOrdinal(*(unsigned char*)s); 6448 6449 /* Fast path. Assume the input actually *is* ASCII, and allocate 6450 a single-block Unicode object with that assumption. If there is 6451 an error, drop the object and start over. */ 6452 v = (PyUnicodeObject*)PyUnicode_New(size, 127); 6453 if (v == NULL) 6454 goto onError; 6455 d = PyUnicode_1BYTE_DATA(v); 6456 for (i = 0; i < size; i++) { 6457 unsigned char ch = ((unsigned char*)s)[i]; 6458 if (ch < 128) 6459 d[i] = ch; 6460 else 6461 break; 6462 } 6463 if (i == size) 6464 return (PyObject*)v; 6465 Py_DECREF(v); /* start over */ 6466 6467 v = _PyUnicode_New(size); 6468 if (v == NULL) 6469 goto onError; 6470 if (size == 0) 6471 return (PyObject *)v; 6472 p = PyUnicode_AS_UNICODE(v); 6473 e = s + size; 6474 while (s < e) { 6475 register unsigned char c = (unsigned char)*s; 6476 if (c < 128) { 6477 *p++ = c; 6478 ++s; 6479 } 6480 else { 6481 startinpos = s-starts; 6482 endinpos = startinpos + 1; 6483 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6484 if (unicode_decode_call_errorhandler( 6485 errors, &errorHandler, 6486 "ascii", "ordinal not in range(128)", 6487 &starts, &e, &startinpos, &endinpos, &exc, &s, 6488 &v, &outpos, &p)) 6489 goto onError; 6490 } 6491 } 6492 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6493 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6494 goto onError; 6495 Py_XDECREF(errorHandler); 6496 Py_XDECREF(exc); 6497 if (_PyUnicode_READY_REPLACE(&v)) { 6498 Py_DECREF(v); 6499 return NULL; 6500 } 6501 return (PyObject *)v; 6502 6503 onError: 6504 Py_XDECREF(v); 6505 Py_XDECREF(errorHandler); 6506 Py_XDECREF(exc); 6507 return NULL; 6508} 6509 6510PyObject * 6511PyUnicode_EncodeASCII(const Py_UNICODE *p, 6512 Py_ssize_t size, 6513 const char *errors) 6514{ 6515 return unicode_encode_ucs1(p, size, errors, 128); 6516} 6517 6518PyObject * 6519_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6520{ 6521 if (!PyUnicode_Check(unicode)) { 6522 PyErr_BadArgument(); 6523 return NULL; 6524 } 6525 if (PyUnicode_READY(unicode) == -1) 6526 return NULL; 6527 /* Fast path: if it is an ASCII-only string, construct bytes object 6528 directly. Else defer to above function to raise the exception. */ 6529 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6530 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6531 PyUnicode_GET_LENGTH(unicode)); 6532 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6533 PyUnicode_GET_SIZE(unicode), 6534 errors); 6535} 6536 6537PyObject * 6538PyUnicode_AsASCIIString(PyObject *unicode) 6539{ 6540 return _PyUnicode_AsASCIIString(unicode, NULL); 6541} 6542 6543#ifdef HAVE_MBCS 6544 6545/* --- MBCS codecs for Windows -------------------------------------------- */ 6546 6547#if SIZEOF_INT < SIZEOF_SIZE_T 6548#define NEED_RETRY 6549#endif 6550 6551/* XXX This code is limited to "true" double-byte encodings, as 6552 a) it assumes an incomplete character consists of a single byte, and 6553 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6554 encodings, see IsDBCSLeadByteEx documentation. */ 6555 6556static int 6557is_dbcs_lead_byte(const char *s, int offset) 6558{ 6559 const char *curr = s + offset; 6560 6561 if (IsDBCSLeadByte(*curr)) { 6562 const char *prev = CharPrev(s, curr); 6563 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6564 } 6565 return 0; 6566} 6567 6568/* 6569 * Decode MBCS string into unicode object. If 'final' is set, converts 6570 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6571 */ 6572static int 6573decode_mbcs(PyUnicodeObject **v, 6574 const char *s, /* MBCS string */ 6575 int size, /* sizeof MBCS string */ 6576 int final, 6577 const char *errors) 6578{ 6579 Py_UNICODE *p; 6580 Py_ssize_t n; 6581 DWORD usize; 6582 DWORD flags; 6583 6584 assert(size >= 0); 6585 6586 /* check and handle 'errors' arg */ 6587 if (errors==NULL || strcmp(errors, "strict")==0) 6588 flags = MB_ERR_INVALID_CHARS; 6589 else if (strcmp(errors, "ignore")==0) 6590 flags = 0; 6591 else { 6592 PyErr_Format(PyExc_ValueError, 6593 "mbcs encoding does not support errors='%s'", 6594 errors); 6595 return -1; 6596 } 6597 6598 /* Skip trailing lead-byte unless 'final' is set */ 6599 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6600 --size; 6601 6602 /* First get the size of the result */ 6603 if (size > 0) { 6604 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6605 if (usize==0) 6606 goto mbcs_decode_error; 6607 } else 6608 usize = 0; 6609 6610 if (*v == NULL) { 6611 /* Create unicode object */ 6612 *v = _PyUnicode_New(usize); 6613 if (*v == NULL) 6614 return -1; 6615 n = 0; 6616 } 6617 else { 6618 /* Extend unicode object */ 6619 n = PyUnicode_GET_SIZE(*v); 6620 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6621 return -1; 6622 } 6623 6624 /* Do the conversion */ 6625 if (usize > 0) { 6626 p = PyUnicode_AS_UNICODE(*v) + n; 6627 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6628 goto mbcs_decode_error; 6629 } 6630 } 6631 return size; 6632 6633mbcs_decode_error: 6634 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6635 we raise a UnicodeDecodeError - else it is a 'generic' 6636 windows error 6637 */ 6638 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6639 /* Ideally, we should get reason from FormatMessage - this 6640 is the Windows 2000 English version of the message 6641 */ 6642 PyObject *exc = NULL; 6643 const char *reason = "No mapping for the Unicode character exists " 6644 "in the target multi-byte code page."; 6645 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6646 if (exc != NULL) { 6647 PyCodec_StrictErrors(exc); 6648 Py_DECREF(exc); 6649 } 6650 } else { 6651 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6652 } 6653 return -1; 6654} 6655 6656PyObject * 6657PyUnicode_DecodeMBCSStateful(const char *s, 6658 Py_ssize_t size, 6659 const char *errors, 6660 Py_ssize_t *consumed) 6661{ 6662 PyUnicodeObject *v = NULL; 6663 int done; 6664 6665 if (consumed) 6666 *consumed = 0; 6667 6668#ifdef NEED_RETRY 6669 retry: 6670 if (size > INT_MAX) 6671 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6672 else 6673#endif 6674 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6675 6676 if (done < 0) { 6677 Py_XDECREF(v); 6678 return NULL; 6679 } 6680 6681 if (consumed) 6682 *consumed += done; 6683 6684#ifdef NEED_RETRY 6685 if (size > INT_MAX) { 6686 s += done; 6687 size -= done; 6688 goto retry; 6689 } 6690#endif 6691 if (_PyUnicode_READY_REPLACE(&v)) { 6692 Py_DECREF(v); 6693 return NULL; 6694 } 6695 return (PyObject *)v; 6696} 6697 6698PyObject * 6699PyUnicode_DecodeMBCS(const char *s, 6700 Py_ssize_t size, 6701 const char *errors) 6702{ 6703 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6704} 6705 6706/* 6707 * Convert unicode into string object (MBCS). 6708 * Returns 0 if succeed, -1 otherwise. 6709 */ 6710static int 6711encode_mbcs(PyObject **repr, 6712 const Py_UNICODE *p, /* unicode */ 6713 int size, /* size of unicode */ 6714 const char* errors) 6715{ 6716 BOOL usedDefaultChar = FALSE; 6717 BOOL *pusedDefaultChar; 6718 int mbcssize; 6719 Py_ssize_t n; 6720 PyObject *exc = NULL; 6721 DWORD flags; 6722 6723 assert(size >= 0); 6724 6725 /* check and handle 'errors' arg */ 6726 if (errors==NULL || strcmp(errors, "strict")==0) { 6727 flags = WC_NO_BEST_FIT_CHARS; 6728 pusedDefaultChar = &usedDefaultChar; 6729 } else if (strcmp(errors, "replace")==0) { 6730 flags = 0; 6731 pusedDefaultChar = NULL; 6732 } else { 6733 PyErr_Format(PyExc_ValueError, 6734 "mbcs encoding does not support errors='%s'", 6735 errors); 6736 return -1; 6737 } 6738 6739 /* First get the size of the result */ 6740 if (size > 0) { 6741 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6742 NULL, pusedDefaultChar); 6743 if (mbcssize == 0) { 6744 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6745 return -1; 6746 } 6747 /* If we used a default char, then we failed! */ 6748 if (pusedDefaultChar && *pusedDefaultChar) 6749 goto mbcs_encode_error; 6750 } else { 6751 mbcssize = 0; 6752 } 6753 6754 if (*repr == NULL) { 6755 /* Create string object */ 6756 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6757 if (*repr == NULL) 6758 return -1; 6759 n = 0; 6760 } 6761 else { 6762 /* Extend string object */ 6763 n = PyBytes_Size(*repr); 6764 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6765 return -1; 6766 } 6767 6768 /* Do the conversion */ 6769 if (size > 0) { 6770 char *s = PyBytes_AS_STRING(*repr) + n; 6771 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6772 NULL, pusedDefaultChar)) { 6773 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6774 return -1; 6775 } 6776 if (pusedDefaultChar && *pusedDefaultChar) 6777 goto mbcs_encode_error; 6778 } 6779 return 0; 6780 6781mbcs_encode_error: 6782 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6783 Py_XDECREF(exc); 6784 return -1; 6785} 6786 6787PyObject * 6788PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6789 Py_ssize_t size, 6790 const char *errors) 6791{ 6792 PyObject *repr = NULL; 6793 int ret; 6794 6795#ifdef NEED_RETRY 6796 retry: 6797 if (size > INT_MAX) 6798 ret = encode_mbcs(&repr, p, INT_MAX, errors); 6799 else 6800#endif 6801 ret = encode_mbcs(&repr, p, (int)size, errors); 6802 6803 if (ret < 0) { 6804 Py_XDECREF(repr); 6805 return NULL; 6806 } 6807 6808#ifdef NEED_RETRY 6809 if (size > INT_MAX) { 6810 p += INT_MAX; 6811 size -= INT_MAX; 6812 goto retry; 6813 } 6814#endif 6815 6816 return repr; 6817} 6818 6819PyObject * 6820PyUnicode_AsMBCSString(PyObject *unicode) 6821{ 6822 if (!PyUnicode_Check(unicode)) { 6823 PyErr_BadArgument(); 6824 return NULL; 6825 } 6826 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 6827 PyUnicode_GET_SIZE(unicode), 6828 NULL); 6829} 6830 6831#undef NEED_RETRY 6832 6833#endif /* HAVE_MBCS */ 6834 6835/* --- Character Mapping Codec -------------------------------------------- */ 6836 6837PyObject * 6838PyUnicode_DecodeCharmap(const char *s, 6839 Py_ssize_t size, 6840 PyObject *mapping, 6841 const char *errors) 6842{ 6843 const char *starts = s; 6844 Py_ssize_t startinpos; 6845 Py_ssize_t endinpos; 6846 Py_ssize_t outpos; 6847 const char *e; 6848 PyUnicodeObject *v; 6849 Py_UNICODE *p; 6850 Py_ssize_t extrachars = 0; 6851 PyObject *errorHandler = NULL; 6852 PyObject *exc = NULL; 6853 Py_UNICODE *mapstring = NULL; 6854 Py_ssize_t maplen = 0; 6855 6856 /* Default to Latin-1 */ 6857 if (mapping == NULL) 6858 return PyUnicode_DecodeLatin1(s, size, errors); 6859 6860 v = _PyUnicode_New(size); 6861 if (v == NULL) 6862 goto onError; 6863 if (size == 0) 6864 return (PyObject *)v; 6865 p = PyUnicode_AS_UNICODE(v); 6866 e = s + size; 6867 if (PyUnicode_CheckExact(mapping)) { 6868 mapstring = PyUnicode_AS_UNICODE(mapping); 6869 maplen = PyUnicode_GET_SIZE(mapping); 6870 while (s < e) { 6871 unsigned char ch = *s; 6872 Py_UNICODE x = 0xfffe; /* illegal value */ 6873 6874 if (ch < maplen) 6875 x = mapstring[ch]; 6876 6877 if (x == 0xfffe) { 6878 /* undefined mapping */ 6879 outpos = p-PyUnicode_AS_UNICODE(v); 6880 startinpos = s-starts; 6881 endinpos = startinpos+1; 6882 if (unicode_decode_call_errorhandler( 6883 errors, &errorHandler, 6884 "charmap", "character maps to <undefined>", 6885 &starts, &e, &startinpos, &endinpos, &exc, &s, 6886 &v, &outpos, &p)) { 6887 goto onError; 6888 } 6889 continue; 6890 } 6891 *p++ = x; 6892 ++s; 6893 } 6894 } 6895 else { 6896 while (s < e) { 6897 unsigned char ch = *s; 6898 PyObject *w, *x; 6899 6900 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 6901 w = PyLong_FromLong((long)ch); 6902 if (w == NULL) 6903 goto onError; 6904 x = PyObject_GetItem(mapping, w); 6905 Py_DECREF(w); 6906 if (x == NULL) { 6907 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6908 /* No mapping found means: mapping is undefined. */ 6909 PyErr_Clear(); 6910 x = Py_None; 6911 Py_INCREF(x); 6912 } else 6913 goto onError; 6914 } 6915 6916 /* Apply mapping */ 6917 if (PyLong_Check(x)) { 6918 long value = PyLong_AS_LONG(x); 6919 if (value < 0 || value > 65535) { 6920 PyErr_SetString(PyExc_TypeError, 6921 "character mapping must be in range(65536)"); 6922 Py_DECREF(x); 6923 goto onError; 6924 } 6925 *p++ = (Py_UNICODE)value; 6926 } 6927 else if (x == Py_None) { 6928 /* undefined mapping */ 6929 outpos = p-PyUnicode_AS_UNICODE(v); 6930 startinpos = s-starts; 6931 endinpos = startinpos+1; 6932 if (unicode_decode_call_errorhandler( 6933 errors, &errorHandler, 6934 "charmap", "character maps to <undefined>", 6935 &starts, &e, &startinpos, &endinpos, &exc, &s, 6936 &v, &outpos, &p)) { 6937 Py_DECREF(x); 6938 goto onError; 6939 } 6940 Py_DECREF(x); 6941 continue; 6942 } 6943 else if (PyUnicode_Check(x)) { 6944 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 6945 6946 if (targetsize == 1) 6947 /* 1-1 mapping */ 6948 *p++ = *PyUnicode_AS_UNICODE(x); 6949 6950 else if (targetsize > 1) { 6951 /* 1-n mapping */ 6952 if (targetsize > extrachars) { 6953 /* resize first */ 6954 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 6955 Py_ssize_t needed = (targetsize - extrachars) + \ 6956 (targetsize << 2); 6957 extrachars += needed; 6958 /* XXX overflow detection missing */ 6959 if (PyUnicode_Resize((PyObject**)&v, 6960 PyUnicode_GET_SIZE(v) + needed) < 0) { 6961 Py_DECREF(x); 6962 goto onError; 6963 } 6964 p = PyUnicode_AS_UNICODE(v) + oldpos; 6965 } 6966 Py_UNICODE_COPY(p, 6967 PyUnicode_AS_UNICODE(x), 6968 targetsize); 6969 p += targetsize; 6970 extrachars -= targetsize; 6971 } 6972 /* 1-0 mapping: skip the character */ 6973 } 6974 else { 6975 /* wrong return value */ 6976 PyErr_SetString(PyExc_TypeError, 6977 "character mapping must return integer, None or str"); 6978 Py_DECREF(x); 6979 goto onError; 6980 } 6981 Py_DECREF(x); 6982 ++s; 6983 } 6984 } 6985 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6986 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6987 goto onError; 6988 Py_XDECREF(errorHandler); 6989 Py_XDECREF(exc); 6990 if (_PyUnicode_READY_REPLACE(&v)) { 6991 Py_DECREF(v); 6992 return NULL; 6993 } 6994 return (PyObject *)v; 6995 6996 onError: 6997 Py_XDECREF(errorHandler); 6998 Py_XDECREF(exc); 6999 Py_XDECREF(v); 7000 return NULL; 7001} 7002 7003/* Charmap encoding: the lookup table */ 7004 7005struct encoding_map { 7006 PyObject_HEAD 7007 unsigned char level1[32]; 7008 int count2, count3; 7009 unsigned char level23[1]; 7010}; 7011 7012static PyObject* 7013encoding_map_size(PyObject *obj, PyObject* args) 7014{ 7015 struct encoding_map *map = (struct encoding_map*)obj; 7016 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7017 128*map->count3); 7018} 7019 7020static PyMethodDef encoding_map_methods[] = { 7021 {"size", encoding_map_size, METH_NOARGS, 7022 PyDoc_STR("Return the size (in bytes) of this object") }, 7023 { 0 } 7024}; 7025 7026static void 7027encoding_map_dealloc(PyObject* o) 7028{ 7029 PyObject_FREE(o); 7030} 7031 7032static PyTypeObject EncodingMapType = { 7033 PyVarObject_HEAD_INIT(NULL, 0) 7034 "EncodingMap", /*tp_name*/ 7035 sizeof(struct encoding_map), /*tp_basicsize*/ 7036 0, /*tp_itemsize*/ 7037 /* methods */ 7038 encoding_map_dealloc, /*tp_dealloc*/ 7039 0, /*tp_print*/ 7040 0, /*tp_getattr*/ 7041 0, /*tp_setattr*/ 7042 0, /*tp_reserved*/ 7043 0, /*tp_repr*/ 7044 0, /*tp_as_number*/ 7045 0, /*tp_as_sequence*/ 7046 0, /*tp_as_mapping*/ 7047 0, /*tp_hash*/ 7048 0, /*tp_call*/ 7049 0, /*tp_str*/ 7050 0, /*tp_getattro*/ 7051 0, /*tp_setattro*/ 7052 0, /*tp_as_buffer*/ 7053 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7054 0, /*tp_doc*/ 7055 0, /*tp_traverse*/ 7056 0, /*tp_clear*/ 7057 0, /*tp_richcompare*/ 7058 0, /*tp_weaklistoffset*/ 7059 0, /*tp_iter*/ 7060 0, /*tp_iternext*/ 7061 encoding_map_methods, /*tp_methods*/ 7062 0, /*tp_members*/ 7063 0, /*tp_getset*/ 7064 0, /*tp_base*/ 7065 0, /*tp_dict*/ 7066 0, /*tp_descr_get*/ 7067 0, /*tp_descr_set*/ 7068 0, /*tp_dictoffset*/ 7069 0, /*tp_init*/ 7070 0, /*tp_alloc*/ 7071 0, /*tp_new*/ 7072 0, /*tp_free*/ 7073 0, /*tp_is_gc*/ 7074}; 7075 7076PyObject* 7077PyUnicode_BuildEncodingMap(PyObject* string) 7078{ 7079 PyObject *result; 7080 struct encoding_map *mresult; 7081 int i; 7082 int need_dict = 0; 7083 unsigned char level1[32]; 7084 unsigned char level2[512]; 7085 unsigned char *mlevel1, *mlevel2, *mlevel3; 7086 int count2 = 0, count3 = 0; 7087 int kind; 7088 void *data; 7089 Py_UCS4 ch; 7090 7091 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7092 PyErr_BadArgument(); 7093 return NULL; 7094 } 7095 kind = PyUnicode_KIND(string); 7096 data = PyUnicode_DATA(string); 7097 memset(level1, 0xFF, sizeof level1); 7098 memset(level2, 0xFF, sizeof level2); 7099 7100 /* If there isn't a one-to-one mapping of NULL to \0, 7101 or if there are non-BMP characters, we need to use 7102 a mapping dictionary. */ 7103 if (PyUnicode_READ(kind, data, 0) != 0) 7104 need_dict = 1; 7105 for (i = 1; i < 256; i++) { 7106 int l1, l2; 7107 ch = PyUnicode_READ(kind, data, i); 7108 if (ch == 0 || ch > 0xFFFF) { 7109 need_dict = 1; 7110 break; 7111 } 7112 if (ch == 0xFFFE) 7113 /* unmapped character */ 7114 continue; 7115 l1 = ch >> 11; 7116 l2 = ch >> 7; 7117 if (level1[l1] == 0xFF) 7118 level1[l1] = count2++; 7119 if (level2[l2] == 0xFF) 7120 level2[l2] = count3++; 7121 } 7122 7123 if (count2 >= 0xFF || count3 >= 0xFF) 7124 need_dict = 1; 7125 7126 if (need_dict) { 7127 PyObject *result = PyDict_New(); 7128 PyObject *key, *value; 7129 if (!result) 7130 return NULL; 7131 for (i = 0; i < 256; i++) { 7132 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7133 value = PyLong_FromLong(i); 7134 if (!key || !value) 7135 goto failed1; 7136 if (PyDict_SetItem(result, key, value) == -1) 7137 goto failed1; 7138 Py_DECREF(key); 7139 Py_DECREF(value); 7140 } 7141 return result; 7142 failed1: 7143 Py_XDECREF(key); 7144 Py_XDECREF(value); 7145 Py_DECREF(result); 7146 return NULL; 7147 } 7148 7149 /* Create a three-level trie */ 7150 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7151 16*count2 + 128*count3 - 1); 7152 if (!result) 7153 return PyErr_NoMemory(); 7154 PyObject_Init(result, &EncodingMapType); 7155 mresult = (struct encoding_map*)result; 7156 mresult->count2 = count2; 7157 mresult->count3 = count3; 7158 mlevel1 = mresult->level1; 7159 mlevel2 = mresult->level23; 7160 mlevel3 = mresult->level23 + 16*count2; 7161 memcpy(mlevel1, level1, 32); 7162 memset(mlevel2, 0xFF, 16*count2); 7163 memset(mlevel3, 0, 128*count3); 7164 count3 = 0; 7165 for (i = 1; i < 256; i++) { 7166 int o1, o2, o3, i2, i3; 7167 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7168 /* unmapped character */ 7169 continue; 7170 o1 = PyUnicode_READ(kind, data, i)>>11; 7171 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7172 i2 = 16*mlevel1[o1] + o2; 7173 if (mlevel2[i2] == 0xFF) 7174 mlevel2[i2] = count3++; 7175 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7176 i3 = 128*mlevel2[i2] + o3; 7177 mlevel3[i3] = i; 7178 } 7179 return result; 7180} 7181 7182static int 7183encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7184{ 7185 struct encoding_map *map = (struct encoding_map*)mapping; 7186 int l1 = c>>11; 7187 int l2 = (c>>7) & 0xF; 7188 int l3 = c & 0x7F; 7189 int i; 7190 7191#ifdef Py_UNICODE_WIDE 7192 if (c > 0xFFFF) { 7193 return -1; 7194 } 7195#endif 7196 if (c == 0) 7197 return 0; 7198 /* level 1*/ 7199 i = map->level1[l1]; 7200 if (i == 0xFF) { 7201 return -1; 7202 } 7203 /* level 2*/ 7204 i = map->level23[16*i+l2]; 7205 if (i == 0xFF) { 7206 return -1; 7207 } 7208 /* level 3 */ 7209 i = map->level23[16*map->count2 + 128*i + l3]; 7210 if (i == 0) { 7211 return -1; 7212 } 7213 return i; 7214} 7215 7216/* Lookup the character ch in the mapping. If the character 7217 can't be found, Py_None is returned (or NULL, if another 7218 error occurred). */ 7219static PyObject * 7220charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7221{ 7222 PyObject *w = PyLong_FromLong((long)c); 7223 PyObject *x; 7224 7225 if (w == NULL) 7226 return NULL; 7227 x = PyObject_GetItem(mapping, w); 7228 Py_DECREF(w); 7229 if (x == NULL) { 7230 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7231 /* No mapping found means: mapping is undefined. */ 7232 PyErr_Clear(); 7233 x = Py_None; 7234 Py_INCREF(x); 7235 return x; 7236 } else 7237 return NULL; 7238 } 7239 else if (x == Py_None) 7240 return x; 7241 else if (PyLong_Check(x)) { 7242 long value = PyLong_AS_LONG(x); 7243 if (value < 0 || value > 255) { 7244 PyErr_SetString(PyExc_TypeError, 7245 "character mapping must be in range(256)"); 7246 Py_DECREF(x); 7247 return NULL; 7248 } 7249 return x; 7250 } 7251 else if (PyBytes_Check(x)) 7252 return x; 7253 else { 7254 /* wrong return value */ 7255 PyErr_Format(PyExc_TypeError, 7256 "character mapping must return integer, bytes or None, not %.400s", 7257 x->ob_type->tp_name); 7258 Py_DECREF(x); 7259 return NULL; 7260 } 7261} 7262 7263static int 7264charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7265{ 7266 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7267 /* exponentially overallocate to minimize reallocations */ 7268 if (requiredsize < 2*outsize) 7269 requiredsize = 2*outsize; 7270 if (_PyBytes_Resize(outobj, requiredsize)) 7271 return -1; 7272 return 0; 7273} 7274 7275typedef enum charmapencode_result { 7276 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7277} charmapencode_result; 7278/* lookup the character, put the result in the output string and adjust 7279 various state variables. Resize the output bytes object if not enough 7280 space is available. Return a new reference to the object that 7281 was put in the output buffer, or Py_None, if the mapping was undefined 7282 (in which case no character was written) or NULL, if a 7283 reallocation error occurred. The caller must decref the result */ 7284static charmapencode_result 7285charmapencode_output(Py_UNICODE c, PyObject *mapping, 7286 PyObject **outobj, Py_ssize_t *outpos) 7287{ 7288 PyObject *rep; 7289 char *outstart; 7290 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7291 7292 if (Py_TYPE(mapping) == &EncodingMapType) { 7293 int res = encoding_map_lookup(c, mapping); 7294 Py_ssize_t requiredsize = *outpos+1; 7295 if (res == -1) 7296 return enc_FAILED; 7297 if (outsize<requiredsize) 7298 if (charmapencode_resize(outobj, outpos, requiredsize)) 7299 return enc_EXCEPTION; 7300 outstart = PyBytes_AS_STRING(*outobj); 7301 outstart[(*outpos)++] = (char)res; 7302 return enc_SUCCESS; 7303 } 7304 7305 rep = charmapencode_lookup(c, mapping); 7306 if (rep==NULL) 7307 return enc_EXCEPTION; 7308 else if (rep==Py_None) { 7309 Py_DECREF(rep); 7310 return enc_FAILED; 7311 } else { 7312 if (PyLong_Check(rep)) { 7313 Py_ssize_t requiredsize = *outpos+1; 7314 if (outsize<requiredsize) 7315 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7316 Py_DECREF(rep); 7317 return enc_EXCEPTION; 7318 } 7319 outstart = PyBytes_AS_STRING(*outobj); 7320 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7321 } 7322 else { 7323 const char *repchars = PyBytes_AS_STRING(rep); 7324 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7325 Py_ssize_t requiredsize = *outpos+repsize; 7326 if (outsize<requiredsize) 7327 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7328 Py_DECREF(rep); 7329 return enc_EXCEPTION; 7330 } 7331 outstart = PyBytes_AS_STRING(*outobj); 7332 memcpy(outstart + *outpos, repchars, repsize); 7333 *outpos += repsize; 7334 } 7335 } 7336 Py_DECREF(rep); 7337 return enc_SUCCESS; 7338} 7339 7340/* handle an error in PyUnicode_EncodeCharmap 7341 Return 0 on success, -1 on error */ 7342static int 7343charmap_encoding_error( 7344 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7345 PyObject **exceptionObject, 7346 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7347 PyObject **res, Py_ssize_t *respos) 7348{ 7349 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7350 Py_ssize_t repsize; 7351 Py_ssize_t newpos; 7352 Py_UNICODE *uni2; 7353 /* startpos for collecting unencodable chars */ 7354 Py_ssize_t collstartpos = *inpos; 7355 Py_ssize_t collendpos = *inpos+1; 7356 Py_ssize_t collpos; 7357 char *encoding = "charmap"; 7358 char *reason = "character maps to <undefined>"; 7359 charmapencode_result x; 7360 7361 /* find all unencodable characters */ 7362 while (collendpos < size) { 7363 PyObject *rep; 7364 if (Py_TYPE(mapping) == &EncodingMapType) { 7365 int res = encoding_map_lookup(p[collendpos], mapping); 7366 if (res != -1) 7367 break; 7368 ++collendpos; 7369 continue; 7370 } 7371 7372 rep = charmapencode_lookup(p[collendpos], mapping); 7373 if (rep==NULL) 7374 return -1; 7375 else if (rep!=Py_None) { 7376 Py_DECREF(rep); 7377 break; 7378 } 7379 Py_DECREF(rep); 7380 ++collendpos; 7381 } 7382 /* cache callback name lookup 7383 * (if not done yet, i.e. it's the first error) */ 7384 if (*known_errorHandler==-1) { 7385 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7386 *known_errorHandler = 1; 7387 else if (!strcmp(errors, "replace")) 7388 *known_errorHandler = 2; 7389 else if (!strcmp(errors, "ignore")) 7390 *known_errorHandler = 3; 7391 else if (!strcmp(errors, "xmlcharrefreplace")) 7392 *known_errorHandler = 4; 7393 else 7394 *known_errorHandler = 0; 7395 } 7396 switch (*known_errorHandler) { 7397 case 1: /* strict */ 7398 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7399 return -1; 7400 case 2: /* replace */ 7401 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7402 x = charmapencode_output('?', mapping, res, respos); 7403 if (x==enc_EXCEPTION) { 7404 return -1; 7405 } 7406 else if (x==enc_FAILED) { 7407 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7408 return -1; 7409 } 7410 } 7411 /* fall through */ 7412 case 3: /* ignore */ 7413 *inpos = collendpos; 7414 break; 7415 case 4: /* xmlcharrefreplace */ 7416 /* generate replacement (temporarily (mis)uses p) */ 7417 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7418 char buffer[2+29+1+1]; 7419 char *cp; 7420 sprintf(buffer, "&#%d;", (int)p[collpos]); 7421 for (cp = buffer; *cp; ++cp) { 7422 x = charmapencode_output(*cp, mapping, res, respos); 7423 if (x==enc_EXCEPTION) 7424 return -1; 7425 else if (x==enc_FAILED) { 7426 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7427 return -1; 7428 } 7429 } 7430 } 7431 *inpos = collendpos; 7432 break; 7433 default: 7434 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7435 encoding, reason, p, size, exceptionObject, 7436 collstartpos, collendpos, &newpos); 7437 if (repunicode == NULL) 7438 return -1; 7439 if (PyBytes_Check(repunicode)) { 7440 /* Directly copy bytes result to output. */ 7441 Py_ssize_t outsize = PyBytes_Size(*res); 7442 Py_ssize_t requiredsize; 7443 repsize = PyBytes_Size(repunicode); 7444 requiredsize = *respos + repsize; 7445 if (requiredsize > outsize) 7446 /* Make room for all additional bytes. */ 7447 if (charmapencode_resize(res, respos, requiredsize)) { 7448 Py_DECREF(repunicode); 7449 return -1; 7450 } 7451 memcpy(PyBytes_AsString(*res) + *respos, 7452 PyBytes_AsString(repunicode), repsize); 7453 *respos += repsize; 7454 *inpos = newpos; 7455 Py_DECREF(repunicode); 7456 break; 7457 } 7458 /* generate replacement */ 7459 repsize = PyUnicode_GET_SIZE(repunicode); 7460 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7461 x = charmapencode_output(*uni2, mapping, res, respos); 7462 if (x==enc_EXCEPTION) { 7463 return -1; 7464 } 7465 else if (x==enc_FAILED) { 7466 Py_DECREF(repunicode); 7467 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7468 return -1; 7469 } 7470 } 7471 *inpos = newpos; 7472 Py_DECREF(repunicode); 7473 } 7474 return 0; 7475} 7476 7477PyObject * 7478PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7479 Py_ssize_t size, 7480 PyObject *mapping, 7481 const char *errors) 7482{ 7483 /* output object */ 7484 PyObject *res = NULL; 7485 /* current input position */ 7486 Py_ssize_t inpos = 0; 7487 /* current output position */ 7488 Py_ssize_t respos = 0; 7489 PyObject *errorHandler = NULL; 7490 PyObject *exc = NULL; 7491 /* the following variable is used for caching string comparisons 7492 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7493 * 3=ignore, 4=xmlcharrefreplace */ 7494 int known_errorHandler = -1; 7495 7496 /* Default to Latin-1 */ 7497 if (mapping == NULL) 7498 return PyUnicode_EncodeLatin1(p, size, errors); 7499 7500 /* allocate enough for a simple encoding without 7501 replacements, if we need more, we'll resize */ 7502 res = PyBytes_FromStringAndSize(NULL, size); 7503 if (res == NULL) 7504 goto onError; 7505 if (size == 0) 7506 return res; 7507 7508 while (inpos<size) { 7509 /* try to encode it */ 7510 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7511 if (x==enc_EXCEPTION) /* error */ 7512 goto onError; 7513 if (x==enc_FAILED) { /* unencodable character */ 7514 if (charmap_encoding_error(p, size, &inpos, mapping, 7515 &exc, 7516 &known_errorHandler, &errorHandler, errors, 7517 &res, &respos)) { 7518 goto onError; 7519 } 7520 } 7521 else 7522 /* done with this character => adjust input position */ 7523 ++inpos; 7524 } 7525 7526 /* Resize if we allocated to much */ 7527 if (respos<PyBytes_GET_SIZE(res)) 7528 if (_PyBytes_Resize(&res, respos) < 0) 7529 goto onError; 7530 7531 Py_XDECREF(exc); 7532 Py_XDECREF(errorHandler); 7533 return res; 7534 7535 onError: 7536 Py_XDECREF(res); 7537 Py_XDECREF(exc); 7538 Py_XDECREF(errorHandler); 7539 return NULL; 7540} 7541 7542PyObject * 7543PyUnicode_AsCharmapString(PyObject *unicode, 7544 PyObject *mapping) 7545{ 7546 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7547 PyErr_BadArgument(); 7548 return NULL; 7549 } 7550 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7551 PyUnicode_GET_SIZE(unicode), 7552 mapping, 7553 NULL); 7554} 7555 7556/* create or adjust a UnicodeTranslateError */ 7557static void 7558make_translate_exception(PyObject **exceptionObject, 7559 PyObject *unicode, 7560 Py_ssize_t startpos, Py_ssize_t endpos, 7561 const char *reason) 7562{ 7563 if (*exceptionObject == NULL) { 7564 *exceptionObject = _PyUnicodeTranslateError_Create( 7565 unicode, startpos, endpos, reason); 7566 } 7567 else { 7568 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7569 goto onError; 7570 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7571 goto onError; 7572 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7573 goto onError; 7574 return; 7575 onError: 7576 Py_DECREF(*exceptionObject); 7577 *exceptionObject = NULL; 7578 } 7579} 7580 7581/* raises a UnicodeTranslateError */ 7582static void 7583raise_translate_exception(PyObject **exceptionObject, 7584 PyObject *unicode, 7585 Py_ssize_t startpos, Py_ssize_t endpos, 7586 const char *reason) 7587{ 7588 make_translate_exception(exceptionObject, 7589 unicode, startpos, endpos, reason); 7590 if (*exceptionObject != NULL) 7591 PyCodec_StrictErrors(*exceptionObject); 7592} 7593 7594/* error handling callback helper: 7595 build arguments, call the callback and check the arguments, 7596 put the result into newpos and return the replacement string, which 7597 has to be freed by the caller */ 7598static PyObject * 7599unicode_translate_call_errorhandler(const char *errors, 7600 PyObject **errorHandler, 7601 const char *reason, 7602 PyObject *unicode, PyObject **exceptionObject, 7603 Py_ssize_t startpos, Py_ssize_t endpos, 7604 Py_ssize_t *newpos) 7605{ 7606 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7607 7608 Py_ssize_t i_newpos; 7609 PyObject *restuple; 7610 PyObject *resunicode; 7611 7612 if (*errorHandler == NULL) { 7613 *errorHandler = PyCodec_LookupError(errors); 7614 if (*errorHandler == NULL) 7615 return NULL; 7616 } 7617 7618 make_translate_exception(exceptionObject, 7619 unicode, startpos, endpos, reason); 7620 if (*exceptionObject == NULL) 7621 return NULL; 7622 7623 restuple = PyObject_CallFunctionObjArgs( 7624 *errorHandler, *exceptionObject, NULL); 7625 if (restuple == NULL) 7626 return NULL; 7627 if (!PyTuple_Check(restuple)) { 7628 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7629 Py_DECREF(restuple); 7630 return NULL; 7631 } 7632 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7633 &resunicode, &i_newpos)) { 7634 Py_DECREF(restuple); 7635 return NULL; 7636 } 7637 if (i_newpos<0) 7638 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7639 else 7640 *newpos = i_newpos; 7641 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7642 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7643 Py_DECREF(restuple); 7644 return NULL; 7645 } 7646 Py_INCREF(resunicode); 7647 Py_DECREF(restuple); 7648 return resunicode; 7649} 7650 7651/* Lookup the character ch in the mapping and put the result in result, 7652 which must be decrefed by the caller. 7653 Return 0 on success, -1 on error */ 7654static int 7655charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7656{ 7657 PyObject *w = PyLong_FromLong((long)c); 7658 PyObject *x; 7659 7660 if (w == NULL) 7661 return -1; 7662 x = PyObject_GetItem(mapping, w); 7663 Py_DECREF(w); 7664 if (x == NULL) { 7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7666 /* No mapping found means: use 1:1 mapping. */ 7667 PyErr_Clear(); 7668 *result = NULL; 7669 return 0; 7670 } else 7671 return -1; 7672 } 7673 else if (x == Py_None) { 7674 *result = x; 7675 return 0; 7676 } 7677 else if (PyLong_Check(x)) { 7678 long value = PyLong_AS_LONG(x); 7679 long max = PyUnicode_GetMax(); 7680 if (value < 0 || value > max) { 7681 PyErr_Format(PyExc_TypeError, 7682 "character mapping must be in range(0x%x)", max+1); 7683 Py_DECREF(x); 7684 return -1; 7685 } 7686 *result = x; 7687 return 0; 7688 } 7689 else if (PyUnicode_Check(x)) { 7690 *result = x; 7691 return 0; 7692 } 7693 else { 7694 /* wrong return value */ 7695 PyErr_SetString(PyExc_TypeError, 7696 "character mapping must return integer, None or str"); 7697 Py_DECREF(x); 7698 return -1; 7699 } 7700} 7701/* ensure that *outobj is at least requiredsize characters long, 7702 if not reallocate and adjust various state variables. 7703 Return 0 on success, -1 on error */ 7704static int 7705charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7706 Py_ssize_t requiredsize) 7707{ 7708 Py_ssize_t oldsize = *psize; 7709 if (requiredsize > oldsize) { 7710 /* exponentially overallocate to minimize reallocations */ 7711 if (requiredsize < 2 * oldsize) 7712 requiredsize = 2 * oldsize; 7713 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7714 if (*outobj == 0) 7715 return -1; 7716 *psize = requiredsize; 7717 } 7718 return 0; 7719} 7720/* lookup the character, put the result in the output string and adjust 7721 various state variables. Return a new reference to the object that 7722 was put in the output buffer in *result, or Py_None, if the mapping was 7723 undefined (in which case no character was written). 7724 The called must decref result. 7725 Return 0 on success, -1 on error. */ 7726static int 7727charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7728 PyObject *mapping, Py_UCS4 **output, 7729 Py_ssize_t *osize, Py_ssize_t *opos, 7730 PyObject **res) 7731{ 7732 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7733 if (charmaptranslate_lookup(curinp, mapping, res)) 7734 return -1; 7735 if (*res==NULL) { 7736 /* not found => default to 1:1 mapping */ 7737 (*output)[(*opos)++] = curinp; 7738 } 7739 else if (*res==Py_None) 7740 ; 7741 else if (PyLong_Check(*res)) { 7742 /* no overflow check, because we know that the space is enough */ 7743 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7744 } 7745 else if (PyUnicode_Check(*res)) { 7746 Py_ssize_t repsize; 7747 if (PyUnicode_READY(*res) == -1) 7748 return -1; 7749 repsize = PyUnicode_GET_LENGTH(*res); 7750 if (repsize==1) { 7751 /* no overflow check, because we know that the space is enough */ 7752 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7753 } 7754 else if (repsize!=0) { 7755 /* more than one character */ 7756 Py_ssize_t requiredsize = *opos + 7757 (PyUnicode_GET_LENGTH(input) - ipos) + 7758 repsize - 1; 7759 Py_ssize_t i; 7760 if (charmaptranslate_makespace(output, osize, requiredsize)) 7761 return -1; 7762 for(i = 0; i < repsize; i++) 7763 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7764 } 7765 } 7766 else 7767 return -1; 7768 return 0; 7769} 7770 7771PyObject * 7772_PyUnicode_TranslateCharmap(PyObject *input, 7773 PyObject *mapping, 7774 const char *errors) 7775{ 7776 /* input object */ 7777 char *idata; 7778 Py_ssize_t size, i; 7779 int kind; 7780 /* output buffer */ 7781 Py_UCS4 *output = NULL; 7782 Py_ssize_t osize; 7783 PyObject *res; 7784 /* current output position */ 7785 Py_ssize_t opos; 7786 char *reason = "character maps to <undefined>"; 7787 PyObject *errorHandler = NULL; 7788 PyObject *exc = NULL; 7789 /* the following variable is used for caching string comparisons 7790 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7791 * 3=ignore, 4=xmlcharrefreplace */ 7792 int known_errorHandler = -1; 7793 7794 if (mapping == NULL) { 7795 PyErr_BadArgument(); 7796 return NULL; 7797 } 7798 7799 if (PyUnicode_READY(input) == -1) 7800 return NULL; 7801 idata = (char*)PyUnicode_DATA(input); 7802 kind = PyUnicode_KIND(input); 7803 size = PyUnicode_GET_LENGTH(input); 7804 i = 0; 7805 7806 if (size == 0) { 7807 Py_INCREF(input); 7808 return input; 7809 } 7810 7811 /* allocate enough for a simple 1:1 translation without 7812 replacements, if we need more, we'll resize */ 7813 osize = size; 7814 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 7815 opos = 0; 7816 if (output == NULL) { 7817 PyErr_NoMemory(); 7818 goto onError; 7819 } 7820 7821 while (i<size) { 7822 /* try to encode it */ 7823 PyObject *x = NULL; 7824 if (charmaptranslate_output(input, i, mapping, 7825 &output, &osize, &opos, &x)) { 7826 Py_XDECREF(x); 7827 goto onError; 7828 } 7829 Py_XDECREF(x); 7830 if (x!=Py_None) /* it worked => adjust input pointer */ 7831 ++i; 7832 else { /* untranslatable character */ 7833 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7834 Py_ssize_t repsize; 7835 Py_ssize_t newpos; 7836 Py_ssize_t uni2; 7837 /* startpos for collecting untranslatable chars */ 7838 Py_ssize_t collstart = i; 7839 Py_ssize_t collend = i+1; 7840 Py_ssize_t coll; 7841 7842 /* find all untranslatable characters */ 7843 while (collend < size) { 7844 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 7845 goto onError; 7846 Py_XDECREF(x); 7847 if (x!=Py_None) 7848 break; 7849 ++collend; 7850 } 7851 /* cache callback name lookup 7852 * (if not done yet, i.e. it's the first error) */ 7853 if (known_errorHandler==-1) { 7854 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7855 known_errorHandler = 1; 7856 else if (!strcmp(errors, "replace")) 7857 known_errorHandler = 2; 7858 else if (!strcmp(errors, "ignore")) 7859 known_errorHandler = 3; 7860 else if (!strcmp(errors, "xmlcharrefreplace")) 7861 known_errorHandler = 4; 7862 else 7863 known_errorHandler = 0; 7864 } 7865 switch (known_errorHandler) { 7866 case 1: /* strict */ 7867 raise_translate_exception(&exc, input, collstart, 7868 collend, reason); 7869 goto onError; 7870 case 2: /* replace */ 7871 /* No need to check for space, this is a 1:1 replacement */ 7872 for (coll = collstart; coll<collend; coll++) 7873 output[opos++] = '?'; 7874 /* fall through */ 7875 case 3: /* ignore */ 7876 i = collend; 7877 break; 7878 case 4: /* xmlcharrefreplace */ 7879 /* generate replacement (temporarily (mis)uses i) */ 7880 for (i = collstart; i < collend; ++i) { 7881 char buffer[2+29+1+1]; 7882 char *cp; 7883 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 7884 if (charmaptranslate_makespace(&output, &osize, 7885 opos+strlen(buffer)+(size-collend))) 7886 goto onError; 7887 for (cp = buffer; *cp; ++cp) 7888 output[opos++] = *cp; 7889 } 7890 i = collend; 7891 break; 7892 default: 7893 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 7894 reason, input, &exc, 7895 collstart, collend, &newpos); 7896 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 7897 goto onError; 7898 /* generate replacement */ 7899 repsize = PyUnicode_GET_LENGTH(repunicode); 7900 if (charmaptranslate_makespace(&output, &osize, 7901 opos+repsize+(size-collend))) { 7902 Py_DECREF(repunicode); 7903 goto onError; 7904 } 7905 for (uni2 = 0; repsize-->0; ++uni2) 7906 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 7907 i = newpos; 7908 Py_DECREF(repunicode); 7909 } 7910 } 7911 } 7912 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 7913 if (!res) 7914 goto onError; 7915 PyMem_Free(output); 7916 Py_XDECREF(exc); 7917 Py_XDECREF(errorHandler); 7918 return res; 7919 7920 onError: 7921 PyMem_Free(output); 7922 Py_XDECREF(exc); 7923 Py_XDECREF(errorHandler); 7924 return NULL; 7925} 7926 7927/* Deprecated. Use PyUnicode_Translate instead. */ 7928PyObject * 7929PyUnicode_TranslateCharmap(const Py_UNICODE *p, 7930 Py_ssize_t size, 7931 PyObject *mapping, 7932 const char *errors) 7933{ 7934 PyObject *unicode = PyUnicode_FromUnicode(p, size); 7935 if (!unicode) 7936 return NULL; 7937 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 7938} 7939 7940PyObject * 7941PyUnicode_Translate(PyObject *str, 7942 PyObject *mapping, 7943 const char *errors) 7944{ 7945 PyObject *result; 7946 7947 str = PyUnicode_FromObject(str); 7948 if (str == NULL) 7949 goto onError; 7950 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 7951 Py_DECREF(str); 7952 return result; 7953 7954 onError: 7955 Py_XDECREF(str); 7956 return NULL; 7957} 7958 7959static Py_UCS4 7960fix_decimal_and_space_to_ascii(PyUnicodeObject *self) 7961{ 7962 /* No need to call PyUnicode_READY(self) because this function is only 7963 called as a callback from fixup() which does it already. */ 7964 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 7965 const int kind = PyUnicode_KIND(self); 7966 void *data = PyUnicode_DATA(self); 7967 Py_UCS4 maxchar = 0, ch, fixed; 7968 Py_ssize_t i; 7969 7970 for (i = 0; i < len; ++i) { 7971 ch = PyUnicode_READ(kind, data, i); 7972 fixed = 0; 7973 if (ch > 127) { 7974 if (Py_UNICODE_ISSPACE(ch)) 7975 fixed = ' '; 7976 else { 7977 const int decimal = Py_UNICODE_TODECIMAL(ch); 7978 if (decimal >= 0) 7979 fixed = '0' + decimal; 7980 } 7981 if (fixed != 0) { 7982 if (fixed > maxchar) 7983 maxchar = fixed; 7984 PyUnicode_WRITE(kind, data, i, fixed); 7985 } 7986 else if (ch > maxchar) 7987 maxchar = ch; 7988 } 7989 else if (ch > maxchar) 7990 maxchar = ch; 7991 } 7992 7993 return maxchar; 7994} 7995 7996PyObject * 7997_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 7998{ 7999 if (!PyUnicode_Check(unicode)) { 8000 PyErr_BadInternalCall(); 8001 return NULL; 8002 } 8003 if (PyUnicode_READY(unicode) == -1) 8004 return NULL; 8005 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8006 /* If the string is already ASCII, just return the same string */ 8007 Py_INCREF(unicode); 8008 return unicode; 8009 } 8010 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); 8011} 8012 8013PyObject * 8014PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8015 Py_ssize_t length) 8016{ 8017 PyObject *result; 8018 Py_UNICODE *p; /* write pointer into result */ 8019 Py_ssize_t i; 8020 /* Copy to a new string */ 8021 result = (PyObject *)_PyUnicode_New(length); 8022 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8023 if (result == NULL) 8024 return result; 8025 p = PyUnicode_AS_UNICODE(result); 8026 /* Iterate over code points */ 8027 for (i = 0; i < length; i++) { 8028 Py_UNICODE ch =s[i]; 8029 if (ch > 127) { 8030 int decimal = Py_UNICODE_TODECIMAL(ch); 8031 if (decimal >= 0) 8032 p[i] = '0' + decimal; 8033 } 8034 } 8035 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) { 8036 Py_DECREF(result); 8037 return NULL; 8038 } 8039 return result; 8040} 8041/* --- Decimal Encoder ---------------------------------------------------- */ 8042 8043int 8044PyUnicode_EncodeDecimal(Py_UNICODE *s, 8045 Py_ssize_t length, 8046 char *output, 8047 const char *errors) 8048{ 8049 Py_UNICODE *p, *end; 8050 PyObject *errorHandler = NULL; 8051 PyObject *exc = NULL; 8052 const char *encoding = "decimal"; 8053 const char *reason = "invalid decimal Unicode string"; 8054 /* the following variable is used for caching string comparisons 8055 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8056 int known_errorHandler = -1; 8057 8058 if (output == NULL) { 8059 PyErr_BadArgument(); 8060 return -1; 8061 } 8062 8063 p = s; 8064 end = s + length; 8065 while (p < end) { 8066 register Py_UNICODE ch = *p; 8067 int decimal; 8068 PyObject *repunicode; 8069 Py_ssize_t repsize; 8070 Py_ssize_t newpos; 8071 Py_UNICODE *uni2; 8072 Py_UNICODE *collstart; 8073 Py_UNICODE *collend; 8074 8075 if (Py_UNICODE_ISSPACE(ch)) { 8076 *output++ = ' '; 8077 ++p; 8078 continue; 8079 } 8080 decimal = Py_UNICODE_TODECIMAL(ch); 8081 if (decimal >= 0) { 8082 *output++ = '0' + decimal; 8083 ++p; 8084 continue; 8085 } 8086 if (0 < ch && ch < 256) { 8087 *output++ = (char)ch; 8088 ++p; 8089 continue; 8090 } 8091 /* All other characters are considered unencodable */ 8092 collstart = p; 8093 collend = p+1; 8094 while (collend < end) { 8095 if ((0 < *collend && *collend < 256) || 8096 !Py_UNICODE_ISSPACE(*collend) || 8097 Py_UNICODE_TODECIMAL(*collend)) 8098 break; 8099 } 8100 /* cache callback name lookup 8101 * (if not done yet, i.e. it's the first error) */ 8102 if (known_errorHandler==-1) { 8103 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8104 known_errorHandler = 1; 8105 else if (!strcmp(errors, "replace")) 8106 known_errorHandler = 2; 8107 else if (!strcmp(errors, "ignore")) 8108 known_errorHandler = 3; 8109 else if (!strcmp(errors, "xmlcharrefreplace")) 8110 known_errorHandler = 4; 8111 else 8112 known_errorHandler = 0; 8113 } 8114 switch (known_errorHandler) { 8115 case 1: /* strict */ 8116 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8117 goto onError; 8118 case 2: /* replace */ 8119 for (p = collstart; p < collend; ++p) 8120 *output++ = '?'; 8121 /* fall through */ 8122 case 3: /* ignore */ 8123 p = collend; 8124 break; 8125 case 4: /* xmlcharrefreplace */ 8126 /* generate replacement (temporarily (mis)uses p) */ 8127 for (p = collstart; p < collend; ++p) 8128 output += sprintf(output, "&#%d;", (int)*p); 8129 p = collend; 8130 break; 8131 default: 8132 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8133 encoding, reason, s, length, &exc, 8134 collstart-s, collend-s, &newpos); 8135 if (repunicode == NULL) 8136 goto onError; 8137 if (!PyUnicode_Check(repunicode)) { 8138 /* Byte results not supported, since they have no decimal property. */ 8139 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8140 Py_DECREF(repunicode); 8141 goto onError; 8142 } 8143 /* generate replacement */ 8144 repsize = PyUnicode_GET_SIZE(repunicode); 8145 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8146 Py_UNICODE ch = *uni2; 8147 if (Py_UNICODE_ISSPACE(ch)) 8148 *output++ = ' '; 8149 else { 8150 decimal = Py_UNICODE_TODECIMAL(ch); 8151 if (decimal >= 0) 8152 *output++ = '0' + decimal; 8153 else if (0 < ch && ch < 256) 8154 *output++ = (char)ch; 8155 else { 8156 Py_DECREF(repunicode); 8157 raise_encode_exception(&exc, encoding, 8158 s, length, collstart-s, collend-s, reason); 8159 goto onError; 8160 } 8161 } 8162 } 8163 p = s + newpos; 8164 Py_DECREF(repunicode); 8165 } 8166 } 8167 /* 0-terminate the output string */ 8168 *output++ = '\0'; 8169 Py_XDECREF(exc); 8170 Py_XDECREF(errorHandler); 8171 return 0; 8172 8173 onError: 8174 Py_XDECREF(exc); 8175 Py_XDECREF(errorHandler); 8176 return -1; 8177} 8178 8179/* --- Helpers ------------------------------------------------------------ */ 8180 8181#include "stringlib/ucs1lib.h" 8182#include "stringlib/fastsearch.h" 8183#include "stringlib/partition.h" 8184#include "stringlib/split.h" 8185#include "stringlib/count.h" 8186#include "stringlib/find.h" 8187#include "stringlib/localeutil.h" 8188#include "stringlib/undef.h" 8189 8190#include "stringlib/ucs2lib.h" 8191#include "stringlib/fastsearch.h" 8192#include "stringlib/partition.h" 8193#include "stringlib/split.h" 8194#include "stringlib/count.h" 8195#include "stringlib/find.h" 8196#include "stringlib/localeutil.h" 8197#include "stringlib/undef.h" 8198 8199#include "stringlib/ucs4lib.h" 8200#include "stringlib/fastsearch.h" 8201#include "stringlib/partition.h" 8202#include "stringlib/split.h" 8203#include "stringlib/count.h" 8204#include "stringlib/find.h" 8205#include "stringlib/localeutil.h" 8206#include "stringlib/undef.h" 8207 8208static Py_ssize_t 8209any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8210 const Py_UCS1*, Py_ssize_t, 8211 Py_ssize_t, Py_ssize_t), 8212 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8213 const Py_UCS2*, Py_ssize_t, 8214 Py_ssize_t, Py_ssize_t), 8215 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8216 const Py_UCS4*, Py_ssize_t, 8217 Py_ssize_t, Py_ssize_t), 8218 PyObject* s1, PyObject* s2, 8219 Py_ssize_t start, 8220 Py_ssize_t end) 8221{ 8222 int kind1, kind2, kind; 8223 void *buf1, *buf2; 8224 Py_ssize_t len1, len2, result; 8225 8226 kind1 = PyUnicode_KIND(s1); 8227 kind2 = PyUnicode_KIND(s2); 8228 kind = kind1 > kind2 ? kind1 : kind2; 8229 buf1 = PyUnicode_DATA(s1); 8230 buf2 = PyUnicode_DATA(s2); 8231 if (kind1 != kind) 8232 buf1 = _PyUnicode_AsKind(s1, kind); 8233 if (!buf1) 8234 return -2; 8235 if (kind2 != kind) 8236 buf2 = _PyUnicode_AsKind(s2, kind); 8237 if (!buf2) { 8238 if (kind1 != kind) PyMem_Free(buf1); 8239 return -2; 8240 } 8241 len1 = PyUnicode_GET_LENGTH(s1); 8242 len2 = PyUnicode_GET_LENGTH(s2); 8243 8244 switch(kind) { 8245 case PyUnicode_1BYTE_KIND: 8246 result = ucs1(buf1, len1, buf2, len2, start, end); 8247 break; 8248 case PyUnicode_2BYTE_KIND: 8249 result = ucs2(buf1, len1, buf2, len2, start, end); 8250 break; 8251 case PyUnicode_4BYTE_KIND: 8252 result = ucs4(buf1, len1, buf2, len2, start, end); 8253 break; 8254 default: 8255 assert(0); result = -2; 8256 } 8257 8258 if (kind1 != kind) 8259 PyMem_Free(buf1); 8260 if (kind2 != kind) 8261 PyMem_Free(buf2); 8262 8263 return result; 8264} 8265 8266Py_ssize_t 8267_PyUnicode_InsertThousandsGrouping(int kind, void *data, 8268 Py_ssize_t n_buffer, 8269 void *digits, Py_ssize_t n_digits, 8270 Py_ssize_t min_width, 8271 const char *grouping, 8272 const char *thousands_sep) 8273{ 8274 switch(kind) { 8275 case PyUnicode_1BYTE_KIND: 8276 return _PyUnicode_ucs1_InsertThousandsGrouping( 8277 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8278 min_width, grouping, thousands_sep); 8279 case PyUnicode_2BYTE_KIND: 8280 return _PyUnicode_ucs2_InsertThousandsGrouping( 8281 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8282 min_width, grouping, thousands_sep); 8283 case PyUnicode_4BYTE_KIND: 8284 return _PyUnicode_ucs4_InsertThousandsGrouping( 8285 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8286 min_width, grouping, thousands_sep); 8287 } 8288 assert(0); 8289 return -1; 8290} 8291 8292 8293#include "stringlib/unicodedefs.h" 8294#include "stringlib/fastsearch.h" 8295 8296#include "stringlib/count.h" 8297#include "stringlib/find.h" 8298 8299/* helper macro to fixup start/end slice values */ 8300#define ADJUST_INDICES(start, end, len) \ 8301 if (end > len) \ 8302 end = len; \ 8303 else if (end < 0) { \ 8304 end += len; \ 8305 if (end < 0) \ 8306 end = 0; \ 8307 } \ 8308 if (start < 0) { \ 8309 start += len; \ 8310 if (start < 0) \ 8311 start = 0; \ 8312 } 8313 8314Py_ssize_t 8315PyUnicode_Count(PyObject *str, 8316 PyObject *substr, 8317 Py_ssize_t start, 8318 Py_ssize_t end) 8319{ 8320 Py_ssize_t result; 8321 PyUnicodeObject* str_obj; 8322 PyUnicodeObject* sub_obj; 8323 int kind1, kind2, kind; 8324 void *buf1 = NULL, *buf2 = NULL; 8325 Py_ssize_t len1, len2; 8326 8327 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8328 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8329 return -1; 8330 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8331 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8332 Py_DECREF(str_obj); 8333 return -1; 8334 } 8335 8336 kind1 = PyUnicode_KIND(str_obj); 8337 kind2 = PyUnicode_KIND(sub_obj); 8338 kind = kind1 > kind2 ? kind1 : kind2; 8339 buf1 = PyUnicode_DATA(str_obj); 8340 if (kind1 != kind) 8341 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8342 if (!buf1) 8343 goto onError; 8344 buf2 = PyUnicode_DATA(sub_obj); 8345 if (kind2 != kind) 8346 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8347 if (!buf2) 8348 goto onError; 8349 len1 = PyUnicode_GET_LENGTH(str_obj); 8350 len2 = PyUnicode_GET_LENGTH(sub_obj); 8351 8352 ADJUST_INDICES(start, end, len1); 8353 switch(kind) { 8354 case PyUnicode_1BYTE_KIND: 8355 result = ucs1lib_count( 8356 ((Py_UCS1*)buf1) + start, end - start, 8357 buf2, len2, PY_SSIZE_T_MAX 8358 ); 8359 break; 8360 case PyUnicode_2BYTE_KIND: 8361 result = ucs2lib_count( 8362 ((Py_UCS2*)buf1) + start, end - start, 8363 buf2, len2, PY_SSIZE_T_MAX 8364 ); 8365 break; 8366 case PyUnicode_4BYTE_KIND: 8367 result = ucs4lib_count( 8368 ((Py_UCS4*)buf1) + start, end - start, 8369 buf2, len2, PY_SSIZE_T_MAX 8370 ); 8371 break; 8372 default: 8373 assert(0); result = 0; 8374 } 8375 8376 Py_DECREF(sub_obj); 8377 Py_DECREF(str_obj); 8378 8379 if (kind1 != kind) 8380 PyMem_Free(buf1); 8381 if (kind2 != kind) 8382 PyMem_Free(buf2); 8383 8384 return result; 8385 onError: 8386 Py_DECREF(sub_obj); 8387 Py_DECREF(str_obj); 8388 if (kind1 != kind && buf1) 8389 PyMem_Free(buf1); 8390 if (kind2 != kind && buf2) 8391 PyMem_Free(buf2); 8392 return -1; 8393} 8394 8395Py_ssize_t 8396PyUnicode_Find(PyObject *str, 8397 PyObject *sub, 8398 Py_ssize_t start, 8399 Py_ssize_t end, 8400 int direction) 8401{ 8402 Py_ssize_t result; 8403 8404 str = PyUnicode_FromObject(str); 8405 if (!str || PyUnicode_READY(str) == -1) 8406 return -2; 8407 sub = PyUnicode_FromObject(sub); 8408 if (!sub || PyUnicode_READY(sub) == -1) { 8409 Py_DECREF(str); 8410 return -2; 8411 } 8412 8413 if (direction > 0) 8414 result = any_find_slice( 8415 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 8416 str, sub, start, end 8417 ); 8418 else 8419 result = any_find_slice( 8420 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8421 str, sub, start, end 8422 ); 8423 8424 Py_DECREF(str); 8425 Py_DECREF(sub); 8426 8427 return result; 8428} 8429 8430Py_ssize_t 8431PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8432 Py_ssize_t start, Py_ssize_t end, 8433 int direction) 8434{ 8435 char *result; 8436 int kind; 8437 if (PyUnicode_READY(str) == -1) 8438 return -2; 8439 if (start < 0 || end < 0) { 8440 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8441 return -2; 8442 } 8443 if (end > PyUnicode_GET_LENGTH(str)) 8444 end = PyUnicode_GET_LENGTH(str); 8445 kind = PyUnicode_KIND(str); 8446 result = findchar(PyUnicode_1BYTE_DATA(str) 8447 + PyUnicode_KIND_SIZE(kind, start), 8448 kind, 8449 end-start, ch, direction); 8450 if (!result) 8451 return -1; 8452 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8453} 8454 8455static int 8456tailmatch(PyUnicodeObject *self, 8457 PyUnicodeObject *substring, 8458 Py_ssize_t start, 8459 Py_ssize_t end, 8460 int direction) 8461{ 8462 int kind_self; 8463 int kind_sub; 8464 void *data_self; 8465 void *data_sub; 8466 Py_ssize_t offset; 8467 Py_ssize_t i; 8468 Py_ssize_t end_sub; 8469 8470 if (PyUnicode_READY(self) == -1 || 8471 PyUnicode_READY(substring) == -1) 8472 return 0; 8473 8474 if (PyUnicode_GET_LENGTH(substring) == 0) 8475 return 1; 8476 8477 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8478 end -= PyUnicode_GET_LENGTH(substring); 8479 if (end < start) 8480 return 0; 8481 8482 kind_self = PyUnicode_KIND(self); 8483 data_self = PyUnicode_DATA(self); 8484 kind_sub = PyUnicode_KIND(substring); 8485 data_sub = PyUnicode_DATA(substring); 8486 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8487 8488 if (direction > 0) 8489 offset = end; 8490 else 8491 offset = start; 8492 8493 if (PyUnicode_READ(kind_self, data_self, offset) == 8494 PyUnicode_READ(kind_sub, data_sub, 0) && 8495 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8496 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8497 /* If both are of the same kind, memcmp is sufficient */ 8498 if (kind_self == kind_sub) { 8499 return ! memcmp((char *)data_self + 8500 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8501 data_sub, 8502 PyUnicode_GET_LENGTH(substring) * 8503 PyUnicode_CHARACTER_SIZE(substring)); 8504 } 8505 /* otherwise we have to compare each character by first accesing it */ 8506 else { 8507 /* We do not need to compare 0 and len(substring)-1 because 8508 the if statement above ensured already that they are equal 8509 when we end up here. */ 8510 // TODO: honor direction and do a forward or backwards search 8511 for (i = 1; i < end_sub; ++i) { 8512 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8513 PyUnicode_READ(kind_sub, data_sub, i)) 8514 return 0; 8515 } 8516 return 1; 8517 } 8518 } 8519 8520 return 0; 8521} 8522 8523Py_ssize_t 8524PyUnicode_Tailmatch(PyObject *str, 8525 PyObject *substr, 8526 Py_ssize_t start, 8527 Py_ssize_t end, 8528 int direction) 8529{ 8530 Py_ssize_t result; 8531 8532 str = PyUnicode_FromObject(str); 8533 if (str == NULL) 8534 return -1; 8535 substr = PyUnicode_FromObject(substr); 8536 if (substr == NULL) { 8537 Py_DECREF(str); 8538 return -1; 8539 } 8540 8541 result = tailmatch((PyUnicodeObject *)str, 8542 (PyUnicodeObject *)substr, 8543 start, end, direction); 8544 Py_DECREF(str); 8545 Py_DECREF(substr); 8546 return result; 8547} 8548 8549/* Apply fixfct filter to the Unicode object self and return a 8550 reference to the modified object */ 8551 8552static PyObject * 8553fixup(PyUnicodeObject *self, 8554 Py_UCS4 (*fixfct)(PyUnicodeObject *s)) 8555{ 8556 PyObject *u; 8557 Py_UCS4 maxchar_old, maxchar_new = 0; 8558 8559 if (PyUnicode_READY(self) == -1) 8560 return NULL; 8561 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8562 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8563 maxchar_old); 8564 if (u == NULL) 8565 return NULL; 8566 8567 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8568 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8569 8570 /* fix functions return the new maximum character in a string, 8571 if the kind of the resulting unicode object does not change, 8572 everything is fine. Otherwise we need to change the string kind 8573 and re-run the fix function. */ 8574 maxchar_new = fixfct((PyUnicodeObject*)u); 8575 if (maxchar_new == 0) 8576 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8577 else if (maxchar_new <= 127) 8578 maxchar_new = 127; 8579 else if (maxchar_new <= 255) 8580 maxchar_new = 255; 8581 else if (maxchar_new <= 65535) 8582 maxchar_new = 65535; 8583 else 8584 maxchar_new = 1114111; /* 0x10ffff */ 8585 8586 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8587 /* fixfct should return TRUE if it modified the buffer. If 8588 FALSE, return a reference to the original buffer instead 8589 (to save space, not time) */ 8590 Py_INCREF(self); 8591 Py_DECREF(u); 8592 return (PyObject*) self; 8593 } 8594 else if (maxchar_new == maxchar_old) { 8595 return u; 8596 } 8597 else { 8598 /* In case the maximum character changed, we need to 8599 convert the string to the new category. */ 8600 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8601 if (v == NULL) { 8602 Py_DECREF(u); 8603 return NULL; 8604 } 8605 if (maxchar_new > maxchar_old) { 8606 /* If the maxchar increased so that the kind changed, not all 8607 characters are representable anymore and we need to fix the 8608 string again. This only happens in very few cases. */ 8609 if (PyUnicode_CopyCharacters(v, 0, 8610 (PyObject*)self, 0, 8611 PyUnicode_GET_LENGTH(self)) < 0) 8612 { 8613 Py_DECREF(u); 8614 return NULL; 8615 } 8616 maxchar_old = fixfct((PyUnicodeObject*)v); 8617 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8618 } 8619 else { 8620 if (PyUnicode_CopyCharacters(v, 0, 8621 u, 0, 8622 PyUnicode_GET_LENGTH(self)) < 0) 8623 { 8624 Py_DECREF(u); 8625 return NULL; 8626 } 8627 } 8628 8629 Py_DECREF(u); 8630 return v; 8631 } 8632} 8633 8634static Py_UCS4 8635fixupper(PyUnicodeObject *self) 8636{ 8637 /* No need to call PyUnicode_READY(self) because this function is only 8638 called as a callback from fixup() which does it already. */ 8639 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8640 const int kind = PyUnicode_KIND(self); 8641 void *data = PyUnicode_DATA(self); 8642 int touched = 0; 8643 Py_UCS4 maxchar = 0; 8644 Py_ssize_t i; 8645 8646 for (i = 0; i < len; ++i) { 8647 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8648 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8649 if (up != ch) { 8650 if (up > maxchar) 8651 maxchar = up; 8652 PyUnicode_WRITE(kind, data, i, up); 8653 touched = 1; 8654 } 8655 else if (ch > maxchar) 8656 maxchar = ch; 8657 } 8658 8659 if (touched) 8660 return maxchar; 8661 else 8662 return 0; 8663} 8664 8665static Py_UCS4 8666fixlower(PyUnicodeObject *self) 8667{ 8668 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8669 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8670 const int kind = PyUnicode_KIND(self); 8671 void *data = PyUnicode_DATA(self); 8672 int touched = 0; 8673 Py_UCS4 maxchar = 0; 8674 Py_ssize_t i; 8675 8676 for(i = 0; i < len; ++i) { 8677 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8678 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8679 if (lo != ch) { 8680 if (lo > maxchar) 8681 maxchar = lo; 8682 PyUnicode_WRITE(kind, data, i, lo); 8683 touched = 1; 8684 } 8685 else if (ch > maxchar) 8686 maxchar = ch; 8687 } 8688 8689 if (touched) 8690 return maxchar; 8691 else 8692 return 0; 8693} 8694 8695static Py_UCS4 8696fixswapcase(PyUnicodeObject *self) 8697{ 8698 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8699 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8700 const int kind = PyUnicode_KIND(self); 8701 void *data = PyUnicode_DATA(self); 8702 int touched = 0; 8703 Py_UCS4 maxchar = 0; 8704 Py_ssize_t i; 8705 8706 for(i = 0; i < len; ++i) { 8707 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8708 Py_UCS4 nu = 0; 8709 8710 if (Py_UNICODE_ISUPPER(ch)) 8711 nu = Py_UNICODE_TOLOWER(ch); 8712 else if (Py_UNICODE_ISLOWER(ch)) 8713 nu = Py_UNICODE_TOUPPER(ch); 8714 8715 if (nu != 0) { 8716 if (nu > maxchar) 8717 maxchar = nu; 8718 PyUnicode_WRITE(kind, data, i, nu); 8719 touched = 1; 8720 } 8721 else if (ch > maxchar) 8722 maxchar = ch; 8723 } 8724 8725 if (touched) 8726 return maxchar; 8727 else 8728 return 0; 8729} 8730 8731static Py_UCS4 8732fixcapitalize(PyUnicodeObject *self) 8733{ 8734 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8735 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8736 const int kind = PyUnicode_KIND(self); 8737 void *data = PyUnicode_DATA(self); 8738 int touched = 0; 8739 Py_UCS4 maxchar = 0; 8740 Py_ssize_t i = 0; 8741 Py_UCS4 ch; 8742 8743 if (len == 0) 8744 return 0; 8745 8746 ch = PyUnicode_READ(kind, data, i); 8747 if (!Py_UNICODE_ISUPPER(ch)) { 8748 maxchar = Py_UNICODE_TOUPPER(ch); 8749 PyUnicode_WRITE(kind, data, i, maxchar); 8750 touched = 1; 8751 } 8752 ++i; 8753 for(; i < len; ++i) { 8754 ch = PyUnicode_READ(kind, data, i); 8755 if (!Py_UNICODE_ISLOWER(ch)) { 8756 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8757 if (lo > maxchar) 8758 maxchar = lo; 8759 PyUnicode_WRITE(kind, data, i, lo); 8760 touched = 1; 8761 } 8762 else if (ch > maxchar) 8763 maxchar = ch; 8764 } 8765 8766 if (touched) 8767 return maxchar; 8768 else 8769 return 0; 8770} 8771 8772static Py_UCS4 8773fixtitle(PyUnicodeObject *self) 8774{ 8775 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8776 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8777 const int kind = PyUnicode_KIND(self); 8778 void *data = PyUnicode_DATA(self); 8779 Py_UCS4 maxchar = 0; 8780 Py_ssize_t i = 0; 8781 int previous_is_cased; 8782 8783 /* Shortcut for single character strings */ 8784 if (len == 1) { 8785 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8786 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 8787 if (ti != ch) { 8788 PyUnicode_WRITE(kind, data, i, ti); 8789 return ti; 8790 } 8791 else 8792 return 0; 8793 } 8794 previous_is_cased = 0; 8795 for(; i < len; ++i) { 8796 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8797 Py_UCS4 nu; 8798 8799 if (previous_is_cased) 8800 nu = Py_UNICODE_TOLOWER(ch); 8801 else 8802 nu = Py_UNICODE_TOTITLE(ch); 8803 8804 if (nu > maxchar) 8805 maxchar = nu; 8806 PyUnicode_WRITE(kind, data, i, nu); 8807 8808 if (Py_UNICODE_ISLOWER(ch) || 8809 Py_UNICODE_ISUPPER(ch) || 8810 Py_UNICODE_ISTITLE(ch)) 8811 previous_is_cased = 1; 8812 else 8813 previous_is_cased = 0; 8814 } 8815 return maxchar; 8816} 8817 8818PyObject * 8819PyUnicode_Join(PyObject *separator, PyObject *seq) 8820{ 8821 PyObject *sep = NULL; 8822 Py_ssize_t seplen = 1; 8823 PyObject *res = NULL; /* the result */ 8824 PyObject *fseq; /* PySequence_Fast(seq) */ 8825 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 8826 PyObject **items; 8827 PyObject *item; 8828 Py_ssize_t sz, i, res_offset; 8829 Py_UCS4 maxchar = 0; 8830 Py_UCS4 item_maxchar; 8831 8832 fseq = PySequence_Fast(seq, ""); 8833 if (fseq == NULL) { 8834 return NULL; 8835 } 8836 8837 /* NOTE: the following code can't call back into Python code, 8838 * so we are sure that fseq won't be mutated. 8839 */ 8840 8841 seqlen = PySequence_Fast_GET_SIZE(fseq); 8842 /* If empty sequence, return u"". */ 8843 if (seqlen == 0) { 8844 res = PyUnicode_New(0, 0); 8845 goto Done; 8846 } 8847 items = PySequence_Fast_ITEMS(fseq); 8848 /* If singleton sequence with an exact Unicode, return that. */ 8849 if (seqlen == 1) { 8850 item = items[0]; 8851 if (PyUnicode_CheckExact(item)) { 8852 Py_INCREF(item); 8853 res = item; 8854 goto Done; 8855 } 8856 } 8857 else { 8858 /* Set up sep and seplen */ 8859 if (separator == NULL) { 8860 /* fall back to a blank space separator */ 8861 sep = PyUnicode_FromOrdinal(' '); 8862 if (!sep) 8863 goto onError; 8864 } 8865 else { 8866 if (!PyUnicode_Check(separator)) { 8867 PyErr_Format(PyExc_TypeError, 8868 "separator: expected str instance," 8869 " %.80s found", 8870 Py_TYPE(separator)->tp_name); 8871 goto onError; 8872 } 8873 if (PyUnicode_READY(separator)) 8874 goto onError; 8875 sep = separator; 8876 seplen = PyUnicode_GET_LENGTH(separator); 8877 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 8878 /* inc refcount to keep this code path symetric with the 8879 above case of a blank separator */ 8880 Py_INCREF(sep); 8881 } 8882 } 8883 8884 /* There are at least two things to join, or else we have a subclass 8885 * of str in the sequence. 8886 * Do a pre-pass to figure out the total amount of space we'll 8887 * need (sz), and see whether all argument are strings. 8888 */ 8889 sz = 0; 8890 for (i = 0; i < seqlen; i++) { 8891 const Py_ssize_t old_sz = sz; 8892 item = items[i]; 8893 if (!PyUnicode_Check(item)) { 8894 PyErr_Format(PyExc_TypeError, 8895 "sequence item %zd: expected str instance," 8896 " %.80s found", 8897 i, Py_TYPE(item)->tp_name); 8898 goto onError; 8899 } 8900 if (PyUnicode_READY(item) == -1) 8901 goto onError; 8902 sz += PyUnicode_GET_LENGTH(item); 8903 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 8904 if (item_maxchar > maxchar) 8905 maxchar = item_maxchar; 8906 if (i != 0) 8907 sz += seplen; 8908 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 8909 PyErr_SetString(PyExc_OverflowError, 8910 "join() result is too long for a Python string"); 8911 goto onError; 8912 } 8913 } 8914 8915 res = PyUnicode_New(sz, maxchar); 8916 if (res == NULL) 8917 goto onError; 8918 8919 /* Catenate everything. */ 8920 for (i = 0, res_offset = 0; i < seqlen; ++i) { 8921 Py_ssize_t itemlen, copied; 8922 item = items[i]; 8923 /* Copy item, and maybe the separator. */ 8924 if (i && seplen != 0) { 8925 copied = PyUnicode_CopyCharacters(res, res_offset, 8926 sep, 0, seplen); 8927 if (copied < 0) 8928 goto onError; 8929#ifdef Py_DEBUG 8930 res_offset += copied; 8931#else 8932 res_offset += seplen; 8933#endif 8934 } 8935 itemlen = PyUnicode_GET_LENGTH(item); 8936 if (itemlen != 0) { 8937 copied = PyUnicode_CopyCharacters(res, res_offset, 8938 item, 0, itemlen); 8939 if (copied < 0) 8940 goto onError; 8941#ifdef Py_DEBUG 8942 res_offset += copied; 8943#else 8944 res_offset += itemlen; 8945#endif 8946 } 8947 } 8948 assert(res_offset == PyUnicode_GET_LENGTH(res)); 8949 8950 Done: 8951 Py_DECREF(fseq); 8952 Py_XDECREF(sep); 8953 return res; 8954 8955 onError: 8956 Py_DECREF(fseq); 8957 Py_XDECREF(sep); 8958 Py_XDECREF(res); 8959 return NULL; 8960} 8961 8962#define FILL(kind, data, value, start, length) \ 8963 do { \ 8964 Py_ssize_t i_ = 0; \ 8965 assert(kind != PyUnicode_WCHAR_KIND); \ 8966 switch ((kind)) { \ 8967 case PyUnicode_1BYTE_KIND: { \ 8968 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 8969 memset(to_, (unsigned char)value, length); \ 8970 break; \ 8971 } \ 8972 case PyUnicode_2BYTE_KIND: { \ 8973 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 8974 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8975 break; \ 8976 } \ 8977 default: { \ 8978 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 8979 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8980 break; \ 8981 } \ 8982 } \ 8983 } while (0) 8984 8985static PyUnicodeObject * 8986pad(PyUnicodeObject *self, 8987 Py_ssize_t left, 8988 Py_ssize_t right, 8989 Py_UCS4 fill) 8990{ 8991 PyObject *u; 8992 Py_UCS4 maxchar; 8993 int kind; 8994 void *data; 8995 8996 if (left < 0) 8997 left = 0; 8998 if (right < 0) 8999 right = 0; 9000 9001 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9002 Py_INCREF(self); 9003 return self; 9004 } 9005 9006 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9007 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9008 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9009 return NULL; 9010 } 9011 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9012 if (fill > maxchar) 9013 maxchar = fill; 9014 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9015 if (!u) 9016 return NULL; 9017 9018 kind = PyUnicode_KIND(u); 9019 data = PyUnicode_DATA(u); 9020 if (left) 9021 FILL(kind, data, fill, 0, left); 9022 if (right) 9023 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9024 if (PyUnicode_CopyCharacters(u, left, 9025 (PyObject*)self, 0, 9026 _PyUnicode_LENGTH(self)) < 0) 9027 { 9028 Py_DECREF(u); 9029 return NULL; 9030 } 9031 9032 return (PyUnicodeObject*)u; 9033} 9034#undef FILL 9035 9036PyObject * 9037PyUnicode_Splitlines(PyObject *string, int keepends) 9038{ 9039 PyObject *list; 9040 9041 string = PyUnicode_FromObject(string); 9042 if (string == NULL || PyUnicode_READY(string) == -1) 9043 return NULL; 9044 9045 switch(PyUnicode_KIND(string)) { 9046 case PyUnicode_1BYTE_KIND: 9047 list = ucs1lib_splitlines( 9048 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9049 PyUnicode_GET_LENGTH(string), keepends); 9050 break; 9051 case PyUnicode_2BYTE_KIND: 9052 list = ucs2lib_splitlines( 9053 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9054 PyUnicode_GET_LENGTH(string), keepends); 9055 break; 9056 case PyUnicode_4BYTE_KIND: 9057 list = ucs4lib_splitlines( 9058 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9059 PyUnicode_GET_LENGTH(string), keepends); 9060 break; 9061 default: 9062 assert(0); 9063 list = 0; 9064 } 9065 Py_DECREF(string); 9066 return list; 9067} 9068 9069static PyObject * 9070split(PyUnicodeObject *self, 9071 PyUnicodeObject *substring, 9072 Py_ssize_t maxcount) 9073{ 9074 int kind1, kind2, kind; 9075 void *buf1, *buf2; 9076 Py_ssize_t len1, len2; 9077 PyObject* out; 9078 9079 if (maxcount < 0) 9080 maxcount = PY_SSIZE_T_MAX; 9081 9082 if (PyUnicode_READY(self) == -1) 9083 return NULL; 9084 9085 if (substring == NULL) 9086 switch(PyUnicode_KIND(self)) { 9087 case PyUnicode_1BYTE_KIND: 9088 return ucs1lib_split_whitespace( 9089 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9090 PyUnicode_GET_LENGTH(self), maxcount 9091 ); 9092 case PyUnicode_2BYTE_KIND: 9093 return ucs2lib_split_whitespace( 9094 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9095 PyUnicode_GET_LENGTH(self), maxcount 9096 ); 9097 case PyUnicode_4BYTE_KIND: 9098 return ucs4lib_split_whitespace( 9099 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9100 PyUnicode_GET_LENGTH(self), maxcount 9101 ); 9102 default: 9103 assert(0); 9104 return NULL; 9105 } 9106 9107 if (PyUnicode_READY(substring) == -1) 9108 return NULL; 9109 9110 kind1 = PyUnicode_KIND(self); 9111 kind2 = PyUnicode_KIND(substring); 9112 kind = kind1 > kind2 ? kind1 : kind2; 9113 buf1 = PyUnicode_DATA(self); 9114 buf2 = PyUnicode_DATA(substring); 9115 if (kind1 != kind) 9116 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9117 if (!buf1) 9118 return NULL; 9119 if (kind2 != kind) 9120 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9121 if (!buf2) { 9122 if (kind1 != kind) PyMem_Free(buf1); 9123 return NULL; 9124 } 9125 len1 = PyUnicode_GET_LENGTH(self); 9126 len2 = PyUnicode_GET_LENGTH(substring); 9127 9128 switch(kind) { 9129 case PyUnicode_1BYTE_KIND: 9130 out = ucs1lib_split( 9131 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9132 break; 9133 case PyUnicode_2BYTE_KIND: 9134 out = ucs2lib_split( 9135 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9136 break; 9137 case PyUnicode_4BYTE_KIND: 9138 out = ucs4lib_split( 9139 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9140 break; 9141 default: 9142 out = NULL; 9143 } 9144 if (kind1 != kind) 9145 PyMem_Free(buf1); 9146 if (kind2 != kind) 9147 PyMem_Free(buf2); 9148 return out; 9149} 9150 9151static PyObject * 9152rsplit(PyUnicodeObject *self, 9153 PyUnicodeObject *substring, 9154 Py_ssize_t maxcount) 9155{ 9156 int kind1, kind2, kind; 9157 void *buf1, *buf2; 9158 Py_ssize_t len1, len2; 9159 PyObject* out; 9160 9161 if (maxcount < 0) 9162 maxcount = PY_SSIZE_T_MAX; 9163 9164 if (PyUnicode_READY(self) == -1) 9165 return NULL; 9166 9167 if (substring == NULL) 9168 switch(PyUnicode_KIND(self)) { 9169 case PyUnicode_1BYTE_KIND: 9170 return ucs1lib_rsplit_whitespace( 9171 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9172 PyUnicode_GET_LENGTH(self), maxcount 9173 ); 9174 case PyUnicode_2BYTE_KIND: 9175 return ucs2lib_rsplit_whitespace( 9176 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9177 PyUnicode_GET_LENGTH(self), maxcount 9178 ); 9179 case PyUnicode_4BYTE_KIND: 9180 return ucs4lib_rsplit_whitespace( 9181 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9182 PyUnicode_GET_LENGTH(self), maxcount 9183 ); 9184 default: 9185 assert(0); 9186 return NULL; 9187 } 9188 9189 if (PyUnicode_READY(substring) == -1) 9190 return NULL; 9191 9192 kind1 = PyUnicode_KIND(self); 9193 kind2 = PyUnicode_KIND(substring); 9194 kind = kind1 > kind2 ? kind1 : kind2; 9195 buf1 = PyUnicode_DATA(self); 9196 buf2 = PyUnicode_DATA(substring); 9197 if (kind1 != kind) 9198 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9199 if (!buf1) 9200 return NULL; 9201 if (kind2 != kind) 9202 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9203 if (!buf2) { 9204 if (kind1 != kind) PyMem_Free(buf1); 9205 return NULL; 9206 } 9207 len1 = PyUnicode_GET_LENGTH(self); 9208 len2 = PyUnicode_GET_LENGTH(substring); 9209 9210 switch(kind) { 9211 case PyUnicode_1BYTE_KIND: 9212 out = ucs1lib_rsplit( 9213 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9214 break; 9215 case PyUnicode_2BYTE_KIND: 9216 out = ucs2lib_rsplit( 9217 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9218 break; 9219 case PyUnicode_4BYTE_KIND: 9220 out = ucs4lib_rsplit( 9221 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9222 break; 9223 default: 9224 out = NULL; 9225 } 9226 if (kind1 != kind) 9227 PyMem_Free(buf1); 9228 if (kind2 != kind) 9229 PyMem_Free(buf2); 9230 return out; 9231} 9232 9233static Py_ssize_t 9234anylib_find(int kind, void *buf1, Py_ssize_t len1, 9235 void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9236{ 9237 switch(kind) { 9238 case PyUnicode_1BYTE_KIND: 9239 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9240 case PyUnicode_2BYTE_KIND: 9241 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9242 case PyUnicode_4BYTE_KIND: 9243 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9244 } 9245 assert(0); 9246 return -1; 9247} 9248 9249static Py_ssize_t 9250anylib_count(int kind, void* sbuf, Py_ssize_t slen, 9251 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9252{ 9253 switch(kind) { 9254 case PyUnicode_1BYTE_KIND: 9255 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9256 case PyUnicode_2BYTE_KIND: 9257 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9258 case PyUnicode_4BYTE_KIND: 9259 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9260 } 9261 assert(0); 9262 return 0; 9263} 9264 9265static PyObject * 9266replace(PyObject *self, PyObject *str1, 9267 PyObject *str2, Py_ssize_t maxcount) 9268{ 9269 PyObject *u; 9270 char *sbuf = PyUnicode_DATA(self); 9271 char *buf1 = PyUnicode_DATA(str1); 9272 char *buf2 = PyUnicode_DATA(str2); 9273 int srelease = 0, release1 = 0, release2 = 0; 9274 int skind = PyUnicode_KIND(self); 9275 int kind1 = PyUnicode_KIND(str1); 9276 int kind2 = PyUnicode_KIND(str2); 9277 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9278 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9279 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9280 9281 if (maxcount < 0) 9282 maxcount = PY_SSIZE_T_MAX; 9283 else if (maxcount == 0 || slen == 0) 9284 goto nothing; 9285 9286 if (skind < kind1) 9287 /* substring too wide to be present */ 9288 goto nothing; 9289 9290 if (len1 == len2) { 9291 Py_ssize_t i; 9292 /* same length */ 9293 if (len1 == 0) 9294 goto nothing; 9295 if (len1 == 1) { 9296 /* replace characters */ 9297 Py_UCS4 u1, u2, maxchar; 9298 int mayshrink, rkind; 9299 u1 = PyUnicode_READ_CHAR(str1, 0); 9300 if (!findchar(sbuf, PyUnicode_KIND(self), 9301 slen, u1, 1)) 9302 goto nothing; 9303 u2 = PyUnicode_READ_CHAR(str2, 0); 9304 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9305 /* Replacing u1 with u2 may cause a maxchar reduction in the 9306 result string. */ 9307 mayshrink = maxchar > 127; 9308 if (u2 > maxchar) { 9309 maxchar = u2; 9310 mayshrink = 0; 9311 } 9312 u = PyUnicode_New(slen, maxchar); 9313 if (!u) 9314 goto error; 9315 if (PyUnicode_CopyCharacters(u, 0, 9316 (PyObject*)self, 0, slen) < 0) 9317 { 9318 Py_DECREF(u); 9319 return NULL; 9320 } 9321 rkind = PyUnicode_KIND(u); 9322 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9323 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9324 if (--maxcount < 0) 9325 break; 9326 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9327 } 9328 if (mayshrink) { 9329 PyObject *tmp = u; 9330 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9331 PyUnicode_GET_LENGTH(tmp)); 9332 Py_DECREF(tmp); 9333 } 9334 } else { 9335 int rkind = skind; 9336 char *res; 9337 if (kind1 < rkind) { 9338 /* widen substring */ 9339 buf1 = _PyUnicode_AsKind(str1, rkind); 9340 if (!buf1) goto error; 9341 release1 = 1; 9342 } 9343 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); 9344 if (i < 0) 9345 goto nothing; 9346 if (rkind > kind2) { 9347 /* widen replacement */ 9348 buf2 = _PyUnicode_AsKind(str2, rkind); 9349 if (!buf2) goto error; 9350 release2 = 1; 9351 } 9352 else if (rkind < kind2) { 9353 /* widen self and buf1 */ 9354 rkind = kind2; 9355 if (release1) PyMem_Free(buf1); 9356 sbuf = _PyUnicode_AsKind(self, rkind); 9357 if (!sbuf) goto error; 9358 srelease = 1; 9359 buf1 = _PyUnicode_AsKind(str1, rkind); 9360 if (!buf1) goto error; 9361 release1 = 1; 9362 } 9363 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9364 if (!res) { 9365 PyErr_NoMemory(); 9366 goto error; 9367 } 9368 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9369 /* change everything in-place, starting with this one */ 9370 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9371 buf2, 9372 PyUnicode_KIND_SIZE(rkind, len2)); 9373 i += len1; 9374 9375 while ( --maxcount > 0) { 9376 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), 9377 slen-i, 9378 buf1, len1, i); 9379 if (i == -1) 9380 break; 9381 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9382 buf2, 9383 PyUnicode_KIND_SIZE(rkind, len2)); 9384 i += len1; 9385 } 9386 9387 u = PyUnicode_FromKindAndData(rkind, res, slen); 9388 PyMem_Free(res); 9389 if (!u) goto error; 9390 } 9391 } else { 9392 9393 Py_ssize_t n, i, j, ires; 9394 Py_ssize_t product, new_size; 9395 int rkind = skind; 9396 char *res; 9397 9398 if (kind1 < rkind) { 9399 buf1 = _PyUnicode_AsKind(str1, rkind); 9400 if (!buf1) goto error; 9401 release1 = 1; 9402 } 9403 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); 9404 if (n == 0) 9405 goto nothing; 9406 if (kind2 < rkind) { 9407 buf2 = _PyUnicode_AsKind(str2, rkind); 9408 if (!buf2) goto error; 9409 release2 = 1; 9410 } 9411 else if (kind2 > rkind) { 9412 rkind = kind2; 9413 sbuf = _PyUnicode_AsKind(self, rkind); 9414 if (!sbuf) goto error; 9415 srelease = 1; 9416 if (release1) PyMem_Free(buf1); 9417 buf1 = _PyUnicode_AsKind(str1, rkind); 9418 if (!buf1) goto error; 9419 release1 = 1; 9420 } 9421 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9422 PyUnicode_GET_LENGTH(str1))); */ 9423 product = n * (len2-len1); 9424 if ((product / (len2-len1)) != n) { 9425 PyErr_SetString(PyExc_OverflowError, 9426 "replace string is too long"); 9427 goto error; 9428 } 9429 new_size = slen + product; 9430 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9431 PyErr_SetString(PyExc_OverflowError, 9432 "replace string is too long"); 9433 goto error; 9434 } 9435 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9436 if (!res) 9437 goto error; 9438 ires = i = 0; 9439 if (len1 > 0) { 9440 while (n-- > 0) { 9441 /* look for next match */ 9442 j = anylib_find(rkind, 9443 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9444 slen-i, buf1, len1, i); 9445 if (j == -1) 9446 break; 9447 else if (j > i) { 9448 /* copy unchanged part [i:j] */ 9449 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9450 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9451 PyUnicode_KIND_SIZE(rkind, j-i)); 9452 ires += j - i; 9453 } 9454 /* copy substitution string */ 9455 if (len2 > 0) { 9456 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9457 buf2, 9458 PyUnicode_KIND_SIZE(rkind, len2)); 9459 ires += len2; 9460 } 9461 i = j + len1; 9462 } 9463 if (i < slen) 9464 /* copy tail [i:] */ 9465 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9466 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9467 PyUnicode_KIND_SIZE(rkind, slen-i)); 9468 } else { 9469 /* interleave */ 9470 while (n > 0) { 9471 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9472 buf2, 9473 PyUnicode_KIND_SIZE(rkind, len2)); 9474 ires += len2; 9475 if (--n <= 0) 9476 break; 9477 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9478 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9479 PyUnicode_KIND_SIZE(rkind, 1)); 9480 ires++; 9481 i++; 9482 } 9483 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9484 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9485 PyUnicode_KIND_SIZE(rkind, slen-i)); 9486 } 9487 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9488 PyMem_Free(res); 9489 } 9490 if (srelease) 9491 PyMem_FREE(sbuf); 9492 if (release1) 9493 PyMem_FREE(buf1); 9494 if (release2) 9495 PyMem_FREE(buf2); 9496 return u; 9497 9498 nothing: 9499 /* nothing to replace; return original string (when possible) */ 9500 if (srelease) 9501 PyMem_FREE(sbuf); 9502 if (release1) 9503 PyMem_FREE(buf1); 9504 if (release2) 9505 PyMem_FREE(buf2); 9506 if (PyUnicode_CheckExact(self)) { 9507 Py_INCREF(self); 9508 return (PyObject *) self; 9509 } 9510 return PyUnicode_Copy(self); 9511 error: 9512 if (srelease && sbuf) 9513 PyMem_FREE(sbuf); 9514 if (release1 && buf1) 9515 PyMem_FREE(buf1); 9516 if (release2 && buf2) 9517 PyMem_FREE(buf2); 9518 return NULL; 9519} 9520 9521/* --- Unicode Object Methods --------------------------------------------- */ 9522 9523PyDoc_STRVAR(title__doc__, 9524 "S.title() -> str\n\ 9525\n\ 9526Return a titlecased version of S, i.e. words start with title case\n\ 9527characters, all remaining cased characters have lower case."); 9528 9529static PyObject* 9530unicode_title(PyUnicodeObject *self) 9531{ 9532 return fixup(self, fixtitle); 9533} 9534 9535PyDoc_STRVAR(capitalize__doc__, 9536 "S.capitalize() -> str\n\ 9537\n\ 9538Return a capitalized version of S, i.e. make the first character\n\ 9539have upper case and the rest lower case."); 9540 9541static PyObject* 9542unicode_capitalize(PyUnicodeObject *self) 9543{ 9544 return fixup(self, fixcapitalize); 9545} 9546 9547#if 0 9548PyDoc_STRVAR(capwords__doc__, 9549 "S.capwords() -> str\n\ 9550\n\ 9551Apply .capitalize() to all words in S and return the result with\n\ 9552normalized whitespace (all whitespace strings are replaced by ' ')."); 9553 9554static PyObject* 9555unicode_capwords(PyUnicodeObject *self) 9556{ 9557 PyObject *list; 9558 PyObject *item; 9559 Py_ssize_t i; 9560 9561 /* Split into words */ 9562 list = split(self, NULL, -1); 9563 if (!list) 9564 return NULL; 9565 9566 /* Capitalize each word */ 9567 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9568 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9569 fixcapitalize); 9570 if (item == NULL) 9571 goto onError; 9572 Py_DECREF(PyList_GET_ITEM(list, i)); 9573 PyList_SET_ITEM(list, i, item); 9574 } 9575 9576 /* Join the words to form a new string */ 9577 item = PyUnicode_Join(NULL, list); 9578 9579 onError: 9580 Py_DECREF(list); 9581 return (PyObject *)item; 9582} 9583#endif 9584 9585/* Argument converter. Coerces to a single unicode character */ 9586 9587static int 9588convert_uc(PyObject *obj, void *addr) 9589{ 9590 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9591 PyObject *uniobj; 9592 9593 uniobj = PyUnicode_FromObject(obj); 9594 if (uniobj == NULL) { 9595 PyErr_SetString(PyExc_TypeError, 9596 "The fill character cannot be converted to Unicode"); 9597 return 0; 9598 } 9599 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9600 PyErr_SetString(PyExc_TypeError, 9601 "The fill character must be exactly one character long"); 9602 Py_DECREF(uniobj); 9603 return 0; 9604 } 9605 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9606 Py_DECREF(uniobj); 9607 return 1; 9608} 9609 9610PyDoc_STRVAR(center__doc__, 9611 "S.center(width[, fillchar]) -> str\n\ 9612\n\ 9613Return S centered in a string of length width. Padding is\n\ 9614done using the specified fill character (default is a space)"); 9615 9616static PyObject * 9617unicode_center(PyUnicodeObject *self, PyObject *args) 9618{ 9619 Py_ssize_t marg, left; 9620 Py_ssize_t width; 9621 Py_UCS4 fillchar = ' '; 9622 9623 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9624 return NULL; 9625 9626 if (PyUnicode_READY(self) == -1) 9627 return NULL; 9628 9629 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9630 Py_INCREF(self); 9631 return (PyObject*) self; 9632 } 9633 9634 marg = width - _PyUnicode_LENGTH(self); 9635 left = marg / 2 + (marg & width & 1); 9636 9637 return (PyObject*) pad(self, left, marg - left, fillchar); 9638} 9639 9640#if 0 9641 9642/* This code should go into some future Unicode collation support 9643 module. The basic comparison should compare ordinals on a naive 9644 basis (this is what Java does and thus Jython too). */ 9645 9646/* speedy UTF-16 code point order comparison */ 9647/* gleaned from: */ 9648/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9649 9650static short utf16Fixup[32] = 9651{ 9652 0, 0, 0, 0, 0, 0, 0, 0, 9653 0, 0, 0, 0, 0, 0, 0, 0, 9654 0, 0, 0, 0, 0, 0, 0, 0, 9655 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9656}; 9657 9658static int 9659unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9660{ 9661 Py_ssize_t len1, len2; 9662 9663 Py_UNICODE *s1 = str1->str; 9664 Py_UNICODE *s2 = str2->str; 9665 9666 len1 = str1->_base._base.length; 9667 len2 = str2->_base._base.length; 9668 9669 while (len1 > 0 && len2 > 0) { 9670 Py_UNICODE c1, c2; 9671 9672 c1 = *s1++; 9673 c2 = *s2++; 9674 9675 if (c1 > (1<<11) * 26) 9676 c1 += utf16Fixup[c1>>11]; 9677 if (c2 > (1<<11) * 26) 9678 c2 += utf16Fixup[c2>>11]; 9679 /* now c1 and c2 are in UTF-32-compatible order */ 9680 9681 if (c1 != c2) 9682 return (c1 < c2) ? -1 : 1; 9683 9684 len1--; len2--; 9685 } 9686 9687 return (len1 < len2) ? -1 : (len1 != len2); 9688} 9689 9690#else 9691 9692/* This function assumes that str1 and str2 are readied by the caller. */ 9693 9694static int 9695unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9696{ 9697 int kind1, kind2; 9698 void *data1, *data2; 9699 Py_ssize_t len1, len2, i; 9700 9701 kind1 = PyUnicode_KIND(str1); 9702 kind2 = PyUnicode_KIND(str2); 9703 data1 = PyUnicode_DATA(str1); 9704 data2 = PyUnicode_DATA(str2); 9705 len1 = PyUnicode_GET_LENGTH(str1); 9706 len2 = PyUnicode_GET_LENGTH(str2); 9707 9708 for (i = 0; i < len1 && i < len2; ++i) { 9709 Py_UCS4 c1, c2; 9710 c1 = PyUnicode_READ(kind1, data1, i); 9711 c2 = PyUnicode_READ(kind2, data2, i); 9712 9713 if (c1 != c2) 9714 return (c1 < c2) ? -1 : 1; 9715 } 9716 9717 return (len1 < len2) ? -1 : (len1 != len2); 9718} 9719 9720#endif 9721 9722int 9723PyUnicode_Compare(PyObject *left, PyObject *right) 9724{ 9725 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9726 if (PyUnicode_READY(left) == -1 || 9727 PyUnicode_READY(right) == -1) 9728 return -1; 9729 return unicode_compare((PyUnicodeObject *)left, 9730 (PyUnicodeObject *)right); 9731 } 9732 PyErr_Format(PyExc_TypeError, 9733 "Can't compare %.100s and %.100s", 9734 left->ob_type->tp_name, 9735 right->ob_type->tp_name); 9736 return -1; 9737} 9738 9739int 9740PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9741{ 9742 Py_ssize_t i; 9743 int kind; 9744 void *data; 9745 Py_UCS4 chr; 9746 9747 assert(_PyUnicode_CHECK(uni)); 9748 if (PyUnicode_READY(uni) == -1) 9749 return -1; 9750 kind = PyUnicode_KIND(uni); 9751 data = PyUnicode_DATA(uni); 9752 /* Compare Unicode string and source character set string */ 9753 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9754 if (chr != str[i]) 9755 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9756 /* This check keeps Python strings that end in '\0' from comparing equal 9757 to C strings identical up to that point. */ 9758 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9759 return 1; /* uni is longer */ 9760 if (str[i]) 9761 return -1; /* str is longer */ 9762 return 0; 9763} 9764 9765 9766#define TEST_COND(cond) \ 9767 ((cond) ? Py_True : Py_False) 9768 9769PyObject * 9770PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 9771{ 9772 int result; 9773 9774 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9775 PyObject *v; 9776 if (PyUnicode_READY(left) == -1 || 9777 PyUnicode_READY(right) == -1) 9778 return NULL; 9779 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 9780 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 9781 if (op == Py_EQ) { 9782 Py_INCREF(Py_False); 9783 return Py_False; 9784 } 9785 if (op == Py_NE) { 9786 Py_INCREF(Py_True); 9787 return Py_True; 9788 } 9789 } 9790 if (left == right) 9791 result = 0; 9792 else 9793 result = unicode_compare((PyUnicodeObject *)left, 9794 (PyUnicodeObject *)right); 9795 9796 /* Convert the return value to a Boolean */ 9797 switch (op) { 9798 case Py_EQ: 9799 v = TEST_COND(result == 0); 9800 break; 9801 case Py_NE: 9802 v = TEST_COND(result != 0); 9803 break; 9804 case Py_LE: 9805 v = TEST_COND(result <= 0); 9806 break; 9807 case Py_GE: 9808 v = TEST_COND(result >= 0); 9809 break; 9810 case Py_LT: 9811 v = TEST_COND(result == -1); 9812 break; 9813 case Py_GT: 9814 v = TEST_COND(result == 1); 9815 break; 9816 default: 9817 PyErr_BadArgument(); 9818 return NULL; 9819 } 9820 Py_INCREF(v); 9821 return v; 9822 } 9823 9824 Py_RETURN_NOTIMPLEMENTED; 9825} 9826 9827int 9828PyUnicode_Contains(PyObject *container, PyObject *element) 9829{ 9830 PyObject *str, *sub; 9831 int kind1, kind2, kind; 9832 void *buf1, *buf2; 9833 Py_ssize_t len1, len2; 9834 int result; 9835 9836 /* Coerce the two arguments */ 9837 sub = PyUnicode_FromObject(element); 9838 if (!sub) { 9839 PyErr_Format(PyExc_TypeError, 9840 "'in <string>' requires string as left operand, not %s", 9841 element->ob_type->tp_name); 9842 return -1; 9843 } 9844 if (PyUnicode_READY(sub) == -1) 9845 return -1; 9846 9847 str = PyUnicode_FromObject(container); 9848 if (!str || PyUnicode_READY(str) == -1) { 9849 Py_DECREF(sub); 9850 return -1; 9851 } 9852 9853 kind1 = PyUnicode_KIND(str); 9854 kind2 = PyUnicode_KIND(sub); 9855 kind = kind1 > kind2 ? kind1 : kind2; 9856 buf1 = PyUnicode_DATA(str); 9857 buf2 = PyUnicode_DATA(sub); 9858 if (kind1 != kind) 9859 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 9860 if (!buf1) { 9861 Py_DECREF(sub); 9862 return -1; 9863 } 9864 if (kind2 != kind) 9865 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 9866 if (!buf2) { 9867 Py_DECREF(sub); 9868 if (kind1 != kind) PyMem_Free(buf1); 9869 return -1; 9870 } 9871 len1 = PyUnicode_GET_LENGTH(str); 9872 len2 = PyUnicode_GET_LENGTH(sub); 9873 9874 switch(kind) { 9875 case PyUnicode_1BYTE_KIND: 9876 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 9877 break; 9878 case PyUnicode_2BYTE_KIND: 9879 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 9880 break; 9881 case PyUnicode_4BYTE_KIND: 9882 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 9883 break; 9884 default: 9885 result = -1; 9886 assert(0); 9887 } 9888 9889 Py_DECREF(str); 9890 Py_DECREF(sub); 9891 9892 if (kind1 != kind) 9893 PyMem_Free(buf1); 9894 if (kind2 != kind) 9895 PyMem_Free(buf2); 9896 9897 return result; 9898} 9899 9900/* Concat to string or Unicode object giving a new Unicode object. */ 9901 9902PyObject * 9903PyUnicode_Concat(PyObject *left, PyObject *right) 9904{ 9905 PyObject *u = NULL, *v = NULL, *w; 9906 Py_UCS4 maxchar; 9907 9908 /* Coerce the two arguments */ 9909 u = PyUnicode_FromObject(left); 9910 if (u == NULL) 9911 goto onError; 9912 v = PyUnicode_FromObject(right); 9913 if (v == NULL) 9914 goto onError; 9915 9916 /* Shortcuts */ 9917 if (v == unicode_empty) { 9918 Py_DECREF(v); 9919 return u; 9920 } 9921 if (u == unicode_empty) { 9922 Py_DECREF(u); 9923 return v; 9924 } 9925 9926 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 9927 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 9928 9929 /* Concat the two Unicode strings */ 9930 w = PyUnicode_New( 9931 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 9932 maxchar); 9933 if (w == NULL) 9934 goto onError; 9935 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) 9936 goto onError; 9937 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), 9938 v, 0, 9939 PyUnicode_GET_LENGTH(v)) < 0) 9940 goto onError; 9941 Py_DECREF(u); 9942 Py_DECREF(v); 9943 return w; 9944 9945 onError: 9946 Py_XDECREF(u); 9947 Py_XDECREF(v); 9948 return NULL; 9949} 9950 9951void 9952PyUnicode_Append(PyObject **p_left, PyObject *right) 9953{ 9954 PyObject *left, *res; 9955 9956 if (p_left == NULL) { 9957 if (!PyErr_Occurred()) 9958 PyErr_BadInternalCall(); 9959 return; 9960 } 9961 left = *p_left; 9962 if (right == NULL || !PyUnicode_Check(left)) { 9963 if (!PyErr_Occurred()) 9964 PyErr_BadInternalCall(); 9965 goto error; 9966 } 9967 9968 if (PyUnicode_CheckExact(left) && left != unicode_empty 9969 && PyUnicode_CheckExact(right) && right != unicode_empty 9970 && unicode_resizable(left) 9971 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 9972 || _PyUnicode_WSTR(left) != NULL)) 9973 { 9974 Py_ssize_t left_len, right_len, new_len; 9975#ifdef Py_DEBUG 9976 Py_ssize_t copied; 9977#endif 9978 9979 if (PyUnicode_READY(left)) 9980 goto error; 9981 if (PyUnicode_READY(right)) 9982 goto error; 9983 9984 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */ 9985 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left)) 9986 { 9987 left_len = PyUnicode_GET_LENGTH(left); 9988 right_len = PyUnicode_GET_LENGTH(right); 9989 if (left_len > PY_SSIZE_T_MAX - right_len) { 9990 PyErr_SetString(PyExc_OverflowError, 9991 "strings are too large to concat"); 9992 goto error; 9993 } 9994 new_len = left_len + right_len; 9995 9996 /* Now we own the last reference to 'left', so we can resize it 9997 * in-place. 9998 */ 9999 if (unicode_resize(&left, new_len) != 0) { 10000 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10001 * deallocated so it cannot be put back into 10002 * 'variable'. The MemoryError is raised when there 10003 * is no value in 'variable', which might (very 10004 * remotely) be a cause of incompatibilities. 10005 */ 10006 goto error; 10007 } 10008 /* copy 'right' into the newly allocated area of 'left' */ 10009#ifdef Py_DEBUG 10010 copied = PyUnicode_CopyCharacters(left, left_len, 10011 right, 0, 10012 right_len); 10013 assert(0 <= copied); 10014#else 10015 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len); 10016#endif 10017 *p_left = left; 10018 return; 10019 } 10020 } 10021 10022 res = PyUnicode_Concat(left, right); 10023 if (res == NULL) 10024 goto error; 10025 Py_DECREF(left); 10026 *p_left = res; 10027 return; 10028 10029error: 10030 Py_DECREF(*p_left); 10031 *p_left = NULL; 10032} 10033 10034void 10035PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10036{ 10037 PyUnicode_Append(pleft, right); 10038 Py_XDECREF(right); 10039} 10040 10041PyDoc_STRVAR(count__doc__, 10042 "S.count(sub[, start[, end]]) -> int\n\ 10043\n\ 10044Return the number of non-overlapping occurrences of substring sub in\n\ 10045string S[start:end]. Optional arguments start and end are\n\ 10046interpreted as in slice notation."); 10047 10048static PyObject * 10049unicode_count(PyUnicodeObject *self, PyObject *args) 10050{ 10051 PyUnicodeObject *substring; 10052 Py_ssize_t start = 0; 10053 Py_ssize_t end = PY_SSIZE_T_MAX; 10054 PyObject *result; 10055 int kind1, kind2, kind; 10056 void *buf1, *buf2; 10057 Py_ssize_t len1, len2, iresult; 10058 10059 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10060 &start, &end)) 10061 return NULL; 10062 10063 kind1 = PyUnicode_KIND(self); 10064 kind2 = PyUnicode_KIND(substring); 10065 kind = kind1 > kind2 ? kind1 : kind2; 10066 buf1 = PyUnicode_DATA(self); 10067 buf2 = PyUnicode_DATA(substring); 10068 if (kind1 != kind) 10069 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10070 if (!buf1) { 10071 Py_DECREF(substring); 10072 return NULL; 10073 } 10074 if (kind2 != kind) 10075 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10076 if (!buf2) { 10077 Py_DECREF(substring); 10078 if (kind1 != kind) PyMem_Free(buf1); 10079 return NULL; 10080 } 10081 len1 = PyUnicode_GET_LENGTH(self); 10082 len2 = PyUnicode_GET_LENGTH(substring); 10083 10084 ADJUST_INDICES(start, end, len1); 10085 switch(kind) { 10086 case PyUnicode_1BYTE_KIND: 10087 iresult = ucs1lib_count( 10088 ((Py_UCS1*)buf1) + start, end - start, 10089 buf2, len2, PY_SSIZE_T_MAX 10090 ); 10091 break; 10092 case PyUnicode_2BYTE_KIND: 10093 iresult = ucs2lib_count( 10094 ((Py_UCS2*)buf1) + start, end - start, 10095 buf2, len2, PY_SSIZE_T_MAX 10096 ); 10097 break; 10098 case PyUnicode_4BYTE_KIND: 10099 iresult = ucs4lib_count( 10100 ((Py_UCS4*)buf1) + start, end - start, 10101 buf2, len2, PY_SSIZE_T_MAX 10102 ); 10103 break; 10104 default: 10105 assert(0); iresult = 0; 10106 } 10107 10108 result = PyLong_FromSsize_t(iresult); 10109 10110 if (kind1 != kind) 10111 PyMem_Free(buf1); 10112 if (kind2 != kind) 10113 PyMem_Free(buf2); 10114 10115 Py_DECREF(substring); 10116 10117 return result; 10118} 10119 10120PyDoc_STRVAR(encode__doc__, 10121 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10122\n\ 10123Encode S using the codec registered for encoding. Default encoding\n\ 10124is 'utf-8'. errors may be given to set a different error\n\ 10125handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10127'xmlcharrefreplace' as well as any other name registered with\n\ 10128codecs.register_error that can handle UnicodeEncodeErrors."); 10129 10130static PyObject * 10131unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10132{ 10133 static char *kwlist[] = {"encoding", "errors", 0}; 10134 char *encoding = NULL; 10135 char *errors = NULL; 10136 10137 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10138 kwlist, &encoding, &errors)) 10139 return NULL; 10140 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10141} 10142 10143PyDoc_STRVAR(expandtabs__doc__, 10144 "S.expandtabs([tabsize]) -> str\n\ 10145\n\ 10146Return a copy of S where all tab characters are expanded using spaces.\n\ 10147If tabsize is not given, a tab size of 8 characters is assumed."); 10148 10149static PyObject* 10150unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10151{ 10152 Py_UNICODE *e; 10153 Py_UNICODE *p; 10154 Py_UNICODE *q; 10155 Py_UNICODE *qe; 10156 Py_ssize_t i, j, incr, wstr_length; 10157 PyUnicodeObject *u; 10158 int tabsize = 8; 10159 10160 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10161 return NULL; 10162 10163 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL) 10164 return NULL; 10165 10166 /* First pass: determine size of output string */ 10167 i = 0; /* chars up to and including most recent \n or \r */ 10168 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 10169 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */ 10170 for (p = _PyUnicode_WSTR(self); p < e; p++) 10171 if (*p == '\t') { 10172 if (tabsize > 0) { 10173 incr = tabsize - (j % tabsize); /* cannot overflow */ 10174 if (j > PY_SSIZE_T_MAX - incr) 10175 goto overflow1; 10176 j += incr; 10177 } 10178 } 10179 else { 10180 if (j > PY_SSIZE_T_MAX - 1) 10181 goto overflow1; 10182 j++; 10183 if (*p == '\n' || *p == '\r') { 10184 if (i > PY_SSIZE_T_MAX - j) 10185 goto overflow1; 10186 i += j; 10187 j = 0; 10188 } 10189 } 10190 10191 if (i > PY_SSIZE_T_MAX - j) 10192 goto overflow1; 10193 10194 /* Second pass: create output string and fill it */ 10195 u = _PyUnicode_New(i + j); 10196 if (!u) 10197 return NULL; 10198 10199 j = 0; /* same as in first pass */ 10200 q = _PyUnicode_WSTR(u); /* next output char */ 10201 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */ 10202 10203 for (p = _PyUnicode_WSTR(self); p < e; p++) 10204 if (*p == '\t') { 10205 if (tabsize > 0) { 10206 i = tabsize - (j % tabsize); 10207 j += i; 10208 while (i--) { 10209 if (q >= qe) 10210 goto overflow2; 10211 *q++ = ' '; 10212 } 10213 } 10214 } 10215 else { 10216 if (q >= qe) 10217 goto overflow2; 10218 *q++ = *p; 10219 j++; 10220 if (*p == '\n' || *p == '\r') 10221 j = 0; 10222 } 10223 10224 if (_PyUnicode_READY_REPLACE(&u)) { 10225 Py_DECREF(u); 10226 return NULL; 10227 } 10228 return (PyObject*) u; 10229 10230 overflow2: 10231 Py_DECREF(u); 10232 overflow1: 10233 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10234 return NULL; 10235} 10236 10237PyDoc_STRVAR(find__doc__, 10238 "S.find(sub[, start[, end]]) -> int\n\ 10239\n\ 10240Return the lowest index in S where substring sub is found,\n\ 10241such that sub is contained within S[start:end]. Optional\n\ 10242arguments start and end are interpreted as in slice notation.\n\ 10243\n\ 10244Return -1 on failure."); 10245 10246static PyObject * 10247unicode_find(PyObject *self, PyObject *args) 10248{ 10249 PyUnicodeObject *substring; 10250 Py_ssize_t start; 10251 Py_ssize_t end; 10252 Py_ssize_t result; 10253 10254 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10255 &start, &end)) 10256 return NULL; 10257 10258 if (PyUnicode_READY(self) == -1) 10259 return NULL; 10260 if (PyUnicode_READY(substring) == -1) 10261 return NULL; 10262 10263 result = any_find_slice( 10264 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10265 self, (PyObject*)substring, start, end 10266 ); 10267 10268 Py_DECREF(substring); 10269 10270 if (result == -2) 10271 return NULL; 10272 10273 return PyLong_FromSsize_t(result); 10274} 10275 10276static PyObject * 10277unicode_getitem(PyObject *self, Py_ssize_t index) 10278{ 10279 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10280 if (ch == (Py_UCS4)-1) 10281 return NULL; 10282 return PyUnicode_FromOrdinal(ch); 10283} 10284 10285/* Believe it or not, this produces the same value for ASCII strings 10286 as bytes_hash(). */ 10287static Py_hash_t 10288unicode_hash(PyUnicodeObject *self) 10289{ 10290 Py_ssize_t len; 10291 Py_uhash_t x; 10292 10293 if (_PyUnicode_HASH(self) != -1) 10294 return _PyUnicode_HASH(self); 10295 if (PyUnicode_READY(self) == -1) 10296 return -1; 10297 len = PyUnicode_GET_LENGTH(self); 10298 10299 /* The hash function as a macro, gets expanded three times below. */ 10300#define HASH(P) \ 10301 x = (Py_uhash_t)*P << 7; \ 10302 while (--len >= 0) \ 10303 x = (1000003*x) ^ (Py_uhash_t)*P++; 10304 10305 switch (PyUnicode_KIND(self)) { 10306 case PyUnicode_1BYTE_KIND: { 10307 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10308 HASH(c); 10309 break; 10310 } 10311 case PyUnicode_2BYTE_KIND: { 10312 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10313 HASH(s); 10314 break; 10315 } 10316 default: { 10317 Py_UCS4 *l; 10318 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10319 "Impossible switch case in unicode_hash"); 10320 l = PyUnicode_4BYTE_DATA(self); 10321 HASH(l); 10322 break; 10323 } 10324 } 10325 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10326 10327 if (x == -1) 10328 x = -2; 10329 _PyUnicode_HASH(self) = x; 10330 return x; 10331} 10332#undef HASH 10333 10334PyDoc_STRVAR(index__doc__, 10335 "S.index(sub[, start[, end]]) -> int\n\ 10336\n\ 10337Like S.find() but raise ValueError when the substring is not found."); 10338 10339static PyObject * 10340unicode_index(PyObject *self, PyObject *args) 10341{ 10342 Py_ssize_t result; 10343 PyUnicodeObject *substring; 10344 Py_ssize_t start; 10345 Py_ssize_t end; 10346 10347 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10348 &start, &end)) 10349 return NULL; 10350 10351 if (PyUnicode_READY(self) == -1) 10352 return NULL; 10353 if (PyUnicode_READY(substring) == -1) 10354 return NULL; 10355 10356 result = any_find_slice( 10357 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10358 self, (PyObject*)substring, start, end 10359 ); 10360 10361 Py_DECREF(substring); 10362 10363 if (result == -2) 10364 return NULL; 10365 10366 if (result < 0) { 10367 PyErr_SetString(PyExc_ValueError, "substring not found"); 10368 return NULL; 10369 } 10370 10371 return PyLong_FromSsize_t(result); 10372} 10373 10374PyDoc_STRVAR(islower__doc__, 10375 "S.islower() -> bool\n\ 10376\n\ 10377Return True if all cased characters in S are lowercase and there is\n\ 10378at least one cased character in S, False otherwise."); 10379 10380static PyObject* 10381unicode_islower(PyUnicodeObject *self) 10382{ 10383 Py_ssize_t i, length; 10384 int kind; 10385 void *data; 10386 int cased; 10387 10388 if (PyUnicode_READY(self) == -1) 10389 return NULL; 10390 length = PyUnicode_GET_LENGTH(self); 10391 kind = PyUnicode_KIND(self); 10392 data = PyUnicode_DATA(self); 10393 10394 /* Shortcut for single character strings */ 10395 if (length == 1) 10396 return PyBool_FromLong( 10397 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10398 10399 /* Special case for empty strings */ 10400 if (length == 0) 10401 return PyBool_FromLong(0); 10402 10403 cased = 0; 10404 for (i = 0; i < length; i++) { 10405 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10406 10407 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10408 return PyBool_FromLong(0); 10409 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10410 cased = 1; 10411 } 10412 return PyBool_FromLong(cased); 10413} 10414 10415PyDoc_STRVAR(isupper__doc__, 10416 "S.isupper() -> bool\n\ 10417\n\ 10418Return True if all cased characters in S are uppercase and there is\n\ 10419at least one cased character in S, False otherwise."); 10420 10421static PyObject* 10422unicode_isupper(PyUnicodeObject *self) 10423{ 10424 Py_ssize_t i, length; 10425 int kind; 10426 void *data; 10427 int cased; 10428 10429 if (PyUnicode_READY(self) == -1) 10430 return NULL; 10431 length = PyUnicode_GET_LENGTH(self); 10432 kind = PyUnicode_KIND(self); 10433 data = PyUnicode_DATA(self); 10434 10435 /* Shortcut for single character strings */ 10436 if (length == 1) 10437 return PyBool_FromLong( 10438 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10439 10440 /* Special case for empty strings */ 10441 if (length == 0) 10442 return PyBool_FromLong(0); 10443 10444 cased = 0; 10445 for (i = 0; i < length; i++) { 10446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10447 10448 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10449 return PyBool_FromLong(0); 10450 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10451 cased = 1; 10452 } 10453 return PyBool_FromLong(cased); 10454} 10455 10456PyDoc_STRVAR(istitle__doc__, 10457 "S.istitle() -> bool\n\ 10458\n\ 10459Return True if S is a titlecased string and there is at least one\n\ 10460character in S, i.e. upper- and titlecase characters may only\n\ 10461follow uncased characters and lowercase characters only cased ones.\n\ 10462Return False otherwise."); 10463 10464static PyObject* 10465unicode_istitle(PyUnicodeObject *self) 10466{ 10467 Py_ssize_t i, length; 10468 int kind; 10469 void *data; 10470 int cased, previous_is_cased; 10471 10472 if (PyUnicode_READY(self) == -1) 10473 return NULL; 10474 length = PyUnicode_GET_LENGTH(self); 10475 kind = PyUnicode_KIND(self); 10476 data = PyUnicode_DATA(self); 10477 10478 /* Shortcut for single character strings */ 10479 if (length == 1) { 10480 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10481 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10482 (Py_UNICODE_ISUPPER(ch) != 0)); 10483 } 10484 10485 /* Special case for empty strings */ 10486 if (length == 0) 10487 return PyBool_FromLong(0); 10488 10489 cased = 0; 10490 previous_is_cased = 0; 10491 for (i = 0; i < length; i++) { 10492 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10493 10494 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10495 if (previous_is_cased) 10496 return PyBool_FromLong(0); 10497 previous_is_cased = 1; 10498 cased = 1; 10499 } 10500 else if (Py_UNICODE_ISLOWER(ch)) { 10501 if (!previous_is_cased) 10502 return PyBool_FromLong(0); 10503 previous_is_cased = 1; 10504 cased = 1; 10505 } 10506 else 10507 previous_is_cased = 0; 10508 } 10509 return PyBool_FromLong(cased); 10510} 10511 10512PyDoc_STRVAR(isspace__doc__, 10513 "S.isspace() -> bool\n\ 10514\n\ 10515Return True if all characters in S are whitespace\n\ 10516and there is at least one character in S, False otherwise."); 10517 10518static PyObject* 10519unicode_isspace(PyUnicodeObject *self) 10520{ 10521 Py_ssize_t i, length; 10522 int kind; 10523 void *data; 10524 10525 if (PyUnicode_READY(self) == -1) 10526 return NULL; 10527 length = PyUnicode_GET_LENGTH(self); 10528 kind = PyUnicode_KIND(self); 10529 data = PyUnicode_DATA(self); 10530 10531 /* Shortcut for single character strings */ 10532 if (length == 1) 10533 return PyBool_FromLong( 10534 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10535 10536 /* Special case for empty strings */ 10537 if (length == 0) 10538 return PyBool_FromLong(0); 10539 10540 for (i = 0; i < length; i++) { 10541 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10542 if (!Py_UNICODE_ISSPACE(ch)) 10543 return PyBool_FromLong(0); 10544 } 10545 return PyBool_FromLong(1); 10546} 10547 10548PyDoc_STRVAR(isalpha__doc__, 10549 "S.isalpha() -> bool\n\ 10550\n\ 10551Return True if all characters in S are alphabetic\n\ 10552and there is at least one character in S, False otherwise."); 10553 10554static PyObject* 10555unicode_isalpha(PyUnicodeObject *self) 10556{ 10557 Py_ssize_t i, length; 10558 int kind; 10559 void *data; 10560 10561 if (PyUnicode_READY(self) == -1) 10562 return NULL; 10563 length = PyUnicode_GET_LENGTH(self); 10564 kind = PyUnicode_KIND(self); 10565 data = PyUnicode_DATA(self); 10566 10567 /* Shortcut for single character strings */ 10568 if (length == 1) 10569 return PyBool_FromLong( 10570 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10571 10572 /* Special case for empty strings */ 10573 if (length == 0) 10574 return PyBool_FromLong(0); 10575 10576 for (i = 0; i < length; i++) { 10577 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10578 return PyBool_FromLong(0); 10579 } 10580 return PyBool_FromLong(1); 10581} 10582 10583PyDoc_STRVAR(isalnum__doc__, 10584 "S.isalnum() -> bool\n\ 10585\n\ 10586Return True if all characters in S are alphanumeric\n\ 10587and there is at least one character in S, False otherwise."); 10588 10589static PyObject* 10590unicode_isalnum(PyUnicodeObject *self) 10591{ 10592 int kind; 10593 void *data; 10594 Py_ssize_t len, i; 10595 10596 if (PyUnicode_READY(self) == -1) 10597 return NULL; 10598 10599 kind = PyUnicode_KIND(self); 10600 data = PyUnicode_DATA(self); 10601 len = PyUnicode_GET_LENGTH(self); 10602 10603 /* Shortcut for single character strings */ 10604 if (len == 1) { 10605 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10606 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10607 } 10608 10609 /* Special case for empty strings */ 10610 if (len == 0) 10611 return PyBool_FromLong(0); 10612 10613 for (i = 0; i < len; i++) { 10614 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10615 if (!Py_UNICODE_ISALNUM(ch)) 10616 return PyBool_FromLong(0); 10617 } 10618 return PyBool_FromLong(1); 10619} 10620 10621PyDoc_STRVAR(isdecimal__doc__, 10622 "S.isdecimal() -> bool\n\ 10623\n\ 10624Return True if there are only decimal characters in S,\n\ 10625False otherwise."); 10626 10627static PyObject* 10628unicode_isdecimal(PyUnicodeObject *self) 10629{ 10630 Py_ssize_t i, length; 10631 int kind; 10632 void *data; 10633 10634 if (PyUnicode_READY(self) == -1) 10635 return NULL; 10636 length = PyUnicode_GET_LENGTH(self); 10637 kind = PyUnicode_KIND(self); 10638 data = PyUnicode_DATA(self); 10639 10640 /* Shortcut for single character strings */ 10641 if (length == 1) 10642 return PyBool_FromLong( 10643 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10644 10645 /* Special case for empty strings */ 10646 if (length == 0) 10647 return PyBool_FromLong(0); 10648 10649 for (i = 0; i < length; i++) { 10650 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10651 return PyBool_FromLong(0); 10652 } 10653 return PyBool_FromLong(1); 10654} 10655 10656PyDoc_STRVAR(isdigit__doc__, 10657 "S.isdigit() -> bool\n\ 10658\n\ 10659Return True if all characters in S are digits\n\ 10660and there is at least one character in S, False otherwise."); 10661 10662static PyObject* 10663unicode_isdigit(PyUnicodeObject *self) 10664{ 10665 Py_ssize_t i, length; 10666 int kind; 10667 void *data; 10668 10669 if (PyUnicode_READY(self) == -1) 10670 return NULL; 10671 length = PyUnicode_GET_LENGTH(self); 10672 kind = PyUnicode_KIND(self); 10673 data = PyUnicode_DATA(self); 10674 10675 /* Shortcut for single character strings */ 10676 if (length == 1) { 10677 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10678 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10679 } 10680 10681 /* Special case for empty strings */ 10682 if (length == 0) 10683 return PyBool_FromLong(0); 10684 10685 for (i = 0; i < length; i++) { 10686 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10687 return PyBool_FromLong(0); 10688 } 10689 return PyBool_FromLong(1); 10690} 10691 10692PyDoc_STRVAR(isnumeric__doc__, 10693 "S.isnumeric() -> bool\n\ 10694\n\ 10695Return True if there are only numeric characters in S,\n\ 10696False otherwise."); 10697 10698static PyObject* 10699unicode_isnumeric(PyUnicodeObject *self) 10700{ 10701 Py_ssize_t i, length; 10702 int kind; 10703 void *data; 10704 10705 if (PyUnicode_READY(self) == -1) 10706 return NULL; 10707 length = PyUnicode_GET_LENGTH(self); 10708 kind = PyUnicode_KIND(self); 10709 data = PyUnicode_DATA(self); 10710 10711 /* Shortcut for single character strings */ 10712 if (length == 1) 10713 return PyBool_FromLong( 10714 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10715 10716 /* Special case for empty strings */ 10717 if (length == 0) 10718 return PyBool_FromLong(0); 10719 10720 for (i = 0; i < length; i++) { 10721 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10722 return PyBool_FromLong(0); 10723 } 10724 return PyBool_FromLong(1); 10725} 10726 10727int 10728PyUnicode_IsIdentifier(PyObject *self) 10729{ 10730 int kind; 10731 void *data; 10732 Py_ssize_t i; 10733 Py_UCS4 first; 10734 10735 if (PyUnicode_READY(self) == -1) { 10736 Py_FatalError("identifier not ready"); 10737 return 0; 10738 } 10739 10740 /* Special case for empty strings */ 10741 if (PyUnicode_GET_LENGTH(self) == 0) 10742 return 0; 10743 kind = PyUnicode_KIND(self); 10744 data = PyUnicode_DATA(self); 10745 10746 /* PEP 3131 says that the first character must be in 10747 XID_Start and subsequent characters in XID_Continue, 10748 and for the ASCII range, the 2.x rules apply (i.e 10749 start with letters and underscore, continue with 10750 letters, digits, underscore). However, given the current 10751 definition of XID_Start and XID_Continue, it is sufficient 10752 to check just for these, except that _ must be allowed 10753 as starting an identifier. */ 10754 first = PyUnicode_READ(kind, data, 0); 10755 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 10756 return 0; 10757 10758 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 10759 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 10760 return 0; 10761 return 1; 10762} 10763 10764PyDoc_STRVAR(isidentifier__doc__, 10765 "S.isidentifier() -> bool\n\ 10766\n\ 10767Return True if S is a valid identifier according\n\ 10768to the language definition."); 10769 10770static PyObject* 10771unicode_isidentifier(PyObject *self) 10772{ 10773 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 10774} 10775 10776PyDoc_STRVAR(isprintable__doc__, 10777 "S.isprintable() -> bool\n\ 10778\n\ 10779Return True if all characters in S are considered\n\ 10780printable in repr() or S is empty, False otherwise."); 10781 10782static PyObject* 10783unicode_isprintable(PyObject *self) 10784{ 10785 Py_ssize_t i, length; 10786 int kind; 10787 void *data; 10788 10789 if (PyUnicode_READY(self) == -1) 10790 return NULL; 10791 length = PyUnicode_GET_LENGTH(self); 10792 kind = PyUnicode_KIND(self); 10793 data = PyUnicode_DATA(self); 10794 10795 /* Shortcut for single character strings */ 10796 if (length == 1) 10797 return PyBool_FromLong( 10798 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 10799 10800 for (i = 0; i < length; i++) { 10801 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 10802 Py_RETURN_FALSE; 10803 } 10804 } 10805 Py_RETURN_TRUE; 10806} 10807 10808PyDoc_STRVAR(join__doc__, 10809 "S.join(iterable) -> str\n\ 10810\n\ 10811Return a string which is the concatenation of the strings in the\n\ 10812iterable. The separator between elements is S."); 10813 10814static PyObject* 10815unicode_join(PyObject *self, PyObject *data) 10816{ 10817 return PyUnicode_Join(self, data); 10818} 10819 10820static Py_ssize_t 10821unicode_length(PyUnicodeObject *self) 10822{ 10823 if (PyUnicode_READY(self) == -1) 10824 return -1; 10825 return PyUnicode_GET_LENGTH(self); 10826} 10827 10828PyDoc_STRVAR(ljust__doc__, 10829 "S.ljust(width[, fillchar]) -> str\n\ 10830\n\ 10831Return S left-justified in a Unicode string of length width. Padding is\n\ 10832done using the specified fill character (default is a space)."); 10833 10834static PyObject * 10835unicode_ljust(PyUnicodeObject *self, PyObject *args) 10836{ 10837 Py_ssize_t width; 10838 Py_UCS4 fillchar = ' '; 10839 10840 if (PyUnicode_READY(self) == -1) 10841 return NULL; 10842 10843 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 10844 return NULL; 10845 10846 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10847 Py_INCREF(self); 10848 return (PyObject*) self; 10849 } 10850 10851 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 10852} 10853 10854PyDoc_STRVAR(lower__doc__, 10855 "S.lower() -> str\n\ 10856\n\ 10857Return a copy of the string S converted to lowercase."); 10858 10859static PyObject* 10860unicode_lower(PyUnicodeObject *self) 10861{ 10862 return fixup(self, fixlower); 10863} 10864 10865#define LEFTSTRIP 0 10866#define RIGHTSTRIP 1 10867#define BOTHSTRIP 2 10868 10869/* Arrays indexed by above */ 10870static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 10871 10872#define STRIPNAME(i) (stripformat[i]+3) 10873 10874/* externally visible for str.strip(unicode) */ 10875PyObject * 10876_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 10877{ 10878 void *data; 10879 int kind; 10880 Py_ssize_t i, j, len; 10881 BLOOM_MASK sepmask; 10882 10883 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 10884 return NULL; 10885 10886 kind = PyUnicode_KIND(self); 10887 data = PyUnicode_DATA(self); 10888 len = PyUnicode_GET_LENGTH(self); 10889 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 10890 PyUnicode_DATA(sepobj), 10891 PyUnicode_GET_LENGTH(sepobj)); 10892 10893 i = 0; 10894 if (striptype != RIGHTSTRIP) { 10895 while (i < len && 10896 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 10897 i++; 10898 } 10899 } 10900 10901 j = len; 10902 if (striptype != LEFTSTRIP) { 10903 do { 10904 j--; 10905 } while (j >= i && 10906 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 10907 j++; 10908 } 10909 10910 return PyUnicode_Substring((PyObject*)self, i, j); 10911} 10912 10913PyObject* 10914PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 10915{ 10916 unsigned char *data; 10917 int kind; 10918 Py_ssize_t length; 10919 10920 if (PyUnicode_READY(self) == -1) 10921 return NULL; 10922 10923 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 10924 10925 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 10926 { 10927 if (PyUnicode_CheckExact(self)) { 10928 Py_INCREF(self); 10929 return self; 10930 } 10931 else 10932 return PyUnicode_Copy(self); 10933 } 10934 10935 length = end - start; 10936 if (length == 1) 10937 return unicode_getitem(self, start); 10938 10939 if (start < 0 || end < 0) { 10940 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10941 return NULL; 10942 } 10943 10944 kind = PyUnicode_KIND(self); 10945 data = PyUnicode_1BYTE_DATA(self); 10946 return PyUnicode_FromKindAndData(kind, 10947 data + PyUnicode_KIND_SIZE(kind, start), 10948 length); 10949} 10950 10951static PyObject * 10952do_strip(PyUnicodeObject *self, int striptype) 10953{ 10954 int kind; 10955 void *data; 10956 Py_ssize_t len, i, j; 10957 10958 if (PyUnicode_READY(self) == -1) 10959 return NULL; 10960 10961 kind = PyUnicode_KIND(self); 10962 data = PyUnicode_DATA(self); 10963 len = PyUnicode_GET_LENGTH(self); 10964 10965 i = 0; 10966 if (striptype != RIGHTSTRIP) { 10967 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 10968 i++; 10969 } 10970 } 10971 10972 j = len; 10973 if (striptype != LEFTSTRIP) { 10974 do { 10975 j--; 10976 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 10977 j++; 10978 } 10979 10980 return PyUnicode_Substring((PyObject*)self, i, j); 10981} 10982 10983 10984static PyObject * 10985do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 10986{ 10987 PyObject *sep = NULL; 10988 10989 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 10990 return NULL; 10991 10992 if (sep != NULL && sep != Py_None) { 10993 if (PyUnicode_Check(sep)) 10994 return _PyUnicode_XStrip(self, striptype, sep); 10995 else { 10996 PyErr_Format(PyExc_TypeError, 10997 "%s arg must be None or str", 10998 STRIPNAME(striptype)); 10999 return NULL; 11000 } 11001 } 11002 11003 return do_strip(self, striptype); 11004} 11005 11006 11007PyDoc_STRVAR(strip__doc__, 11008 "S.strip([chars]) -> str\n\ 11009\n\ 11010Return a copy of the string S with leading and trailing\n\ 11011whitespace removed.\n\ 11012If chars is given and not None, remove characters in chars instead."); 11013 11014static PyObject * 11015unicode_strip(PyUnicodeObject *self, PyObject *args) 11016{ 11017 if (PyTuple_GET_SIZE(args) == 0) 11018 return do_strip(self, BOTHSTRIP); /* Common case */ 11019 else 11020 return do_argstrip(self, BOTHSTRIP, args); 11021} 11022 11023 11024PyDoc_STRVAR(lstrip__doc__, 11025 "S.lstrip([chars]) -> str\n\ 11026\n\ 11027Return a copy of the string S with leading whitespace removed.\n\ 11028If chars is given and not None, remove characters in chars instead."); 11029 11030static PyObject * 11031unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11032{ 11033 if (PyTuple_GET_SIZE(args) == 0) 11034 return do_strip(self, LEFTSTRIP); /* Common case */ 11035 else 11036 return do_argstrip(self, LEFTSTRIP, args); 11037} 11038 11039 11040PyDoc_STRVAR(rstrip__doc__, 11041 "S.rstrip([chars]) -> str\n\ 11042\n\ 11043Return a copy of the string S with trailing whitespace removed.\n\ 11044If chars is given and not None, remove characters in chars instead."); 11045 11046static PyObject * 11047unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11048{ 11049 if (PyTuple_GET_SIZE(args) == 0) 11050 return do_strip(self, RIGHTSTRIP); /* Common case */ 11051 else 11052 return do_argstrip(self, RIGHTSTRIP, args); 11053} 11054 11055 11056static PyObject* 11057unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11058{ 11059 PyUnicodeObject *u; 11060 Py_ssize_t nchars, n; 11061 11062 if (len < 1) { 11063 Py_INCREF(unicode_empty); 11064 return unicode_empty; 11065 } 11066 11067 if (len == 1 && PyUnicode_CheckExact(str)) { 11068 /* no repeat, return original string */ 11069 Py_INCREF(str); 11070 return (PyObject*) str; 11071 } 11072 11073 if (PyUnicode_READY(str) == -1) 11074 return NULL; 11075 11076 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11077 PyErr_SetString(PyExc_OverflowError, 11078 "repeated string is too long"); 11079 return NULL; 11080 } 11081 nchars = len * PyUnicode_GET_LENGTH(str); 11082 11083 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11084 if (!u) 11085 return NULL; 11086 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11087 11088 if (PyUnicode_GET_LENGTH(str) == 1) { 11089 const int kind = PyUnicode_KIND(str); 11090 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11091 void *to = PyUnicode_DATA(u); 11092 if (kind == PyUnicode_1BYTE_KIND) 11093 memset(to, (unsigned char)fill_char, len); 11094 else { 11095 for (n = 0; n < len; ++n) 11096 PyUnicode_WRITE(kind, to, n, fill_char); 11097 } 11098 } 11099 else { 11100 /* number of characters copied this far */ 11101 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11102 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11103 char *to = (char *) PyUnicode_DATA(u); 11104 Py_MEMCPY(to, PyUnicode_DATA(str), 11105 PyUnicode_GET_LENGTH(str) * char_size); 11106 while (done < nchars) { 11107 n = (done <= nchars-done) ? done : nchars-done; 11108 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11109 done += n; 11110 } 11111 } 11112 11113 return (PyObject*) u; 11114} 11115 11116PyObject * 11117PyUnicode_Replace(PyObject *obj, 11118 PyObject *subobj, 11119 PyObject *replobj, 11120 Py_ssize_t maxcount) 11121{ 11122 PyObject *self; 11123 PyObject *str1; 11124 PyObject *str2; 11125 PyObject *result; 11126 11127 self = PyUnicode_FromObject(obj); 11128 if (self == NULL || PyUnicode_READY(self) == -1) 11129 return NULL; 11130 str1 = PyUnicode_FromObject(subobj); 11131 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11132 Py_DECREF(self); 11133 return NULL; 11134 } 11135 str2 = PyUnicode_FromObject(replobj); 11136 if (str2 == NULL || PyUnicode_READY(str2)) { 11137 Py_DECREF(self); 11138 Py_DECREF(str1); 11139 return NULL; 11140 } 11141 result = replace(self, str1, str2, maxcount); 11142 Py_DECREF(self); 11143 Py_DECREF(str1); 11144 Py_DECREF(str2); 11145 return result; 11146} 11147 11148PyDoc_STRVAR(replace__doc__, 11149 "S.replace(old, new[, count]) -> str\n\ 11150\n\ 11151Return a copy of S with all occurrences of substring\n\ 11152old replaced by new. If the optional argument count is\n\ 11153given, only the first count occurrences are replaced."); 11154 11155static PyObject* 11156unicode_replace(PyObject *self, PyObject *args) 11157{ 11158 PyObject *str1; 11159 PyObject *str2; 11160 Py_ssize_t maxcount = -1; 11161 PyObject *result; 11162 11163 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11164 return NULL; 11165 if (!PyUnicode_READY(self) == -1) 11166 return NULL; 11167 str1 = PyUnicode_FromObject(str1); 11168 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11169 return NULL; 11170 str2 = PyUnicode_FromObject(str2); 11171 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11172 Py_DECREF(str1); 11173 return NULL; 11174 } 11175 11176 result = replace(self, str1, str2, maxcount); 11177 11178 Py_DECREF(str1); 11179 Py_DECREF(str2); 11180 return result; 11181} 11182 11183static PyObject * 11184unicode_repr(PyObject *unicode) 11185{ 11186 PyObject *repr; 11187 Py_ssize_t isize; 11188 Py_ssize_t osize, squote, dquote, i, o; 11189 Py_UCS4 max, quote; 11190 int ikind, okind; 11191 void *idata, *odata; 11192 11193 if (PyUnicode_READY(unicode) == -1) 11194 return NULL; 11195 11196 isize = PyUnicode_GET_LENGTH(unicode); 11197 idata = PyUnicode_DATA(unicode); 11198 11199 /* Compute length of output, quote characters, and 11200 maximum character */ 11201 osize = 2; /* quotes */ 11202 max = 127; 11203 squote = dquote = 0; 11204 ikind = PyUnicode_KIND(unicode); 11205 for (i = 0; i < isize; i++) { 11206 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11207 switch (ch) { 11208 case '\'': squote++; osize++; break; 11209 case '"': dquote++; osize++; break; 11210 case '\\': case '\t': case '\r': case '\n': 11211 osize += 2; break; 11212 default: 11213 /* Fast-path ASCII */ 11214 if (ch < ' ' || ch == 0x7f) 11215 osize += 4; /* \xHH */ 11216 else if (ch < 0x7f) 11217 osize++; 11218 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11219 osize++; 11220 max = ch > max ? ch : max; 11221 } 11222 else if (ch < 0x100) 11223 osize += 4; /* \xHH */ 11224 else if (ch < 0x10000) 11225 osize += 6; /* \uHHHH */ 11226 else 11227 osize += 10; /* \uHHHHHHHH */ 11228 } 11229 } 11230 11231 quote = '\''; 11232 if (squote) { 11233 if (dquote) 11234 /* Both squote and dquote present. Use squote, 11235 and escape them */ 11236 osize += squote; 11237 else 11238 quote = '"'; 11239 } 11240 11241 repr = PyUnicode_New(osize, max); 11242 if (repr == NULL) 11243 return NULL; 11244 okind = PyUnicode_KIND(repr); 11245 odata = PyUnicode_DATA(repr); 11246 11247 PyUnicode_WRITE(okind, odata, 0, quote); 11248 PyUnicode_WRITE(okind, odata, osize-1, quote); 11249 11250 for (i = 0, o = 1; i < isize; i++) { 11251 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11252 11253 /* Escape quotes and backslashes */ 11254 if ((ch == quote) || (ch == '\\')) { 11255 PyUnicode_WRITE(okind, odata, o++, '\\'); 11256 PyUnicode_WRITE(okind, odata, o++, ch); 11257 continue; 11258 } 11259 11260 /* Map special whitespace to '\t', \n', '\r' */ 11261 if (ch == '\t') { 11262 PyUnicode_WRITE(okind, odata, o++, '\\'); 11263 PyUnicode_WRITE(okind, odata, o++, 't'); 11264 } 11265 else if (ch == '\n') { 11266 PyUnicode_WRITE(okind, odata, o++, '\\'); 11267 PyUnicode_WRITE(okind, odata, o++, 'n'); 11268 } 11269 else if (ch == '\r') { 11270 PyUnicode_WRITE(okind, odata, o++, '\\'); 11271 PyUnicode_WRITE(okind, odata, o++, 'r'); 11272 } 11273 11274 /* Map non-printable US ASCII to '\xhh' */ 11275 else if (ch < ' ' || ch == 0x7F) { 11276 PyUnicode_WRITE(okind, odata, o++, '\\'); 11277 PyUnicode_WRITE(okind, odata, o++, 'x'); 11278 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11279 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11280 } 11281 11282 /* Copy ASCII characters as-is */ 11283 else if (ch < 0x7F) { 11284 PyUnicode_WRITE(okind, odata, o++, ch); 11285 } 11286 11287 /* Non-ASCII characters */ 11288 else { 11289 /* Map Unicode whitespace and control characters 11290 (categories Z* and C* except ASCII space) 11291 */ 11292 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11293 /* Map 8-bit characters to '\xhh' */ 11294 if (ch <= 0xff) { 11295 PyUnicode_WRITE(okind, odata, o++, '\\'); 11296 PyUnicode_WRITE(okind, odata, o++, 'x'); 11297 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11298 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11299 } 11300 /* Map 21-bit characters to '\U00xxxxxx' */ 11301 else if (ch >= 0x10000) { 11302 PyUnicode_WRITE(okind, odata, o++, '\\'); 11303 PyUnicode_WRITE(okind, odata, o++, 'U'); 11304 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11305 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11306 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11307 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11308 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11309 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11310 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11311 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11312 } 11313 /* Map 16-bit characters to '\uxxxx' */ 11314 else { 11315 PyUnicode_WRITE(okind, odata, o++, '\\'); 11316 PyUnicode_WRITE(okind, odata, o++, 'u'); 11317 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11318 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11319 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11320 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11321 } 11322 } 11323 /* Copy characters as-is */ 11324 else { 11325 PyUnicode_WRITE(okind, odata, o++, ch); 11326 } 11327 } 11328 } 11329 /* Closing quote already added at the beginning */ 11330 return repr; 11331} 11332 11333PyDoc_STRVAR(rfind__doc__, 11334 "S.rfind(sub[, start[, end]]) -> int\n\ 11335\n\ 11336Return the highest index in S where substring sub is found,\n\ 11337such that sub is contained within S[start:end]. Optional\n\ 11338arguments start and end are interpreted as in slice notation.\n\ 11339\n\ 11340Return -1 on failure."); 11341 11342static PyObject * 11343unicode_rfind(PyObject *self, PyObject *args) 11344{ 11345 PyUnicodeObject *substring; 11346 Py_ssize_t start; 11347 Py_ssize_t end; 11348 Py_ssize_t result; 11349 11350 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11351 &start, &end)) 11352 return NULL; 11353 11354 if (PyUnicode_READY(self) == -1) 11355 return NULL; 11356 if (PyUnicode_READY(substring) == -1) 11357 return NULL; 11358 11359 result = any_find_slice( 11360 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11361 self, (PyObject*)substring, start, end 11362 ); 11363 11364 Py_DECREF(substring); 11365 11366 if (result == -2) 11367 return NULL; 11368 11369 return PyLong_FromSsize_t(result); 11370} 11371 11372PyDoc_STRVAR(rindex__doc__, 11373 "S.rindex(sub[, start[, end]]) -> int\n\ 11374\n\ 11375Like S.rfind() but raise ValueError when the substring is not found."); 11376 11377static PyObject * 11378unicode_rindex(PyObject *self, PyObject *args) 11379{ 11380 PyUnicodeObject *substring; 11381 Py_ssize_t start; 11382 Py_ssize_t end; 11383 Py_ssize_t result; 11384 11385 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11386 &start, &end)) 11387 return NULL; 11388 11389 if (PyUnicode_READY(self) == -1) 11390 return NULL; 11391 if (PyUnicode_READY(substring) == -1) 11392 return NULL; 11393 11394 result = any_find_slice( 11395 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11396 self, (PyObject*)substring, start, end 11397 ); 11398 11399 Py_DECREF(substring); 11400 11401 if (result == -2) 11402 return NULL; 11403 11404 if (result < 0) { 11405 PyErr_SetString(PyExc_ValueError, "substring not found"); 11406 return NULL; 11407 } 11408 11409 return PyLong_FromSsize_t(result); 11410} 11411 11412PyDoc_STRVAR(rjust__doc__, 11413 "S.rjust(width[, fillchar]) -> str\n\ 11414\n\ 11415Return S right-justified in a string of length width. Padding is\n\ 11416done using the specified fill character (default is a space)."); 11417 11418static PyObject * 11419unicode_rjust(PyUnicodeObject *self, PyObject *args) 11420{ 11421 Py_ssize_t width; 11422 Py_UCS4 fillchar = ' '; 11423 11424 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11425 return NULL; 11426 11427 if (PyUnicode_READY(self) == -1) 11428 return NULL; 11429 11430 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11431 Py_INCREF(self); 11432 return (PyObject*) self; 11433 } 11434 11435 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11436} 11437 11438PyObject * 11439PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11440{ 11441 PyObject *result; 11442 11443 s = PyUnicode_FromObject(s); 11444 if (s == NULL) 11445 return NULL; 11446 if (sep != NULL) { 11447 sep = PyUnicode_FromObject(sep); 11448 if (sep == NULL) { 11449 Py_DECREF(s); 11450 return NULL; 11451 } 11452 } 11453 11454 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11455 11456 Py_DECREF(s); 11457 Py_XDECREF(sep); 11458 return result; 11459} 11460 11461PyDoc_STRVAR(split__doc__, 11462 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11463\n\ 11464Return a list of the words in S, using sep as the\n\ 11465delimiter string. If maxsplit is given, at most maxsplit\n\ 11466splits are done. If sep is not specified or is None, any\n\ 11467whitespace string is a separator and empty strings are\n\ 11468removed from the result."); 11469 11470static PyObject* 11471unicode_split(PyUnicodeObject *self, PyObject *args) 11472{ 11473 PyObject *substring = Py_None; 11474 Py_ssize_t maxcount = -1; 11475 11476 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11477 return NULL; 11478 11479 if (substring == Py_None) 11480 return split(self, NULL, maxcount); 11481 else if (PyUnicode_Check(substring)) 11482 return split(self, (PyUnicodeObject *)substring, maxcount); 11483 else 11484 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11485} 11486 11487PyObject * 11488PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11489{ 11490 PyObject* str_obj; 11491 PyObject* sep_obj; 11492 PyObject* out; 11493 int kind1, kind2, kind; 11494 void *buf1 = NULL, *buf2 = NULL; 11495 Py_ssize_t len1, len2; 11496 11497 str_obj = PyUnicode_FromObject(str_in); 11498 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11499 return NULL; 11500 sep_obj = PyUnicode_FromObject(sep_in); 11501 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11502 Py_DECREF(str_obj); 11503 return NULL; 11504 } 11505 11506 kind1 = PyUnicode_KIND(str_in); 11507 kind2 = PyUnicode_KIND(sep_obj); 11508 kind = kind1 > kind2 ? kind1 : kind2; 11509 buf1 = PyUnicode_DATA(str_in); 11510 if (kind1 != kind) 11511 buf1 = _PyUnicode_AsKind(str_in, kind); 11512 if (!buf1) 11513 goto onError; 11514 buf2 = PyUnicode_DATA(sep_obj); 11515 if (kind2 != kind) 11516 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11517 if (!buf2) 11518 goto onError; 11519 len1 = PyUnicode_GET_LENGTH(str_obj); 11520 len2 = PyUnicode_GET_LENGTH(sep_obj); 11521 11522 switch(PyUnicode_KIND(str_in)) { 11523 case PyUnicode_1BYTE_KIND: 11524 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11525 break; 11526 case PyUnicode_2BYTE_KIND: 11527 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11528 break; 11529 case PyUnicode_4BYTE_KIND: 11530 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11531 break; 11532 default: 11533 assert(0); 11534 out = 0; 11535 } 11536 11537 Py_DECREF(sep_obj); 11538 Py_DECREF(str_obj); 11539 if (kind1 != kind) 11540 PyMem_Free(buf1); 11541 if (kind2 != kind) 11542 PyMem_Free(buf2); 11543 11544 return out; 11545 onError: 11546 Py_DECREF(sep_obj); 11547 Py_DECREF(str_obj); 11548 if (kind1 != kind && buf1) 11549 PyMem_Free(buf1); 11550 if (kind2 != kind && buf2) 11551 PyMem_Free(buf2); 11552 return NULL; 11553} 11554 11555 11556PyObject * 11557PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11558{ 11559 PyObject* str_obj; 11560 PyObject* sep_obj; 11561 PyObject* out; 11562 int kind1, kind2, kind; 11563 void *buf1 = NULL, *buf2 = NULL; 11564 Py_ssize_t len1, len2; 11565 11566 str_obj = PyUnicode_FromObject(str_in); 11567 if (!str_obj) 11568 return NULL; 11569 sep_obj = PyUnicode_FromObject(sep_in); 11570 if (!sep_obj) { 11571 Py_DECREF(str_obj); 11572 return NULL; 11573 } 11574 11575 kind1 = PyUnicode_KIND(str_in); 11576 kind2 = PyUnicode_KIND(sep_obj); 11577 kind = Py_MAX(kind1, kind2); 11578 buf1 = PyUnicode_DATA(str_in); 11579 if (kind1 != kind) 11580 buf1 = _PyUnicode_AsKind(str_in, kind); 11581 if (!buf1) 11582 goto onError; 11583 buf2 = PyUnicode_DATA(sep_obj); 11584 if (kind2 != kind) 11585 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11586 if (!buf2) 11587 goto onError; 11588 len1 = PyUnicode_GET_LENGTH(str_obj); 11589 len2 = PyUnicode_GET_LENGTH(sep_obj); 11590 11591 switch(PyUnicode_KIND(str_in)) { 11592 case PyUnicode_1BYTE_KIND: 11593 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11594 break; 11595 case PyUnicode_2BYTE_KIND: 11596 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11597 break; 11598 case PyUnicode_4BYTE_KIND: 11599 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11600 break; 11601 default: 11602 assert(0); 11603 out = 0; 11604 } 11605 11606 Py_DECREF(sep_obj); 11607 Py_DECREF(str_obj); 11608 if (kind1 != kind) 11609 PyMem_Free(buf1); 11610 if (kind2 != kind) 11611 PyMem_Free(buf2); 11612 11613 return out; 11614 onError: 11615 Py_DECREF(sep_obj); 11616 Py_DECREF(str_obj); 11617 if (kind1 != kind && buf1) 11618 PyMem_Free(buf1); 11619 if (kind2 != kind && buf2) 11620 PyMem_Free(buf2); 11621 return NULL; 11622} 11623 11624PyDoc_STRVAR(partition__doc__, 11625 "S.partition(sep) -> (head, sep, tail)\n\ 11626\n\ 11627Search for the separator sep in S, and return the part before it,\n\ 11628the separator itself, and the part after it. If the separator is not\n\ 11629found, return S and two empty strings."); 11630 11631static PyObject* 11632unicode_partition(PyUnicodeObject *self, PyObject *separator) 11633{ 11634 return PyUnicode_Partition((PyObject *)self, separator); 11635} 11636 11637PyDoc_STRVAR(rpartition__doc__, 11638 "S.rpartition(sep) -> (head, sep, tail)\n\ 11639\n\ 11640Search for the separator sep in S, starting at the end of S, and return\n\ 11641the part before it, the separator itself, and the part after it. If the\n\ 11642separator is not found, return two empty strings and S."); 11643 11644static PyObject* 11645unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 11646{ 11647 return PyUnicode_RPartition((PyObject *)self, separator); 11648} 11649 11650PyObject * 11651PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11652{ 11653 PyObject *result; 11654 11655 s = PyUnicode_FromObject(s); 11656 if (s == NULL) 11657 return NULL; 11658 if (sep != NULL) { 11659 sep = PyUnicode_FromObject(sep); 11660 if (sep == NULL) { 11661 Py_DECREF(s); 11662 return NULL; 11663 } 11664 } 11665 11666 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11667 11668 Py_DECREF(s); 11669 Py_XDECREF(sep); 11670 return result; 11671} 11672 11673PyDoc_STRVAR(rsplit__doc__, 11674 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11675\n\ 11676Return a list of the words in S, using sep as the\n\ 11677delimiter string, starting at the end of the string and\n\ 11678working to the front. If maxsplit is given, at most maxsplit\n\ 11679splits are done. If sep is not specified, any whitespace string\n\ 11680is a separator."); 11681 11682static PyObject* 11683unicode_rsplit(PyUnicodeObject *self, PyObject *args) 11684{ 11685 PyObject *substring = Py_None; 11686 Py_ssize_t maxcount = -1; 11687 11688 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11689 return NULL; 11690 11691 if (substring == Py_None) 11692 return rsplit(self, NULL, maxcount); 11693 else if (PyUnicode_Check(substring)) 11694 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 11695 else 11696 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 11697} 11698 11699PyDoc_STRVAR(splitlines__doc__, 11700 "S.splitlines([keepends]) -> list of strings\n\ 11701\n\ 11702Return a list of the lines in S, breaking at line boundaries.\n\ 11703Line breaks are not included in the resulting list unless keepends\n\ 11704is given and true."); 11705 11706static PyObject* 11707unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11708{ 11709 static char *kwlist[] = {"keepends", 0}; 11710 int keepends = 0; 11711 11712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11713 kwlist, &keepends)) 11714 return NULL; 11715 11716 return PyUnicode_Splitlines((PyObject *)self, keepends); 11717} 11718 11719static 11720PyObject *unicode_str(PyObject *self) 11721{ 11722 if (PyUnicode_CheckExact(self)) { 11723 Py_INCREF(self); 11724 return self; 11725 } else 11726 /* Subtype -- return genuine unicode string with the same value. */ 11727 return PyUnicode_Copy(self); 11728} 11729 11730PyDoc_STRVAR(swapcase__doc__, 11731 "S.swapcase() -> str\n\ 11732\n\ 11733Return a copy of S with uppercase characters converted to lowercase\n\ 11734and vice versa."); 11735 11736static PyObject* 11737unicode_swapcase(PyUnicodeObject *self) 11738{ 11739 return fixup(self, fixswapcase); 11740} 11741 11742PyDoc_STRVAR(maketrans__doc__, 11743 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 11744\n\ 11745Return a translation table usable for str.translate().\n\ 11746If there is only one argument, it must be a dictionary mapping Unicode\n\ 11747ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 11748Character keys will be then converted to ordinals.\n\ 11749If there are two arguments, they must be strings of equal length, and\n\ 11750in the resulting dictionary, each character in x will be mapped to the\n\ 11751character at the same position in y. If there is a third argument, it\n\ 11752must be a string, whose characters will be mapped to None in the result."); 11753 11754static PyObject* 11755unicode_maketrans(PyUnicodeObject *null, PyObject *args) 11756{ 11757 PyObject *x, *y = NULL, *z = NULL; 11758 PyObject *new = NULL, *key, *value; 11759 Py_ssize_t i = 0; 11760 int res; 11761 11762 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 11763 return NULL; 11764 new = PyDict_New(); 11765 if (!new) 11766 return NULL; 11767 if (y != NULL) { 11768 int x_kind, y_kind, z_kind; 11769 void *x_data, *y_data, *z_data; 11770 11771 /* x must be a string too, of equal length */ 11772 if (!PyUnicode_Check(x)) { 11773 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 11774 "be a string if there is a second argument"); 11775 goto err; 11776 } 11777 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 11778 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 11779 "arguments must have equal length"); 11780 goto err; 11781 } 11782 /* create entries for translating chars in x to those in y */ 11783 x_kind = PyUnicode_KIND(x); 11784 y_kind = PyUnicode_KIND(y); 11785 x_data = PyUnicode_DATA(x); 11786 y_data = PyUnicode_DATA(y); 11787 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 11788 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 11789 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 11790 if (!key || !value) 11791 goto err; 11792 res = PyDict_SetItem(new, key, value); 11793 Py_DECREF(key); 11794 Py_DECREF(value); 11795 if (res < 0) 11796 goto err; 11797 } 11798 /* create entries for deleting chars in z */ 11799 if (z != NULL) { 11800 z_kind = PyUnicode_KIND(z); 11801 z_data = PyUnicode_DATA(z); 11802 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 11803 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 11804 if (!key) 11805 goto err; 11806 res = PyDict_SetItem(new, key, Py_None); 11807 Py_DECREF(key); 11808 if (res < 0) 11809 goto err; 11810 } 11811 } 11812 } else { 11813 int kind; 11814 void *data; 11815 11816 /* x must be a dict */ 11817 if (!PyDict_CheckExact(x)) { 11818 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 11819 "to maketrans it must be a dict"); 11820 goto err; 11821 } 11822 /* copy entries into the new dict, converting string keys to int keys */ 11823 while (PyDict_Next(x, &i, &key, &value)) { 11824 if (PyUnicode_Check(key)) { 11825 /* convert string keys to integer keys */ 11826 PyObject *newkey; 11827 if (PyUnicode_GET_SIZE(key) != 1) { 11828 PyErr_SetString(PyExc_ValueError, "string keys in translate " 11829 "table must be of length 1"); 11830 goto err; 11831 } 11832 kind = PyUnicode_KIND(key); 11833 data = PyUnicode_DATA(key); 11834 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 11835 if (!newkey) 11836 goto err; 11837 res = PyDict_SetItem(new, newkey, value); 11838 Py_DECREF(newkey); 11839 if (res < 0) 11840 goto err; 11841 } else if (PyLong_Check(key)) { 11842 /* just keep integer keys */ 11843 if (PyDict_SetItem(new, key, value) < 0) 11844 goto err; 11845 } else { 11846 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 11847 "be strings or integers"); 11848 goto err; 11849 } 11850 } 11851 } 11852 return new; 11853 err: 11854 Py_DECREF(new); 11855 return NULL; 11856} 11857 11858PyDoc_STRVAR(translate__doc__, 11859 "S.translate(table) -> str\n\ 11860\n\ 11861Return a copy of the string S, where all characters have been mapped\n\ 11862through the given translation table, which must be a mapping of\n\ 11863Unicode ordinals to Unicode ordinals, strings, or None.\n\ 11864Unmapped characters are left untouched. Characters mapped to None\n\ 11865are deleted."); 11866 11867static PyObject* 11868unicode_translate(PyObject *self, PyObject *table) 11869{ 11870 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 11871} 11872 11873PyDoc_STRVAR(upper__doc__, 11874 "S.upper() -> str\n\ 11875\n\ 11876Return a copy of S converted to uppercase."); 11877 11878static PyObject* 11879unicode_upper(PyUnicodeObject *self) 11880{ 11881 return fixup(self, fixupper); 11882} 11883 11884PyDoc_STRVAR(zfill__doc__, 11885 "S.zfill(width) -> str\n\ 11886\n\ 11887Pad a numeric string S with zeros on the left, to fill a field\n\ 11888of the specified width. The string S is never truncated."); 11889 11890static PyObject * 11891unicode_zfill(PyUnicodeObject *self, PyObject *args) 11892{ 11893 Py_ssize_t fill; 11894 PyUnicodeObject *u; 11895 Py_ssize_t width; 11896 int kind; 11897 void *data; 11898 Py_UCS4 chr; 11899 11900 if (PyUnicode_READY(self) == -1) 11901 return NULL; 11902 11903 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 11904 return NULL; 11905 11906 if (PyUnicode_GET_LENGTH(self) >= width) { 11907 if (PyUnicode_CheckExact(self)) { 11908 Py_INCREF(self); 11909 return (PyObject*) self; 11910 } 11911 else 11912 return PyUnicode_Copy((PyObject*)self); 11913 } 11914 11915 fill = width - _PyUnicode_LENGTH(self); 11916 11917 u = pad(self, fill, 0, '0'); 11918 11919 if (u == NULL) 11920 return NULL; 11921 11922 kind = PyUnicode_KIND(u); 11923 data = PyUnicode_DATA(u); 11924 chr = PyUnicode_READ(kind, data, fill); 11925 11926 if (chr == '+' || chr == '-') { 11927 /* move sign to beginning of string */ 11928 PyUnicode_WRITE(kind, data, 0, chr); 11929 PyUnicode_WRITE(kind, data, fill, '0'); 11930 } 11931 11932 return (PyObject*) u; 11933} 11934 11935#if 0 11936static PyObject * 11937unicode__decimal2ascii(PyObject *self) 11938{ 11939 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 11940} 11941#endif 11942 11943PyDoc_STRVAR(startswith__doc__, 11944 "S.startswith(prefix[, start[, end]]) -> bool\n\ 11945\n\ 11946Return True if S starts with the specified prefix, False otherwise.\n\ 11947With optional start, test S beginning at that position.\n\ 11948With optional end, stop comparing S at that position.\n\ 11949prefix can also be a tuple of strings to try."); 11950 11951static PyObject * 11952unicode_startswith(PyUnicodeObject *self, 11953 PyObject *args) 11954{ 11955 PyObject *subobj; 11956 PyUnicodeObject *substring; 11957 Py_ssize_t start = 0; 11958 Py_ssize_t end = PY_SSIZE_T_MAX; 11959 int result; 11960 11961 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 11962 return NULL; 11963 if (PyTuple_Check(subobj)) { 11964 Py_ssize_t i; 11965 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 11966 substring = (PyUnicodeObject *)PyUnicode_FromObject( 11967 PyTuple_GET_ITEM(subobj, i)); 11968 if (substring == NULL) 11969 return NULL; 11970 result = tailmatch(self, substring, start, end, -1); 11971 Py_DECREF(substring); 11972 if (result) { 11973 Py_RETURN_TRUE; 11974 } 11975 } 11976 /* nothing matched */ 11977 Py_RETURN_FALSE; 11978 } 11979 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 11980 if (substring == NULL) { 11981 if (PyErr_ExceptionMatches(PyExc_TypeError)) 11982 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 11983 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 11984 return NULL; 11985 } 11986 result = tailmatch(self, substring, start, end, -1); 11987 Py_DECREF(substring); 11988 return PyBool_FromLong(result); 11989} 11990 11991 11992PyDoc_STRVAR(endswith__doc__, 11993 "S.endswith(suffix[, start[, end]]) -> bool\n\ 11994\n\ 11995Return True if S ends with the specified suffix, False otherwise.\n\ 11996With optional start, test S beginning at that position.\n\ 11997With optional end, stop comparing S at that position.\n\ 11998suffix can also be a tuple of strings to try."); 11999 12000static PyObject * 12001unicode_endswith(PyUnicodeObject *self, 12002 PyObject *args) 12003{ 12004 PyObject *subobj; 12005 PyUnicodeObject *substring; 12006 Py_ssize_t start = 0; 12007 Py_ssize_t end = PY_SSIZE_T_MAX; 12008 int result; 12009 12010 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12011 return NULL; 12012 if (PyTuple_Check(subobj)) { 12013 Py_ssize_t i; 12014 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12015 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12016 PyTuple_GET_ITEM(subobj, i)); 12017 if (substring == NULL) 12018 return NULL; 12019 result = tailmatch(self, substring, start, end, +1); 12020 Py_DECREF(substring); 12021 if (result) { 12022 Py_RETURN_TRUE; 12023 } 12024 } 12025 Py_RETURN_FALSE; 12026 } 12027 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12028 if (substring == NULL) { 12029 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12030 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12031 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12032 return NULL; 12033 } 12034 result = tailmatch(self, substring, start, end, +1); 12035 Py_DECREF(substring); 12036 return PyBool_FromLong(result); 12037} 12038 12039#include "stringlib/unicode_format.h" 12040 12041PyDoc_STRVAR(format__doc__, 12042 "S.format(*args, **kwargs) -> str\n\ 12043\n\ 12044Return a formatted version of S, using substitutions from args and kwargs.\n\ 12045The substitutions are identified by braces ('{' and '}')."); 12046 12047PyDoc_STRVAR(format_map__doc__, 12048 "S.format_map(mapping) -> str\n\ 12049\n\ 12050Return a formatted version of S, using substitutions from mapping.\n\ 12051The substitutions are identified by braces ('{' and '}')."); 12052 12053static PyObject * 12054unicode__format__(PyObject* self, PyObject* args) 12055{ 12056 PyObject *format_spec; 12057 12058 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12059 return NULL; 12060 12061 return _PyUnicode_FormatAdvanced(self, format_spec, 0, 12062 PyUnicode_GET_LENGTH(format_spec)); 12063} 12064 12065PyDoc_STRVAR(p_format__doc__, 12066 "S.__format__(format_spec) -> str\n\ 12067\n\ 12068Return a formatted version of S as described by format_spec."); 12069 12070static PyObject * 12071unicode__sizeof__(PyUnicodeObject *v) 12072{ 12073 Py_ssize_t size; 12074 12075 /* If it's a compact object, account for base structure + 12076 character data. */ 12077 if (PyUnicode_IS_COMPACT_ASCII(v)) 12078 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12079 else if (PyUnicode_IS_COMPACT(v)) 12080 size = sizeof(PyCompactUnicodeObject) + 12081 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 12082 else { 12083 /* If it is a two-block object, account for base object, and 12084 for character block if present. */ 12085 size = sizeof(PyUnicodeObject); 12086 if (_PyUnicode_DATA_ANY(v)) 12087 size += (PyUnicode_GET_LENGTH(v) + 1) * 12088 PyUnicode_CHARACTER_SIZE(v); 12089 } 12090 /* If the wstr pointer is present, account for it unless it is shared 12091 with the data pointer. Check if the data is not shared. */ 12092 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12093 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12094 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12095 size += PyUnicode_UTF8_LENGTH(v) + 1; 12096 12097 return PyLong_FromSsize_t(size); 12098} 12099 12100PyDoc_STRVAR(sizeof__doc__, 12101 "S.__sizeof__() -> size of S in memory, in bytes"); 12102 12103static PyObject * 12104unicode_getnewargs(PyObject *v) 12105{ 12106 PyObject *copy = PyUnicode_Copy(v); 12107 if (!copy) 12108 return NULL; 12109 return Py_BuildValue("(N)", copy); 12110} 12111 12112static PyMethodDef unicode_methods[] = { 12113 12114 /* Order is according to common usage: often used methods should 12115 appear first, since lookup is done sequentially. */ 12116 12117 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12118 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12119 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12120 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12121 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12122 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12123 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12124 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12125 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12126 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12127 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12128 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12129 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12130 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12131 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12132 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12133 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12134 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12135 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12136 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12137 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12138 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12139 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12140 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12141 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12142 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12143 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12144 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12145 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12146 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12147 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12148 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12149 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12150 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12151 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12152 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12153 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12154 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12155 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12156 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12157 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12158 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12159 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12160 {"maketrans", (PyCFunction) unicode_maketrans, 12161 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12162 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12163#if 0 12164 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12165#endif 12166 12167#if 0 12168 /* These methods are just used for debugging the implementation. */ 12169 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12170#endif 12171 12172 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12173 {NULL, NULL} 12174}; 12175 12176static PyObject * 12177unicode_mod(PyObject *v, PyObject *w) 12178{ 12179 if (!PyUnicode_Check(v)) 12180 Py_RETURN_NOTIMPLEMENTED; 12181 return PyUnicode_Format(v, w); 12182} 12183 12184static PyNumberMethods unicode_as_number = { 12185 0, /*nb_add*/ 12186 0, /*nb_subtract*/ 12187 0, /*nb_multiply*/ 12188 unicode_mod, /*nb_remainder*/ 12189}; 12190 12191static PySequenceMethods unicode_as_sequence = { 12192 (lenfunc) unicode_length, /* sq_length */ 12193 PyUnicode_Concat, /* sq_concat */ 12194 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12195 (ssizeargfunc) unicode_getitem, /* sq_item */ 12196 0, /* sq_slice */ 12197 0, /* sq_ass_item */ 12198 0, /* sq_ass_slice */ 12199 PyUnicode_Contains, /* sq_contains */ 12200}; 12201 12202static PyObject* 12203unicode_subscript(PyUnicodeObject* self, PyObject* item) 12204{ 12205 if (PyUnicode_READY(self) == -1) 12206 return NULL; 12207 12208 if (PyIndex_Check(item)) { 12209 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12210 if (i == -1 && PyErr_Occurred()) 12211 return NULL; 12212 if (i < 0) 12213 i += PyUnicode_GET_LENGTH(self); 12214 return unicode_getitem((PyObject*)self, i); 12215 } else if (PySlice_Check(item)) { 12216 Py_ssize_t start, stop, step, slicelength, cur, i; 12217 const Py_UNICODE* source_buf; 12218 Py_UNICODE* result_buf; 12219 PyObject* result; 12220 12221 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12222 &start, &stop, &step, &slicelength) < 0) { 12223 return NULL; 12224 } 12225 12226 if (slicelength <= 0) { 12227 return PyUnicode_New(0, 0); 12228 } else if (start == 0 && step == 1 && 12229 slicelength == PyUnicode_GET_LENGTH(self) && 12230 PyUnicode_CheckExact(self)) { 12231 Py_INCREF(self); 12232 return (PyObject *)self; 12233 } else if (step == 1) { 12234 return PyUnicode_Substring((PyObject*)self, 12235 start, start + slicelength); 12236 } else { 12237 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 12238 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 12239 sizeof(Py_UNICODE)); 12240 12241 if (result_buf == NULL) 12242 return PyErr_NoMemory(); 12243 12244 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12245 result_buf[i] = source_buf[cur]; 12246 } 12247 12248 result = PyUnicode_FromUnicode(result_buf, slicelength); 12249 PyObject_FREE(result_buf); 12250 return result; 12251 } 12252 } else { 12253 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12254 return NULL; 12255 } 12256} 12257 12258static PyMappingMethods unicode_as_mapping = { 12259 (lenfunc)unicode_length, /* mp_length */ 12260 (binaryfunc)unicode_subscript, /* mp_subscript */ 12261 (objobjargproc)0, /* mp_ass_subscript */ 12262}; 12263 12264 12265/* Helpers for PyUnicode_Format() */ 12266 12267static PyObject * 12268getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12269{ 12270 Py_ssize_t argidx = *p_argidx; 12271 if (argidx < arglen) { 12272 (*p_argidx)++; 12273 if (arglen < 0) 12274 return args; 12275 else 12276 return PyTuple_GetItem(args, argidx); 12277 } 12278 PyErr_SetString(PyExc_TypeError, 12279 "not enough arguments for format string"); 12280 return NULL; 12281} 12282 12283/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12284 12285static PyObject * 12286formatfloat(PyObject *v, int flags, int prec, int type) 12287{ 12288 char *p; 12289 PyObject *result; 12290 double x; 12291 12292 x = PyFloat_AsDouble(v); 12293 if (x == -1.0 && PyErr_Occurred()) 12294 return NULL; 12295 12296 if (prec < 0) 12297 prec = 6; 12298 12299 p = PyOS_double_to_string(x, type, prec, 12300 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12301 if (p == NULL) 12302 return NULL; 12303 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12304 PyMem_Free(p); 12305 return result; 12306} 12307 12308static PyObject* 12309formatlong(PyObject *val, int flags, int prec, int type) 12310{ 12311 char *buf; 12312 int len; 12313 PyObject *str; /* temporary string object. */ 12314 PyObject *result; 12315 12316 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12317 if (!str) 12318 return NULL; 12319 result = PyUnicode_DecodeASCII(buf, len, NULL); 12320 Py_DECREF(str); 12321 return result; 12322} 12323 12324static int 12325formatchar(Py_UCS4 *buf, 12326 size_t buflen, 12327 PyObject *v) 12328{ 12329 /* presume that the buffer is at least 3 characters long */ 12330 if (PyUnicode_Check(v)) { 12331 if (PyUnicode_GET_LENGTH(v) == 1) { 12332 buf[0] = PyUnicode_READ_CHAR(v, 0); 12333 buf[1] = '\0'; 12334 return 1; 12335 } 12336 goto onError; 12337 } 12338 else { 12339 /* Integer input truncated to a character */ 12340 long x; 12341 x = PyLong_AsLong(v); 12342 if (x == -1 && PyErr_Occurred()) 12343 goto onError; 12344 12345 if (x < 0 || x > 0x10ffff) { 12346 PyErr_SetString(PyExc_OverflowError, 12347 "%c arg not in range(0x110000)"); 12348 return -1; 12349 } 12350 12351 buf[0] = (Py_UCS4) x; 12352 buf[1] = '\0'; 12353 return 1; 12354 } 12355 12356 onError: 12357 PyErr_SetString(PyExc_TypeError, 12358 "%c requires int or char"); 12359 return -1; 12360} 12361 12362/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12363 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12364*/ 12365#define FORMATBUFLEN (size_t)10 12366 12367PyObject * 12368PyUnicode_Format(PyObject *format, PyObject *args) 12369{ 12370 void *fmt; 12371 int fmtkind; 12372 PyObject *result; 12373 Py_UCS4 *res, *res0; 12374 Py_UCS4 max; 12375 int kind; 12376 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12377 int args_owned = 0; 12378 PyObject *dict = NULL; 12379 PyUnicodeObject *uformat; 12380 12381 if (format == NULL || args == NULL) { 12382 PyErr_BadInternalCall(); 12383 return NULL; 12384 } 12385 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12386 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12387 return NULL; 12388 fmt = PyUnicode_DATA(uformat); 12389 fmtkind = PyUnicode_KIND(uformat); 12390 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12391 fmtpos = 0; 12392 12393 reslen = rescnt = fmtcnt + 100; 12394 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12395 if (res0 == NULL) { 12396 PyErr_NoMemory(); 12397 goto onError; 12398 } 12399 12400 if (PyTuple_Check(args)) { 12401 arglen = PyTuple_Size(args); 12402 argidx = 0; 12403 } 12404 else { 12405 arglen = -1; 12406 argidx = -2; 12407 } 12408 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12409 !PyUnicode_Check(args)) 12410 dict = args; 12411 12412 while (--fmtcnt >= 0) { 12413 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12414 if (--rescnt < 0) { 12415 rescnt = fmtcnt + 100; 12416 reslen += rescnt; 12417 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12418 if (res0 == NULL){ 12419 PyErr_NoMemory(); 12420 goto onError; 12421 } 12422 res = res0 + reslen - rescnt; 12423 --rescnt; 12424 } 12425 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12426 } 12427 else { 12428 /* Got a format specifier */ 12429 int flags = 0; 12430 Py_ssize_t width = -1; 12431 int prec = -1; 12432 Py_UCS4 c = '\0'; 12433 Py_UCS4 fill; 12434 int isnumok; 12435 PyObject *v = NULL; 12436 PyObject *temp = NULL; 12437 void *pbuf; 12438 Py_ssize_t pindex; 12439 Py_UNICODE sign; 12440 Py_ssize_t len, len1; 12441 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12442 12443 fmtpos++; 12444 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12445 Py_ssize_t keystart; 12446 Py_ssize_t keylen; 12447 PyObject *key; 12448 int pcount = 1; 12449 12450 if (dict == NULL) { 12451 PyErr_SetString(PyExc_TypeError, 12452 "format requires a mapping"); 12453 goto onError; 12454 } 12455 ++fmtpos; 12456 --fmtcnt; 12457 keystart = fmtpos; 12458 /* Skip over balanced parentheses */ 12459 while (pcount > 0 && --fmtcnt >= 0) { 12460 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12461 --pcount; 12462 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12463 ++pcount; 12464 fmtpos++; 12465 } 12466 keylen = fmtpos - keystart - 1; 12467 if (fmtcnt < 0 || pcount > 0) { 12468 PyErr_SetString(PyExc_ValueError, 12469 "incomplete format key"); 12470 goto onError; 12471 } 12472 key = PyUnicode_Substring((PyObject*)uformat, 12473 keystart, keystart + keylen); 12474 if (key == NULL) 12475 goto onError; 12476 if (args_owned) { 12477 Py_DECREF(args); 12478 args_owned = 0; 12479 } 12480 args = PyObject_GetItem(dict, key); 12481 Py_DECREF(key); 12482 if (args == NULL) { 12483 goto onError; 12484 } 12485 args_owned = 1; 12486 arglen = -1; 12487 argidx = -2; 12488 } 12489 while (--fmtcnt >= 0) { 12490 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12491 case '-': flags |= F_LJUST; continue; 12492 case '+': flags |= F_SIGN; continue; 12493 case ' ': flags |= F_BLANK; continue; 12494 case '#': flags |= F_ALT; continue; 12495 case '0': flags |= F_ZERO; continue; 12496 } 12497 break; 12498 } 12499 if (c == '*') { 12500 v = getnextarg(args, arglen, &argidx); 12501 if (v == NULL) 12502 goto onError; 12503 if (!PyLong_Check(v)) { 12504 PyErr_SetString(PyExc_TypeError, 12505 "* wants int"); 12506 goto onError; 12507 } 12508 width = PyLong_AsLong(v); 12509 if (width == -1 && PyErr_Occurred()) 12510 goto onError; 12511 if (width < 0) { 12512 flags |= F_LJUST; 12513 width = -width; 12514 } 12515 if (--fmtcnt >= 0) 12516 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12517 } 12518 else if (c >= '0' && c <= '9') { 12519 width = c - '0'; 12520 while (--fmtcnt >= 0) { 12521 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12522 if (c < '0' || c > '9') 12523 break; 12524 if ((width*10) / 10 != width) { 12525 PyErr_SetString(PyExc_ValueError, 12526 "width too big"); 12527 goto onError; 12528 } 12529 width = width*10 + (c - '0'); 12530 } 12531 } 12532 if (c == '.') { 12533 prec = 0; 12534 if (--fmtcnt >= 0) 12535 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12536 if (c == '*') { 12537 v = getnextarg(args, arglen, &argidx); 12538 if (v == NULL) 12539 goto onError; 12540 if (!PyLong_Check(v)) { 12541 PyErr_SetString(PyExc_TypeError, 12542 "* wants int"); 12543 goto onError; 12544 } 12545 prec = PyLong_AsLong(v); 12546 if (prec == -1 && PyErr_Occurred()) 12547 goto onError; 12548 if (prec < 0) 12549 prec = 0; 12550 if (--fmtcnt >= 0) 12551 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12552 } 12553 else if (c >= '0' && c <= '9') { 12554 prec = c - '0'; 12555 while (--fmtcnt >= 0) { 12556 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12557 if (c < '0' || c > '9') 12558 break; 12559 if ((prec*10) / 10 != prec) { 12560 PyErr_SetString(PyExc_ValueError, 12561 "prec too big"); 12562 goto onError; 12563 } 12564 prec = prec*10 + (c - '0'); 12565 } 12566 } 12567 } /* prec */ 12568 if (fmtcnt >= 0) { 12569 if (c == 'h' || c == 'l' || c == 'L') { 12570 if (--fmtcnt >= 0) 12571 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12572 } 12573 } 12574 if (fmtcnt < 0) { 12575 PyErr_SetString(PyExc_ValueError, 12576 "incomplete format"); 12577 goto onError; 12578 } 12579 if (c != '%') { 12580 v = getnextarg(args, arglen, &argidx); 12581 if (v == NULL) 12582 goto onError; 12583 } 12584 sign = 0; 12585 fill = ' '; 12586 switch (c) { 12587 12588 case '%': 12589 pbuf = formatbuf; 12590 kind = PyUnicode_4BYTE_KIND; 12591 /* presume that buffer length is at least 1 */ 12592 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12593 len = 1; 12594 break; 12595 12596 case 's': 12597 case 'r': 12598 case 'a': 12599 if (PyUnicode_CheckExact(v) && c == 's') { 12600 temp = v; 12601 Py_INCREF(temp); 12602 } 12603 else { 12604 if (c == 's') 12605 temp = PyObject_Str(v); 12606 else if (c == 'r') 12607 temp = PyObject_Repr(v); 12608 else 12609 temp = PyObject_ASCII(v); 12610 if (temp == NULL) 12611 goto onError; 12612 if (PyUnicode_Check(temp)) 12613 /* nothing to do */; 12614 else { 12615 Py_DECREF(temp); 12616 PyErr_SetString(PyExc_TypeError, 12617 "%s argument has non-string str()"); 12618 goto onError; 12619 } 12620 } 12621 if (PyUnicode_READY(temp) == -1) { 12622 Py_CLEAR(temp); 12623 goto onError; 12624 } 12625 pbuf = PyUnicode_DATA(temp); 12626 kind = PyUnicode_KIND(temp); 12627 len = PyUnicode_GET_LENGTH(temp); 12628 if (prec >= 0 && len > prec) 12629 len = prec; 12630 break; 12631 12632 case 'i': 12633 case 'd': 12634 case 'u': 12635 case 'o': 12636 case 'x': 12637 case 'X': 12638 isnumok = 0; 12639 if (PyNumber_Check(v)) { 12640 PyObject *iobj=NULL; 12641 12642 if (PyLong_Check(v)) { 12643 iobj = v; 12644 Py_INCREF(iobj); 12645 } 12646 else { 12647 iobj = PyNumber_Long(v); 12648 } 12649 if (iobj!=NULL) { 12650 if (PyLong_Check(iobj)) { 12651 isnumok = 1; 12652 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12653 Py_DECREF(iobj); 12654 if (!temp) 12655 goto onError; 12656 if (PyUnicode_READY(temp) == -1) { 12657 Py_CLEAR(temp); 12658 goto onError; 12659 } 12660 pbuf = PyUnicode_DATA(temp); 12661 kind = PyUnicode_KIND(temp); 12662 len = PyUnicode_GET_LENGTH(temp); 12663 sign = 1; 12664 } 12665 else { 12666 Py_DECREF(iobj); 12667 } 12668 } 12669 } 12670 if (!isnumok) { 12671 PyErr_Format(PyExc_TypeError, 12672 "%%%c format: a number is required, " 12673 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12674 goto onError; 12675 } 12676 if (flags & F_ZERO) 12677 fill = '0'; 12678 break; 12679 12680 case 'e': 12681 case 'E': 12682 case 'f': 12683 case 'F': 12684 case 'g': 12685 case 'G': 12686 temp = formatfloat(v, flags, prec, c); 12687 if (!temp) 12688 goto onError; 12689 if (PyUnicode_READY(temp) == -1) { 12690 Py_CLEAR(temp); 12691 goto onError; 12692 } 12693 pbuf = PyUnicode_DATA(temp); 12694 kind = PyUnicode_KIND(temp); 12695 len = PyUnicode_GET_LENGTH(temp); 12696 sign = 1; 12697 if (flags & F_ZERO) 12698 fill = '0'; 12699 break; 12700 12701 case 'c': 12702 pbuf = formatbuf; 12703 kind = PyUnicode_4BYTE_KIND; 12704 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12705 if (len < 0) 12706 goto onError; 12707 break; 12708 12709 default: 12710 PyErr_Format(PyExc_ValueError, 12711 "unsupported format character '%c' (0x%x) " 12712 "at index %zd", 12713 (31<=c && c<=126) ? (char)c : '?', 12714 (int)c, 12715 fmtpos - 1); 12716 goto onError; 12717 } 12718 /* pbuf is initialized here. */ 12719 pindex = 0; 12720 if (sign) { 12721 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 12722 PyUnicode_READ(kind, pbuf, pindex) == '+') { 12723 sign = PyUnicode_READ(kind, pbuf, pindex++); 12724 len--; 12725 } 12726 else if (flags & F_SIGN) 12727 sign = '+'; 12728 else if (flags & F_BLANK) 12729 sign = ' '; 12730 else 12731 sign = 0; 12732 } 12733 if (width < len) 12734 width = len; 12735 if (rescnt - (sign != 0) < width) { 12736 reslen -= rescnt; 12737 rescnt = width + fmtcnt + 100; 12738 reslen += rescnt; 12739 if (reslen < 0) { 12740 Py_XDECREF(temp); 12741 PyErr_NoMemory(); 12742 goto onError; 12743 } 12744 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12745 if (res0 == 0) { 12746 PyErr_NoMemory(); 12747 Py_XDECREF(temp); 12748 goto onError; 12749 } 12750 res = res0 + reslen - rescnt; 12751 } 12752 if (sign) { 12753 if (fill != ' ') 12754 *res++ = sign; 12755 rescnt--; 12756 if (width > len) 12757 width--; 12758 } 12759 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12760 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12761 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12762 if (fill != ' ') { 12763 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12764 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12765 } 12766 rescnt -= 2; 12767 width -= 2; 12768 if (width < 0) 12769 width = 0; 12770 len -= 2; 12771 } 12772 if (width > len && !(flags & F_LJUST)) { 12773 do { 12774 --rescnt; 12775 *res++ = fill; 12776 } while (--width > len); 12777 } 12778 if (fill == ' ') { 12779 if (sign) 12780 *res++ = sign; 12781 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12782 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12783 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12784 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12785 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12786 } 12787 } 12788 /* Copy all characters, preserving len */ 12789 len1 = len; 12790 while (len1--) { 12791 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12792 rescnt--; 12793 } 12794 while (--width >= len) { 12795 --rescnt; 12796 *res++ = ' '; 12797 } 12798 if (dict && (argidx < arglen) && c != '%') { 12799 PyErr_SetString(PyExc_TypeError, 12800 "not all arguments converted during string formatting"); 12801 Py_XDECREF(temp); 12802 goto onError; 12803 } 12804 Py_XDECREF(temp); 12805 } /* '%' */ 12806 } /* until end */ 12807 if (argidx < arglen && !dict) { 12808 PyErr_SetString(PyExc_TypeError, 12809 "not all arguments converted during string formatting"); 12810 goto onError; 12811 } 12812 12813 12814 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 12815 if (*res > max) 12816 max = *res; 12817 result = PyUnicode_New(reslen - rescnt, max); 12818 if (!result) 12819 goto onError; 12820 kind = PyUnicode_KIND(result); 12821 for (res = res0; res < res0+reslen-rescnt; res++) 12822 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 12823 PyMem_Free(res0); 12824 if (args_owned) { 12825 Py_DECREF(args); 12826 } 12827 Py_DECREF(uformat); 12828 return (PyObject *)result; 12829 12830 onError: 12831 PyMem_Free(res0); 12832 Py_DECREF(uformat); 12833 if (args_owned) { 12834 Py_DECREF(args); 12835 } 12836 return NULL; 12837} 12838 12839static PyObject * 12840unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 12841 12842static PyObject * 12843unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12844{ 12845 PyObject *x = NULL; 12846 static char *kwlist[] = {"object", "encoding", "errors", 0}; 12847 char *encoding = NULL; 12848 char *errors = NULL; 12849 12850 if (type != &PyUnicode_Type) 12851 return unicode_subtype_new(type, args, kwds); 12852 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 12853 kwlist, &x, &encoding, &errors)) 12854 return NULL; 12855 if (x == NULL) 12856 return (PyObject *)PyUnicode_New(0, 0); 12857 if (encoding == NULL && errors == NULL) 12858 return PyObject_Str(x); 12859 else 12860 return PyUnicode_FromEncodedObject(x, encoding, errors); 12861} 12862 12863static PyObject * 12864unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12865{ 12866 PyUnicodeObject *unicode, *self; 12867 Py_ssize_t length, char_size; 12868 int share_wstr, share_utf8; 12869 unsigned int kind; 12870 void *data; 12871 12872 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 12873 12874 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 12875 if (unicode == NULL) 12876 return NULL; 12877 assert(_PyUnicode_CHECK(unicode)); 12878 if (_PyUnicode_READY_REPLACE(&unicode)) 12879 return NULL; 12880 12881 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 12882 if (self == NULL) { 12883 Py_DECREF(unicode); 12884 return NULL; 12885 } 12886 kind = PyUnicode_KIND(unicode); 12887 length = PyUnicode_GET_LENGTH(unicode); 12888 12889 _PyUnicode_LENGTH(self) = length; 12890 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 12891 _PyUnicode_STATE(self).interned = 0; 12892 _PyUnicode_STATE(self).kind = kind; 12893 _PyUnicode_STATE(self).compact = 0; 12894 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 12895 _PyUnicode_STATE(self).ready = 1; 12896 _PyUnicode_WSTR(self) = NULL; 12897 _PyUnicode_UTF8_LENGTH(self) = 0; 12898 _PyUnicode_UTF8(self) = NULL; 12899 _PyUnicode_WSTR_LENGTH(self) = 0; 12900 _PyUnicode_DATA_ANY(self) = NULL; 12901 12902 share_utf8 = 0; 12903 share_wstr = 0; 12904 if (kind == PyUnicode_1BYTE_KIND) { 12905 char_size = 1; 12906 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 12907 share_utf8 = 1; 12908 } 12909 else if (kind == PyUnicode_2BYTE_KIND) { 12910 char_size = 2; 12911 if (sizeof(wchar_t) == 2) 12912 share_wstr = 1; 12913 } 12914 else { 12915 assert(kind == PyUnicode_4BYTE_KIND); 12916 char_size = 4; 12917 if (sizeof(wchar_t) == 4) 12918 share_wstr = 1; 12919 } 12920 12921 /* Ensure we won't overflow the length. */ 12922 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 12923 PyErr_NoMemory(); 12924 goto onError; 12925 } 12926 data = PyObject_MALLOC((length + 1) * char_size); 12927 if (data == NULL) { 12928 PyErr_NoMemory(); 12929 goto onError; 12930 } 12931 12932 _PyUnicode_DATA_ANY(self) = data; 12933 if (share_utf8) { 12934 _PyUnicode_UTF8_LENGTH(self) = length; 12935 _PyUnicode_UTF8(self) = data; 12936 } 12937 if (share_wstr) { 12938 _PyUnicode_WSTR_LENGTH(self) = length; 12939 _PyUnicode_WSTR(self) = (wchar_t *)data; 12940 } 12941 12942 Py_MEMCPY(data, PyUnicode_DATA(unicode), 12943 PyUnicode_KIND_SIZE(kind, length + 1)); 12944 Py_DECREF(unicode); 12945 return (PyObject *)self; 12946 12947onError: 12948 Py_DECREF(unicode); 12949 Py_DECREF(self); 12950 return NULL; 12951} 12952 12953PyDoc_STRVAR(unicode_doc, 12954 "str(string[, encoding[, errors]]) -> str\n\ 12955\n\ 12956Create a new string object from the given encoded string.\n\ 12957encoding defaults to the current default string encoding.\n\ 12958errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 12959 12960static PyObject *unicode_iter(PyObject *seq); 12961 12962PyTypeObject PyUnicode_Type = { 12963 PyVarObject_HEAD_INIT(&PyType_Type, 0) 12964 "str", /* tp_name */ 12965 sizeof(PyUnicodeObject), /* tp_size */ 12966 0, /* tp_itemsize */ 12967 /* Slots */ 12968 (destructor)unicode_dealloc, /* tp_dealloc */ 12969 0, /* tp_print */ 12970 0, /* tp_getattr */ 12971 0, /* tp_setattr */ 12972 0, /* tp_reserved */ 12973 unicode_repr, /* tp_repr */ 12974 &unicode_as_number, /* tp_as_number */ 12975 &unicode_as_sequence, /* tp_as_sequence */ 12976 &unicode_as_mapping, /* tp_as_mapping */ 12977 (hashfunc) unicode_hash, /* tp_hash*/ 12978 0, /* tp_call*/ 12979 (reprfunc) unicode_str, /* tp_str */ 12980 PyObject_GenericGetAttr, /* tp_getattro */ 12981 0, /* tp_setattro */ 12982 0, /* tp_as_buffer */ 12983 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 12984 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 12985 unicode_doc, /* tp_doc */ 12986 0, /* tp_traverse */ 12987 0, /* tp_clear */ 12988 PyUnicode_RichCompare, /* tp_richcompare */ 12989 0, /* tp_weaklistoffset */ 12990 unicode_iter, /* tp_iter */ 12991 0, /* tp_iternext */ 12992 unicode_methods, /* tp_methods */ 12993 0, /* tp_members */ 12994 0, /* tp_getset */ 12995 &PyBaseObject_Type, /* tp_base */ 12996 0, /* tp_dict */ 12997 0, /* tp_descr_get */ 12998 0, /* tp_descr_set */ 12999 0, /* tp_dictoffset */ 13000 0, /* tp_init */ 13001 0, /* tp_alloc */ 13002 unicode_new, /* tp_new */ 13003 PyObject_Del, /* tp_free */ 13004}; 13005 13006/* Initialize the Unicode implementation */ 13007 13008void _PyUnicode_Init(void) 13009{ 13010 int i; 13011 13012 /* XXX - move this array to unicodectype.c ? */ 13013 Py_UCS2 linebreak[] = { 13014 0x000A, /* LINE FEED */ 13015 0x000D, /* CARRIAGE RETURN */ 13016 0x001C, /* FILE SEPARATOR */ 13017 0x001D, /* GROUP SEPARATOR */ 13018 0x001E, /* RECORD SEPARATOR */ 13019 0x0085, /* NEXT LINE */ 13020 0x2028, /* LINE SEPARATOR */ 13021 0x2029, /* PARAGRAPH SEPARATOR */ 13022 }; 13023 13024 /* Init the implementation */ 13025 unicode_empty = PyUnicode_New(0, 0); 13026 if (!unicode_empty) 13027 Py_FatalError("Can't create empty string"); 13028 13029 for (i = 0; i < 256; i++) 13030 unicode_latin1[i] = NULL; 13031 if (PyType_Ready(&PyUnicode_Type) < 0) 13032 Py_FatalError("Can't initialize 'unicode'"); 13033 13034 /* initialize the linebreak bloom filter */ 13035 bloom_linebreak = make_bloom_mask( 13036 PyUnicode_2BYTE_KIND, linebreak, 13037 Py_ARRAY_LENGTH(linebreak)); 13038 13039 PyType_Ready(&EncodingMapType); 13040} 13041 13042/* Finalize the Unicode implementation */ 13043 13044int 13045PyUnicode_ClearFreeList(void) 13046{ 13047 return 0; 13048} 13049 13050void 13051_PyUnicode_Fini(void) 13052{ 13053 int i; 13054 13055 Py_XDECREF(unicode_empty); 13056 unicode_empty = NULL; 13057 13058 for (i = 0; i < 256; i++) { 13059 if (unicode_latin1[i]) { 13060 Py_DECREF(unicode_latin1[i]); 13061 unicode_latin1[i] = NULL; 13062 } 13063 } 13064 (void)PyUnicode_ClearFreeList(); 13065} 13066 13067void 13068PyUnicode_InternInPlace(PyObject **p) 13069{ 13070 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13071 PyObject *t; 13072#ifdef Py_DEBUG 13073 assert(s != NULL); 13074 assert(_PyUnicode_CHECK(s)); 13075#else 13076 if (s == NULL || !PyUnicode_Check(s)) 13077 return; 13078#endif 13079 /* If it's a subclass, we don't really know what putting 13080 it in the interned dict might do. */ 13081 if (!PyUnicode_CheckExact(s)) 13082 return; 13083 if (PyUnicode_CHECK_INTERNED(s)) 13084 return; 13085 if (_PyUnicode_READY_REPLACE(p)) { 13086 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace"); 13087 return; 13088 } 13089 s = (PyUnicodeObject *)(*p); 13090 if (interned == NULL) { 13091 interned = PyDict_New(); 13092 if (interned == NULL) { 13093 PyErr_Clear(); /* Don't leave an exception */ 13094 return; 13095 } 13096 } 13097 /* It might be that the GetItem call fails even 13098 though the key is present in the dictionary, 13099 namely when this happens during a stack overflow. */ 13100 Py_ALLOW_RECURSION 13101 t = PyDict_GetItem(interned, (PyObject *)s); 13102 Py_END_ALLOW_RECURSION 13103 13104 if (t) { 13105 Py_INCREF(t); 13106 Py_DECREF(*p); 13107 *p = t; 13108 return; 13109 } 13110 13111 PyThreadState_GET()->recursion_critical = 1; 13112 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13113 PyErr_Clear(); 13114 PyThreadState_GET()->recursion_critical = 0; 13115 return; 13116 } 13117 PyThreadState_GET()->recursion_critical = 0; 13118 /* The two references in interned are not counted by refcnt. 13119 The deallocator will take care of this */ 13120 Py_REFCNT(s) -= 2; 13121 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13122} 13123 13124void 13125PyUnicode_InternImmortal(PyObject **p) 13126{ 13127 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13128 13129 PyUnicode_InternInPlace(p); 13130 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13131 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13132 Py_INCREF(*p); 13133 } 13134} 13135 13136PyObject * 13137PyUnicode_InternFromString(const char *cp) 13138{ 13139 PyObject *s = PyUnicode_FromString(cp); 13140 if (s == NULL) 13141 return NULL; 13142 PyUnicode_InternInPlace(&s); 13143 return s; 13144} 13145 13146void 13147_Py_ReleaseInternedUnicodeStrings(void) 13148{ 13149 PyObject *keys; 13150 PyUnicodeObject *s; 13151 Py_ssize_t i, n; 13152 Py_ssize_t immortal_size = 0, mortal_size = 0; 13153 13154 if (interned == NULL || !PyDict_Check(interned)) 13155 return; 13156 keys = PyDict_Keys(interned); 13157 if (keys == NULL || !PyList_Check(keys)) { 13158 PyErr_Clear(); 13159 return; 13160 } 13161 13162 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13163 detector, interned unicode strings are not forcibly deallocated; 13164 rather, we give them their stolen references back, and then clear 13165 and DECREF the interned dict. */ 13166 13167 n = PyList_GET_SIZE(keys); 13168 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13169 n); 13170 for (i = 0; i < n; i++) { 13171 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13172 if (PyUnicode_READY(s) == -1) 13173 fprintf(stderr, "could not ready string\n"); 13174 switch (PyUnicode_CHECK_INTERNED(s)) { 13175 case SSTATE_NOT_INTERNED: 13176 /* XXX Shouldn't happen */ 13177 break; 13178 case SSTATE_INTERNED_IMMORTAL: 13179 Py_REFCNT(s) += 1; 13180 immortal_size += PyUnicode_GET_LENGTH(s); 13181 break; 13182 case SSTATE_INTERNED_MORTAL: 13183 Py_REFCNT(s) += 2; 13184 mortal_size += PyUnicode_GET_LENGTH(s); 13185 break; 13186 default: 13187 Py_FatalError("Inconsistent interned string state."); 13188 } 13189 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13190 } 13191 fprintf(stderr, "total size of all interned strings: " 13192 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13193 "mortal/immortal\n", mortal_size, immortal_size); 13194 Py_DECREF(keys); 13195 PyDict_Clear(interned); 13196 Py_DECREF(interned); 13197 interned = NULL; 13198} 13199 13200 13201/********************* Unicode Iterator **************************/ 13202 13203typedef struct { 13204 PyObject_HEAD 13205 Py_ssize_t it_index; 13206 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13207} unicodeiterobject; 13208 13209static void 13210unicodeiter_dealloc(unicodeiterobject *it) 13211{ 13212 _PyObject_GC_UNTRACK(it); 13213 Py_XDECREF(it->it_seq); 13214 PyObject_GC_Del(it); 13215} 13216 13217static int 13218unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13219{ 13220 Py_VISIT(it->it_seq); 13221 return 0; 13222} 13223 13224static PyObject * 13225unicodeiter_next(unicodeiterobject *it) 13226{ 13227 PyUnicodeObject *seq; 13228 PyObject *item; 13229 13230 assert(it != NULL); 13231 seq = it->it_seq; 13232 if (seq == NULL) 13233 return NULL; 13234 assert(_PyUnicode_CHECK(seq)); 13235 13236 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13237 int kind = PyUnicode_KIND(seq); 13238 void *data = PyUnicode_DATA(seq); 13239 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13240 item = PyUnicode_FromOrdinal(chr); 13241 if (item != NULL) 13242 ++it->it_index; 13243 return item; 13244 } 13245 13246 Py_DECREF(seq); 13247 it->it_seq = NULL; 13248 return NULL; 13249} 13250 13251static PyObject * 13252unicodeiter_len(unicodeiterobject *it) 13253{ 13254 Py_ssize_t len = 0; 13255 if (it->it_seq) 13256 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13257 return PyLong_FromSsize_t(len); 13258} 13259 13260PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13261 13262static PyMethodDef unicodeiter_methods[] = { 13263 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13264 length_hint_doc}, 13265 {NULL, NULL} /* sentinel */ 13266}; 13267 13268PyTypeObject PyUnicodeIter_Type = { 13269 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13270 "str_iterator", /* tp_name */ 13271 sizeof(unicodeiterobject), /* tp_basicsize */ 13272 0, /* tp_itemsize */ 13273 /* methods */ 13274 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13275 0, /* tp_print */ 13276 0, /* tp_getattr */ 13277 0, /* tp_setattr */ 13278 0, /* tp_reserved */ 13279 0, /* tp_repr */ 13280 0, /* tp_as_number */ 13281 0, /* tp_as_sequence */ 13282 0, /* tp_as_mapping */ 13283 0, /* tp_hash */ 13284 0, /* tp_call */ 13285 0, /* tp_str */ 13286 PyObject_GenericGetAttr, /* tp_getattro */ 13287 0, /* tp_setattro */ 13288 0, /* tp_as_buffer */ 13289 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13290 0, /* tp_doc */ 13291 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13292 0, /* tp_clear */ 13293 0, /* tp_richcompare */ 13294 0, /* tp_weaklistoffset */ 13295 PyObject_SelfIter, /* tp_iter */ 13296 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13297 unicodeiter_methods, /* tp_methods */ 13298 0, 13299}; 13300 13301static PyObject * 13302unicode_iter(PyObject *seq) 13303{ 13304 unicodeiterobject *it; 13305 13306 if (!PyUnicode_Check(seq)) { 13307 PyErr_BadInternalCall(); 13308 return NULL; 13309 } 13310 if (PyUnicode_READY(seq) == -1) 13311 return NULL; 13312 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13313 if (it == NULL) 13314 return NULL; 13315 it->it_index = 0; 13316 Py_INCREF(seq); 13317 it->it_seq = (PyUnicodeObject *)seq; 13318 _PyObject_GC_TRACK(it); 13319 return (PyObject *)it; 13320} 13321 13322#define UNIOP(x) Py_UNICODE_##x 13323#define UNIOP_t Py_UNICODE 13324#include "uniops.h" 13325#undef UNIOP 13326#undef UNIOP_t 13327#define UNIOP(x) Py_UCS4_##x 13328#define UNIOP_t Py_UCS4 13329#include "uniops.h" 13330#undef UNIOP 13331#undef UNIOP_t 13332 13333Py_UNICODE* 13334PyUnicode_AsUnicodeCopy(PyObject *object) 13335{ 13336 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13337 Py_UNICODE *copy; 13338 Py_ssize_t size; 13339 13340 if (!PyUnicode_Check(unicode)) { 13341 PyErr_BadArgument(); 13342 return NULL; 13343 } 13344 /* Ensure we won't overflow the size. */ 13345 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13346 PyErr_NoMemory(); 13347 return NULL; 13348 } 13349 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13350 size *= sizeof(Py_UNICODE); 13351 copy = PyMem_Malloc(size); 13352 if (copy == NULL) { 13353 PyErr_NoMemory(); 13354 return NULL; 13355 } 13356 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13357 return copy; 13358} 13359 13360/* A _string module, to export formatter_parser and formatter_field_name_split 13361 to the string.Formatter class implemented in Python. */ 13362 13363static PyMethodDef _string_methods[] = { 13364 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13365 METH_O, PyDoc_STR("split the argument as a field name")}, 13366 {"formatter_parser", (PyCFunction) formatter_parser, 13367 METH_O, PyDoc_STR("parse the argument as a format string")}, 13368 {NULL, NULL} 13369}; 13370 13371static struct PyModuleDef _string_module = { 13372 PyModuleDef_HEAD_INIT, 13373 "_string", 13374 PyDoc_STR("string helper module"), 13375 0, 13376 _string_methods, 13377 NULL, 13378 NULL, 13379 NULL, 13380 NULL 13381}; 13382 13383PyMODINIT_FUNC 13384PyInit__string(void) 13385{ 13386 return PyModule_Create(&_string_module); 13387} 13388 13389 13390#ifdef __cplusplus 13391} 13392#endif 13393