unicodeobject.c revision 9e9d689d85e60193494603e65bdbac7717187058
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Limit for the Unicode object free list */ 50 51#define PyUnicode_MAXFREELIST 1024 52 53/* Limit for the Unicode object free list stay alive optimization. 54 55 The implementation will keep allocated Unicode memory intact for 56 all objects on the free list having a size less than this 57 limit. This reduces malloc() overhead for small Unicode objects. 58 59 At worst this will result in PyUnicode_MAXFREELIST * 60 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 61 malloc()-overhead) bytes of unused garbage. 62 63 Setting the limit to 0 effectively turns the feature off. 64 65 Note: This is an experimental feature ! If you get core dumps when 66 using Unicode objects, turn this feature off. 67 68*/ 69 70#define KEEPALIVE_SIZE_LIMIT 9 71 72/* Endianness switches; defaults to little endian */ 73 74#ifdef WORDS_BIGENDIAN 75# define BYTEORDER_IS_BIG_ENDIAN 76#else 77# define BYTEORDER_IS_LITTLE_ENDIAN 78#endif 79 80/* --- Globals ------------------------------------------------------------ 81 82 The globals are initialized by the _PyUnicode_Init() API and should 83 not be used before calling that API. 84 85*/ 86 87 88#ifdef __cplusplus 89extern "C" { 90#endif 91 92#ifdef Py_DEBUG 93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) 94#else 95# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 96#endif 97 98#define _PyUnicode_UTF8(op) \ 99 (((PyCompactUnicodeObject*)(op))->utf8) 100#define PyUnicode_UTF8(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 assert(PyUnicode_IS_READY(op)), \ 103 PyUnicode_IS_COMPACT_ASCII(op) ? \ 104 ((char*)((PyASCIIObject*)(op) + 1)) : \ 105 _PyUnicode_UTF8(op)) 106#define _PyUnicode_UTF8_LENGTH(op) \ 107 (((PyCompactUnicodeObject*)(op))->utf8_length) 108#define PyUnicode_UTF8_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 assert(PyUnicode_IS_READY(op)), \ 111 PyUnicode_IS_COMPACT_ASCII(op) ? \ 112 ((PyASCIIObject*)(op))->length : \ 113 _PyUnicode_UTF8_LENGTH(op)) 114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr) 115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length) 116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length) 117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state) 118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash) 119#define _PyUnicode_KIND(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 ((PyASCIIObject *)(op))->state.kind) 122#define _PyUnicode_GET_LENGTH(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 ((PyASCIIObject *)(op))->length) 125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any) 126 127#undef PyUnicode_READY 128#define PyUnicode_READY(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 (PyUnicode_IS_READY(op) ? \ 131 0 : _PyUnicode_Ready((PyObject *)(op)))) 132 133#define _PyUnicode_READY_REPLACE(p_obj) \ 134 (assert(_PyUnicode_CHECK(*p_obj)), \ 135 (PyUnicode_IS_READY(*p_obj) ? \ 136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 137 138#define _PyUnicode_SHARE_UTF8(op) \ 139 (assert(_PyUnicode_CHECK(op)), \ 140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 142#define _PyUnicode_SHARE_WSTR(op) \ 143 (assert(_PyUnicode_CHECK(op)), \ 144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 145 146/* true if the Unicode object has an allocated UTF-8 memory block 147 (not shared with other data) */ 148#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 149 (assert(_PyUnicode_CHECK(op)), \ 150 (!PyUnicode_IS_COMPACT_ASCII(op) \ 151 && _PyUnicode_UTF8(op) \ 152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 153 154/* true if the Unicode object has an allocated wstr memory block 155 (not shared with other data) */ 156#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 157 (assert(_PyUnicode_CHECK(op)), \ 158 (_PyUnicode_WSTR(op) && \ 159 (!PyUnicode_IS_READY(op) || \ 160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 161 162/* Generic helper macro to convert characters of different types. 163 from_type and to_type have to be valid type names, begin and end 164 are pointers to the source characters which should be of type 165 "from_type *". to is a pointer of type "to_type *" and points to the 166 buffer where the result characters are written to. */ 167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 168 do { \ 169 const from_type *iter_; to_type *to_; \ 170 for (iter_ = (begin), to_ = (to_type *)(to); \ 171 iter_ < (end); \ 172 ++iter_, ++to_) { \ 173 *to_ = (to_type)*iter_; \ 174 } \ 175 } while (0) 176 177/* The Unicode string has been modified: reset the hash */ 178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 179 180/* This dictionary holds all interned unicode strings. Note that references 181 to strings in this dictionary are *not* counted in the string's ob_refcnt. 182 When the interned string reaches a refcnt of 0 the string deallocation 183 function will delete the reference from this dictionary. 184 185 Another way to look at this is that to say that the actual reference 186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 187*/ 188static PyObject *interned; 189 190/* The empty Unicode object is shared to improve performance. */ 191static PyObject *unicode_empty; 192 193/* Single character Unicode strings in the Latin-1 range are being 194 shared as well. */ 195static PyObject *unicode_latin1[256]; 196 197/* Fast detection of the most frequent whitespace characters */ 198const unsigned char _Py_ascii_whitespace[] = { 199 0, 0, 0, 0, 0, 0, 0, 0, 200/* case 0x0009: * CHARACTER TABULATION */ 201/* case 0x000A: * LINE FEED */ 202/* case 0x000B: * LINE TABULATION */ 203/* case 0x000C: * FORM FEED */ 204/* case 0x000D: * CARRIAGE RETURN */ 205 0, 1, 1, 1, 1, 1, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x001C: * FILE SEPARATOR */ 208/* case 0x001D: * GROUP SEPARATOR */ 209/* case 0x001E: * RECORD SEPARATOR */ 210/* case 0x001F: * UNIT SEPARATOR */ 211 0, 0, 0, 0, 1, 1, 1, 1, 212/* case 0x0020: * SPACE */ 213 1, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0 226}; 227 228/* forward */ 229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 230static PyObject* get_latin1_char(unsigned char ch); 231 232static PyObject * 233unicode_encode_call_errorhandler(const char *errors, 234 PyObject **errorHandler,const char *encoding, const char *reason, 235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 237 238static void 239raise_encode_exception(PyObject **exceptionObject, 240 const char *encoding, 241 const Py_UNICODE *unicode, Py_ssize_t size, 242 Py_ssize_t startpos, Py_ssize_t endpos, 243 const char *reason); 244 245/* Same for linebreaks */ 246static unsigned char ascii_linebreak[] = { 247 0, 0, 0, 0, 0, 0, 0, 0, 248/* 0x000A, * LINE FEED */ 249/* 0x000B, * LINE TABULATION */ 250/* 0x000C, * FORM FEED */ 251/* 0x000D, * CARRIAGE RETURN */ 252 0, 0, 1, 1, 1, 1, 0, 0, 253 0, 0, 0, 0, 0, 0, 0, 0, 254/* 0x001C, * FILE SEPARATOR */ 255/* 0x001D, * GROUP SEPARATOR */ 256/* 0x001E, * RECORD SEPARATOR */ 257 0, 0, 0, 0, 1, 1, 1, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261 0, 0, 0, 0, 0, 0, 0, 0, 262 263 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269 0, 0, 0, 0, 0, 0, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0 271}; 272 273/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 274 This function is kept for backward compatibility with the old API. */ 275Py_UNICODE 276PyUnicode_GetMax(void) 277{ 278#ifdef Py_UNICODE_WIDE 279 return 0x10FFFF; 280#else 281 /* This is actually an illegal character, so it should 282 not be passed to unichr. */ 283 return 0xFFFF; 284#endif 285} 286 287#ifdef Py_DEBUG 288static int 289_PyUnicode_CheckConsistency(void *op) 290{ 291 PyASCIIObject *ascii; 292 unsigned int kind; 293 294 assert(PyUnicode_Check(op)); 295 296 ascii = (PyASCIIObject *)op; 297 kind = ascii->state.kind; 298 299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 300 assert(kind == PyUnicode_1BYTE_KIND); 301 assert(ascii->state.ready == 1); 302 } 303 else if (ascii->state.compact == 1) { 304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 305 void *data; 306 assert(kind == PyUnicode_1BYTE_KIND 307 || kind == PyUnicode_2BYTE_KIND 308 || kind == PyUnicode_4BYTE_KIND); 309 assert(ascii->state.ascii == 0); 310 assert(ascii->state.ready == 1); 311 data = compact + 1; 312 assert (compact->utf8 != data); 313 if ( 314#if SIZEOF_WCHAR_T == 2 315 kind == PyUnicode_2BYTE_KIND 316#else 317 kind == PyUnicode_4BYTE_KIND 318#endif 319 ) 320 assert(ascii->wstr == data); 321 else 322 assert(ascii->wstr != data); 323 } else { 324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 325 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 326 327 if (kind == PyUnicode_WCHAR_KIND) { 328 assert(ascii->state.compact == 0); 329 assert(ascii->state.ascii == 0); 330 assert(ascii->state.ready == 0); 331 assert(ascii->wstr != NULL); 332 assert(unicode->data.any == NULL); 333 assert(compact->utf8 == NULL); 334 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 335 } 336 else { 337 assert(kind == PyUnicode_1BYTE_KIND 338 || kind == PyUnicode_2BYTE_KIND 339 || kind == PyUnicode_4BYTE_KIND); 340 assert(ascii->state.compact == 0); 341 assert(ascii->state.ready == 1); 342 assert(unicode->data.any != NULL); 343 if (ascii->state.ascii) 344 assert (compact->utf8 == unicode->data.any); 345 else 346 assert (compact->utf8 != unicode->data.any); 347 if ( 348#if SIZEOF_WCHAR_T == 2 349 kind == PyUnicode_2BYTE_KIND 350#else 351 kind == PyUnicode_4BYTE_KIND 352#endif 353 ) 354 assert(ascii->wstr == unicode->data.any); 355 else 356 assert(ascii->wstr != unicode->data.any); 357 } 358 } 359 return 1; 360} 361#endif 362 363/* --- Bloom Filters ----------------------------------------------------- */ 364 365/* stuff to implement simple "bloom filters" for Unicode characters. 366 to keep things simple, we use a single bitmask, using the least 5 367 bits from each unicode characters as the bit index. */ 368 369/* the linebreak mask is set up by Unicode_Init below */ 370 371#if LONG_BIT >= 128 372#define BLOOM_WIDTH 128 373#elif LONG_BIT >= 64 374#define BLOOM_WIDTH 64 375#elif LONG_BIT >= 32 376#define BLOOM_WIDTH 32 377#else 378#error "LONG_BIT is smaller than 32" 379#endif 380 381#define BLOOM_MASK unsigned long 382 383static BLOOM_MASK bloom_linebreak; 384 385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 386#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 387 388#define BLOOM_LINEBREAK(ch) \ 389 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 390 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 391 392Py_LOCAL_INLINE(BLOOM_MASK) 393make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 394{ 395 /* calculate simple bloom-style bitmask for a given unicode string */ 396 397 BLOOM_MASK mask; 398 Py_ssize_t i; 399 400 mask = 0; 401 for (i = 0; i < len; i++) 402 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 403 404 return mask; 405} 406 407#define BLOOM_MEMBER(mask, chr, str) \ 408 (BLOOM(mask, chr) \ 409 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 410 411/* --- Unicode Object ----------------------------------------------------- */ 412 413static PyObject * 414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); 415 416Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 417 Py_ssize_t size, Py_UCS4 ch, 418 int direction) 419{ 420 /* like wcschr, but doesn't stop at NULL characters */ 421 Py_ssize_t i; 422 if (direction == 1) { 423 for(i = 0; i < size; i++) 424 if (PyUnicode_READ(kind, s, i) == ch) 425 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 426 } 427 else { 428 for(i = size-1; i >= 0; i--) 429 if (PyUnicode_READ(kind, s, i) == ch) 430 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 431 } 432 return NULL; 433} 434 435static PyObject* 436resize_compact(PyObject *unicode, Py_ssize_t length) 437{ 438 Py_ssize_t char_size; 439 Py_ssize_t struct_size; 440 Py_ssize_t new_size; 441 int share_wstr; 442 443 assert(PyUnicode_IS_READY(unicode)); 444 char_size = PyUnicode_CHARACTER_SIZE(unicode); 445 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 446 struct_size = sizeof(PyASCIIObject); 447 else 448 struct_size = sizeof(PyCompactUnicodeObject); 449 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 450 451 _Py_DEC_REFTOTAL; 452 _Py_ForgetReference(unicode); 453 454 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 455 PyErr_NoMemory(); 456 return NULL; 457 } 458 new_size = (struct_size + (length + 1) * char_size); 459 460 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 461 if (unicode == NULL) { 462 PyObject_Del(unicode); 463 PyErr_NoMemory(); 464 return NULL; 465 } 466 _Py_NewReference(unicode); 467 _PyUnicode_LENGTH(unicode) = length; 468 if (share_wstr) { 469 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 470 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 471 _PyUnicode_WSTR_LENGTH(unicode) = length; 472 } 473 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 474 length, 0); 475 return unicode; 476} 477 478static int 479resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length) 480{ 481 void *oldstr; 482 483 assert(!PyUnicode_IS_COMPACT(unicode)); 484 485 assert(Py_REFCNT(unicode) == 1); 486 _PyUnicode_DIRTY(unicode); 487 488 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 489 { 490 PyObject_DEL(_PyUnicode_UTF8(unicode)); 491 _PyUnicode_UTF8(unicode) = NULL; 492 } 493 494 if (PyUnicode_IS_READY(unicode)) { 495 Py_ssize_t char_size; 496 Py_ssize_t new_size; 497 int share_wstr, share_utf8; 498 void *data; 499 500 data = _PyUnicode_DATA_ANY(unicode); 501 assert(data != NULL); 502 char_size = PyUnicode_CHARACTER_SIZE(unicode); 503 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 504 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 505 506 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 507 PyErr_NoMemory(); 508 return -1; 509 } 510 new_size = (length + 1) * char_size; 511 512 data = (PyObject *)PyObject_REALLOC(data, new_size); 513 if (data == NULL) { 514 PyErr_NoMemory(); 515 return -1; 516 } 517 _PyUnicode_DATA_ANY(unicode) = data; 518 if (share_wstr) { 519 _PyUnicode_WSTR(unicode) = data; 520 _PyUnicode_WSTR_LENGTH(unicode) = length; 521 } 522 if (share_utf8) { 523 _PyUnicode_UTF8(unicode) = data; 524 _PyUnicode_UTF8_LENGTH(unicode) = length; 525 } 526 _PyUnicode_LENGTH(unicode) = length; 527 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 528 if (share_wstr) 529 return 0; 530 } 531 if (_PyUnicode_WSTR(unicode) != NULL) { 532 assert(_PyUnicode_WSTR(unicode) != NULL); 533 534 oldstr = _PyUnicode_WSTR(unicode); 535 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), 536 sizeof(Py_UNICODE) * (length + 1)); 537 if (!_PyUnicode_WSTR(unicode)) { 538 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr; 539 PyErr_NoMemory(); 540 return -1; 541 } 542 _PyUnicode_WSTR(unicode)[length] = 0; 543 _PyUnicode_WSTR_LENGTH(unicode) = length; 544 } 545 return 0; 546} 547 548static PyObject* 549resize_copy(PyObject *unicode, Py_ssize_t length) 550{ 551 Py_ssize_t copy_length; 552 if (PyUnicode_IS_COMPACT(unicode)) { 553 PyObject *copy; 554 assert(PyUnicode_IS_READY(unicode)); 555 556 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 557 if (copy == NULL) 558 return NULL; 559 560 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 561 if (PyUnicode_CopyCharacters(copy, 0, 562 unicode, 0, 563 copy_length) < 0) 564 { 565 Py_DECREF(copy); 566 return NULL; 567 } 568 return copy; 569 } 570 else { 571 PyUnicodeObject *w; 572 assert(_PyUnicode_WSTR(unicode) != NULL); 573 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 574 w = _PyUnicode_New(length); 575 if (w == NULL) 576 return NULL; 577 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 578 copy_length = Py_MIN(copy_length, length); 579 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 580 copy_length); 581 return (PyObject*)w; 582 } 583} 584 585/* We allocate one more byte to make sure the string is 586 Ux0000 terminated; some code (e.g. new_identifier) 587 relies on that. 588 589 XXX This allocator could further be enhanced by assuring that the 590 free list never reduces its size below 1. 591 592*/ 593 594#ifdef Py_DEBUG 595int unicode_old_new_calls = 0; 596#endif 597 598static PyUnicodeObject * 599_PyUnicode_New(Py_ssize_t length) 600{ 601 register PyUnicodeObject *unicode; 602 size_t new_size; 603 604 /* Optimization for empty strings */ 605 if (length == 0 && unicode_empty != NULL) { 606 Py_INCREF(unicode_empty); 607 return (PyUnicodeObject*)unicode_empty; 608 } 609 610 /* Ensure we won't overflow the size. */ 611 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 612 return (PyUnicodeObject *)PyErr_NoMemory(); 613 } 614 if (length < 0) { 615 PyErr_SetString(PyExc_SystemError, 616 "Negative size passed to _PyUnicode_New"); 617 return NULL; 618 } 619 620#ifdef Py_DEBUG 621 ++unicode_old_new_calls; 622#endif 623 624 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 625 if (unicode == NULL) 626 return NULL; 627 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 628 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 629 if (!_PyUnicode_WSTR(unicode)) { 630 PyErr_NoMemory(); 631 goto onError; 632 } 633 634 /* Initialize the first element to guard against cases where 635 * the caller fails before initializing str -- unicode_resize() 636 * reads str[0], and the Keep-Alive optimization can keep memory 637 * allocated for str alive across a call to unicode_dealloc(unicode). 638 * We don't want unicode_resize to read uninitialized memory in 639 * that case. 640 */ 641 _PyUnicode_WSTR(unicode)[0] = 0; 642 _PyUnicode_WSTR(unicode)[length] = 0; 643 _PyUnicode_WSTR_LENGTH(unicode) = length; 644 _PyUnicode_HASH(unicode) = -1; 645 _PyUnicode_STATE(unicode).interned = 0; 646 _PyUnicode_STATE(unicode).kind = 0; 647 _PyUnicode_STATE(unicode).compact = 0; 648 _PyUnicode_STATE(unicode).ready = 0; 649 _PyUnicode_STATE(unicode).ascii = 0; 650 _PyUnicode_DATA_ANY(unicode) = NULL; 651 _PyUnicode_LENGTH(unicode) = 0; 652 _PyUnicode_UTF8(unicode) = NULL; 653 _PyUnicode_UTF8_LENGTH(unicode) = 0; 654 return unicode; 655 656 onError: 657 /* XXX UNREF/NEWREF interface should be more symmetrical */ 658 _Py_DEC_REFTOTAL; 659 _Py_ForgetReference((PyObject *)unicode); 660 PyObject_Del(unicode); 661 return NULL; 662} 663 664static const char* 665unicode_kind_name(PyObject *unicode) 666{ 667 /* don't check consistency: unicode_kind_name() is called from 668 _PyUnicode_Dump() */ 669 if (!PyUnicode_IS_COMPACT(unicode)) 670 { 671 if (!PyUnicode_IS_READY(unicode)) 672 return "wstr"; 673 switch(PyUnicode_KIND(unicode)) 674 { 675 case PyUnicode_1BYTE_KIND: 676 if (PyUnicode_IS_ASCII(unicode)) 677 return "legacy ascii"; 678 else 679 return "legacy latin1"; 680 case PyUnicode_2BYTE_KIND: 681 return "legacy UCS2"; 682 case PyUnicode_4BYTE_KIND: 683 return "legacy UCS4"; 684 default: 685 return "<legacy invalid kind>"; 686 } 687 } 688 assert(PyUnicode_IS_READY(unicode)); 689 switch(PyUnicode_KIND(unicode)) 690 { 691 case PyUnicode_1BYTE_KIND: 692 if (PyUnicode_IS_ASCII(unicode)) 693 return "ascii"; 694 else 695 return "latin1"; 696 case PyUnicode_2BYTE_KIND: 697 return "UCS2"; 698 case PyUnicode_4BYTE_KIND: 699 return "UCS4"; 700 default: 701 return "<invalid compact kind>"; 702 } 703} 704 705#ifdef Py_DEBUG 706int unicode_new_new_calls = 0; 707 708/* Functions wrapping macros for use in debugger */ 709char *_PyUnicode_utf8(void *unicode){ 710 return PyUnicode_UTF8(unicode); 711} 712 713void *_PyUnicode_compact_data(void *unicode) { 714 return _PyUnicode_COMPACT_DATA(unicode); 715} 716void *_PyUnicode_data(void *unicode){ 717 printf("obj %p\n", unicode); 718 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 719 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 720 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 721 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 722 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 723 return PyUnicode_DATA(unicode); 724} 725 726void 727_PyUnicode_Dump(PyObject *op) 728{ 729 PyASCIIObject *ascii = (PyASCIIObject *)op; 730 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 731 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 732 void *data; 733 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 734 if (ascii->state.compact) 735 data = (compact + 1); 736 else 737 data = unicode->data.any; 738 if (ascii->wstr == data) 739 printf("shared "); 740 printf("wstr=%p", ascii->wstr); 741 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 742 printf(" (%zu), ", compact->wstr_length); 743 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 744 printf("shared "); 745 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 746 } 747 printf(", data=%p\n", data); 748} 749#endif 750 751PyObject * 752PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 753{ 754 PyObject *obj; 755 PyCompactUnicodeObject *unicode; 756 void *data; 757 int kind_state; 758 int is_sharing, is_ascii; 759 Py_ssize_t char_size; 760 Py_ssize_t struct_size; 761 762 /* Optimization for empty strings */ 763 if (size == 0 && unicode_empty != NULL) { 764 Py_INCREF(unicode_empty); 765 return unicode_empty; 766 } 767 768#ifdef Py_DEBUG 769 ++unicode_new_new_calls; 770#endif 771 772 is_ascii = 0; 773 is_sharing = 0; 774 struct_size = sizeof(PyCompactUnicodeObject); 775 if (maxchar < 128) { 776 kind_state = PyUnicode_1BYTE_KIND; 777 char_size = 1; 778 is_ascii = 1; 779 struct_size = sizeof(PyASCIIObject); 780 } 781 else if (maxchar < 256) { 782 kind_state = PyUnicode_1BYTE_KIND; 783 char_size = 1; 784 } 785 else if (maxchar < 65536) { 786 kind_state = PyUnicode_2BYTE_KIND; 787 char_size = 2; 788 if (sizeof(wchar_t) == 2) 789 is_sharing = 1; 790 } 791 else { 792 kind_state = PyUnicode_4BYTE_KIND; 793 char_size = 4; 794 if (sizeof(wchar_t) == 4) 795 is_sharing = 1; 796 } 797 798 /* Ensure we won't overflow the size. */ 799 if (size < 0) { 800 PyErr_SetString(PyExc_SystemError, 801 "Negative size passed to PyUnicode_New"); 802 return NULL; 803 } 804 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 805 return PyErr_NoMemory(); 806 807 /* Duplicated allocation code from _PyObject_New() instead of a call to 808 * PyObject_New() so we are able to allocate space for the object and 809 * it's data buffer. 810 */ 811 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 812 if (obj == NULL) 813 return PyErr_NoMemory(); 814 obj = PyObject_INIT(obj, &PyUnicode_Type); 815 if (obj == NULL) 816 return NULL; 817 818 unicode = (PyCompactUnicodeObject *)obj; 819 if (is_ascii) 820 data = ((PyASCIIObject*)obj) + 1; 821 else 822 data = unicode + 1; 823 _PyUnicode_LENGTH(unicode) = size; 824 _PyUnicode_HASH(unicode) = -1; 825 _PyUnicode_STATE(unicode).interned = 0; 826 _PyUnicode_STATE(unicode).kind = kind_state; 827 _PyUnicode_STATE(unicode).compact = 1; 828 _PyUnicode_STATE(unicode).ready = 1; 829 _PyUnicode_STATE(unicode).ascii = is_ascii; 830 if (is_ascii) { 831 ((char*)data)[size] = 0; 832 _PyUnicode_WSTR(unicode) = NULL; 833 } 834 else if (kind_state == PyUnicode_1BYTE_KIND) { 835 ((char*)data)[size] = 0; 836 _PyUnicode_WSTR(unicode) = NULL; 837 _PyUnicode_WSTR_LENGTH(unicode) = 0; 838 unicode->utf8 = NULL; 839 unicode->utf8_length = 0; 840 } 841 else { 842 unicode->utf8 = NULL; 843 unicode->utf8_length = 0; 844 if (kind_state == PyUnicode_2BYTE_KIND) 845 ((Py_UCS2*)data)[size] = 0; 846 else /* kind_state == PyUnicode_4BYTE_KIND */ 847 ((Py_UCS4*)data)[size] = 0; 848 if (is_sharing) { 849 _PyUnicode_WSTR_LENGTH(unicode) = size; 850 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 851 } 852 else { 853 _PyUnicode_WSTR_LENGTH(unicode) = 0; 854 _PyUnicode_WSTR(unicode) = NULL; 855 } 856 } 857 return obj; 858} 859 860#if SIZEOF_WCHAR_T == 2 861/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 862 will decode surrogate pairs, the other conversions are implemented as macros 863 for efficency. 864 865 This function assumes that unicode can hold one more code point than wstr 866 characters for a terminating null character. */ 867static void 868unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 869 PyUnicodeObject *unicode) 870{ 871 const wchar_t *iter; 872 Py_UCS4 *ucs4_out; 873 874 assert(unicode != NULL); 875 assert(_PyUnicode_CHECK(unicode)); 876 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 877 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 878 879 for (iter = begin; iter < end; ) { 880 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 881 _PyUnicode_GET_LENGTH(unicode))); 882 if (*iter >= 0xD800 && *iter <= 0xDBFF 883 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 884 { 885 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 886 iter += 2; 887 } 888 else { 889 *ucs4_out++ = *iter; 890 iter++; 891 } 892 } 893 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 894 _PyUnicode_GET_LENGTH(unicode))); 895 896} 897#endif 898 899static int 900_PyUnicode_Dirty(PyObject *unicode) 901{ 902 assert(_PyUnicode_CHECK(unicode)); 903 if (Py_REFCNT(unicode) != 1) { 904 PyErr_SetString(PyExc_SystemError, 905 "Cannot modify a string having more than 1 reference"); 906 return -1; 907 } 908 _PyUnicode_DIRTY(unicode); 909 return 0; 910} 911 912Py_ssize_t 913PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 914 PyObject *from, Py_ssize_t from_start, 915 Py_ssize_t how_many) 916{ 917 unsigned int from_kind, to_kind; 918 void *from_data, *to_data; 919 920 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 921 PyErr_BadInternalCall(); 922 return -1; 923 } 924 925 if (PyUnicode_READY(from)) 926 return -1; 927 if (PyUnicode_READY(to)) 928 return -1; 929 930 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 931 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 932 PyErr_Format(PyExc_SystemError, 933 "Cannot write %zi characters at %zi " 934 "in a string of %zi characters", 935 how_many, to_start, PyUnicode_GET_LENGTH(to)); 936 return -1; 937 } 938 if (how_many == 0) 939 return 0; 940 941 if (_PyUnicode_Dirty(to)) 942 return -1; 943 944 from_kind = PyUnicode_KIND(from); 945 from_data = PyUnicode_DATA(from); 946 to_kind = PyUnicode_KIND(to); 947 to_data = PyUnicode_DATA(to); 948 949 if (from_kind == to_kind 950 /* deny latin1 => ascii */ 951 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from)) 952 { 953 Py_MEMCPY((char*)to_data 954 + PyUnicode_KIND_SIZE(to_kind, to_start), 955 (char*)from_data 956 + PyUnicode_KIND_SIZE(from_kind, from_start), 957 PyUnicode_KIND_SIZE(to_kind, how_many)); 958 } 959 else if (from_kind == PyUnicode_1BYTE_KIND 960 && to_kind == PyUnicode_2BYTE_KIND) 961 { 962 _PyUnicode_CONVERT_BYTES( 963 Py_UCS1, Py_UCS2, 964 PyUnicode_1BYTE_DATA(from) + from_start, 965 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 966 PyUnicode_2BYTE_DATA(to) + to_start 967 ); 968 } 969 else if (from_kind == PyUnicode_1BYTE_KIND 970 && to_kind == PyUnicode_4BYTE_KIND) 971 { 972 _PyUnicode_CONVERT_BYTES( 973 Py_UCS1, Py_UCS4, 974 PyUnicode_1BYTE_DATA(from) + from_start, 975 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 976 PyUnicode_4BYTE_DATA(to) + to_start 977 ); 978 } 979 else if (from_kind == PyUnicode_2BYTE_KIND 980 && to_kind == PyUnicode_4BYTE_KIND) 981 { 982 _PyUnicode_CONVERT_BYTES( 983 Py_UCS2, Py_UCS4, 984 PyUnicode_2BYTE_DATA(from) + from_start, 985 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 986 PyUnicode_4BYTE_DATA(to) + to_start 987 ); 988 } 989 else { 990 int invalid_kinds; 991 992 /* check if max_char(from substring) <= max_char(to) */ 993 if (from_kind > to_kind 994 /* latin1 => ascii */ 995 || (PyUnicode_IS_ASCII(to) 996 && to_kind == PyUnicode_1BYTE_KIND 997 && !PyUnicode_IS_ASCII(from))) 998 { 999 /* slow path to check for character overflow */ 1000 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1001 Py_UCS4 ch, maxchar; 1002 Py_ssize_t i; 1003 1004 maxchar = 0; 1005 invalid_kinds = 0; 1006 for (i=0; i < how_many; i++) { 1007 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1008 if (ch > maxchar) { 1009 maxchar = ch; 1010 if (maxchar > to_maxchar) { 1011 invalid_kinds = 1; 1012 break; 1013 } 1014 } 1015 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1016 } 1017 } 1018 else 1019 invalid_kinds = 1; 1020 if (invalid_kinds) { 1021 PyErr_Format(PyExc_SystemError, 1022 "Cannot copy %s characters " 1023 "into a string of %s characters", 1024 unicode_kind_name(from), 1025 unicode_kind_name(to)); 1026 return -1; 1027 } 1028 } 1029 return how_many; 1030} 1031 1032/* Find the maximum code point and count the number of surrogate pairs so a 1033 correct string length can be computed before converting a string to UCS4. 1034 This function counts single surrogates as a character and not as a pair. 1035 1036 Return 0 on success, or -1 on error. */ 1037static int 1038find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1039 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1040{ 1041 const wchar_t *iter; 1042 1043 assert(num_surrogates != NULL && maxchar != NULL); 1044 if (num_surrogates == NULL || maxchar == NULL) { 1045 PyErr_SetString(PyExc_SystemError, 1046 "unexpected NULL arguments to " 1047 "PyUnicode_FindMaxCharAndNumSurrogatePairs"); 1048 return -1; 1049 } 1050 1051 *num_surrogates = 0; 1052 *maxchar = 0; 1053 1054 for (iter = begin; iter < end; ) { 1055 if (*iter > *maxchar) 1056 *maxchar = *iter; 1057#if SIZEOF_WCHAR_T == 2 1058 if (*iter >= 0xD800 && *iter <= 0xDBFF 1059 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1060 { 1061 Py_UCS4 surrogate_val; 1062 surrogate_val = (((iter[0] & 0x3FF)<<10) 1063 | (iter[1] & 0x3FF)) + 0x10000; 1064 ++(*num_surrogates); 1065 if (surrogate_val > *maxchar) 1066 *maxchar = surrogate_val; 1067 iter += 2; 1068 } 1069 else 1070 iter++; 1071#else 1072 iter++; 1073#endif 1074 } 1075 return 0; 1076} 1077 1078#ifdef Py_DEBUG 1079int unicode_ready_calls = 0; 1080#endif 1081 1082static int 1083unicode_ready(PyObject **p_obj, int replace) 1084{ 1085 PyUnicodeObject *unicode; 1086 wchar_t *end; 1087 Py_UCS4 maxchar = 0; 1088 Py_ssize_t num_surrogates; 1089#if SIZEOF_WCHAR_T == 2 1090 Py_ssize_t length_wo_surrogates; 1091#endif 1092 1093 assert(p_obj != NULL); 1094 unicode = (PyUnicodeObject *)*p_obj; 1095 1096 /* _PyUnicode_Ready() is only intented for old-style API usage where 1097 strings were created using _PyObject_New() and where no canonical 1098 representation (the str field) has been set yet aka strings 1099 which are not yet ready. */ 1100 assert(_PyUnicode_CHECK(unicode)); 1101 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1102 assert(_PyUnicode_WSTR(unicode) != NULL); 1103 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1104 assert(_PyUnicode_UTF8(unicode) == NULL); 1105 /* Actually, it should neither be interned nor be anything else: */ 1106 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1107 1108#ifdef Py_DEBUG 1109 ++unicode_ready_calls; 1110#endif 1111 1112#ifdef Py_DEBUG 1113 assert(!replace || Py_REFCNT(unicode) == 1); 1114#else 1115 if (replace && Py_REFCNT(unicode) != 1) 1116 replace = 0; 1117#endif 1118 if (replace) { 1119 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1120 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1121 /* Optimization for empty strings */ 1122 if (len == 0) { 1123 Py_INCREF(unicode_empty); 1124 Py_DECREF(*p_obj); 1125 *p_obj = unicode_empty; 1126 return 0; 1127 } 1128 if (len == 1 && wstr[0] < 256) { 1129 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1130 if (latin1_char == NULL) 1131 return -1; 1132 Py_DECREF(*p_obj); 1133 *p_obj = latin1_char; 1134 return 0; 1135 } 1136 } 1137 1138 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1139 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1140 &maxchar, &num_surrogates) == -1) 1141 return -1; 1142 1143 if (maxchar < 256) { 1144 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1145 if (!_PyUnicode_DATA_ANY(unicode)) { 1146 PyErr_NoMemory(); 1147 return -1; 1148 } 1149 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1150 _PyUnicode_WSTR(unicode), end, 1151 PyUnicode_1BYTE_DATA(unicode)); 1152 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1153 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1154 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1155 if (maxchar < 128) { 1156 _PyUnicode_STATE(unicode).ascii = 1; 1157 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1158 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1159 } 1160 else { 1161 _PyUnicode_STATE(unicode).ascii = 0; 1162 _PyUnicode_UTF8(unicode) = NULL; 1163 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1164 } 1165 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1166 _PyUnicode_WSTR(unicode) = NULL; 1167 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1168 } 1169 /* In this case we might have to convert down from 4-byte native 1170 wchar_t to 2-byte unicode. */ 1171 else if (maxchar < 65536) { 1172 assert(num_surrogates == 0 && 1173 "FindMaxCharAndNumSurrogatePairs() messed up"); 1174 1175#if SIZEOF_WCHAR_T == 2 1176 /* We can share representations and are done. */ 1177 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1178 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1179 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1180 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1181 _PyUnicode_UTF8(unicode) = NULL; 1182 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1183#else 1184 /* sizeof(wchar_t) == 4 */ 1185 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1186 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1187 if (!_PyUnicode_DATA_ANY(unicode)) { 1188 PyErr_NoMemory(); 1189 return -1; 1190 } 1191 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1192 _PyUnicode_WSTR(unicode), end, 1193 PyUnicode_2BYTE_DATA(unicode)); 1194 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1195 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1196 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1197 _PyUnicode_UTF8(unicode) = NULL; 1198 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1199 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1200 _PyUnicode_WSTR(unicode) = NULL; 1201 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1202#endif 1203 } 1204 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1205 else { 1206#if SIZEOF_WCHAR_T == 2 1207 /* in case the native representation is 2-bytes, we need to allocate a 1208 new normalized 4-byte version. */ 1209 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1210 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1211 if (!_PyUnicode_DATA_ANY(unicode)) { 1212 PyErr_NoMemory(); 1213 return -1; 1214 } 1215 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1216 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1217 _PyUnicode_UTF8(unicode) = NULL; 1218 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1219 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1220 _PyUnicode_STATE(unicode).ready = 1; 1221 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1222 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1223 _PyUnicode_WSTR(unicode) = NULL; 1224 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1225#else 1226 assert(num_surrogates == 0); 1227 1228 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1229 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1230 _PyUnicode_UTF8(unicode) = NULL; 1231 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1232 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1233#endif 1234 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1235 } 1236 _PyUnicode_STATE(unicode).ready = 1; 1237 return 0; 1238} 1239 1240int 1241_PyUnicode_ReadyReplace(PyObject **op) 1242{ 1243 return unicode_ready(op, 1); 1244} 1245 1246int 1247_PyUnicode_Ready(PyObject *op) 1248{ 1249 return unicode_ready(&op, 0); 1250} 1251 1252static void 1253unicode_dealloc(register PyUnicodeObject *unicode) 1254{ 1255 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1256 case SSTATE_NOT_INTERNED: 1257 break; 1258 1259 case SSTATE_INTERNED_MORTAL: 1260 /* revive dead object temporarily for DelItem */ 1261 Py_REFCNT(unicode) = 3; 1262 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1263 Py_FatalError( 1264 "deletion of interned string failed"); 1265 break; 1266 1267 case SSTATE_INTERNED_IMMORTAL: 1268 Py_FatalError("Immortal interned string died."); 1269 1270 default: 1271 Py_FatalError("Inconsistent interned string state."); 1272 } 1273 1274 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1275 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1276 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1277 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1278 1279 if (PyUnicode_IS_COMPACT(unicode)) { 1280 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1281 } 1282 else { 1283 if (_PyUnicode_DATA_ANY(unicode)) 1284 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1285 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1286 } 1287} 1288 1289static int 1290unicode_resizable(PyObject *unicode) 1291{ 1292 if (Py_REFCNT(unicode) != 1) 1293 return 0; 1294 if (PyUnicode_CHECK_INTERNED(unicode)) 1295 return 0; 1296 assert (unicode != unicode_empty); 1297#ifdef Py_DEBUG 1298 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND 1299 && PyUnicode_GET_LENGTH(unicode) == 1) 1300 { 1301 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1302 if (ch < 256 && unicode_latin1[ch] == unicode) 1303 return 0; 1304 } 1305#endif 1306 return 1; 1307} 1308 1309static int 1310unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1311{ 1312 PyObject *unicode; 1313 Py_ssize_t old_length; 1314 1315 assert(p_unicode != NULL); 1316 unicode = *p_unicode; 1317 1318 assert(unicode != NULL); 1319 assert(PyUnicode_Check(unicode)); 1320 assert(0 <= length); 1321 1322 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1323 old_length = PyUnicode_WSTR_LENGTH(unicode); 1324 else 1325 old_length = PyUnicode_GET_LENGTH(unicode); 1326 if (old_length == length) 1327 return 0; 1328 1329 if (!unicode_resizable(unicode)) { 1330 PyObject *copy = resize_copy(unicode, length); 1331 if (copy == NULL) 1332 return -1; 1333 Py_DECREF(*p_unicode); 1334 *p_unicode = copy; 1335 return 0; 1336 } 1337 1338 if (PyUnicode_IS_COMPACT(unicode)) { 1339 *p_unicode = resize_compact(unicode, length); 1340 if (*p_unicode == NULL) 1341 return -1; 1342 return 0; 1343 } else 1344 return resize_inplace((PyUnicodeObject*)unicode, length); 1345} 1346 1347int 1348PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1349{ 1350 PyObject *unicode; 1351 if (p_unicode == NULL) { 1352 PyErr_BadInternalCall(); 1353 return -1; 1354 } 1355 unicode = *p_unicode; 1356 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1357 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1358 { 1359 PyErr_BadInternalCall(); 1360 return -1; 1361 } 1362 return unicode_resize(p_unicode, length); 1363} 1364 1365static PyObject* 1366get_latin1_char(unsigned char ch) 1367{ 1368 PyObject *unicode = unicode_latin1[ch]; 1369 if (!unicode) { 1370 unicode = PyUnicode_New(1, ch); 1371 if (!unicode) 1372 return NULL; 1373 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1374 unicode_latin1[ch] = unicode; 1375 } 1376 Py_INCREF(unicode); 1377 return unicode; 1378} 1379 1380PyObject * 1381PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1382{ 1383 PyUnicodeObject *unicode; 1384 Py_UCS4 maxchar = 0; 1385 Py_ssize_t num_surrogates; 1386 1387 if (u == NULL) 1388 return (PyObject*)_PyUnicode_New(size); 1389 1390 /* If the Unicode data is known at construction time, we can apply 1391 some optimizations which share commonly used objects. */ 1392 1393 /* Optimization for empty strings */ 1394 if (size == 0 && unicode_empty != NULL) { 1395 Py_INCREF(unicode_empty); 1396 return unicode_empty; 1397 } 1398 1399 /* Single character Unicode objects in the Latin-1 range are 1400 shared when using this constructor */ 1401 if (size == 1 && *u < 256) 1402 return get_latin1_char((unsigned char)*u); 1403 1404 /* If not empty and not single character, copy the Unicode data 1405 into the new object */ 1406 if (find_maxchar_surrogates(u, u + size, 1407 &maxchar, &num_surrogates) == -1) 1408 return NULL; 1409 1410 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1411 maxchar); 1412 if (!unicode) 1413 return NULL; 1414 1415 switch (PyUnicode_KIND(unicode)) { 1416 case PyUnicode_1BYTE_KIND: 1417 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1418 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1419 break; 1420 case PyUnicode_2BYTE_KIND: 1421#if Py_UNICODE_SIZE == 2 1422 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1423#else 1424 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1425 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1426#endif 1427 break; 1428 case PyUnicode_4BYTE_KIND: 1429#if SIZEOF_WCHAR_T == 2 1430 /* This is the only case which has to process surrogates, thus 1431 a simple copy loop is not enough and we need a function. */ 1432 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1433#else 1434 assert(num_surrogates == 0); 1435 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1436#endif 1437 break; 1438 default: 1439 assert(0 && "Impossible state"); 1440 } 1441 1442 return (PyObject *)unicode; 1443} 1444 1445PyObject * 1446PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1447{ 1448 PyUnicodeObject *unicode; 1449 1450 if (size < 0) { 1451 PyErr_SetString(PyExc_SystemError, 1452 "Negative size passed to PyUnicode_FromStringAndSize"); 1453 return NULL; 1454 } 1455 1456 /* If the Unicode data is known at construction time, we can apply 1457 some optimizations which share commonly used objects. 1458 Also, this means the input must be UTF-8, so fall back to the 1459 UTF-8 decoder at the end. */ 1460 if (u != NULL) { 1461 1462 /* Optimization for empty strings */ 1463 if (size == 0 && unicode_empty != NULL) { 1464 Py_INCREF(unicode_empty); 1465 return unicode_empty; 1466 } 1467 1468 /* Single characters are shared when using this constructor. 1469 Restrict to ASCII, since the input must be UTF-8. */ 1470 if (size == 1 && Py_CHARMASK(*u) < 128) 1471 return get_latin1_char(Py_CHARMASK(*u)); 1472 1473 return PyUnicode_DecodeUTF8(u, size, NULL); 1474 } 1475 1476 unicode = _PyUnicode_New(size); 1477 if (!unicode) 1478 return NULL; 1479 1480 return (PyObject *)unicode; 1481} 1482 1483PyObject * 1484PyUnicode_FromString(const char *u) 1485{ 1486 size_t size = strlen(u); 1487 if (size > PY_SSIZE_T_MAX) { 1488 PyErr_SetString(PyExc_OverflowError, "input too long"); 1489 return NULL; 1490 } 1491 1492 return PyUnicode_FromStringAndSize(u, size); 1493} 1494 1495static PyObject* 1496_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1497{ 1498 PyObject *res; 1499 unsigned char max = 127; 1500 Py_ssize_t i; 1501 for (i = 0; i < size; i++) { 1502 if (u[i] & 0x80) { 1503 max = 255; 1504 break; 1505 } 1506 } 1507 res = PyUnicode_New(size, max); 1508 if (!res) 1509 return NULL; 1510 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1511 return res; 1512} 1513 1514static PyObject* 1515_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1516{ 1517 PyObject *res; 1518 Py_UCS2 max = 0; 1519 Py_ssize_t i; 1520 for (i = 0; i < size; i++) 1521 if (u[i] > max) 1522 max = u[i]; 1523 res = PyUnicode_New(size, max); 1524 if (!res) 1525 return NULL; 1526 if (max >= 256) 1527 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1528 else 1529 for (i = 0; i < size; i++) 1530 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1531 return res; 1532} 1533 1534static PyObject* 1535_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1536{ 1537 PyObject *res; 1538 Py_UCS4 max = 0; 1539 Py_ssize_t i; 1540 for (i = 0; i < size; i++) 1541 if (u[i] > max) 1542 max = u[i]; 1543 res = PyUnicode_New(size, max); 1544 if (!res) 1545 return NULL; 1546 if (max >= 0x10000) 1547 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1548 else { 1549 int kind = PyUnicode_KIND(res); 1550 void *data = PyUnicode_DATA(res); 1551 for (i = 0; i < size; i++) 1552 PyUnicode_WRITE(kind, data, i, u[i]); 1553 } 1554 return res; 1555} 1556 1557PyObject* 1558PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1559{ 1560 switch(kind) { 1561 case PyUnicode_1BYTE_KIND: 1562 return _PyUnicode_FromUCS1(buffer, size); 1563 case PyUnicode_2BYTE_KIND: 1564 return _PyUnicode_FromUCS2(buffer, size); 1565 case PyUnicode_4BYTE_KIND: 1566 return _PyUnicode_FromUCS4(buffer, size); 1567 } 1568 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1569 return NULL; 1570} 1571 1572PyObject* 1573PyUnicode_Copy(PyObject *unicode) 1574{ 1575 Py_ssize_t size; 1576 PyObject *copy; 1577 void *data; 1578 1579 if (!PyUnicode_Check(unicode)) { 1580 PyErr_BadInternalCall(); 1581 return NULL; 1582 } 1583 if (PyUnicode_READY(unicode)) 1584 return NULL; 1585 1586 size = PyUnicode_GET_LENGTH(unicode); 1587 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1588 if (!copy) 1589 return NULL; 1590 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1591 1592 data = PyUnicode_DATA(unicode); 1593 switch (PyUnicode_KIND(unicode)) 1594 { 1595 case PyUnicode_1BYTE_KIND: 1596 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1597 break; 1598 case PyUnicode_2BYTE_KIND: 1599 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1600 break; 1601 case PyUnicode_4BYTE_KIND: 1602 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1603 break; 1604 default: 1605 assert(0); 1606 break; 1607 } 1608 return copy; 1609} 1610 1611 1612/* Widen Unicode objects to larger buffers. Don't write terminating null 1613 character. Return NULL on error. */ 1614 1615void* 1616_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1617{ 1618 Py_ssize_t len; 1619 void *result; 1620 unsigned int skind; 1621 1622 if (PyUnicode_READY(s)) 1623 return NULL; 1624 1625 len = PyUnicode_GET_LENGTH(s); 1626 skind = PyUnicode_KIND(s); 1627 if (skind >= kind) { 1628 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1629 return NULL; 1630 } 1631 switch(kind) { 1632 case PyUnicode_2BYTE_KIND: 1633 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1634 if (!result) 1635 return PyErr_NoMemory(); 1636 assert(skind == PyUnicode_1BYTE_KIND); 1637 _PyUnicode_CONVERT_BYTES( 1638 Py_UCS1, Py_UCS2, 1639 PyUnicode_1BYTE_DATA(s), 1640 PyUnicode_1BYTE_DATA(s) + len, 1641 result); 1642 return result; 1643 case PyUnicode_4BYTE_KIND: 1644 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1645 if (!result) 1646 return PyErr_NoMemory(); 1647 if (skind == PyUnicode_2BYTE_KIND) { 1648 _PyUnicode_CONVERT_BYTES( 1649 Py_UCS2, Py_UCS4, 1650 PyUnicode_2BYTE_DATA(s), 1651 PyUnicode_2BYTE_DATA(s) + len, 1652 result); 1653 } 1654 else { 1655 assert(skind == PyUnicode_1BYTE_KIND); 1656 _PyUnicode_CONVERT_BYTES( 1657 Py_UCS1, Py_UCS4, 1658 PyUnicode_1BYTE_DATA(s), 1659 PyUnicode_1BYTE_DATA(s) + len, 1660 result); 1661 } 1662 return result; 1663 default: 1664 break; 1665 } 1666 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1667 return NULL; 1668} 1669 1670static Py_UCS4* 1671as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1672 int copy_null) 1673{ 1674 int kind; 1675 void *data; 1676 Py_ssize_t len, targetlen; 1677 if (PyUnicode_READY(string) == -1) 1678 return NULL; 1679 kind = PyUnicode_KIND(string); 1680 data = PyUnicode_DATA(string); 1681 len = PyUnicode_GET_LENGTH(string); 1682 targetlen = len; 1683 if (copy_null) 1684 targetlen++; 1685 if (!target) { 1686 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1687 PyErr_NoMemory(); 1688 return NULL; 1689 } 1690 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1691 if (!target) { 1692 PyErr_NoMemory(); 1693 return NULL; 1694 } 1695 } 1696 else { 1697 if (targetsize < targetlen) { 1698 PyErr_Format(PyExc_SystemError, 1699 "string is longer than the buffer"); 1700 if (copy_null && 0 < targetsize) 1701 target[0] = 0; 1702 return NULL; 1703 } 1704 } 1705 if (kind != PyUnicode_4BYTE_KIND) { 1706 Py_ssize_t i; 1707 for (i = 0; i < len; i++) 1708 target[i] = PyUnicode_READ(kind, data, i); 1709 } 1710 else 1711 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1712 if (copy_null) 1713 target[len] = 0; 1714 return target; 1715} 1716 1717Py_UCS4* 1718PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1719 int copy_null) 1720{ 1721 if (target == NULL || targetsize < 1) { 1722 PyErr_BadInternalCall(); 1723 return NULL; 1724 } 1725 return as_ucs4(string, target, targetsize, copy_null); 1726} 1727 1728Py_UCS4* 1729PyUnicode_AsUCS4Copy(PyObject *string) 1730{ 1731 return as_ucs4(string, NULL, 0, 1); 1732} 1733 1734#ifdef HAVE_WCHAR_H 1735 1736PyObject * 1737PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1738{ 1739 if (w == NULL) { 1740 if (size == 0) 1741 return PyUnicode_New(0, 0); 1742 PyErr_BadInternalCall(); 1743 return NULL; 1744 } 1745 1746 if (size == -1) { 1747 size = wcslen(w); 1748 } 1749 1750 return PyUnicode_FromUnicode(w, size); 1751} 1752 1753#endif /* HAVE_WCHAR_H */ 1754 1755static void 1756makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1757 int zeropad, int width, int precision, char c) 1758{ 1759 *fmt++ = '%'; 1760 if (width) { 1761 if (zeropad) 1762 *fmt++ = '0'; 1763 fmt += sprintf(fmt, "%d", width); 1764 } 1765 if (precision) 1766 fmt += sprintf(fmt, ".%d", precision); 1767 if (longflag) 1768 *fmt++ = 'l'; 1769 else if (longlongflag) { 1770 /* longlongflag should only ever be nonzero on machines with 1771 HAVE_LONG_LONG defined */ 1772#ifdef HAVE_LONG_LONG 1773 char *f = PY_FORMAT_LONG_LONG; 1774 while (*f) 1775 *fmt++ = *f++; 1776#else 1777 /* we shouldn't ever get here */ 1778 assert(0); 1779 *fmt++ = 'l'; 1780#endif 1781 } 1782 else if (size_tflag) { 1783 char *f = PY_FORMAT_SIZE_T; 1784 while (*f) 1785 *fmt++ = *f++; 1786 } 1787 *fmt++ = c; 1788 *fmt = '\0'; 1789} 1790 1791/* helper for PyUnicode_FromFormatV() */ 1792 1793static const char* 1794parse_format_flags(const char *f, 1795 int *p_width, int *p_precision, 1796 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1797{ 1798 int width, precision, longflag, longlongflag, size_tflag; 1799 1800 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1801 f++; 1802 width = 0; 1803 while (Py_ISDIGIT((unsigned)*f)) 1804 width = (width*10) + *f++ - '0'; 1805 precision = 0; 1806 if (*f == '.') { 1807 f++; 1808 while (Py_ISDIGIT((unsigned)*f)) 1809 precision = (precision*10) + *f++ - '0'; 1810 if (*f == '%') { 1811 /* "%.3%s" => f points to "3" */ 1812 f--; 1813 } 1814 } 1815 if (*f == '\0') { 1816 /* bogus format "%.1" => go backward, f points to "1" */ 1817 f--; 1818 } 1819 if (p_width != NULL) 1820 *p_width = width; 1821 if (p_precision != NULL) 1822 *p_precision = precision; 1823 1824 /* Handle %ld, %lu, %lld and %llu. */ 1825 longflag = 0; 1826 longlongflag = 0; 1827 size_tflag = 0; 1828 1829 if (*f == 'l') { 1830 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1831 longflag = 1; 1832 ++f; 1833 } 1834#ifdef HAVE_LONG_LONG 1835 else if (f[1] == 'l' && 1836 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1837 longlongflag = 1; 1838 f += 2; 1839 } 1840#endif 1841 } 1842 /* handle the size_t flag. */ 1843 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 1844 size_tflag = 1; 1845 ++f; 1846 } 1847 if (p_longflag != NULL) 1848 *p_longflag = longflag; 1849 if (p_longlongflag != NULL) 1850 *p_longlongflag = longlongflag; 1851 if (p_size_tflag != NULL) 1852 *p_size_tflag = size_tflag; 1853 return f; 1854} 1855 1856/* maximum number of characters required for output of %ld. 21 characters 1857 allows for 64-bit integers (in decimal) and an optional sign. */ 1858#define MAX_LONG_CHARS 21 1859/* maximum number of characters required for output of %lld. 1860 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 1861 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 1862#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 1863 1864PyObject * 1865PyUnicode_FromFormatV(const char *format, va_list vargs) 1866{ 1867 va_list count; 1868 Py_ssize_t callcount = 0; 1869 PyObject **callresults = NULL; 1870 PyObject **callresult = NULL; 1871 Py_ssize_t n = 0; 1872 int width = 0; 1873 int precision = 0; 1874 int zeropad; 1875 const char* f; 1876 PyUnicodeObject *string; 1877 /* used by sprintf */ 1878 char fmt[61]; /* should be enough for %0width.precisionlld */ 1879 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 1880 Py_UCS4 argmaxchar; 1881 Py_ssize_t numbersize = 0; 1882 char *numberresults = NULL; 1883 char *numberresult = NULL; 1884 Py_ssize_t i; 1885 int kind; 1886 void *data; 1887 1888 Py_VA_COPY(count, vargs); 1889 /* step 1: count the number of %S/%R/%A/%s format specifications 1890 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 1891 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 1892 * result in an array) 1893 * also esimate a upper bound for all the number formats in the string, 1894 * numbers will be formated in step 3 and be keept in a '\0'-separated 1895 * buffer before putting everything together. */ 1896 for (f = format; *f; f++) { 1897 if (*f == '%') { 1898 int longlongflag; 1899 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 1900 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 1901 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 1902 ++callcount; 1903 1904 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 1905#ifdef HAVE_LONG_LONG 1906 if (longlongflag) { 1907 if (width < MAX_LONG_LONG_CHARS) 1908 width = MAX_LONG_LONG_CHARS; 1909 } 1910 else 1911#endif 1912 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 1913 including sign. Decimal takes the most space. This 1914 isn't enough for octal. If a width is specified we 1915 need more (which we allocate later). */ 1916 if (width < MAX_LONG_CHARS) 1917 width = MAX_LONG_CHARS; 1918 1919 /* account for the size + '\0' to separate numbers 1920 inside of the numberresults buffer */ 1921 numbersize += (width + 1); 1922 } 1923 } 1924 else if ((unsigned char)*f > 127) { 1925 PyErr_Format(PyExc_ValueError, 1926 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1927 "string, got a non-ASCII byte: 0x%02x", 1928 (unsigned char)*f); 1929 return NULL; 1930 } 1931 } 1932 /* step 2: allocate memory for the results of 1933 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 1934 if (callcount) { 1935 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 1936 if (!callresults) { 1937 PyErr_NoMemory(); 1938 return NULL; 1939 } 1940 callresult = callresults; 1941 } 1942 /* step 2.5: allocate memory for the results of formating numbers */ 1943 if (numbersize) { 1944 numberresults = PyObject_Malloc(numbersize); 1945 if (!numberresults) { 1946 PyErr_NoMemory(); 1947 goto fail; 1948 } 1949 numberresult = numberresults; 1950 } 1951 1952 /* step 3: format numbers and figure out how large a buffer we need */ 1953 for (f = format; *f; f++) { 1954 if (*f == '%') { 1955 const char* p; 1956 int longflag; 1957 int longlongflag; 1958 int size_tflag; 1959 int numprinted; 1960 1961 p = f; 1962 zeropad = (f[1] == '0'); 1963 f = parse_format_flags(f, &width, &precision, 1964 &longflag, &longlongflag, &size_tflag); 1965 switch (*f) { 1966 case 'c': 1967 { 1968 Py_UCS4 ordinal = va_arg(count, int); 1969 maxchar = Py_MAX(maxchar, ordinal); 1970 n++; 1971 break; 1972 } 1973 case '%': 1974 n++; 1975 break; 1976 case 'i': 1977 case 'd': 1978 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1979 width, precision, *f); 1980 if (longflag) 1981 numprinted = sprintf(numberresult, fmt, 1982 va_arg(count, long)); 1983#ifdef HAVE_LONG_LONG 1984 else if (longlongflag) 1985 numprinted = sprintf(numberresult, fmt, 1986 va_arg(count, PY_LONG_LONG)); 1987#endif 1988 else if (size_tflag) 1989 numprinted = sprintf(numberresult, fmt, 1990 va_arg(count, Py_ssize_t)); 1991 else 1992 numprinted = sprintf(numberresult, fmt, 1993 va_arg(count, int)); 1994 n += numprinted; 1995 /* advance by +1 to skip over the '\0' */ 1996 numberresult += (numprinted + 1); 1997 assert(*(numberresult - 1) == '\0'); 1998 assert(*(numberresult - 2) != '\0'); 1999 assert(numprinted >= 0); 2000 assert(numberresult <= numberresults + numbersize); 2001 break; 2002 case 'u': 2003 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2004 width, precision, 'u'); 2005 if (longflag) 2006 numprinted = sprintf(numberresult, fmt, 2007 va_arg(count, unsigned long)); 2008#ifdef HAVE_LONG_LONG 2009 else if (longlongflag) 2010 numprinted = sprintf(numberresult, fmt, 2011 va_arg(count, unsigned PY_LONG_LONG)); 2012#endif 2013 else if (size_tflag) 2014 numprinted = sprintf(numberresult, fmt, 2015 va_arg(count, size_t)); 2016 else 2017 numprinted = sprintf(numberresult, fmt, 2018 va_arg(count, unsigned int)); 2019 n += numprinted; 2020 numberresult += (numprinted + 1); 2021 assert(*(numberresult - 1) == '\0'); 2022 assert(*(numberresult - 2) != '\0'); 2023 assert(numprinted >= 0); 2024 assert(numberresult <= numberresults + numbersize); 2025 break; 2026 case 'x': 2027 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2028 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2029 n += numprinted; 2030 numberresult += (numprinted + 1); 2031 assert(*(numberresult - 1) == '\0'); 2032 assert(*(numberresult - 2) != '\0'); 2033 assert(numprinted >= 0); 2034 assert(numberresult <= numberresults + numbersize); 2035 break; 2036 case 'p': 2037 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2038 /* %p is ill-defined: ensure leading 0x. */ 2039 if (numberresult[1] == 'X') 2040 numberresult[1] = 'x'; 2041 else if (numberresult[1] != 'x') { 2042 memmove(numberresult + 2, numberresult, 2043 strlen(numberresult) + 1); 2044 numberresult[0] = '0'; 2045 numberresult[1] = 'x'; 2046 numprinted += 2; 2047 } 2048 n += numprinted; 2049 numberresult += (numprinted + 1); 2050 assert(*(numberresult - 1) == '\0'); 2051 assert(*(numberresult - 2) != '\0'); 2052 assert(numprinted >= 0); 2053 assert(numberresult <= numberresults + numbersize); 2054 break; 2055 case 's': 2056 { 2057 /* UTF-8 */ 2058 const char *s = va_arg(count, const char*); 2059 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2060 if (!str) 2061 goto fail; 2062 /* since PyUnicode_DecodeUTF8 returns already flexible 2063 unicode objects, there is no need to call ready on them */ 2064 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2065 maxchar = Py_MAX(maxchar, argmaxchar); 2066 n += PyUnicode_GET_LENGTH(str); 2067 /* Remember the str and switch to the next slot */ 2068 *callresult++ = str; 2069 break; 2070 } 2071 case 'U': 2072 { 2073 PyObject *obj = va_arg(count, PyObject *); 2074 assert(obj && _PyUnicode_CHECK(obj)); 2075 if (PyUnicode_READY(obj) == -1) 2076 goto fail; 2077 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2078 maxchar = Py_MAX(maxchar, argmaxchar); 2079 n += PyUnicode_GET_LENGTH(obj); 2080 break; 2081 } 2082 case 'V': 2083 { 2084 PyObject *obj = va_arg(count, PyObject *); 2085 const char *str = va_arg(count, const char *); 2086 PyObject *str_obj; 2087 assert(obj || str); 2088 assert(!obj || _PyUnicode_CHECK(obj)); 2089 if (obj) { 2090 if (PyUnicode_READY(obj) == -1) 2091 goto fail; 2092 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2093 maxchar = Py_MAX(maxchar, argmaxchar); 2094 n += PyUnicode_GET_LENGTH(obj); 2095 *callresult++ = NULL; 2096 } 2097 else { 2098 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2099 if (!str_obj) 2100 goto fail; 2101 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2102 maxchar = Py_MAX(maxchar, argmaxchar); 2103 n += PyUnicode_GET_LENGTH(str_obj); 2104 *callresult++ = str_obj; 2105 } 2106 break; 2107 } 2108 case 'S': 2109 { 2110 PyObject *obj = va_arg(count, PyObject *); 2111 PyObject *str; 2112 assert(obj); 2113 str = PyObject_Str(obj); 2114 if (!str || PyUnicode_READY(str) == -1) 2115 goto fail; 2116 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2117 maxchar = Py_MAX(maxchar, argmaxchar); 2118 n += PyUnicode_GET_LENGTH(str); 2119 /* Remember the str and switch to the next slot */ 2120 *callresult++ = str; 2121 break; 2122 } 2123 case 'R': 2124 { 2125 PyObject *obj = va_arg(count, PyObject *); 2126 PyObject *repr; 2127 assert(obj); 2128 repr = PyObject_Repr(obj); 2129 if (!repr || PyUnicode_READY(repr) == -1) 2130 goto fail; 2131 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2132 maxchar = Py_MAX(maxchar, argmaxchar); 2133 n += PyUnicode_GET_LENGTH(repr); 2134 /* Remember the repr and switch to the next slot */ 2135 *callresult++ = repr; 2136 break; 2137 } 2138 case 'A': 2139 { 2140 PyObject *obj = va_arg(count, PyObject *); 2141 PyObject *ascii; 2142 assert(obj); 2143 ascii = PyObject_ASCII(obj); 2144 if (!ascii || PyUnicode_READY(ascii) == -1) 2145 goto fail; 2146 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2147 maxchar = Py_MAX(maxchar, argmaxchar); 2148 n += PyUnicode_GET_LENGTH(ascii); 2149 /* Remember the repr and switch to the next slot */ 2150 *callresult++ = ascii; 2151 break; 2152 } 2153 default: 2154 /* if we stumble upon an unknown 2155 formatting code, copy the rest of 2156 the format string to the output 2157 string. (we cannot just skip the 2158 code, since there's no way to know 2159 what's in the argument list) */ 2160 n += strlen(p); 2161 goto expand; 2162 } 2163 } else 2164 n++; 2165 } 2166 expand: 2167 /* step 4: fill the buffer */ 2168 /* Since we've analyzed how much space we need, 2169 we don't have to resize the string. 2170 There can be no errors beyond this point. */ 2171 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); 2172 if (!string) 2173 goto fail; 2174 kind = PyUnicode_KIND(string); 2175 data = PyUnicode_DATA(string); 2176 callresult = callresults; 2177 numberresult = numberresults; 2178 2179 for (i = 0, f = format; *f; f++) { 2180 if (*f == '%') { 2181 const char* p; 2182 2183 p = f; 2184 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2185 /* checking for == because the last argument could be a empty 2186 string, which causes i to point to end, the assert at the end of 2187 the loop */ 2188 assert(i <= PyUnicode_GET_LENGTH(string)); 2189 2190 switch (*f) { 2191 case 'c': 2192 { 2193 const int ordinal = va_arg(vargs, int); 2194 PyUnicode_WRITE(kind, data, i++, ordinal); 2195 break; 2196 } 2197 case 'i': 2198 case 'd': 2199 case 'u': 2200 case 'x': 2201 case 'p': 2202 /* unused, since we already have the result */ 2203 if (*f == 'p') 2204 (void) va_arg(vargs, void *); 2205 else 2206 (void) va_arg(vargs, int); 2207 /* extract the result from numberresults and append. */ 2208 for (; *numberresult; ++i, ++numberresult) 2209 PyUnicode_WRITE(kind, data, i, *numberresult); 2210 /* skip over the separating '\0' */ 2211 assert(*numberresult == '\0'); 2212 numberresult++; 2213 assert(numberresult <= numberresults + numbersize); 2214 break; 2215 case 's': 2216 { 2217 /* unused, since we already have the result */ 2218 Py_ssize_t size; 2219 (void) va_arg(vargs, char *); 2220 size = PyUnicode_GET_LENGTH(*callresult); 2221 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2222 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2223 *callresult, 0, 2224 size) < 0) 2225 goto fail; 2226 i += size; 2227 /* We're done with the unicode()/repr() => forget it */ 2228 Py_DECREF(*callresult); 2229 /* switch to next unicode()/repr() result */ 2230 ++callresult; 2231 break; 2232 } 2233 case 'U': 2234 { 2235 PyObject *obj = va_arg(vargs, PyObject *); 2236 Py_ssize_t size; 2237 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2238 size = PyUnicode_GET_LENGTH(obj); 2239 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2240 obj, 0, 2241 size) < 0) 2242 goto fail; 2243 i += size; 2244 break; 2245 } 2246 case 'V': 2247 { 2248 Py_ssize_t size; 2249 PyObject *obj = va_arg(vargs, PyObject *); 2250 va_arg(vargs, const char *); 2251 if (obj) { 2252 size = PyUnicode_GET_LENGTH(obj); 2253 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2254 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2255 obj, 0, 2256 size) < 0) 2257 goto fail; 2258 i += size; 2259 } else { 2260 size = PyUnicode_GET_LENGTH(*callresult); 2261 assert(PyUnicode_KIND(*callresult) <= 2262 PyUnicode_KIND(string)); 2263 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2264 *callresult, 2265 0, size) < 0) 2266 goto fail; 2267 i += size; 2268 Py_DECREF(*callresult); 2269 } 2270 ++callresult; 2271 break; 2272 } 2273 case 'S': 2274 case 'R': 2275 case 'A': 2276 { 2277 /* unused, since we already have the result */ 2278 (void) va_arg(vargs, PyObject *); 2279 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2280 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2281 *callresult, 0, 2282 PyUnicode_GET_LENGTH(*callresult)) < 0) 2283 goto fail; 2284 i += PyUnicode_GET_LENGTH(*callresult); 2285 /* We're done with the unicode()/repr() => forget it */ 2286 Py_DECREF(*callresult); 2287 /* switch to next unicode()/repr() result */ 2288 ++callresult; 2289 break; 2290 } 2291 case '%': 2292 PyUnicode_WRITE(kind, data, i++, '%'); 2293 break; 2294 default: 2295 for (; *p; ++p, ++i) 2296 PyUnicode_WRITE(kind, data, i, *p); 2297 assert(i == PyUnicode_GET_LENGTH(string)); 2298 goto end; 2299 } 2300 } 2301 else { 2302 assert(i < PyUnicode_GET_LENGTH(string)); 2303 PyUnicode_WRITE(kind, data, i++, *f); 2304 } 2305 } 2306 assert(i == PyUnicode_GET_LENGTH(string)); 2307 2308 end: 2309 if (callresults) 2310 PyObject_Free(callresults); 2311 if (numberresults) 2312 PyObject_Free(numberresults); 2313 return (PyObject *)string; 2314 fail: 2315 if (callresults) { 2316 PyObject **callresult2 = callresults; 2317 while (callresult2 < callresult) { 2318 Py_XDECREF(*callresult2); 2319 ++callresult2; 2320 } 2321 PyObject_Free(callresults); 2322 } 2323 if (numberresults) 2324 PyObject_Free(numberresults); 2325 return NULL; 2326} 2327 2328PyObject * 2329PyUnicode_FromFormat(const char *format, ...) 2330{ 2331 PyObject* ret; 2332 va_list vargs; 2333 2334#ifdef HAVE_STDARG_PROTOTYPES 2335 va_start(vargs, format); 2336#else 2337 va_start(vargs); 2338#endif 2339 ret = PyUnicode_FromFormatV(format, vargs); 2340 va_end(vargs); 2341 return ret; 2342} 2343 2344#ifdef HAVE_WCHAR_H 2345 2346/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2347 convert a Unicode object to a wide character string. 2348 2349 - If w is NULL: return the number of wide characters (including the null 2350 character) required to convert the unicode object. Ignore size argument. 2351 2352 - Otherwise: return the number of wide characters (excluding the null 2353 character) written into w. Write at most size wide characters (including 2354 the null character). */ 2355static Py_ssize_t 2356unicode_aswidechar(PyUnicodeObject *unicode, 2357 wchar_t *w, 2358 Py_ssize_t size) 2359{ 2360 Py_ssize_t res; 2361 const wchar_t *wstr; 2362 2363 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2364 if (wstr == NULL) 2365 return -1; 2366 2367 if (w != NULL) { 2368 if (size > res) 2369 size = res + 1; 2370 else 2371 res = size; 2372 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2373 return res; 2374 } 2375 else 2376 return res + 1; 2377} 2378 2379Py_ssize_t 2380PyUnicode_AsWideChar(PyObject *unicode, 2381 wchar_t *w, 2382 Py_ssize_t size) 2383{ 2384 if (unicode == NULL) { 2385 PyErr_BadInternalCall(); 2386 return -1; 2387 } 2388 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2389} 2390 2391wchar_t* 2392PyUnicode_AsWideCharString(PyObject *unicode, 2393 Py_ssize_t *size) 2394{ 2395 wchar_t* buffer; 2396 Py_ssize_t buflen; 2397 2398 if (unicode == NULL) { 2399 PyErr_BadInternalCall(); 2400 return NULL; 2401 } 2402 2403 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2404 if (buflen == -1) 2405 return NULL; 2406 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2407 PyErr_NoMemory(); 2408 return NULL; 2409 } 2410 2411 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2412 if (buffer == NULL) { 2413 PyErr_NoMemory(); 2414 return NULL; 2415 } 2416 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2417 if (buflen == -1) 2418 return NULL; 2419 if (size != NULL) 2420 *size = buflen; 2421 return buffer; 2422} 2423 2424#endif /* HAVE_WCHAR_H */ 2425 2426PyObject * 2427PyUnicode_FromOrdinal(int ordinal) 2428{ 2429 PyObject *v; 2430 if (ordinal < 0 || ordinal > 0x10ffff) { 2431 PyErr_SetString(PyExc_ValueError, 2432 "chr() arg not in range(0x110000)"); 2433 return NULL; 2434 } 2435 2436 if (ordinal < 256) 2437 return get_latin1_char(ordinal); 2438 2439 v = PyUnicode_New(1, ordinal); 2440 if (v == NULL) 2441 return NULL; 2442 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2443 return v; 2444} 2445 2446PyObject * 2447PyUnicode_FromObject(register PyObject *obj) 2448{ 2449 /* XXX Perhaps we should make this API an alias of 2450 PyObject_Str() instead ?! */ 2451 if (PyUnicode_CheckExact(obj)) { 2452 if (PyUnicode_READY(obj)) 2453 return NULL; 2454 Py_INCREF(obj); 2455 return obj; 2456 } 2457 if (PyUnicode_Check(obj)) { 2458 /* For a Unicode subtype that's not a Unicode object, 2459 return a true Unicode object with the same data. */ 2460 return PyUnicode_Copy(obj); 2461 } 2462 PyErr_Format(PyExc_TypeError, 2463 "Can't convert '%.100s' object to str implicitly", 2464 Py_TYPE(obj)->tp_name); 2465 return NULL; 2466} 2467 2468PyObject * 2469PyUnicode_FromEncodedObject(register PyObject *obj, 2470 const char *encoding, 2471 const char *errors) 2472{ 2473 Py_buffer buffer; 2474 PyObject *v; 2475 2476 if (obj == NULL) { 2477 PyErr_BadInternalCall(); 2478 return NULL; 2479 } 2480 2481 /* Decoding bytes objects is the most common case and should be fast */ 2482 if (PyBytes_Check(obj)) { 2483 if (PyBytes_GET_SIZE(obj) == 0) { 2484 Py_INCREF(unicode_empty); 2485 v = unicode_empty; 2486 } 2487 else { 2488 v = PyUnicode_Decode( 2489 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2490 encoding, errors); 2491 } 2492 return v; 2493 } 2494 2495 if (PyUnicode_Check(obj)) { 2496 PyErr_SetString(PyExc_TypeError, 2497 "decoding str is not supported"); 2498 return NULL; 2499 } 2500 2501 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2502 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2503 PyErr_Format(PyExc_TypeError, 2504 "coercing to str: need bytes, bytearray " 2505 "or buffer-like object, %.80s found", 2506 Py_TYPE(obj)->tp_name); 2507 return NULL; 2508 } 2509 2510 if (buffer.len == 0) { 2511 Py_INCREF(unicode_empty); 2512 v = unicode_empty; 2513 } 2514 else 2515 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2516 2517 PyBuffer_Release(&buffer); 2518 return v; 2519} 2520 2521/* Convert encoding to lower case and replace '_' with '-' in order to 2522 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2523 1 on success. */ 2524static int 2525normalize_encoding(const char *encoding, 2526 char *lower, 2527 size_t lower_len) 2528{ 2529 const char *e; 2530 char *l; 2531 char *l_end; 2532 2533 e = encoding; 2534 l = lower; 2535 l_end = &lower[lower_len - 1]; 2536 while (*e) { 2537 if (l == l_end) 2538 return 0; 2539 if (Py_ISUPPER(*e)) { 2540 *l++ = Py_TOLOWER(*e++); 2541 } 2542 else if (*e == '_') { 2543 *l++ = '-'; 2544 e++; 2545 } 2546 else { 2547 *l++ = *e++; 2548 } 2549 } 2550 *l = '\0'; 2551 return 1; 2552} 2553 2554PyObject * 2555PyUnicode_Decode(const char *s, 2556 Py_ssize_t size, 2557 const char *encoding, 2558 const char *errors) 2559{ 2560 PyObject *buffer = NULL, *unicode; 2561 Py_buffer info; 2562 char lower[11]; /* Enough for any encoding shortcut */ 2563 2564 if (encoding == NULL) 2565 return PyUnicode_DecodeUTF8(s, size, errors); 2566 2567 /* Shortcuts for common default encodings */ 2568 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2569 if ((strcmp(lower, "utf-8") == 0) || 2570 (strcmp(lower, "utf8") == 0)) 2571 return PyUnicode_DecodeUTF8(s, size, errors); 2572 else if ((strcmp(lower, "latin-1") == 0) || 2573 (strcmp(lower, "latin1") == 0) || 2574 (strcmp(lower, "iso-8859-1") == 0)) 2575 return PyUnicode_DecodeLatin1(s, size, errors); 2576#ifdef HAVE_MBCS 2577 else if (strcmp(lower, "mbcs") == 0) 2578 return PyUnicode_DecodeMBCS(s, size, errors); 2579#endif 2580 else if (strcmp(lower, "ascii") == 0) 2581 return PyUnicode_DecodeASCII(s, size, errors); 2582 else if (strcmp(lower, "utf-16") == 0) 2583 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2584 else if (strcmp(lower, "utf-32") == 0) 2585 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2586 } 2587 2588 /* Decode via the codec registry */ 2589 buffer = NULL; 2590 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2591 goto onError; 2592 buffer = PyMemoryView_FromBuffer(&info); 2593 if (buffer == NULL) 2594 goto onError; 2595 unicode = PyCodec_Decode(buffer, encoding, errors); 2596 if (unicode == NULL) 2597 goto onError; 2598 if (!PyUnicode_Check(unicode)) { 2599 PyErr_Format(PyExc_TypeError, 2600 "decoder did not return a str object (type=%.400s)", 2601 Py_TYPE(unicode)->tp_name); 2602 Py_DECREF(unicode); 2603 goto onError; 2604 } 2605 Py_DECREF(buffer); 2606 if (_PyUnicode_READY_REPLACE(&unicode)) { 2607 Py_DECREF(unicode); 2608 return NULL; 2609 } 2610 return unicode; 2611 2612 onError: 2613 Py_XDECREF(buffer); 2614 return NULL; 2615} 2616 2617PyObject * 2618PyUnicode_AsDecodedObject(PyObject *unicode, 2619 const char *encoding, 2620 const char *errors) 2621{ 2622 PyObject *v; 2623 2624 if (!PyUnicode_Check(unicode)) { 2625 PyErr_BadArgument(); 2626 goto onError; 2627 } 2628 2629 if (encoding == NULL) 2630 encoding = PyUnicode_GetDefaultEncoding(); 2631 2632 /* Decode via the codec registry */ 2633 v = PyCodec_Decode(unicode, encoding, errors); 2634 if (v == NULL) 2635 goto onError; 2636 return v; 2637 2638 onError: 2639 return NULL; 2640} 2641 2642PyObject * 2643PyUnicode_AsDecodedUnicode(PyObject *unicode, 2644 const char *encoding, 2645 const char *errors) 2646{ 2647 PyObject *v; 2648 2649 if (!PyUnicode_Check(unicode)) { 2650 PyErr_BadArgument(); 2651 goto onError; 2652 } 2653 2654 if (encoding == NULL) 2655 encoding = PyUnicode_GetDefaultEncoding(); 2656 2657 /* Decode via the codec registry */ 2658 v = PyCodec_Decode(unicode, encoding, errors); 2659 if (v == NULL) 2660 goto onError; 2661 if (!PyUnicode_Check(v)) { 2662 PyErr_Format(PyExc_TypeError, 2663 "decoder did not return a str object (type=%.400s)", 2664 Py_TYPE(v)->tp_name); 2665 Py_DECREF(v); 2666 goto onError; 2667 } 2668 return v; 2669 2670 onError: 2671 return NULL; 2672} 2673 2674PyObject * 2675PyUnicode_Encode(const Py_UNICODE *s, 2676 Py_ssize_t size, 2677 const char *encoding, 2678 const char *errors) 2679{ 2680 PyObject *v, *unicode; 2681 2682 unicode = PyUnicode_FromUnicode(s, size); 2683 if (unicode == NULL) 2684 return NULL; 2685 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2686 Py_DECREF(unicode); 2687 return v; 2688} 2689 2690PyObject * 2691PyUnicode_AsEncodedObject(PyObject *unicode, 2692 const char *encoding, 2693 const char *errors) 2694{ 2695 PyObject *v; 2696 2697 if (!PyUnicode_Check(unicode)) { 2698 PyErr_BadArgument(); 2699 goto onError; 2700 } 2701 2702 if (encoding == NULL) 2703 encoding = PyUnicode_GetDefaultEncoding(); 2704 2705 /* Encode via the codec registry */ 2706 v = PyCodec_Encode(unicode, encoding, errors); 2707 if (v == NULL) 2708 goto onError; 2709 return v; 2710 2711 onError: 2712 return NULL; 2713} 2714 2715PyObject * 2716PyUnicode_EncodeFSDefault(PyObject *unicode) 2717{ 2718#ifdef HAVE_MBCS 2719 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2720 PyUnicode_GET_SIZE(unicode), 2721 NULL); 2722#elif defined(__APPLE__) 2723 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2724#else 2725 PyInterpreterState *interp = PyThreadState_GET()->interp; 2726 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2727 cannot use it to encode and decode filenames before it is loaded. Load 2728 the Python codec requires to encode at least its own filename. Use the C 2729 version of the locale codec until the codec registry is initialized and 2730 the Python codec is loaded. 2731 2732 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2733 cannot only rely on it: check also interp->fscodec_initialized for 2734 subinterpreters. */ 2735 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2736 return PyUnicode_AsEncodedString(unicode, 2737 Py_FileSystemDefaultEncoding, 2738 "surrogateescape"); 2739 } 2740 else { 2741 /* locale encoding with surrogateescape */ 2742 wchar_t *wchar; 2743 char *bytes; 2744 PyObject *bytes_obj; 2745 size_t error_pos; 2746 2747 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2748 if (wchar == NULL) 2749 return NULL; 2750 bytes = _Py_wchar2char(wchar, &error_pos); 2751 if (bytes == NULL) { 2752 if (error_pos != (size_t)-1) { 2753 char *errmsg = strerror(errno); 2754 PyObject *exc = NULL; 2755 if (errmsg == NULL) 2756 errmsg = "Py_wchar2char() failed"; 2757 raise_encode_exception(&exc, 2758 "filesystemencoding", 2759 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2760 error_pos, error_pos+1, 2761 errmsg); 2762 Py_XDECREF(exc); 2763 } 2764 else 2765 PyErr_NoMemory(); 2766 PyMem_Free(wchar); 2767 return NULL; 2768 } 2769 PyMem_Free(wchar); 2770 2771 bytes_obj = PyBytes_FromString(bytes); 2772 PyMem_Free(bytes); 2773 return bytes_obj; 2774 } 2775#endif 2776} 2777 2778PyObject * 2779PyUnicode_AsEncodedString(PyObject *unicode, 2780 const char *encoding, 2781 const char *errors) 2782{ 2783 PyObject *v; 2784 char lower[11]; /* Enough for any encoding shortcut */ 2785 2786 if (!PyUnicode_Check(unicode)) { 2787 PyErr_BadArgument(); 2788 return NULL; 2789 } 2790 2791 if (encoding == NULL) { 2792 if (errors == NULL || strcmp(errors, "strict") == 0) 2793 return _PyUnicode_AsUTF8String(unicode, NULL); 2794 else 2795 return _PyUnicode_AsUTF8String(unicode, errors); 2796 } 2797 2798 /* Shortcuts for common default encodings */ 2799 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2800 if ((strcmp(lower, "utf-8") == 0) || 2801 (strcmp(lower, "utf8") == 0)) 2802 { 2803 if (errors == NULL || strcmp(errors, "strict") == 0) 2804 return _PyUnicode_AsUTF8String(unicode, NULL); 2805 else 2806 return _PyUnicode_AsUTF8String(unicode, errors); 2807 } 2808 else if ((strcmp(lower, "latin-1") == 0) || 2809 (strcmp(lower, "latin1") == 0) || 2810 (strcmp(lower, "iso-8859-1") == 0)) 2811 return _PyUnicode_AsLatin1String(unicode, errors); 2812#ifdef HAVE_MBCS 2813 else if (strcmp(lower, "mbcs") == 0) 2814 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2815 PyUnicode_GET_SIZE(unicode), 2816 errors); 2817#endif 2818 else if (strcmp(lower, "ascii") == 0) 2819 return _PyUnicode_AsASCIIString(unicode, errors); 2820 } 2821 2822 /* Encode via the codec registry */ 2823 v = PyCodec_Encode(unicode, encoding, errors); 2824 if (v == NULL) 2825 return NULL; 2826 2827 /* The normal path */ 2828 if (PyBytes_Check(v)) 2829 return v; 2830 2831 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2832 if (PyByteArray_Check(v)) { 2833 int error; 2834 PyObject *b; 2835 2836 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2837 "encoder %s returned bytearray instead of bytes", 2838 encoding); 2839 if (error) { 2840 Py_DECREF(v); 2841 return NULL; 2842 } 2843 2844 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2845 Py_DECREF(v); 2846 return b; 2847 } 2848 2849 PyErr_Format(PyExc_TypeError, 2850 "encoder did not return a bytes object (type=%.400s)", 2851 Py_TYPE(v)->tp_name); 2852 Py_DECREF(v); 2853 return NULL; 2854} 2855 2856PyObject * 2857PyUnicode_AsEncodedUnicode(PyObject *unicode, 2858 const char *encoding, 2859 const char *errors) 2860{ 2861 PyObject *v; 2862 2863 if (!PyUnicode_Check(unicode)) { 2864 PyErr_BadArgument(); 2865 goto onError; 2866 } 2867 2868 if (encoding == NULL) 2869 encoding = PyUnicode_GetDefaultEncoding(); 2870 2871 /* Encode via the codec registry */ 2872 v = PyCodec_Encode(unicode, encoding, errors); 2873 if (v == NULL) 2874 goto onError; 2875 if (!PyUnicode_Check(v)) { 2876 PyErr_Format(PyExc_TypeError, 2877 "encoder did not return an str object (type=%.400s)", 2878 Py_TYPE(v)->tp_name); 2879 Py_DECREF(v); 2880 goto onError; 2881 } 2882 return v; 2883 2884 onError: 2885 return NULL; 2886} 2887 2888PyObject* 2889PyUnicode_DecodeFSDefault(const char *s) { 2890 Py_ssize_t size = (Py_ssize_t)strlen(s); 2891 return PyUnicode_DecodeFSDefaultAndSize(s, size); 2892} 2893 2894PyObject* 2895PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 2896{ 2897#ifdef HAVE_MBCS 2898 return PyUnicode_DecodeMBCS(s, size, NULL); 2899#elif defined(__APPLE__) 2900 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 2901#else 2902 PyInterpreterState *interp = PyThreadState_GET()->interp; 2903 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2904 cannot use it to encode and decode filenames before it is loaded. Load 2905 the Python codec requires to encode at least its own filename. Use the C 2906 version of the locale codec until the codec registry is initialized and 2907 the Python codec is loaded. 2908 2909 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2910 cannot only rely on it: check also interp->fscodec_initialized for 2911 subinterpreters. */ 2912 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2913 return PyUnicode_Decode(s, size, 2914 Py_FileSystemDefaultEncoding, 2915 "surrogateescape"); 2916 } 2917 else { 2918 /* locale encoding with surrogateescape */ 2919 wchar_t *wchar; 2920 PyObject *unicode; 2921 size_t len; 2922 2923 if (s[size] != '\0' || size != strlen(s)) { 2924 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2925 return NULL; 2926 } 2927 2928 wchar = _Py_char2wchar(s, &len); 2929 if (wchar == NULL) 2930 return PyErr_NoMemory(); 2931 2932 unicode = PyUnicode_FromWideChar(wchar, len); 2933 PyMem_Free(wchar); 2934 return unicode; 2935 } 2936#endif 2937} 2938 2939 2940int 2941PyUnicode_FSConverter(PyObject* arg, void* addr) 2942{ 2943 PyObject *output = NULL; 2944 Py_ssize_t size; 2945 void *data; 2946 if (arg == NULL) { 2947 Py_DECREF(*(PyObject**)addr); 2948 return 1; 2949 } 2950 if (PyBytes_Check(arg)) { 2951 output = arg; 2952 Py_INCREF(output); 2953 } 2954 else { 2955 arg = PyUnicode_FromObject(arg); 2956 if (!arg) 2957 return 0; 2958 output = PyUnicode_EncodeFSDefault(arg); 2959 Py_DECREF(arg); 2960 if (!output) 2961 return 0; 2962 if (!PyBytes_Check(output)) { 2963 Py_DECREF(output); 2964 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 2965 return 0; 2966 } 2967 } 2968 size = PyBytes_GET_SIZE(output); 2969 data = PyBytes_AS_STRING(output); 2970 if (size != strlen(data)) { 2971 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2972 Py_DECREF(output); 2973 return 0; 2974 } 2975 *(PyObject**)addr = output; 2976 return Py_CLEANUP_SUPPORTED; 2977} 2978 2979 2980int 2981PyUnicode_FSDecoder(PyObject* arg, void* addr) 2982{ 2983 PyObject *output = NULL; 2984 if (arg == NULL) { 2985 Py_DECREF(*(PyObject**)addr); 2986 return 1; 2987 } 2988 if (PyUnicode_Check(arg)) { 2989 if (PyUnicode_READY(arg)) 2990 return 0; 2991 output = arg; 2992 Py_INCREF(output); 2993 } 2994 else { 2995 arg = PyBytes_FromObject(arg); 2996 if (!arg) 2997 return 0; 2998 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 2999 PyBytes_GET_SIZE(arg)); 3000 Py_DECREF(arg); 3001 if (!output) 3002 return 0; 3003 if (!PyUnicode_Check(output)) { 3004 Py_DECREF(output); 3005 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3006 return 0; 3007 } 3008 } 3009 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3010 PyUnicode_GET_LENGTH(output), 0, 1)) { 3011 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3012 Py_DECREF(output); 3013 return 0; 3014 } 3015 *(PyObject**)addr = output; 3016 return Py_CLEANUP_SUPPORTED; 3017} 3018 3019 3020char* 3021PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3022{ 3023 PyObject *bytes; 3024 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3025 3026 if (!PyUnicode_Check(unicode)) { 3027 PyErr_BadArgument(); 3028 return NULL; 3029 } 3030 if (PyUnicode_READY(u) == -1) 3031 return NULL; 3032 3033 if (PyUnicode_UTF8(unicode) == NULL) { 3034 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3035 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3036 if (bytes == NULL) 3037 return NULL; 3038 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3039 if (_PyUnicode_UTF8(u) == NULL) { 3040 Py_DECREF(bytes); 3041 return NULL; 3042 } 3043 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3044 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3045 Py_DECREF(bytes); 3046 } 3047 3048 if (psize) 3049 *psize = PyUnicode_UTF8_LENGTH(unicode); 3050 return PyUnicode_UTF8(unicode); 3051} 3052 3053char* 3054PyUnicode_AsUTF8(PyObject *unicode) 3055{ 3056 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3057} 3058 3059#ifdef Py_DEBUG 3060int unicode_as_unicode_calls = 0; 3061#endif 3062 3063 3064Py_UNICODE * 3065PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3066{ 3067 PyUnicodeObject *u; 3068 const unsigned char *one_byte; 3069#if SIZEOF_WCHAR_T == 4 3070 const Py_UCS2 *two_bytes; 3071#else 3072 const Py_UCS4 *four_bytes; 3073 const Py_UCS4 *ucs4_end; 3074 Py_ssize_t num_surrogates; 3075#endif 3076 wchar_t *w; 3077 wchar_t *wchar_end; 3078 3079 if (!PyUnicode_Check(unicode)) { 3080 PyErr_BadArgument(); 3081 return NULL; 3082 } 3083 u = (PyUnicodeObject*)unicode; 3084 if (_PyUnicode_WSTR(u) == NULL) { 3085 /* Non-ASCII compact unicode object */ 3086 assert(_PyUnicode_KIND(u) != 0); 3087 assert(PyUnicode_IS_READY(u)); 3088 3089#ifdef Py_DEBUG 3090 ++unicode_as_unicode_calls; 3091#endif 3092 3093 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3094#if SIZEOF_WCHAR_T == 2 3095 four_bytes = PyUnicode_4BYTE_DATA(u); 3096 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3097 num_surrogates = 0; 3098 3099 for (; four_bytes < ucs4_end; ++four_bytes) { 3100 if (*four_bytes > 0xFFFF) 3101 ++num_surrogates; 3102 } 3103 3104 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3105 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3106 if (!_PyUnicode_WSTR(u)) { 3107 PyErr_NoMemory(); 3108 return NULL; 3109 } 3110 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3111 3112 w = _PyUnicode_WSTR(u); 3113 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3114 four_bytes = PyUnicode_4BYTE_DATA(u); 3115 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3116 if (*four_bytes > 0xFFFF) { 3117 /* encode surrogate pair in this case */ 3118 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3119 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3120 } 3121 else 3122 *w = *four_bytes; 3123 3124 if (w > wchar_end) { 3125 assert(0 && "Miscalculated string end"); 3126 } 3127 } 3128 *w = 0; 3129#else 3130 /* sizeof(wchar_t) == 4 */ 3131 Py_FatalError("Impossible unicode object state, wstr and str " 3132 "should share memory already."); 3133 return NULL; 3134#endif 3135 } 3136 else { 3137 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3138 (_PyUnicode_LENGTH(u) + 1)); 3139 if (!_PyUnicode_WSTR(u)) { 3140 PyErr_NoMemory(); 3141 return NULL; 3142 } 3143 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3144 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3145 w = _PyUnicode_WSTR(u); 3146 wchar_end = w + _PyUnicode_LENGTH(u); 3147 3148 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3149 one_byte = PyUnicode_1BYTE_DATA(u); 3150 for (; w < wchar_end; ++one_byte, ++w) 3151 *w = *one_byte; 3152 /* null-terminate the wstr */ 3153 *w = 0; 3154 } 3155 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3156#if SIZEOF_WCHAR_T == 4 3157 two_bytes = PyUnicode_2BYTE_DATA(u); 3158 for (; w < wchar_end; ++two_bytes, ++w) 3159 *w = *two_bytes; 3160 /* null-terminate the wstr */ 3161 *w = 0; 3162#else 3163 /* sizeof(wchar_t) == 2 */ 3164 PyObject_FREE(_PyUnicode_WSTR(u)); 3165 _PyUnicode_WSTR(u) = NULL; 3166 Py_FatalError("Impossible unicode object state, wstr " 3167 "and str should share memory already."); 3168 return NULL; 3169#endif 3170 } 3171 else { 3172 assert(0 && "This should never happen."); 3173 } 3174 } 3175 } 3176 if (size != NULL) 3177 *size = PyUnicode_WSTR_LENGTH(u); 3178 return _PyUnicode_WSTR(u); 3179} 3180 3181Py_UNICODE * 3182PyUnicode_AsUnicode(PyObject *unicode) 3183{ 3184 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3185} 3186 3187 3188Py_ssize_t 3189PyUnicode_GetSize(PyObject *unicode) 3190{ 3191 if (!PyUnicode_Check(unicode)) { 3192 PyErr_BadArgument(); 3193 goto onError; 3194 } 3195 return PyUnicode_GET_SIZE(unicode); 3196 3197 onError: 3198 return -1; 3199} 3200 3201Py_ssize_t 3202PyUnicode_GetLength(PyObject *unicode) 3203{ 3204 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3205 PyErr_BadArgument(); 3206 return -1; 3207 } 3208 3209 return PyUnicode_GET_LENGTH(unicode); 3210} 3211 3212Py_UCS4 3213PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3214{ 3215 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3216 PyErr_BadArgument(); 3217 return (Py_UCS4)-1; 3218 } 3219 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3220 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3221 return (Py_UCS4)-1; 3222 } 3223 return PyUnicode_READ_CHAR(unicode, index); 3224} 3225 3226int 3227PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3228{ 3229 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3230 PyErr_BadArgument(); 3231 return -1; 3232 } 3233 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3234 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3235 return -1; 3236 } 3237 if (_PyUnicode_Dirty(unicode)) 3238 return -1; 3239 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3240 index, ch); 3241 return 0; 3242} 3243 3244const char * 3245PyUnicode_GetDefaultEncoding(void) 3246{ 3247 return "utf-8"; 3248} 3249 3250/* create or adjust a UnicodeDecodeError */ 3251static void 3252make_decode_exception(PyObject **exceptionObject, 3253 const char *encoding, 3254 const char *input, Py_ssize_t length, 3255 Py_ssize_t startpos, Py_ssize_t endpos, 3256 const char *reason) 3257{ 3258 if (*exceptionObject == NULL) { 3259 *exceptionObject = PyUnicodeDecodeError_Create( 3260 encoding, input, length, startpos, endpos, reason); 3261 } 3262 else { 3263 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3264 goto onError; 3265 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3266 goto onError; 3267 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3268 goto onError; 3269 } 3270 return; 3271 3272onError: 3273 Py_DECREF(*exceptionObject); 3274 *exceptionObject = NULL; 3275} 3276 3277/* error handling callback helper: 3278 build arguments, call the callback and check the arguments, 3279 if no exception occurred, copy the replacement to the output 3280 and adjust various state variables. 3281 return 0 on success, -1 on error 3282*/ 3283 3284static int 3285unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3286 const char *encoding, const char *reason, 3287 const char **input, const char **inend, Py_ssize_t *startinpos, 3288 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3289 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3290{ 3291 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3292 3293 PyObject *restuple = NULL; 3294 PyObject *repunicode = NULL; 3295 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3296 Py_ssize_t insize; 3297 Py_ssize_t requiredsize; 3298 Py_ssize_t newpos; 3299 const Py_UNICODE *repptr; 3300 PyObject *inputobj = NULL; 3301 Py_ssize_t repsize; 3302 int res = -1; 3303 3304 if (*errorHandler == NULL) { 3305 *errorHandler = PyCodec_LookupError(errors); 3306 if (*errorHandler == NULL) 3307 goto onError; 3308 } 3309 3310 make_decode_exception(exceptionObject, 3311 encoding, 3312 *input, *inend - *input, 3313 *startinpos, *endinpos, 3314 reason); 3315 if (*exceptionObject == NULL) 3316 goto onError; 3317 3318 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3319 if (restuple == NULL) 3320 goto onError; 3321 if (!PyTuple_Check(restuple)) { 3322 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3323 goto onError; 3324 } 3325 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3326 goto onError; 3327 3328 /* Copy back the bytes variables, which might have been modified by the 3329 callback */ 3330 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3331 if (!inputobj) 3332 goto onError; 3333 if (!PyBytes_Check(inputobj)) { 3334 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3335 } 3336 *input = PyBytes_AS_STRING(inputobj); 3337 insize = PyBytes_GET_SIZE(inputobj); 3338 *inend = *input + insize; 3339 /* we can DECREF safely, as the exception has another reference, 3340 so the object won't go away. */ 3341 Py_DECREF(inputobj); 3342 3343 if (newpos<0) 3344 newpos = insize+newpos; 3345 if (newpos<0 || newpos>insize) { 3346 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3347 goto onError; 3348 } 3349 3350 /* need more space? (at least enough for what we 3351 have+the replacement+the rest of the string (starting 3352 at the new input position), so we won't have to check space 3353 when there are no errors in the rest of the string) */ 3354 repptr = PyUnicode_AS_UNICODE(repunicode); 3355 repsize = PyUnicode_GET_SIZE(repunicode); 3356 requiredsize = *outpos + repsize + insize-newpos; 3357 if (requiredsize > outsize) { 3358 if (requiredsize<2*outsize) 3359 requiredsize = 2*outsize; 3360 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3361 goto onError; 3362 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3363 } 3364 *endinpos = newpos; 3365 *inptr = *input + newpos; 3366 Py_UNICODE_COPY(*outptr, repptr, repsize); 3367 *outptr += repsize; 3368 *outpos += repsize; 3369 3370 /* we made it! */ 3371 res = 0; 3372 3373 onError: 3374 Py_XDECREF(restuple); 3375 return res; 3376} 3377 3378/* --- UTF-7 Codec -------------------------------------------------------- */ 3379 3380/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3381 3382/* Three simple macros defining base-64. */ 3383 3384/* Is c a base-64 character? */ 3385 3386#define IS_BASE64(c) \ 3387 (((c) >= 'A' && (c) <= 'Z') || \ 3388 ((c) >= 'a' && (c) <= 'z') || \ 3389 ((c) >= '0' && (c) <= '9') || \ 3390 (c) == '+' || (c) == '/') 3391 3392/* given that c is a base-64 character, what is its base-64 value? */ 3393 3394#define FROM_BASE64(c) \ 3395 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3396 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3397 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3398 (c) == '+' ? 62 : 63) 3399 3400/* What is the base-64 character of the bottom 6 bits of n? */ 3401 3402#define TO_BASE64(n) \ 3403 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3404 3405/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3406 * decoded as itself. We are permissive on decoding; the only ASCII 3407 * byte not decoding to itself is the + which begins a base64 3408 * string. */ 3409 3410#define DECODE_DIRECT(c) \ 3411 ((c) <= 127 && (c) != '+') 3412 3413/* The UTF-7 encoder treats ASCII characters differently according to 3414 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3415 * the above). See RFC2152. This array identifies these different 3416 * sets: 3417 * 0 : "Set D" 3418 * alphanumeric and '(),-./:? 3419 * 1 : "Set O" 3420 * !"#$%&*;<=>@[]^_`{|} 3421 * 2 : "whitespace" 3422 * ht nl cr sp 3423 * 3 : special (must be base64 encoded) 3424 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3425 */ 3426 3427static 3428char utf7_category[128] = { 3429/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3430 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3431/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3432 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3433/* sp ! " # $ % & ' ( ) * + , - . / */ 3434 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3435/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3436 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3437/* @ A B C D E F G H I J K L M N O */ 3438 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3439/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3441/* ` a b c d e f g h i j k l m n o */ 3442 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3443/* p q r s t u v w x y z { | } ~ del */ 3444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3445}; 3446 3447/* ENCODE_DIRECT: this character should be encoded as itself. The 3448 * answer depends on whether we are encoding set O as itself, and also 3449 * on whether we are encoding whitespace as itself. RFC2152 makes it 3450 * clear that the answers to these questions vary between 3451 * applications, so this code needs to be flexible. */ 3452 3453#define ENCODE_DIRECT(c, directO, directWS) \ 3454 ((c) < 128 && (c) > 0 && \ 3455 ((utf7_category[(c)] == 0) || \ 3456 (directWS && (utf7_category[(c)] == 2)) || \ 3457 (directO && (utf7_category[(c)] == 1)))) 3458 3459PyObject * 3460PyUnicode_DecodeUTF7(const char *s, 3461 Py_ssize_t size, 3462 const char *errors) 3463{ 3464 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3465} 3466 3467/* The decoder. The only state we preserve is our read position, 3468 * i.e. how many characters we have consumed. So if we end in the 3469 * middle of a shift sequence we have to back off the read position 3470 * and the output to the beginning of the sequence, otherwise we lose 3471 * all the shift state (seen bits, number of bits seen, high 3472 * surrogate). */ 3473 3474PyObject * 3475PyUnicode_DecodeUTF7Stateful(const char *s, 3476 Py_ssize_t size, 3477 const char *errors, 3478 Py_ssize_t *consumed) 3479{ 3480 const char *starts = s; 3481 Py_ssize_t startinpos; 3482 Py_ssize_t endinpos; 3483 Py_ssize_t outpos; 3484 const char *e; 3485 PyUnicodeObject *unicode; 3486 Py_UNICODE *p; 3487 const char *errmsg = ""; 3488 int inShift = 0; 3489 Py_UNICODE *shiftOutStart; 3490 unsigned int base64bits = 0; 3491 unsigned long base64buffer = 0; 3492 Py_UNICODE surrogate = 0; 3493 PyObject *errorHandler = NULL; 3494 PyObject *exc = NULL; 3495 3496 unicode = _PyUnicode_New(size); 3497 if (!unicode) 3498 return NULL; 3499 if (size == 0) { 3500 if (consumed) 3501 *consumed = 0; 3502 return (PyObject *)unicode; 3503 } 3504 3505 p = PyUnicode_AS_UNICODE(unicode); 3506 shiftOutStart = p; 3507 e = s + size; 3508 3509 while (s < e) { 3510 Py_UNICODE ch; 3511 restart: 3512 ch = (unsigned char) *s; 3513 3514 if (inShift) { /* in a base-64 section */ 3515 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3516 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3517 base64bits += 6; 3518 s++; 3519 if (base64bits >= 16) { 3520 /* we have enough bits for a UTF-16 value */ 3521 Py_UNICODE outCh = (Py_UNICODE) 3522 (base64buffer >> (base64bits-16)); 3523 base64bits -= 16; 3524 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3525 if (surrogate) { 3526 /* expecting a second surrogate */ 3527 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3528#ifdef Py_UNICODE_WIDE 3529 *p++ = (((surrogate & 0x3FF)<<10) 3530 | (outCh & 0x3FF)) + 0x10000; 3531#else 3532 *p++ = surrogate; 3533 *p++ = outCh; 3534#endif 3535 surrogate = 0; 3536 } 3537 else { 3538 surrogate = 0; 3539 errmsg = "second surrogate missing"; 3540 goto utf7Error; 3541 } 3542 } 3543 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3544 /* first surrogate */ 3545 surrogate = outCh; 3546 } 3547 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3548 errmsg = "unexpected second surrogate"; 3549 goto utf7Error; 3550 } 3551 else { 3552 *p++ = outCh; 3553 } 3554 } 3555 } 3556 else { /* now leaving a base-64 section */ 3557 inShift = 0; 3558 s++; 3559 if (surrogate) { 3560 errmsg = "second surrogate missing at end of shift sequence"; 3561 goto utf7Error; 3562 } 3563 if (base64bits > 0) { /* left-over bits */ 3564 if (base64bits >= 6) { 3565 /* We've seen at least one base-64 character */ 3566 errmsg = "partial character in shift sequence"; 3567 goto utf7Error; 3568 } 3569 else { 3570 /* Some bits remain; they should be zero */ 3571 if (base64buffer != 0) { 3572 errmsg = "non-zero padding bits in shift sequence"; 3573 goto utf7Error; 3574 } 3575 } 3576 } 3577 if (ch != '-') { 3578 /* '-' is absorbed; other terminating 3579 characters are preserved */ 3580 *p++ = ch; 3581 } 3582 } 3583 } 3584 else if ( ch == '+' ) { 3585 startinpos = s-starts; 3586 s++; /* consume '+' */ 3587 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3588 s++; 3589 *p++ = '+'; 3590 } 3591 else { /* begin base64-encoded section */ 3592 inShift = 1; 3593 shiftOutStart = p; 3594 base64bits = 0; 3595 } 3596 } 3597 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3598 *p++ = ch; 3599 s++; 3600 } 3601 else { 3602 startinpos = s-starts; 3603 s++; 3604 errmsg = "unexpected special character"; 3605 goto utf7Error; 3606 } 3607 continue; 3608utf7Error: 3609 outpos = p-PyUnicode_AS_UNICODE(unicode); 3610 endinpos = s-starts; 3611 if (unicode_decode_call_errorhandler( 3612 errors, &errorHandler, 3613 "utf7", errmsg, 3614 &starts, &e, &startinpos, &endinpos, &exc, &s, 3615 &unicode, &outpos, &p)) 3616 goto onError; 3617 } 3618 3619 /* end of string */ 3620 3621 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3622 /* if we're in an inconsistent state, that's an error */ 3623 if (surrogate || 3624 (base64bits >= 6) || 3625 (base64bits > 0 && base64buffer != 0)) { 3626 outpos = p-PyUnicode_AS_UNICODE(unicode); 3627 endinpos = size; 3628 if (unicode_decode_call_errorhandler( 3629 errors, &errorHandler, 3630 "utf7", "unterminated shift sequence", 3631 &starts, &e, &startinpos, &endinpos, &exc, &s, 3632 &unicode, &outpos, &p)) 3633 goto onError; 3634 if (s < e) 3635 goto restart; 3636 } 3637 } 3638 3639 /* return state */ 3640 if (consumed) { 3641 if (inShift) { 3642 p = shiftOutStart; /* back off output */ 3643 *consumed = startinpos; 3644 } 3645 else { 3646 *consumed = s-starts; 3647 } 3648 } 3649 3650 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3651 goto onError; 3652 3653 Py_XDECREF(errorHandler); 3654 Py_XDECREF(exc); 3655 if (_PyUnicode_READY_REPLACE(&unicode)) { 3656 Py_DECREF(unicode); 3657 return NULL; 3658 } 3659 return (PyObject *)unicode; 3660 3661 onError: 3662 Py_XDECREF(errorHandler); 3663 Py_XDECREF(exc); 3664 Py_DECREF(unicode); 3665 return NULL; 3666} 3667 3668 3669PyObject * 3670PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3671 Py_ssize_t size, 3672 int base64SetO, 3673 int base64WhiteSpace, 3674 const char *errors) 3675{ 3676 PyObject *v; 3677 /* It might be possible to tighten this worst case */ 3678 Py_ssize_t allocated = 8 * size; 3679 int inShift = 0; 3680 Py_ssize_t i = 0; 3681 unsigned int base64bits = 0; 3682 unsigned long base64buffer = 0; 3683 char * out; 3684 char * start; 3685 3686 if (size == 0) 3687 return PyBytes_FromStringAndSize(NULL, 0); 3688 3689 if (allocated / 8 != size) 3690 return PyErr_NoMemory(); 3691 3692 v = PyBytes_FromStringAndSize(NULL, allocated); 3693 if (v == NULL) 3694 return NULL; 3695 3696 start = out = PyBytes_AS_STRING(v); 3697 for (;i < size; ++i) { 3698 Py_UNICODE ch = s[i]; 3699 3700 if (inShift) { 3701 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3702 /* shifting out */ 3703 if (base64bits) { /* output remaining bits */ 3704 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3705 base64buffer = 0; 3706 base64bits = 0; 3707 } 3708 inShift = 0; 3709 /* Characters not in the BASE64 set implicitly unshift the sequence 3710 so no '-' is required, except if the character is itself a '-' */ 3711 if (IS_BASE64(ch) || ch == '-') { 3712 *out++ = '-'; 3713 } 3714 *out++ = (char) ch; 3715 } 3716 else { 3717 goto encode_char; 3718 } 3719 } 3720 else { /* not in a shift sequence */ 3721 if (ch == '+') { 3722 *out++ = '+'; 3723 *out++ = '-'; 3724 } 3725 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3726 *out++ = (char) ch; 3727 } 3728 else { 3729 *out++ = '+'; 3730 inShift = 1; 3731 goto encode_char; 3732 } 3733 } 3734 continue; 3735encode_char: 3736#ifdef Py_UNICODE_WIDE 3737 if (ch >= 0x10000) { 3738 /* code first surrogate */ 3739 base64bits += 16; 3740 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3741 while (base64bits >= 6) { 3742 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3743 base64bits -= 6; 3744 } 3745 /* prepare second surrogate */ 3746 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3747 } 3748#endif 3749 base64bits += 16; 3750 base64buffer = (base64buffer << 16) | ch; 3751 while (base64bits >= 6) { 3752 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3753 base64bits -= 6; 3754 } 3755 } 3756 if (base64bits) 3757 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3758 if (inShift) 3759 *out++ = '-'; 3760 if (_PyBytes_Resize(&v, out - start) < 0) 3761 return NULL; 3762 return v; 3763} 3764 3765#undef IS_BASE64 3766#undef FROM_BASE64 3767#undef TO_BASE64 3768#undef DECODE_DIRECT 3769#undef ENCODE_DIRECT 3770 3771/* --- UTF-8 Codec -------------------------------------------------------- */ 3772 3773static 3774char utf8_code_length[256] = { 3775 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3776 illegal prefix. See RFC 3629 for details */ 3777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3779 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3780 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3781 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3782 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3783 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3784 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3785 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3789 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3790 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3791 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3792 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3793}; 3794 3795PyObject * 3796PyUnicode_DecodeUTF8(const char *s, 3797 Py_ssize_t size, 3798 const char *errors) 3799{ 3800 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3801} 3802 3803/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3804#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3805 3806/* Mask to quickly check whether a C 'long' contains a 3807 non-ASCII, UTF8-encoded char. */ 3808#if (SIZEOF_LONG == 8) 3809# define ASCII_CHAR_MASK 0x8080808080808080L 3810#elif (SIZEOF_LONG == 4) 3811# define ASCII_CHAR_MASK 0x80808080L 3812#else 3813# error C 'long' size should be either 4 or 8! 3814#endif 3815 3816/* Scans a UTF-8 string and returns the maximum character to be expected, 3817 the size of the decoded unicode string and if any major errors were 3818 encountered. 3819 3820 This function does check basic UTF-8 sanity, it does however NOT CHECK 3821 if the string contains surrogates, and if all continuation bytes are 3822 within the correct ranges, these checks are performed in 3823 PyUnicode_DecodeUTF8Stateful. 3824 3825 If it sets has_errors to 1, it means the value of unicode_size and max_char 3826 will be bogus and you should not rely on useful information in them. 3827 */ 3828static Py_UCS4 3829utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3830 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3831 int *has_errors) 3832{ 3833 Py_ssize_t n; 3834 Py_ssize_t char_count = 0; 3835 Py_UCS4 max_char = 127, new_max; 3836 Py_UCS4 upper_bound; 3837 const unsigned char *p = (const unsigned char *)s; 3838 const unsigned char *end = p + string_size; 3839 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3840 int err = 0; 3841 3842 for (; p < end && !err; ++p, ++char_count) { 3843 /* Only check value if it's not a ASCII char... */ 3844 if (*p < 0x80) { 3845 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 3846 an explanation. */ 3847 if (!((size_t) p & LONG_PTR_MASK)) { 3848 /* Help register allocation */ 3849 register const unsigned char *_p = p; 3850 while (_p < aligned_end) { 3851 unsigned long value = *(unsigned long *) _p; 3852 if (value & ASCII_CHAR_MASK) 3853 break; 3854 _p += SIZEOF_LONG; 3855 char_count += SIZEOF_LONG; 3856 } 3857 p = _p; 3858 if (p == end) 3859 break; 3860 } 3861 } 3862 if (*p >= 0x80) { 3863 n = utf8_code_length[*p]; 3864 new_max = max_char; 3865 switch (n) { 3866 /* invalid start byte */ 3867 case 0: 3868 err = 1; 3869 break; 3870 case 2: 3871 /* Code points between 0x00FF and 0x07FF inclusive. 3872 Approximate the upper bound of the code point, 3873 if this flips over 255 we can be sure it will be more 3874 than 255 and the string will need 2 bytes per code coint, 3875 if it stays under or equal to 255, we can be sure 1 byte 3876 is enough. 3877 ((*p & 0b00011111) << 6) | 0b00111111 */ 3878 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 3879 if (max_char < upper_bound) 3880 new_max = upper_bound; 3881 /* Ensure we track at least that we left ASCII space. */ 3882 if (new_max < 128) 3883 new_max = 128; 3884 break; 3885 case 3: 3886 /* Between 0x0FFF and 0xFFFF inclusive, so values are 3887 always > 255 and <= 65535 and will always need 2 bytes. */ 3888 if (max_char < 65535) 3889 new_max = 65535; 3890 break; 3891 case 4: 3892 /* Code point will be above 0xFFFF for sure in this case. */ 3893 new_max = 65537; 3894 break; 3895 /* Internal error, this should be caught by the first if */ 3896 case 1: 3897 default: 3898 assert(0 && "Impossible case in utf8_max_char_and_size"); 3899 err = 1; 3900 } 3901 /* Instead of number of overall bytes for this code point, 3902 n containts the number of following bytes: */ 3903 --n; 3904 /* Check if the follow up chars are all valid continuation bytes */ 3905 if (n >= 1) { 3906 const unsigned char *cont; 3907 if ((p + n) >= end) { 3908 if (consumed == 0) 3909 /* incomplete data, non-incremental decoding */ 3910 err = 1; 3911 break; 3912 } 3913 for (cont = p + 1; cont < (p + n); ++cont) { 3914 if ((*cont & 0xc0) != 0x80) { 3915 err = 1; 3916 break; 3917 } 3918 } 3919 p += n; 3920 } 3921 else 3922 err = 1; 3923 max_char = new_max; 3924 } 3925 } 3926 3927 if (unicode_size) 3928 *unicode_size = char_count; 3929 if (has_errors) 3930 *has_errors = err; 3931 return max_char; 3932} 3933 3934/* Similar to PyUnicode_WRITE but can also write into wstr field 3935 of the legacy unicode representation */ 3936#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 3937 do { \ 3938 const int k_ = (kind); \ 3939 if (k_ == PyUnicode_WCHAR_KIND) \ 3940 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 3941 else if (k_ == PyUnicode_1BYTE_KIND) \ 3942 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 3943 else if (k_ == PyUnicode_2BYTE_KIND) \ 3944 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 3945 else \ 3946 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 3947 } while (0) 3948 3949PyObject * 3950PyUnicode_DecodeUTF8Stateful(const char *s, 3951 Py_ssize_t size, 3952 const char *errors, 3953 Py_ssize_t *consumed) 3954{ 3955 const char *starts = s; 3956 int n; 3957 int k; 3958 Py_ssize_t startinpos; 3959 Py_ssize_t endinpos; 3960 const char *e, *aligned_end; 3961 PyUnicodeObject *unicode; 3962 const char *errmsg = ""; 3963 PyObject *errorHandler = NULL; 3964 PyObject *exc = NULL; 3965 Py_UCS4 maxchar = 0; 3966 Py_ssize_t unicode_size; 3967 Py_ssize_t i; 3968 int kind; 3969 void *data; 3970 int has_errors; 3971 Py_UNICODE *error_outptr; 3972#if SIZEOF_WCHAR_T == 2 3973 Py_ssize_t wchar_offset = 0; 3974#endif 3975 3976 if (size == 0) { 3977 if (consumed) 3978 *consumed = 0; 3979 return (PyObject *)PyUnicode_New(0, 0); 3980 } 3981 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 3982 consumed, &has_errors); 3983 if (has_errors) { 3984 unicode = _PyUnicode_New(size); 3985 if (!unicode) 3986 return NULL; 3987 kind = PyUnicode_WCHAR_KIND; 3988 data = PyUnicode_AS_UNICODE(unicode); 3989 assert(data != NULL); 3990 } 3991 else { 3992 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 3993 if (!unicode) 3994 return NULL; 3995 /* When the string is ASCII only, just use memcpy and return. 3996 unicode_size may be != size if there is an incomplete UTF-8 3997 sequence at the end of the ASCII block. */ 3998 if (maxchar < 128 && size == unicode_size) { 3999 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4000 return (PyObject *)unicode; 4001 } 4002 kind = PyUnicode_KIND(unicode); 4003 data = PyUnicode_DATA(unicode); 4004 } 4005 /* Unpack UTF-8 encoded data */ 4006 i = 0; 4007 e = s + size; 4008 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4009 4010 while (s < e) { 4011 Py_UCS4 ch = (unsigned char)*s; 4012 4013 if (ch < 0x80) { 4014 /* Fast path for runs of ASCII characters. Given that common UTF-8 4015 input will consist of an overwhelming majority of ASCII 4016 characters, we try to optimize for this case by checking 4017 as many characters as a C 'long' can contain. 4018 First, check if we can do an aligned read, as most CPUs have 4019 a penalty for unaligned reads. 4020 */ 4021 if (!((size_t) s & LONG_PTR_MASK)) { 4022 /* Help register allocation */ 4023 register const char *_s = s; 4024 register Py_ssize_t _i = i; 4025 while (_s < aligned_end) { 4026 /* Read a whole long at a time (either 4 or 8 bytes), 4027 and do a fast unrolled copy if it only contains ASCII 4028 characters. */ 4029 unsigned long value = *(unsigned long *) _s; 4030 if (value & ASCII_CHAR_MASK) 4031 break; 4032 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4033 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4034 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4035 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4036#if (SIZEOF_LONG == 8) 4037 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4038 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4039 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4040 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4041#endif 4042 _s += SIZEOF_LONG; 4043 _i += SIZEOF_LONG; 4044 } 4045 s = _s; 4046 i = _i; 4047 if (s == e) 4048 break; 4049 ch = (unsigned char)*s; 4050 } 4051 } 4052 4053 if (ch < 0x80) { 4054 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4055 s++; 4056 continue; 4057 } 4058 4059 n = utf8_code_length[ch]; 4060 4061 if (s + n > e) { 4062 if (consumed) 4063 break; 4064 else { 4065 errmsg = "unexpected end of data"; 4066 startinpos = s-starts; 4067 endinpos = startinpos+1; 4068 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4069 endinpos++; 4070 goto utf8Error; 4071 } 4072 } 4073 4074 switch (n) { 4075 4076 case 0: 4077 errmsg = "invalid start byte"; 4078 startinpos = s-starts; 4079 endinpos = startinpos+1; 4080 goto utf8Error; 4081 4082 case 1: 4083 errmsg = "internal error"; 4084 startinpos = s-starts; 4085 endinpos = startinpos+1; 4086 goto utf8Error; 4087 4088 case 2: 4089 if ((s[1] & 0xc0) != 0x80) { 4090 errmsg = "invalid continuation byte"; 4091 startinpos = s-starts; 4092 endinpos = startinpos + 1; 4093 goto utf8Error; 4094 } 4095 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4096 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4097 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4098 break; 4099 4100 case 3: 4101 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4102 will result in surrogates in range d800-dfff. Surrogates are 4103 not valid UTF-8 so they are rejected. 4104 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4105 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4106 if ((s[1] & 0xc0) != 0x80 || 4107 (s[2] & 0xc0) != 0x80 || 4108 ((unsigned char)s[0] == 0xE0 && 4109 (unsigned char)s[1] < 0xA0) || 4110 ((unsigned char)s[0] == 0xED && 4111 (unsigned char)s[1] > 0x9F)) { 4112 errmsg = "invalid continuation byte"; 4113 startinpos = s-starts; 4114 endinpos = startinpos + 1; 4115 4116 /* if s[1] first two bits are 1 and 0, then the invalid 4117 continuation byte is s[2], so increment endinpos by 1, 4118 if not, s[1] is invalid and endinpos doesn't need to 4119 be incremented. */ 4120 if ((s[1] & 0xC0) == 0x80) 4121 endinpos++; 4122 goto utf8Error; 4123 } 4124 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4125 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4126 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4127 break; 4128 4129 case 4: 4130 if ((s[1] & 0xc0) != 0x80 || 4131 (s[2] & 0xc0) != 0x80 || 4132 (s[3] & 0xc0) != 0x80 || 4133 ((unsigned char)s[0] == 0xF0 && 4134 (unsigned char)s[1] < 0x90) || 4135 ((unsigned char)s[0] == 0xF4 && 4136 (unsigned char)s[1] > 0x8F)) { 4137 errmsg = "invalid continuation byte"; 4138 startinpos = s-starts; 4139 endinpos = startinpos + 1; 4140 if ((s[1] & 0xC0) == 0x80) { 4141 endinpos++; 4142 if ((s[2] & 0xC0) == 0x80) 4143 endinpos++; 4144 } 4145 goto utf8Error; 4146 } 4147 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4148 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4149 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4150 4151 /* If the string is flexible or we have native UCS-4, write 4152 directly.. */ 4153 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4154 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4155 4156 else { 4157 /* compute and append the two surrogates: */ 4158 4159 /* translate from 10000..10FFFF to 0..FFFF */ 4160 ch -= 0x10000; 4161 4162 /* high surrogate = top 10 bits added to D800 */ 4163 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4164 (Py_UNICODE)(0xD800 + (ch >> 10))); 4165 4166 /* low surrogate = bottom 10 bits added to DC00 */ 4167 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4168 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4169 } 4170#if SIZEOF_WCHAR_T == 2 4171 wchar_offset++; 4172#endif 4173 break; 4174 } 4175 s += n; 4176 continue; 4177 4178 utf8Error: 4179 /* If this is not yet a resizable string, make it one.. */ 4180 if (kind != PyUnicode_WCHAR_KIND) { 4181 const Py_UNICODE *u; 4182 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4183 if (!new_unicode) 4184 goto onError; 4185 u = PyUnicode_AsUnicode((PyObject *)unicode); 4186 if (!u) 4187 goto onError; 4188#if SIZEOF_WCHAR_T == 2 4189 i += wchar_offset; 4190#endif 4191 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4192 Py_DECREF(unicode); 4193 unicode = new_unicode; 4194 kind = 0; 4195 data = PyUnicode_AS_UNICODE(new_unicode); 4196 assert(data != NULL); 4197 } 4198 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4199 if (unicode_decode_call_errorhandler( 4200 errors, &errorHandler, 4201 "utf8", errmsg, 4202 &starts, &e, &startinpos, &endinpos, &exc, &s, 4203 &unicode, &i, &error_outptr)) 4204 goto onError; 4205 /* Update data because unicode_decode_call_errorhandler might have 4206 re-created or resized the unicode object. */ 4207 data = PyUnicode_AS_UNICODE(unicode); 4208 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4209 } 4210 /* Ensure the unicode_size calculation above was correct: */ 4211 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4212 4213 if (consumed) 4214 *consumed = s-starts; 4215 4216 /* Adjust length and ready string when it contained errors and 4217 is of the old resizable kind. */ 4218 if (kind == PyUnicode_WCHAR_KIND) { 4219 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4220 goto onError; 4221 } 4222 4223 Py_XDECREF(errorHandler); 4224 Py_XDECREF(exc); 4225 if (_PyUnicode_READY_REPLACE(&unicode)) { 4226 Py_DECREF(unicode); 4227 return NULL; 4228 } 4229 return (PyObject *)unicode; 4230 4231 onError: 4232 Py_XDECREF(errorHandler); 4233 Py_XDECREF(exc); 4234 Py_DECREF(unicode); 4235 return NULL; 4236} 4237 4238#undef WRITE_FLEXIBLE_OR_WSTR 4239 4240#ifdef __APPLE__ 4241 4242/* Simplified UTF-8 decoder using surrogateescape error handler, 4243 used to decode the command line arguments on Mac OS X. */ 4244 4245wchar_t* 4246_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4247{ 4248 int n; 4249 const char *e; 4250 wchar_t *unicode, *p; 4251 4252 /* Note: size will always be longer than the resulting Unicode 4253 character count */ 4254 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4255 PyErr_NoMemory(); 4256 return NULL; 4257 } 4258 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4259 if (!unicode) 4260 return NULL; 4261 4262 /* Unpack UTF-8 encoded data */ 4263 p = unicode; 4264 e = s + size; 4265 while (s < e) { 4266 Py_UCS4 ch = (unsigned char)*s; 4267 4268 if (ch < 0x80) { 4269 *p++ = (wchar_t)ch; 4270 s++; 4271 continue; 4272 } 4273 4274 n = utf8_code_length[ch]; 4275 if (s + n > e) { 4276 goto surrogateescape; 4277 } 4278 4279 switch (n) { 4280 case 0: 4281 case 1: 4282 goto surrogateescape; 4283 4284 case 2: 4285 if ((s[1] & 0xc0) != 0x80) 4286 goto surrogateescape; 4287 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4288 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4289 *p++ = (wchar_t)ch; 4290 break; 4291 4292 case 3: 4293 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4294 will result in surrogates in range d800-dfff. Surrogates are 4295 not valid UTF-8 so they are rejected. 4296 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4297 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4298 if ((s[1] & 0xc0) != 0x80 || 4299 (s[2] & 0xc0) != 0x80 || 4300 ((unsigned char)s[0] == 0xE0 && 4301 (unsigned char)s[1] < 0xA0) || 4302 ((unsigned char)s[0] == 0xED && 4303 (unsigned char)s[1] > 0x9F)) { 4304 4305 goto surrogateescape; 4306 } 4307 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4308 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4309 *p++ = (wchar_t)ch; 4310 break; 4311 4312 case 4: 4313 if ((s[1] & 0xc0) != 0x80 || 4314 (s[2] & 0xc0) != 0x80 || 4315 (s[3] & 0xc0) != 0x80 || 4316 ((unsigned char)s[0] == 0xF0 && 4317 (unsigned char)s[1] < 0x90) || 4318 ((unsigned char)s[0] == 0xF4 && 4319 (unsigned char)s[1] > 0x8F)) { 4320 goto surrogateescape; 4321 } 4322 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4323 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4324 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4325 4326#if SIZEOF_WCHAR_T == 4 4327 *p++ = (wchar_t)ch; 4328#else 4329 /* compute and append the two surrogates: */ 4330 4331 /* translate from 10000..10FFFF to 0..FFFF */ 4332 ch -= 0x10000; 4333 4334 /* high surrogate = top 10 bits added to D800 */ 4335 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4336 4337 /* low surrogate = bottom 10 bits added to DC00 */ 4338 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4339#endif 4340 break; 4341 } 4342 s += n; 4343 continue; 4344 4345 surrogateescape: 4346 *p++ = 0xDC00 + ch; 4347 s++; 4348 } 4349 *p = L'\0'; 4350 return unicode; 4351} 4352 4353#endif /* __APPLE__ */ 4354 4355/* Primary internal function which creates utf8 encoded bytes objects. 4356 4357 Allocation strategy: if the string is short, convert into a stack buffer 4358 and allocate exactly as much space needed at the end. Else allocate the 4359 maximum possible needed (4 result bytes per Unicode character), and return 4360 the excess memory at the end. 4361*/ 4362PyObject * 4363_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4364{ 4365#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4366 4367 Py_ssize_t i; /* index into s of next input byte */ 4368 PyObject *result; /* result string object */ 4369 char *p; /* next free byte in output buffer */ 4370 Py_ssize_t nallocated; /* number of result bytes allocated */ 4371 Py_ssize_t nneeded; /* number of result bytes needed */ 4372 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4373 PyObject *errorHandler = NULL; 4374 PyObject *exc = NULL; 4375 int kind; 4376 void *data; 4377 Py_ssize_t size; 4378 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4379#if SIZEOF_WCHAR_T == 2 4380 Py_ssize_t wchar_offset = 0; 4381#endif 4382 4383 if (!PyUnicode_Check(unicode)) { 4384 PyErr_BadArgument(); 4385 return NULL; 4386 } 4387 4388 if (PyUnicode_READY(unicode) == -1) 4389 return NULL; 4390 4391 if (PyUnicode_UTF8(unicode)) 4392 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4393 PyUnicode_UTF8_LENGTH(unicode)); 4394 4395 kind = PyUnicode_KIND(unicode); 4396 data = PyUnicode_DATA(unicode); 4397 size = PyUnicode_GET_LENGTH(unicode); 4398 4399 assert(size >= 0); 4400 4401 if (size <= MAX_SHORT_UNICHARS) { 4402 /* Write into the stack buffer; nallocated can't overflow. 4403 * At the end, we'll allocate exactly as much heap space as it 4404 * turns out we need. 4405 */ 4406 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4407 result = NULL; /* will allocate after we're done */ 4408 p = stackbuf; 4409 } 4410 else { 4411 /* Overallocate on the heap, and give the excess back at the end. */ 4412 nallocated = size * 4; 4413 if (nallocated / 4 != size) /* overflow! */ 4414 return PyErr_NoMemory(); 4415 result = PyBytes_FromStringAndSize(NULL, nallocated); 4416 if (result == NULL) 4417 return NULL; 4418 p = PyBytes_AS_STRING(result); 4419 } 4420 4421 for (i = 0; i < size;) { 4422 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4423 4424 if (ch < 0x80) 4425 /* Encode ASCII */ 4426 *p++ = (char) ch; 4427 4428 else if (ch < 0x0800) { 4429 /* Encode Latin-1 */ 4430 *p++ = (char)(0xc0 | (ch >> 6)); 4431 *p++ = (char)(0x80 | (ch & 0x3f)); 4432 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4433 Py_ssize_t newpos; 4434 PyObject *rep; 4435 Py_ssize_t repsize, k, startpos; 4436 startpos = i-1; 4437#if SIZEOF_WCHAR_T == 2 4438 startpos += wchar_offset; 4439#endif 4440 rep = unicode_encode_call_errorhandler( 4441 errors, &errorHandler, "utf-8", "surrogates not allowed", 4442 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4443 &exc, startpos, startpos+1, &newpos); 4444 if (!rep) 4445 goto error; 4446 4447 if (PyBytes_Check(rep)) 4448 repsize = PyBytes_GET_SIZE(rep); 4449 else 4450 repsize = PyUnicode_GET_SIZE(rep); 4451 4452 if (repsize > 4) { 4453 Py_ssize_t offset; 4454 4455 if (result == NULL) 4456 offset = p - stackbuf; 4457 else 4458 offset = p - PyBytes_AS_STRING(result); 4459 4460 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4461 /* integer overflow */ 4462 PyErr_NoMemory(); 4463 goto error; 4464 } 4465 nallocated += repsize - 4; 4466 if (result != NULL) { 4467 if (_PyBytes_Resize(&result, nallocated) < 0) 4468 goto error; 4469 } else { 4470 result = PyBytes_FromStringAndSize(NULL, nallocated); 4471 if (result == NULL) 4472 goto error; 4473 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4474 } 4475 p = PyBytes_AS_STRING(result) + offset; 4476 } 4477 4478 if (PyBytes_Check(rep)) { 4479 char *prep = PyBytes_AS_STRING(rep); 4480 for(k = repsize; k > 0; k--) 4481 *p++ = *prep++; 4482 } else /* rep is unicode */ { 4483 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4484 Py_UNICODE c; 4485 4486 for(k=0; k<repsize; k++) { 4487 c = prep[k]; 4488 if (0x80 <= c) { 4489 raise_encode_exception(&exc, "utf-8", 4490 PyUnicode_AS_UNICODE(unicode), 4491 size, i-1, i, 4492 "surrogates not allowed"); 4493 goto error; 4494 } 4495 *p++ = (char)prep[k]; 4496 } 4497 } 4498 Py_DECREF(rep); 4499 } else if (ch < 0x10000) { 4500 *p++ = (char)(0xe0 | (ch >> 12)); 4501 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4502 *p++ = (char)(0x80 | (ch & 0x3f)); 4503 } else /* ch >= 0x10000 */ { 4504 /* Encode UCS4 Unicode ordinals */ 4505 *p++ = (char)(0xf0 | (ch >> 18)); 4506 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4507 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4508 *p++ = (char)(0x80 | (ch & 0x3f)); 4509#if SIZEOF_WCHAR_T == 2 4510 wchar_offset++; 4511#endif 4512 } 4513 } 4514 4515 if (result == NULL) { 4516 /* This was stack allocated. */ 4517 nneeded = p - stackbuf; 4518 assert(nneeded <= nallocated); 4519 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4520 } 4521 else { 4522 /* Cut back to size actually needed. */ 4523 nneeded = p - PyBytes_AS_STRING(result); 4524 assert(nneeded <= nallocated); 4525 _PyBytes_Resize(&result, nneeded); 4526 } 4527 4528 Py_XDECREF(errorHandler); 4529 Py_XDECREF(exc); 4530 return result; 4531 error: 4532 Py_XDECREF(errorHandler); 4533 Py_XDECREF(exc); 4534 Py_XDECREF(result); 4535 return NULL; 4536 4537#undef MAX_SHORT_UNICHARS 4538} 4539 4540PyObject * 4541PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4542 Py_ssize_t size, 4543 const char *errors) 4544{ 4545 PyObject *v, *unicode; 4546 4547 unicode = PyUnicode_FromUnicode(s, size); 4548 if (unicode == NULL) 4549 return NULL; 4550 v = _PyUnicode_AsUTF8String(unicode, errors); 4551 Py_DECREF(unicode); 4552 return v; 4553} 4554 4555PyObject * 4556PyUnicode_AsUTF8String(PyObject *unicode) 4557{ 4558 return _PyUnicode_AsUTF8String(unicode, NULL); 4559} 4560 4561/* --- UTF-32 Codec ------------------------------------------------------- */ 4562 4563PyObject * 4564PyUnicode_DecodeUTF32(const char *s, 4565 Py_ssize_t size, 4566 const char *errors, 4567 int *byteorder) 4568{ 4569 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4570} 4571 4572PyObject * 4573PyUnicode_DecodeUTF32Stateful(const char *s, 4574 Py_ssize_t size, 4575 const char *errors, 4576 int *byteorder, 4577 Py_ssize_t *consumed) 4578{ 4579 const char *starts = s; 4580 Py_ssize_t startinpos; 4581 Py_ssize_t endinpos; 4582 Py_ssize_t outpos; 4583 PyUnicodeObject *unicode; 4584 Py_UNICODE *p; 4585#ifndef Py_UNICODE_WIDE 4586 int pairs = 0; 4587 const unsigned char *qq; 4588#else 4589 const int pairs = 0; 4590#endif 4591 const unsigned char *q, *e; 4592 int bo = 0; /* assume native ordering by default */ 4593 const char *errmsg = ""; 4594 /* Offsets from q for retrieving bytes in the right order. */ 4595#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4596 int iorder[] = {0, 1, 2, 3}; 4597#else 4598 int iorder[] = {3, 2, 1, 0}; 4599#endif 4600 PyObject *errorHandler = NULL; 4601 PyObject *exc = NULL; 4602 4603 q = (unsigned char *)s; 4604 e = q + size; 4605 4606 if (byteorder) 4607 bo = *byteorder; 4608 4609 /* Check for BOM marks (U+FEFF) in the input and adjust current 4610 byte order setting accordingly. In native mode, the leading BOM 4611 mark is skipped, in all other modes, it is copied to the output 4612 stream as-is (giving a ZWNBSP character). */ 4613 if (bo == 0) { 4614 if (size >= 4) { 4615 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4616 (q[iorder[1]] << 8) | q[iorder[0]]; 4617#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4618 if (bom == 0x0000FEFF) { 4619 q += 4; 4620 bo = -1; 4621 } 4622 else if (bom == 0xFFFE0000) { 4623 q += 4; 4624 bo = 1; 4625 } 4626#else 4627 if (bom == 0x0000FEFF) { 4628 q += 4; 4629 bo = 1; 4630 } 4631 else if (bom == 0xFFFE0000) { 4632 q += 4; 4633 bo = -1; 4634 } 4635#endif 4636 } 4637 } 4638 4639 if (bo == -1) { 4640 /* force LE */ 4641 iorder[0] = 0; 4642 iorder[1] = 1; 4643 iorder[2] = 2; 4644 iorder[3] = 3; 4645 } 4646 else if (bo == 1) { 4647 /* force BE */ 4648 iorder[0] = 3; 4649 iorder[1] = 2; 4650 iorder[2] = 1; 4651 iorder[3] = 0; 4652 } 4653 4654 /* On narrow builds we split characters outside the BMP into two 4655 codepoints => count how much extra space we need. */ 4656#ifndef Py_UNICODE_WIDE 4657 for (qq = q; qq < e; qq += 4) 4658 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4659 pairs++; 4660#endif 4661 4662 /* This might be one to much, because of a BOM */ 4663 unicode = _PyUnicode_New((size+3)/4+pairs); 4664 if (!unicode) 4665 return NULL; 4666 if (size == 0) 4667 return (PyObject *)unicode; 4668 4669 /* Unpack UTF-32 encoded data */ 4670 p = PyUnicode_AS_UNICODE(unicode); 4671 4672 while (q < e) { 4673 Py_UCS4 ch; 4674 /* remaining bytes at the end? (size should be divisible by 4) */ 4675 if (e-q<4) { 4676 if (consumed) 4677 break; 4678 errmsg = "truncated data"; 4679 startinpos = ((const char *)q)-starts; 4680 endinpos = ((const char *)e)-starts; 4681 goto utf32Error; 4682 /* The remaining input chars are ignored if the callback 4683 chooses to skip the input */ 4684 } 4685 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4686 (q[iorder[1]] << 8) | q[iorder[0]]; 4687 4688 if (ch >= 0x110000) 4689 { 4690 errmsg = "codepoint not in range(0x110000)"; 4691 startinpos = ((const char *)q)-starts; 4692 endinpos = startinpos+4; 4693 goto utf32Error; 4694 } 4695#ifndef Py_UNICODE_WIDE 4696 if (ch >= 0x10000) 4697 { 4698 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4699 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4700 } 4701 else 4702#endif 4703 *p++ = ch; 4704 q += 4; 4705 continue; 4706 utf32Error: 4707 outpos = p-PyUnicode_AS_UNICODE(unicode); 4708 if (unicode_decode_call_errorhandler( 4709 errors, &errorHandler, 4710 "utf32", errmsg, 4711 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4712 &unicode, &outpos, &p)) 4713 goto onError; 4714 } 4715 4716 if (byteorder) 4717 *byteorder = bo; 4718 4719 if (consumed) 4720 *consumed = (const char *)q-starts; 4721 4722 /* Adjust length */ 4723 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4724 goto onError; 4725 4726 Py_XDECREF(errorHandler); 4727 Py_XDECREF(exc); 4728 if (_PyUnicode_READY_REPLACE(&unicode)) { 4729 Py_DECREF(unicode); 4730 return NULL; 4731 } 4732 return (PyObject *)unicode; 4733 4734 onError: 4735 Py_DECREF(unicode); 4736 Py_XDECREF(errorHandler); 4737 Py_XDECREF(exc); 4738 return NULL; 4739} 4740 4741PyObject * 4742PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4743 Py_ssize_t size, 4744 const char *errors, 4745 int byteorder) 4746{ 4747 PyObject *v; 4748 unsigned char *p; 4749 Py_ssize_t nsize, bytesize; 4750#ifndef Py_UNICODE_WIDE 4751 Py_ssize_t i, pairs; 4752#else 4753 const int pairs = 0; 4754#endif 4755 /* Offsets from p for storing byte pairs in the right order. */ 4756#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4757 int iorder[] = {0, 1, 2, 3}; 4758#else 4759 int iorder[] = {3, 2, 1, 0}; 4760#endif 4761 4762#define STORECHAR(CH) \ 4763 do { \ 4764 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4765 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4766 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4767 p[iorder[0]] = (CH) & 0xff; \ 4768 p += 4; \ 4769 } while(0) 4770 4771 /* In narrow builds we can output surrogate pairs as one codepoint, 4772 so we need less space. */ 4773#ifndef Py_UNICODE_WIDE 4774 for (i = pairs = 0; i < size-1; i++) 4775 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4776 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4777 pairs++; 4778#endif 4779 nsize = (size - pairs + (byteorder == 0)); 4780 bytesize = nsize * 4; 4781 if (bytesize / 4 != nsize) 4782 return PyErr_NoMemory(); 4783 v = PyBytes_FromStringAndSize(NULL, bytesize); 4784 if (v == NULL) 4785 return NULL; 4786 4787 p = (unsigned char *)PyBytes_AS_STRING(v); 4788 if (byteorder == 0) 4789 STORECHAR(0xFEFF); 4790 if (size == 0) 4791 goto done; 4792 4793 if (byteorder == -1) { 4794 /* force LE */ 4795 iorder[0] = 0; 4796 iorder[1] = 1; 4797 iorder[2] = 2; 4798 iorder[3] = 3; 4799 } 4800 else if (byteorder == 1) { 4801 /* force BE */ 4802 iorder[0] = 3; 4803 iorder[1] = 2; 4804 iorder[2] = 1; 4805 iorder[3] = 0; 4806 } 4807 4808 while (size-- > 0) { 4809 Py_UCS4 ch = *s++; 4810#ifndef Py_UNICODE_WIDE 4811 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4812 Py_UCS4 ch2 = *s; 4813 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4814 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4815 s++; 4816 size--; 4817 } 4818 } 4819#endif 4820 STORECHAR(ch); 4821 } 4822 4823 done: 4824 return v; 4825#undef STORECHAR 4826} 4827 4828PyObject * 4829PyUnicode_AsUTF32String(PyObject *unicode) 4830{ 4831 if (!PyUnicode_Check(unicode)) { 4832 PyErr_BadArgument(); 4833 return NULL; 4834 } 4835 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 4836 PyUnicode_GET_SIZE(unicode), 4837 NULL, 4838 0); 4839} 4840 4841/* --- UTF-16 Codec ------------------------------------------------------- */ 4842 4843PyObject * 4844PyUnicode_DecodeUTF16(const char *s, 4845 Py_ssize_t size, 4846 const char *errors, 4847 int *byteorder) 4848{ 4849 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 4850} 4851 4852/* Two masks for fast checking of whether a C 'long' may contain 4853 UTF16-encoded surrogate characters. This is an efficient heuristic, 4854 assuming that non-surrogate characters with a code point >= 0x8000 are 4855 rare in most input. 4856 FAST_CHAR_MASK is used when the input is in native byte ordering, 4857 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 4858*/ 4859#if (SIZEOF_LONG == 8) 4860# define FAST_CHAR_MASK 0x8000800080008000L 4861# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 4862#elif (SIZEOF_LONG == 4) 4863# define FAST_CHAR_MASK 0x80008000L 4864# define SWAPPED_FAST_CHAR_MASK 0x00800080L 4865#else 4866# error C 'long' size should be either 4 or 8! 4867#endif 4868 4869PyObject * 4870PyUnicode_DecodeUTF16Stateful(const char *s, 4871 Py_ssize_t size, 4872 const char *errors, 4873 int *byteorder, 4874 Py_ssize_t *consumed) 4875{ 4876 const char *starts = s; 4877 Py_ssize_t startinpos; 4878 Py_ssize_t endinpos; 4879 Py_ssize_t outpos; 4880 PyUnicodeObject *unicode; 4881 Py_UNICODE *p; 4882 const unsigned char *q, *e, *aligned_end; 4883 int bo = 0; /* assume native ordering by default */ 4884 int native_ordering = 0; 4885 const char *errmsg = ""; 4886 /* Offsets from q for retrieving byte pairs in the right order. */ 4887#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4888 int ihi = 1, ilo = 0; 4889#else 4890 int ihi = 0, ilo = 1; 4891#endif 4892 PyObject *errorHandler = NULL; 4893 PyObject *exc = NULL; 4894 4895 /* Note: size will always be longer than the resulting Unicode 4896 character count */ 4897 unicode = _PyUnicode_New(size); 4898 if (!unicode) 4899 return NULL; 4900 if (size == 0) 4901 return (PyObject *)unicode; 4902 4903 /* Unpack UTF-16 encoded data */ 4904 p = PyUnicode_AS_UNICODE(unicode); 4905 q = (unsigned char *)s; 4906 e = q + size - 1; 4907 4908 if (byteorder) 4909 bo = *byteorder; 4910 4911 /* Check for BOM marks (U+FEFF) in the input and adjust current 4912 byte order setting accordingly. In native mode, the leading BOM 4913 mark is skipped, in all other modes, it is copied to the output 4914 stream as-is (giving a ZWNBSP character). */ 4915 if (bo == 0) { 4916 if (size >= 2) { 4917 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 4918#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4919 if (bom == 0xFEFF) { 4920 q += 2; 4921 bo = -1; 4922 } 4923 else if (bom == 0xFFFE) { 4924 q += 2; 4925 bo = 1; 4926 } 4927#else 4928 if (bom == 0xFEFF) { 4929 q += 2; 4930 bo = 1; 4931 } 4932 else if (bom == 0xFFFE) { 4933 q += 2; 4934 bo = -1; 4935 } 4936#endif 4937 } 4938 } 4939 4940 if (bo == -1) { 4941 /* force LE */ 4942 ihi = 1; 4943 ilo = 0; 4944 } 4945 else if (bo == 1) { 4946 /* force BE */ 4947 ihi = 0; 4948 ilo = 1; 4949 } 4950#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4951 native_ordering = ilo < ihi; 4952#else 4953 native_ordering = ilo > ihi; 4954#endif 4955 4956 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 4957 while (q < e) { 4958 Py_UNICODE ch; 4959 /* First check for possible aligned read of a C 'long'. Unaligned 4960 reads are more expensive, better to defer to another iteration. */ 4961 if (!((size_t) q & LONG_PTR_MASK)) { 4962 /* Fast path for runs of non-surrogate chars. */ 4963 register const unsigned char *_q = q; 4964 Py_UNICODE *_p = p; 4965 if (native_ordering) { 4966 /* Native ordering is simple: as long as the input cannot 4967 possibly contain a surrogate char, do an unrolled copy 4968 of several 16-bit code points to the target object. 4969 The non-surrogate check is done on several input bytes 4970 at a time (as many as a C 'long' can contain). */ 4971 while (_q < aligned_end) { 4972 unsigned long data = * (unsigned long *) _q; 4973 if (data & FAST_CHAR_MASK) 4974 break; 4975 _p[0] = ((unsigned short *) _q)[0]; 4976 _p[1] = ((unsigned short *) _q)[1]; 4977#if (SIZEOF_LONG == 8) 4978 _p[2] = ((unsigned short *) _q)[2]; 4979 _p[3] = ((unsigned short *) _q)[3]; 4980#endif 4981 _q += SIZEOF_LONG; 4982 _p += SIZEOF_LONG / 2; 4983 } 4984 } 4985 else { 4986 /* Byteswapped ordering is similar, but we must decompose 4987 the copy bytewise, and take care of zero'ing out the 4988 upper bytes if the target object is in 32-bit units 4989 (that is, in UCS-4 builds). */ 4990 while (_q < aligned_end) { 4991 unsigned long data = * (unsigned long *) _q; 4992 if (data & SWAPPED_FAST_CHAR_MASK) 4993 break; 4994 /* Zero upper bytes in UCS-4 builds */ 4995#if (Py_UNICODE_SIZE > 2) 4996 _p[0] = 0; 4997 _p[1] = 0; 4998#if (SIZEOF_LONG == 8) 4999 _p[2] = 0; 5000 _p[3] = 0; 5001#endif 5002#endif 5003 /* Issue #4916; UCS-4 builds on big endian machines must 5004 fill the two last bytes of each 4-byte unit. */ 5005#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5006# define OFF 2 5007#else 5008# define OFF 0 5009#endif 5010 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5011 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5012 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5013 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5014#if (SIZEOF_LONG == 8) 5015 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5016 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5017 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5018 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5019#endif 5020#undef OFF 5021 _q += SIZEOF_LONG; 5022 _p += SIZEOF_LONG / 2; 5023 } 5024 } 5025 p = _p; 5026 q = _q; 5027 if (q >= e) 5028 break; 5029 } 5030 ch = (q[ihi] << 8) | q[ilo]; 5031 5032 q += 2; 5033 5034 if (ch < 0xD800 || ch > 0xDFFF) { 5035 *p++ = ch; 5036 continue; 5037 } 5038 5039 /* UTF-16 code pair: */ 5040 if (q > e) { 5041 errmsg = "unexpected end of data"; 5042 startinpos = (((const char *)q) - 2) - starts; 5043 endinpos = ((const char *)e) + 1 - starts; 5044 goto utf16Error; 5045 } 5046 if (0xD800 <= ch && ch <= 0xDBFF) { 5047 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5048 q += 2; 5049 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5050#ifndef Py_UNICODE_WIDE 5051 *p++ = ch; 5052 *p++ = ch2; 5053#else 5054 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5055#endif 5056 continue; 5057 } 5058 else { 5059 errmsg = "illegal UTF-16 surrogate"; 5060 startinpos = (((const char *)q)-4)-starts; 5061 endinpos = startinpos+2; 5062 goto utf16Error; 5063 } 5064 5065 } 5066 errmsg = "illegal encoding"; 5067 startinpos = (((const char *)q)-2)-starts; 5068 endinpos = startinpos+2; 5069 /* Fall through to report the error */ 5070 5071 utf16Error: 5072 outpos = p - PyUnicode_AS_UNICODE(unicode); 5073 if (unicode_decode_call_errorhandler( 5074 errors, 5075 &errorHandler, 5076 "utf16", errmsg, 5077 &starts, 5078 (const char **)&e, 5079 &startinpos, 5080 &endinpos, 5081 &exc, 5082 (const char **)&q, 5083 &unicode, 5084 &outpos, 5085 &p)) 5086 goto onError; 5087 } 5088 /* remaining byte at the end? (size should be even) */ 5089 if (e == q) { 5090 if (!consumed) { 5091 errmsg = "truncated data"; 5092 startinpos = ((const char *)q) - starts; 5093 endinpos = ((const char *)e) + 1 - starts; 5094 outpos = p - PyUnicode_AS_UNICODE(unicode); 5095 if (unicode_decode_call_errorhandler( 5096 errors, 5097 &errorHandler, 5098 "utf16", errmsg, 5099 &starts, 5100 (const char **)&e, 5101 &startinpos, 5102 &endinpos, 5103 &exc, 5104 (const char **)&q, 5105 &unicode, 5106 &outpos, 5107 &p)) 5108 goto onError; 5109 /* The remaining input chars are ignored if the callback 5110 chooses to skip the input */ 5111 } 5112 } 5113 5114 if (byteorder) 5115 *byteorder = bo; 5116 5117 if (consumed) 5118 *consumed = (const char *)q-starts; 5119 5120 /* Adjust length */ 5121 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5122 goto onError; 5123 5124 Py_XDECREF(errorHandler); 5125 Py_XDECREF(exc); 5126 if (_PyUnicode_READY_REPLACE(&unicode)) { 5127 Py_DECREF(unicode); 5128 return NULL; 5129 } 5130 return (PyObject *)unicode; 5131 5132 onError: 5133 Py_DECREF(unicode); 5134 Py_XDECREF(errorHandler); 5135 Py_XDECREF(exc); 5136 return NULL; 5137} 5138 5139#undef FAST_CHAR_MASK 5140#undef SWAPPED_FAST_CHAR_MASK 5141 5142PyObject * 5143PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5144 Py_ssize_t size, 5145 const char *errors, 5146 int byteorder) 5147{ 5148 PyObject *v; 5149 unsigned char *p; 5150 Py_ssize_t nsize, bytesize; 5151#ifdef Py_UNICODE_WIDE 5152 Py_ssize_t i, pairs; 5153#else 5154 const int pairs = 0; 5155#endif 5156 /* Offsets from p for storing byte pairs in the right order. */ 5157#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5158 int ihi = 1, ilo = 0; 5159#else 5160 int ihi = 0, ilo = 1; 5161#endif 5162 5163#define STORECHAR(CH) \ 5164 do { \ 5165 p[ihi] = ((CH) >> 8) & 0xff; \ 5166 p[ilo] = (CH) & 0xff; \ 5167 p += 2; \ 5168 } while(0) 5169 5170#ifdef Py_UNICODE_WIDE 5171 for (i = pairs = 0; i < size; i++) 5172 if (s[i] >= 0x10000) 5173 pairs++; 5174#endif 5175 /* 2 * (size + pairs + (byteorder == 0)) */ 5176 if (size > PY_SSIZE_T_MAX || 5177 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5178 return PyErr_NoMemory(); 5179 nsize = size + pairs + (byteorder == 0); 5180 bytesize = nsize * 2; 5181 if (bytesize / 2 != nsize) 5182 return PyErr_NoMemory(); 5183 v = PyBytes_FromStringAndSize(NULL, bytesize); 5184 if (v == NULL) 5185 return NULL; 5186 5187 p = (unsigned char *)PyBytes_AS_STRING(v); 5188 if (byteorder == 0) 5189 STORECHAR(0xFEFF); 5190 if (size == 0) 5191 goto done; 5192 5193 if (byteorder == -1) { 5194 /* force LE */ 5195 ihi = 1; 5196 ilo = 0; 5197 } 5198 else if (byteorder == 1) { 5199 /* force BE */ 5200 ihi = 0; 5201 ilo = 1; 5202 } 5203 5204 while (size-- > 0) { 5205 Py_UNICODE ch = *s++; 5206 Py_UNICODE ch2 = 0; 5207#ifdef Py_UNICODE_WIDE 5208 if (ch >= 0x10000) { 5209 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5210 ch = 0xD800 | ((ch-0x10000) >> 10); 5211 } 5212#endif 5213 STORECHAR(ch); 5214 if (ch2) 5215 STORECHAR(ch2); 5216 } 5217 5218 done: 5219 return v; 5220#undef STORECHAR 5221} 5222 5223PyObject * 5224PyUnicode_AsUTF16String(PyObject *unicode) 5225{ 5226 if (!PyUnicode_Check(unicode)) { 5227 PyErr_BadArgument(); 5228 return NULL; 5229 } 5230 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5231 PyUnicode_GET_SIZE(unicode), 5232 NULL, 5233 0); 5234} 5235 5236/* --- Unicode Escape Codec ----------------------------------------------- */ 5237 5238/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5239 if all the escapes in the string make it still a valid ASCII string. 5240 Returns -1 if any escapes were found which cause the string to 5241 pop out of ASCII range. Otherwise returns the length of the 5242 required buffer to hold the string. 5243 */ 5244Py_ssize_t 5245length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5246{ 5247 const unsigned char *p = (const unsigned char *)s; 5248 const unsigned char *end = p + size; 5249 Py_ssize_t length = 0; 5250 5251 if (size < 0) 5252 return -1; 5253 5254 for (; p < end; ++p) { 5255 if (*p > 127) { 5256 /* Non-ASCII */ 5257 return -1; 5258 } 5259 else if (*p != '\\') { 5260 /* Normal character */ 5261 ++length; 5262 } 5263 else { 5264 /* Backslash-escape, check next char */ 5265 ++p; 5266 /* Escape sequence reaches till end of string or 5267 non-ASCII follow-up. */ 5268 if (p >= end || *p > 127) 5269 return -1; 5270 switch (*p) { 5271 case '\n': 5272 /* backslash + \n result in zero characters */ 5273 break; 5274 case '\\': case '\'': case '\"': 5275 case 'b': case 'f': case 't': 5276 case 'n': case 'r': case 'v': case 'a': 5277 ++length; 5278 break; 5279 case '0': case '1': case '2': case '3': 5280 case '4': case '5': case '6': case '7': 5281 case 'x': case 'u': case 'U': case 'N': 5282 /* these do not guarantee ASCII characters */ 5283 return -1; 5284 default: 5285 /* count the backslash + the other character */ 5286 length += 2; 5287 } 5288 } 5289 } 5290 return length; 5291} 5292 5293/* Similar to PyUnicode_WRITE but either write into wstr field 5294 or treat string as ASCII. */ 5295#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5296 do { \ 5297 if ((kind) != PyUnicode_WCHAR_KIND) \ 5298 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5299 else \ 5300 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5301 } while (0) 5302 5303#define WRITE_WSTR(buf, index, value) \ 5304 assert(kind == PyUnicode_WCHAR_KIND), \ 5305 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5306 5307 5308static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5309 5310PyObject * 5311PyUnicode_DecodeUnicodeEscape(const char *s, 5312 Py_ssize_t size, 5313 const char *errors) 5314{ 5315 const char *starts = s; 5316 Py_ssize_t startinpos; 5317 Py_ssize_t endinpos; 5318 int j; 5319 PyUnicodeObject *v; 5320 Py_UNICODE *p; 5321 const char *end; 5322 char* message; 5323 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5324 PyObject *errorHandler = NULL; 5325 PyObject *exc = NULL; 5326 Py_ssize_t ascii_length; 5327 Py_ssize_t i; 5328 int kind; 5329 void *data; 5330 5331 ascii_length = length_of_escaped_ascii_string(s, size); 5332 5333 /* After length_of_escaped_ascii_string() there are two alternatives, 5334 either the string is pure ASCII with named escapes like \n, etc. 5335 and we determined it's exact size (common case) 5336 or it contains \x, \u, ... escape sequences. then we create a 5337 legacy wchar string and resize it at the end of this function. */ 5338 if (ascii_length >= 0) { 5339 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5340 if (!v) 5341 goto onError; 5342 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5343 kind = PyUnicode_1BYTE_KIND; 5344 data = PyUnicode_DATA(v); 5345 } 5346 else { 5347 /* Escaped strings will always be longer than the resulting 5348 Unicode string, so we start with size here and then reduce the 5349 length after conversion to the true value. 5350 (but if the error callback returns a long replacement string 5351 we'll have to allocate more space) */ 5352 v = _PyUnicode_New(size); 5353 if (!v) 5354 goto onError; 5355 kind = PyUnicode_WCHAR_KIND; 5356 data = PyUnicode_AS_UNICODE(v); 5357 } 5358 5359 if (size == 0) 5360 return (PyObject *)v; 5361 i = 0; 5362 end = s + size; 5363 5364 while (s < end) { 5365 unsigned char c; 5366 Py_UNICODE x; 5367 int digits; 5368 5369 if (kind == PyUnicode_WCHAR_KIND) { 5370 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5371 } 5372 else { 5373 /* The only case in which i == ascii_length is a backslash 5374 followed by a newline. */ 5375 assert(i <= ascii_length); 5376 } 5377 5378 /* Non-escape characters are interpreted as Unicode ordinals */ 5379 if (*s != '\\') { 5380 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5381 continue; 5382 } 5383 5384 startinpos = s-starts; 5385 /* \ - Escapes */ 5386 s++; 5387 c = *s++; 5388 if (s > end) 5389 c = '\0'; /* Invalid after \ */ 5390 5391 if (kind == PyUnicode_WCHAR_KIND) { 5392 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5393 } 5394 else { 5395 /* The only case in which i == ascii_length is a backslash 5396 followed by a newline. */ 5397 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5398 } 5399 5400 switch (c) { 5401 5402 /* \x escapes */ 5403 case '\n': break; 5404 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5405 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5406 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5407 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5408 /* FF */ 5409 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5410 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5411 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5412 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5413 /* VT */ 5414 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5415 /* BEL, not classic C */ 5416 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5417 5418 /* \OOO (octal) escapes */ 5419 case '0': case '1': case '2': case '3': 5420 case '4': case '5': case '6': case '7': 5421 x = s[-1] - '0'; 5422 if (s < end && '0' <= *s && *s <= '7') { 5423 x = (x<<3) + *s++ - '0'; 5424 if (s < end && '0' <= *s && *s <= '7') 5425 x = (x<<3) + *s++ - '0'; 5426 } 5427 WRITE_WSTR(data, i++, x); 5428 break; 5429 5430 /* hex escapes */ 5431 /* \xXX */ 5432 case 'x': 5433 digits = 2; 5434 message = "truncated \\xXX escape"; 5435 goto hexescape; 5436 5437 /* \uXXXX */ 5438 case 'u': 5439 digits = 4; 5440 message = "truncated \\uXXXX escape"; 5441 goto hexescape; 5442 5443 /* \UXXXXXXXX */ 5444 case 'U': 5445 digits = 8; 5446 message = "truncated \\UXXXXXXXX escape"; 5447 hexescape: 5448 chr = 0; 5449 p = PyUnicode_AS_UNICODE(v) + i; 5450 if (s+digits>end) { 5451 endinpos = size; 5452 if (unicode_decode_call_errorhandler( 5453 errors, &errorHandler, 5454 "unicodeescape", "end of string in escape sequence", 5455 &starts, &end, &startinpos, &endinpos, &exc, &s, 5456 &v, &i, &p)) 5457 goto onError; 5458 data = PyUnicode_AS_UNICODE(v); 5459 goto nextByte; 5460 } 5461 for (j = 0; j < digits; ++j) { 5462 c = (unsigned char) s[j]; 5463 if (!Py_ISXDIGIT(c)) { 5464 endinpos = (s+j+1)-starts; 5465 p = PyUnicode_AS_UNICODE(v) + i; 5466 if (unicode_decode_call_errorhandler( 5467 errors, &errorHandler, 5468 "unicodeescape", message, 5469 &starts, &end, &startinpos, &endinpos, &exc, &s, 5470 &v, &i, &p)) 5471 goto onError; 5472 data = PyUnicode_AS_UNICODE(v); 5473 goto nextByte; 5474 } 5475 chr = (chr<<4) & ~0xF; 5476 if (c >= '0' && c <= '9') 5477 chr += c - '0'; 5478 else if (c >= 'a' && c <= 'f') 5479 chr += 10 + c - 'a'; 5480 else 5481 chr += 10 + c - 'A'; 5482 } 5483 s += j; 5484 if (chr == 0xffffffff && PyErr_Occurred()) 5485 /* _decoding_error will have already written into the 5486 target buffer. */ 5487 break; 5488 store: 5489 /* when we get here, chr is a 32-bit unicode character */ 5490 if (chr <= 0xffff) 5491 /* UCS-2 character */ 5492 WRITE_WSTR(data, i++, chr); 5493 else if (chr <= 0x10ffff) { 5494 /* UCS-4 character. Either store directly, or as 5495 surrogate pair. */ 5496#ifdef Py_UNICODE_WIDE 5497 WRITE_WSTR(data, i++, chr); 5498#else 5499 chr -= 0x10000L; 5500 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5501 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5502#endif 5503 } else { 5504 endinpos = s-starts; 5505 p = PyUnicode_AS_UNICODE(v) + i; 5506 if (unicode_decode_call_errorhandler( 5507 errors, &errorHandler, 5508 "unicodeescape", "illegal Unicode character", 5509 &starts, &end, &startinpos, &endinpos, &exc, &s, 5510 &v, &i, &p)) 5511 goto onError; 5512 data = PyUnicode_AS_UNICODE(v); 5513 } 5514 break; 5515 5516 /* \N{name} */ 5517 case 'N': 5518 message = "malformed \\N character escape"; 5519 if (ucnhash_CAPI == NULL) { 5520 /* load the unicode data module */ 5521 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5522 PyUnicodeData_CAPSULE_NAME, 1); 5523 if (ucnhash_CAPI == NULL) 5524 goto ucnhashError; 5525 } 5526 if (*s == '{') { 5527 const char *start = s+1; 5528 /* look for the closing brace */ 5529 while (*s != '}' && s < end) 5530 s++; 5531 if (s > start && s < end && *s == '}') { 5532 /* found a name. look it up in the unicode database */ 5533 message = "unknown Unicode character name"; 5534 s++; 5535 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5536 &chr)) 5537 goto store; 5538 } 5539 } 5540 endinpos = s-starts; 5541 p = PyUnicode_AS_UNICODE(v) + i; 5542 if (unicode_decode_call_errorhandler( 5543 errors, &errorHandler, 5544 "unicodeescape", message, 5545 &starts, &end, &startinpos, &endinpos, &exc, &s, 5546 &v, &i, &p)) 5547 goto onError; 5548 data = PyUnicode_AS_UNICODE(v); 5549 break; 5550 5551 default: 5552 if (s > end) { 5553 assert(kind == PyUnicode_WCHAR_KIND); 5554 message = "\\ at end of string"; 5555 s--; 5556 endinpos = s-starts; 5557 p = PyUnicode_AS_UNICODE(v) + i; 5558 if (unicode_decode_call_errorhandler( 5559 errors, &errorHandler, 5560 "unicodeescape", message, 5561 &starts, &end, &startinpos, &endinpos, &exc, &s, 5562 &v, &i, &p)) 5563 goto onError; 5564 data = PyUnicode_AS_UNICODE(v); 5565 } 5566 else { 5567 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5568 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5569 } 5570 break; 5571 } 5572 nextByte: 5573 ; 5574 } 5575 /* Ensure the length prediction worked in case of ASCII strings */ 5576 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5577 5578 if (kind == PyUnicode_WCHAR_KIND) 5579 { 5580 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5581 goto onError; 5582 } 5583 Py_XDECREF(errorHandler); 5584 Py_XDECREF(exc); 5585 if (_PyUnicode_READY_REPLACE(&v)) { 5586 Py_DECREF(v); 5587 return NULL; 5588 } 5589 return (PyObject *)v; 5590 5591 ucnhashError: 5592 PyErr_SetString( 5593 PyExc_UnicodeError, 5594 "\\N escapes not supported (can't load unicodedata module)" 5595 ); 5596 Py_XDECREF(v); 5597 Py_XDECREF(errorHandler); 5598 Py_XDECREF(exc); 5599 return NULL; 5600 5601 onError: 5602 Py_XDECREF(v); 5603 Py_XDECREF(errorHandler); 5604 Py_XDECREF(exc); 5605 return NULL; 5606} 5607 5608#undef WRITE_ASCII_OR_WSTR 5609#undef WRITE_WSTR 5610 5611/* Return a Unicode-Escape string version of the Unicode object. 5612 5613 If quotes is true, the string is enclosed in u"" or u'' quotes as 5614 appropriate. 5615 5616*/ 5617 5618static const char *hexdigits = "0123456789abcdef"; 5619 5620PyObject * 5621PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5622 Py_ssize_t size) 5623{ 5624 PyObject *repr; 5625 char *p; 5626 5627#ifdef Py_UNICODE_WIDE 5628 const Py_ssize_t expandsize = 10; 5629#else 5630 const Py_ssize_t expandsize = 6; 5631#endif 5632 5633 /* XXX(nnorwitz): rather than over-allocating, it would be 5634 better to choose a different scheme. Perhaps scan the 5635 first N-chars of the string and allocate based on that size. 5636 */ 5637 /* Initial allocation is based on the longest-possible unichr 5638 escape. 5639 5640 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5641 unichr, so in this case it's the longest unichr escape. In 5642 narrow (UTF-16) builds this is five chars per source unichr 5643 since there are two unichrs in the surrogate pair, so in narrow 5644 (UTF-16) builds it's not the longest unichr escape. 5645 5646 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5647 so in the narrow (UTF-16) build case it's the longest unichr 5648 escape. 5649 */ 5650 5651 if (size == 0) 5652 return PyBytes_FromStringAndSize(NULL, 0); 5653 5654 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5655 return PyErr_NoMemory(); 5656 5657 repr = PyBytes_FromStringAndSize(NULL, 5658 2 5659 + expandsize*size 5660 + 1); 5661 if (repr == NULL) 5662 return NULL; 5663 5664 p = PyBytes_AS_STRING(repr); 5665 5666 while (size-- > 0) { 5667 Py_UNICODE ch = *s++; 5668 5669 /* Escape backslashes */ 5670 if (ch == '\\') { 5671 *p++ = '\\'; 5672 *p++ = (char) ch; 5673 continue; 5674 } 5675 5676#ifdef Py_UNICODE_WIDE 5677 /* Map 21-bit characters to '\U00xxxxxx' */ 5678 else if (ch >= 0x10000) { 5679 *p++ = '\\'; 5680 *p++ = 'U'; 5681 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5682 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5683 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5684 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5685 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5686 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5687 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5688 *p++ = hexdigits[ch & 0x0000000F]; 5689 continue; 5690 } 5691#else 5692 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5693 else if (ch >= 0xD800 && ch < 0xDC00) { 5694 Py_UNICODE ch2; 5695 Py_UCS4 ucs; 5696 5697 ch2 = *s++; 5698 size--; 5699 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5700 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5701 *p++ = '\\'; 5702 *p++ = 'U'; 5703 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5704 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5705 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5706 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5707 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5708 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5709 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5710 *p++ = hexdigits[ucs & 0x0000000F]; 5711 continue; 5712 } 5713 /* Fall through: isolated surrogates are copied as-is */ 5714 s--; 5715 size++; 5716 } 5717#endif 5718 5719 /* Map 16-bit characters to '\uxxxx' */ 5720 if (ch >= 256) { 5721 *p++ = '\\'; 5722 *p++ = 'u'; 5723 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5724 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5725 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5726 *p++ = hexdigits[ch & 0x000F]; 5727 } 5728 5729 /* Map special whitespace to '\t', \n', '\r' */ 5730 else if (ch == '\t') { 5731 *p++ = '\\'; 5732 *p++ = 't'; 5733 } 5734 else if (ch == '\n') { 5735 *p++ = '\\'; 5736 *p++ = 'n'; 5737 } 5738 else if (ch == '\r') { 5739 *p++ = '\\'; 5740 *p++ = 'r'; 5741 } 5742 5743 /* Map non-printable US ASCII to '\xhh' */ 5744 else if (ch < ' ' || ch >= 0x7F) { 5745 *p++ = '\\'; 5746 *p++ = 'x'; 5747 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5748 *p++ = hexdigits[ch & 0x000F]; 5749 } 5750 5751 /* Copy everything else as-is */ 5752 else 5753 *p++ = (char) ch; 5754 } 5755 5756 assert(p - PyBytes_AS_STRING(repr) > 0); 5757 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5758 return NULL; 5759 return repr; 5760} 5761 5762PyObject * 5763PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5764{ 5765 PyObject *s; 5766 if (!PyUnicode_Check(unicode)) { 5767 PyErr_BadArgument(); 5768 return NULL; 5769 } 5770 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5771 PyUnicode_GET_SIZE(unicode)); 5772 return s; 5773} 5774 5775/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5776 5777PyObject * 5778PyUnicode_DecodeRawUnicodeEscape(const char *s, 5779 Py_ssize_t size, 5780 const char *errors) 5781{ 5782 const char *starts = s; 5783 Py_ssize_t startinpos; 5784 Py_ssize_t endinpos; 5785 Py_ssize_t outpos; 5786 PyUnicodeObject *v; 5787 Py_UNICODE *p; 5788 const char *end; 5789 const char *bs; 5790 PyObject *errorHandler = NULL; 5791 PyObject *exc = NULL; 5792 5793 /* Escaped strings will always be longer than the resulting 5794 Unicode string, so we start with size here and then reduce the 5795 length after conversion to the true value. (But decoding error 5796 handler might have to resize the string) */ 5797 v = _PyUnicode_New(size); 5798 if (v == NULL) 5799 goto onError; 5800 if (size == 0) 5801 return (PyObject *)v; 5802 p = PyUnicode_AS_UNICODE(v); 5803 end = s + size; 5804 while (s < end) { 5805 unsigned char c; 5806 Py_UCS4 x; 5807 int i; 5808 int count; 5809 5810 /* Non-escape characters are interpreted as Unicode ordinals */ 5811 if (*s != '\\') { 5812 *p++ = (unsigned char)*s++; 5813 continue; 5814 } 5815 startinpos = s-starts; 5816 5817 /* \u-escapes are only interpreted iff the number of leading 5818 backslashes if odd */ 5819 bs = s; 5820 for (;s < end;) { 5821 if (*s != '\\') 5822 break; 5823 *p++ = (unsigned char)*s++; 5824 } 5825 if (((s - bs) & 1) == 0 || 5826 s >= end || 5827 (*s != 'u' && *s != 'U')) { 5828 continue; 5829 } 5830 p--; 5831 count = *s=='u' ? 4 : 8; 5832 s++; 5833 5834 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5835 outpos = p-PyUnicode_AS_UNICODE(v); 5836 for (x = 0, i = 0; i < count; ++i, ++s) { 5837 c = (unsigned char)*s; 5838 if (!Py_ISXDIGIT(c)) { 5839 endinpos = s-starts; 5840 if (unicode_decode_call_errorhandler( 5841 errors, &errorHandler, 5842 "rawunicodeescape", "truncated \\uXXXX", 5843 &starts, &end, &startinpos, &endinpos, &exc, &s, 5844 &v, &outpos, &p)) 5845 goto onError; 5846 goto nextByte; 5847 } 5848 x = (x<<4) & ~0xF; 5849 if (c >= '0' && c <= '9') 5850 x += c - '0'; 5851 else if (c >= 'a' && c <= 'f') 5852 x += 10 + c - 'a'; 5853 else 5854 x += 10 + c - 'A'; 5855 } 5856 if (x <= 0xffff) 5857 /* UCS-2 character */ 5858 *p++ = (Py_UNICODE) x; 5859 else if (x <= 0x10ffff) { 5860 /* UCS-4 character. Either store directly, or as 5861 surrogate pair. */ 5862#ifdef Py_UNICODE_WIDE 5863 *p++ = (Py_UNICODE) x; 5864#else 5865 x -= 0x10000L; 5866 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 5867 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 5868#endif 5869 } else { 5870 endinpos = s-starts; 5871 outpos = p-PyUnicode_AS_UNICODE(v); 5872 if (unicode_decode_call_errorhandler( 5873 errors, &errorHandler, 5874 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5875 &starts, &end, &startinpos, &endinpos, &exc, &s, 5876 &v, &outpos, &p)) 5877 goto onError; 5878 } 5879 nextByte: 5880 ; 5881 } 5882 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5883 goto onError; 5884 Py_XDECREF(errorHandler); 5885 Py_XDECREF(exc); 5886 if (_PyUnicode_READY_REPLACE(&v)) { 5887 Py_DECREF(v); 5888 return NULL; 5889 } 5890 return (PyObject *)v; 5891 5892 onError: 5893 Py_XDECREF(v); 5894 Py_XDECREF(errorHandler); 5895 Py_XDECREF(exc); 5896 return NULL; 5897} 5898 5899PyObject * 5900PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5901 Py_ssize_t size) 5902{ 5903 PyObject *repr; 5904 char *p; 5905 char *q; 5906 5907#ifdef Py_UNICODE_WIDE 5908 const Py_ssize_t expandsize = 10; 5909#else 5910 const Py_ssize_t expandsize = 6; 5911#endif 5912 5913 if (size > PY_SSIZE_T_MAX / expandsize) 5914 return PyErr_NoMemory(); 5915 5916 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 5917 if (repr == NULL) 5918 return NULL; 5919 if (size == 0) 5920 return repr; 5921 5922 p = q = PyBytes_AS_STRING(repr); 5923 while (size-- > 0) { 5924 Py_UNICODE ch = *s++; 5925#ifdef Py_UNICODE_WIDE 5926 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5927 if (ch >= 0x10000) { 5928 *p++ = '\\'; 5929 *p++ = 'U'; 5930 *p++ = hexdigits[(ch >> 28) & 0xf]; 5931 *p++ = hexdigits[(ch >> 24) & 0xf]; 5932 *p++ = hexdigits[(ch >> 20) & 0xf]; 5933 *p++ = hexdigits[(ch >> 16) & 0xf]; 5934 *p++ = hexdigits[(ch >> 12) & 0xf]; 5935 *p++ = hexdigits[(ch >> 8) & 0xf]; 5936 *p++ = hexdigits[(ch >> 4) & 0xf]; 5937 *p++ = hexdigits[ch & 15]; 5938 } 5939 else 5940#else 5941 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5942 if (ch >= 0xD800 && ch < 0xDC00) { 5943 Py_UNICODE ch2; 5944 Py_UCS4 ucs; 5945 5946 ch2 = *s++; 5947 size--; 5948 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5949 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5950 *p++ = '\\'; 5951 *p++ = 'U'; 5952 *p++ = hexdigits[(ucs >> 28) & 0xf]; 5953 *p++ = hexdigits[(ucs >> 24) & 0xf]; 5954 *p++ = hexdigits[(ucs >> 20) & 0xf]; 5955 *p++ = hexdigits[(ucs >> 16) & 0xf]; 5956 *p++ = hexdigits[(ucs >> 12) & 0xf]; 5957 *p++ = hexdigits[(ucs >> 8) & 0xf]; 5958 *p++ = hexdigits[(ucs >> 4) & 0xf]; 5959 *p++ = hexdigits[ucs & 0xf]; 5960 continue; 5961 } 5962 /* Fall through: isolated surrogates are copied as-is */ 5963 s--; 5964 size++; 5965 } 5966#endif 5967 /* Map 16-bit characters to '\uxxxx' */ 5968 if (ch >= 256) { 5969 *p++ = '\\'; 5970 *p++ = 'u'; 5971 *p++ = hexdigits[(ch >> 12) & 0xf]; 5972 *p++ = hexdigits[(ch >> 8) & 0xf]; 5973 *p++ = hexdigits[(ch >> 4) & 0xf]; 5974 *p++ = hexdigits[ch & 15]; 5975 } 5976 /* Copy everything else as-is */ 5977 else 5978 *p++ = (char) ch; 5979 } 5980 size = p - q; 5981 5982 assert(size > 0); 5983 if (_PyBytes_Resize(&repr, size) < 0) 5984 return NULL; 5985 return repr; 5986} 5987 5988PyObject * 5989PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5990{ 5991 PyObject *s; 5992 if (!PyUnicode_Check(unicode)) { 5993 PyErr_BadArgument(); 5994 return NULL; 5995 } 5996 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5997 PyUnicode_GET_SIZE(unicode)); 5998 5999 return s; 6000} 6001 6002/* --- Unicode Internal Codec ------------------------------------------- */ 6003 6004PyObject * 6005_PyUnicode_DecodeUnicodeInternal(const char *s, 6006 Py_ssize_t size, 6007 const char *errors) 6008{ 6009 const char *starts = s; 6010 Py_ssize_t startinpos; 6011 Py_ssize_t endinpos; 6012 Py_ssize_t outpos; 6013 PyUnicodeObject *v; 6014 Py_UNICODE *p; 6015 const char *end; 6016 const char *reason; 6017 PyObject *errorHandler = NULL; 6018 PyObject *exc = NULL; 6019 6020#ifdef Py_UNICODE_WIDE 6021 Py_UNICODE unimax = PyUnicode_GetMax(); 6022#endif 6023 6024 /* XXX overflow detection missing */ 6025 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6026 if (v == NULL) 6027 goto onError; 6028 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6029 as string was created with the old API. */ 6030 if (PyUnicode_GET_SIZE(v) == 0) 6031 return (PyObject *)v; 6032 p = PyUnicode_AS_UNICODE(v); 6033 end = s + size; 6034 6035 while (s < end) { 6036 memcpy(p, s, sizeof(Py_UNICODE)); 6037 /* We have to sanity check the raw data, otherwise doom looms for 6038 some malformed UCS-4 data. */ 6039 if ( 6040#ifdef Py_UNICODE_WIDE 6041 *p > unimax || *p < 0 || 6042#endif 6043 end-s < Py_UNICODE_SIZE 6044 ) 6045 { 6046 startinpos = s - starts; 6047 if (end-s < Py_UNICODE_SIZE) { 6048 endinpos = end-starts; 6049 reason = "truncated input"; 6050 } 6051 else { 6052 endinpos = s - starts + Py_UNICODE_SIZE; 6053 reason = "illegal code point (> 0x10FFFF)"; 6054 } 6055 outpos = p - PyUnicode_AS_UNICODE(v); 6056 if (unicode_decode_call_errorhandler( 6057 errors, &errorHandler, 6058 "unicode_internal", reason, 6059 &starts, &end, &startinpos, &endinpos, &exc, &s, 6060 &v, &outpos, &p)) { 6061 goto onError; 6062 } 6063 } 6064 else { 6065 p++; 6066 s += Py_UNICODE_SIZE; 6067 } 6068 } 6069 6070 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6071 goto onError; 6072 Py_XDECREF(errorHandler); 6073 Py_XDECREF(exc); 6074 if (_PyUnicode_READY_REPLACE(&v)) { 6075 Py_DECREF(v); 6076 return NULL; 6077 } 6078 return (PyObject *)v; 6079 6080 onError: 6081 Py_XDECREF(v); 6082 Py_XDECREF(errorHandler); 6083 Py_XDECREF(exc); 6084 return NULL; 6085} 6086 6087/* --- Latin-1 Codec ------------------------------------------------------ */ 6088 6089PyObject * 6090PyUnicode_DecodeLatin1(const char *s, 6091 Py_ssize_t size, 6092 const char *errors) 6093{ 6094 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6095 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6096} 6097 6098/* create or adjust a UnicodeEncodeError */ 6099static void 6100make_encode_exception(PyObject **exceptionObject, 6101 const char *encoding, 6102 const Py_UNICODE *unicode, Py_ssize_t size, 6103 Py_ssize_t startpos, Py_ssize_t endpos, 6104 const char *reason) 6105{ 6106 if (*exceptionObject == NULL) { 6107 *exceptionObject = PyUnicodeEncodeError_Create( 6108 encoding, unicode, size, startpos, endpos, reason); 6109 } 6110 else { 6111 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6112 goto onError; 6113 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6114 goto onError; 6115 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6116 goto onError; 6117 return; 6118 onError: 6119 Py_DECREF(*exceptionObject); 6120 *exceptionObject = NULL; 6121 } 6122} 6123 6124/* raises a UnicodeEncodeError */ 6125static void 6126raise_encode_exception(PyObject **exceptionObject, 6127 const char *encoding, 6128 const Py_UNICODE *unicode, Py_ssize_t size, 6129 Py_ssize_t startpos, Py_ssize_t endpos, 6130 const char *reason) 6131{ 6132 make_encode_exception(exceptionObject, 6133 encoding, unicode, size, startpos, endpos, reason); 6134 if (*exceptionObject != NULL) 6135 PyCodec_StrictErrors(*exceptionObject); 6136} 6137 6138/* error handling callback helper: 6139 build arguments, call the callback and check the arguments, 6140 put the result into newpos and return the replacement string, which 6141 has to be freed by the caller */ 6142static PyObject * 6143unicode_encode_call_errorhandler(const char *errors, 6144 PyObject **errorHandler, 6145 const char *encoding, const char *reason, 6146 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6147 Py_ssize_t startpos, Py_ssize_t endpos, 6148 Py_ssize_t *newpos) 6149{ 6150 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6151 6152 PyObject *restuple; 6153 PyObject *resunicode; 6154 6155 if (*errorHandler == NULL) { 6156 *errorHandler = PyCodec_LookupError(errors); 6157 if (*errorHandler == NULL) 6158 return NULL; 6159 } 6160 6161 make_encode_exception(exceptionObject, 6162 encoding, unicode, size, startpos, endpos, reason); 6163 if (*exceptionObject == NULL) 6164 return NULL; 6165 6166 restuple = PyObject_CallFunctionObjArgs( 6167 *errorHandler, *exceptionObject, NULL); 6168 if (restuple == NULL) 6169 return NULL; 6170 if (!PyTuple_Check(restuple)) { 6171 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6172 Py_DECREF(restuple); 6173 return NULL; 6174 } 6175 if (!PyArg_ParseTuple(restuple, argparse, 6176 &resunicode, newpos)) { 6177 Py_DECREF(restuple); 6178 return NULL; 6179 } 6180 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6181 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6182 Py_DECREF(restuple); 6183 return NULL; 6184 } 6185 if (*newpos<0) 6186 *newpos = size+*newpos; 6187 if (*newpos<0 || *newpos>size) { 6188 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6189 Py_DECREF(restuple); 6190 return NULL; 6191 } 6192 Py_INCREF(resunicode); 6193 Py_DECREF(restuple); 6194 return resunicode; 6195} 6196 6197static PyObject * 6198unicode_encode_ucs1(const Py_UNICODE *p, 6199 Py_ssize_t size, 6200 const char *errors, 6201 int limit) 6202{ 6203 /* output object */ 6204 PyObject *res; 6205 /* pointers to the beginning and end+1 of input */ 6206 const Py_UNICODE *startp = p; 6207 const Py_UNICODE *endp = p + size; 6208 /* pointer to the beginning of the unencodable characters */ 6209 /* const Py_UNICODE *badp = NULL; */ 6210 /* pointer into the output */ 6211 char *str; 6212 /* current output position */ 6213 Py_ssize_t ressize; 6214 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6215 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6216 PyObject *errorHandler = NULL; 6217 PyObject *exc = NULL; 6218 /* the following variable is used for caching string comparisons 6219 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6220 int known_errorHandler = -1; 6221 6222 /* allocate enough for a simple encoding without 6223 replacements, if we need more, we'll resize */ 6224 if (size == 0) 6225 return PyBytes_FromStringAndSize(NULL, 0); 6226 res = PyBytes_FromStringAndSize(NULL, size); 6227 if (res == NULL) 6228 return NULL; 6229 str = PyBytes_AS_STRING(res); 6230 ressize = size; 6231 6232 while (p<endp) { 6233 Py_UNICODE c = *p; 6234 6235 /* can we encode this? */ 6236 if (c<limit) { 6237 /* no overflow check, because we know that the space is enough */ 6238 *str++ = (char)c; 6239 ++p; 6240 } 6241 else { 6242 Py_ssize_t unicodepos = p-startp; 6243 Py_ssize_t requiredsize; 6244 PyObject *repunicode; 6245 Py_ssize_t repsize; 6246 Py_ssize_t newpos; 6247 Py_ssize_t respos; 6248 Py_UNICODE *uni2; 6249 /* startpos for collecting unencodable chars */ 6250 const Py_UNICODE *collstart = p; 6251 const Py_UNICODE *collend = p; 6252 /* find all unecodable characters */ 6253 while ((collend < endp) && ((*collend)>=limit)) 6254 ++collend; 6255 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6256 if (known_errorHandler==-1) { 6257 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6258 known_errorHandler = 1; 6259 else if (!strcmp(errors, "replace")) 6260 known_errorHandler = 2; 6261 else if (!strcmp(errors, "ignore")) 6262 known_errorHandler = 3; 6263 else if (!strcmp(errors, "xmlcharrefreplace")) 6264 known_errorHandler = 4; 6265 else 6266 known_errorHandler = 0; 6267 } 6268 switch (known_errorHandler) { 6269 case 1: /* strict */ 6270 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6271 goto onError; 6272 case 2: /* replace */ 6273 while (collstart++<collend) 6274 *str++ = '?'; /* fall through */ 6275 case 3: /* ignore */ 6276 p = collend; 6277 break; 6278 case 4: /* xmlcharrefreplace */ 6279 respos = str - PyBytes_AS_STRING(res); 6280 /* determine replacement size (temporarily (mis)uses p) */ 6281 for (p = collstart, repsize = 0; p < collend; ++p) { 6282 if (*p<10) 6283 repsize += 2+1+1; 6284 else if (*p<100) 6285 repsize += 2+2+1; 6286 else if (*p<1000) 6287 repsize += 2+3+1; 6288 else if (*p<10000) 6289 repsize += 2+4+1; 6290#ifndef Py_UNICODE_WIDE 6291 else 6292 repsize += 2+5+1; 6293#else 6294 else if (*p<100000) 6295 repsize += 2+5+1; 6296 else if (*p<1000000) 6297 repsize += 2+6+1; 6298 else 6299 repsize += 2+7+1; 6300#endif 6301 } 6302 requiredsize = respos+repsize+(endp-collend); 6303 if (requiredsize > ressize) { 6304 if (requiredsize<2*ressize) 6305 requiredsize = 2*ressize; 6306 if (_PyBytes_Resize(&res, requiredsize)) 6307 goto onError; 6308 str = PyBytes_AS_STRING(res) + respos; 6309 ressize = requiredsize; 6310 } 6311 /* generate replacement (temporarily (mis)uses p) */ 6312 for (p = collstart; p < collend; ++p) { 6313 str += sprintf(str, "&#%d;", (int)*p); 6314 } 6315 p = collend; 6316 break; 6317 default: 6318 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6319 encoding, reason, startp, size, &exc, 6320 collstart-startp, collend-startp, &newpos); 6321 if (repunicode == NULL) 6322 goto onError; 6323 if (PyBytes_Check(repunicode)) { 6324 /* Directly copy bytes result to output. */ 6325 repsize = PyBytes_Size(repunicode); 6326 if (repsize > 1) { 6327 /* Make room for all additional bytes. */ 6328 respos = str - PyBytes_AS_STRING(res); 6329 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6330 Py_DECREF(repunicode); 6331 goto onError; 6332 } 6333 str = PyBytes_AS_STRING(res) + respos; 6334 ressize += repsize-1; 6335 } 6336 memcpy(str, PyBytes_AsString(repunicode), repsize); 6337 str += repsize; 6338 p = startp + newpos; 6339 Py_DECREF(repunicode); 6340 break; 6341 } 6342 /* need more space? (at least enough for what we 6343 have+the replacement+the rest of the string, so 6344 we won't have to check space for encodable characters) */ 6345 respos = str - PyBytes_AS_STRING(res); 6346 repsize = PyUnicode_GET_SIZE(repunicode); 6347 requiredsize = respos+repsize+(endp-collend); 6348 if (requiredsize > ressize) { 6349 if (requiredsize<2*ressize) 6350 requiredsize = 2*ressize; 6351 if (_PyBytes_Resize(&res, requiredsize)) { 6352 Py_DECREF(repunicode); 6353 goto onError; 6354 } 6355 str = PyBytes_AS_STRING(res) + respos; 6356 ressize = requiredsize; 6357 } 6358 /* check if there is anything unencodable in the replacement 6359 and copy it to the output */ 6360 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6361 c = *uni2; 6362 if (c >= limit) { 6363 raise_encode_exception(&exc, encoding, startp, size, 6364 unicodepos, unicodepos+1, reason); 6365 Py_DECREF(repunicode); 6366 goto onError; 6367 } 6368 *str = (char)c; 6369 } 6370 p = startp + newpos; 6371 Py_DECREF(repunicode); 6372 } 6373 } 6374 } 6375 /* Resize if we allocated to much */ 6376 size = str - PyBytes_AS_STRING(res); 6377 if (size < ressize) { /* If this falls res will be NULL */ 6378 assert(size >= 0); 6379 if (_PyBytes_Resize(&res, size) < 0) 6380 goto onError; 6381 } 6382 6383 Py_XDECREF(errorHandler); 6384 Py_XDECREF(exc); 6385 return res; 6386 6387 onError: 6388 Py_XDECREF(res); 6389 Py_XDECREF(errorHandler); 6390 Py_XDECREF(exc); 6391 return NULL; 6392} 6393 6394PyObject * 6395PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6396 Py_ssize_t size, 6397 const char *errors) 6398{ 6399 return unicode_encode_ucs1(p, size, errors, 256); 6400} 6401 6402PyObject * 6403_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6404{ 6405 if (!PyUnicode_Check(unicode)) { 6406 PyErr_BadArgument(); 6407 return NULL; 6408 } 6409 if (PyUnicode_READY(unicode) == -1) 6410 return NULL; 6411 /* Fast path: if it is a one-byte string, construct 6412 bytes object directly. */ 6413 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6414 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6415 PyUnicode_GET_LENGTH(unicode)); 6416 /* Non-Latin-1 characters present. Defer to above function to 6417 raise the exception. */ 6418 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6419 PyUnicode_GET_SIZE(unicode), 6420 errors); 6421} 6422 6423PyObject* 6424PyUnicode_AsLatin1String(PyObject *unicode) 6425{ 6426 return _PyUnicode_AsLatin1String(unicode, NULL); 6427} 6428 6429/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6430 6431PyObject * 6432PyUnicode_DecodeASCII(const char *s, 6433 Py_ssize_t size, 6434 const char *errors) 6435{ 6436 const char *starts = s; 6437 PyUnicodeObject *v; 6438 Py_UNICODE *p; 6439 Py_ssize_t startinpos; 6440 Py_ssize_t endinpos; 6441 Py_ssize_t outpos; 6442 const char *e; 6443 unsigned char* d; 6444 PyObject *errorHandler = NULL; 6445 PyObject *exc = NULL; 6446 Py_ssize_t i; 6447 6448 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6449 if (size == 1 && *(unsigned char*)s < 128) 6450 return PyUnicode_FromOrdinal(*(unsigned char*)s); 6451 6452 /* Fast path. Assume the input actually *is* ASCII, and allocate 6453 a single-block Unicode object with that assumption. If there is 6454 an error, drop the object and start over. */ 6455 v = (PyUnicodeObject*)PyUnicode_New(size, 127); 6456 if (v == NULL) 6457 goto onError; 6458 d = PyUnicode_1BYTE_DATA(v); 6459 for (i = 0; i < size; i++) { 6460 unsigned char ch = ((unsigned char*)s)[i]; 6461 if (ch < 128) 6462 d[i] = ch; 6463 else 6464 break; 6465 } 6466 if (i == size) 6467 return (PyObject*)v; 6468 Py_DECREF(v); /* start over */ 6469 6470 v = _PyUnicode_New(size); 6471 if (v == NULL) 6472 goto onError; 6473 if (size == 0) 6474 return (PyObject *)v; 6475 p = PyUnicode_AS_UNICODE(v); 6476 e = s + size; 6477 while (s < e) { 6478 register unsigned char c = (unsigned char)*s; 6479 if (c < 128) { 6480 *p++ = c; 6481 ++s; 6482 } 6483 else { 6484 startinpos = s-starts; 6485 endinpos = startinpos + 1; 6486 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6487 if (unicode_decode_call_errorhandler( 6488 errors, &errorHandler, 6489 "ascii", "ordinal not in range(128)", 6490 &starts, &e, &startinpos, &endinpos, &exc, &s, 6491 &v, &outpos, &p)) 6492 goto onError; 6493 } 6494 } 6495 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6496 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6497 goto onError; 6498 Py_XDECREF(errorHandler); 6499 Py_XDECREF(exc); 6500 if (_PyUnicode_READY_REPLACE(&v)) { 6501 Py_DECREF(v); 6502 return NULL; 6503 } 6504 return (PyObject *)v; 6505 6506 onError: 6507 Py_XDECREF(v); 6508 Py_XDECREF(errorHandler); 6509 Py_XDECREF(exc); 6510 return NULL; 6511} 6512 6513PyObject * 6514PyUnicode_EncodeASCII(const Py_UNICODE *p, 6515 Py_ssize_t size, 6516 const char *errors) 6517{ 6518 return unicode_encode_ucs1(p, size, errors, 128); 6519} 6520 6521PyObject * 6522_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6523{ 6524 if (!PyUnicode_Check(unicode)) { 6525 PyErr_BadArgument(); 6526 return NULL; 6527 } 6528 if (PyUnicode_READY(unicode) == -1) 6529 return NULL; 6530 /* Fast path: if it is an ASCII-only string, construct bytes object 6531 directly. Else defer to above function to raise the exception. */ 6532 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6533 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6534 PyUnicode_GET_LENGTH(unicode)); 6535 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6536 PyUnicode_GET_SIZE(unicode), 6537 errors); 6538} 6539 6540PyObject * 6541PyUnicode_AsASCIIString(PyObject *unicode) 6542{ 6543 return _PyUnicode_AsASCIIString(unicode, NULL); 6544} 6545 6546#ifdef HAVE_MBCS 6547 6548/* --- MBCS codecs for Windows -------------------------------------------- */ 6549 6550#if SIZEOF_INT < SIZEOF_SIZE_T 6551#define NEED_RETRY 6552#endif 6553 6554/* XXX This code is limited to "true" double-byte encodings, as 6555 a) it assumes an incomplete character consists of a single byte, and 6556 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6557 encodings, see IsDBCSLeadByteEx documentation. */ 6558 6559static int 6560is_dbcs_lead_byte(const char *s, int offset) 6561{ 6562 const char *curr = s + offset; 6563 6564 if (IsDBCSLeadByte(*curr)) { 6565 const char *prev = CharPrev(s, curr); 6566 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6567 } 6568 return 0; 6569} 6570 6571/* 6572 * Decode MBCS string into unicode object. If 'final' is set, converts 6573 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6574 */ 6575static int 6576decode_mbcs(PyUnicodeObject **v, 6577 const char *s, /* MBCS string */ 6578 int size, /* sizeof MBCS string */ 6579 int final, 6580 const char *errors) 6581{ 6582 Py_UNICODE *p; 6583 Py_ssize_t n; 6584 DWORD usize; 6585 DWORD flags; 6586 6587 assert(size >= 0); 6588 6589 /* check and handle 'errors' arg */ 6590 if (errors==NULL || strcmp(errors, "strict")==0) 6591 flags = MB_ERR_INVALID_CHARS; 6592 else if (strcmp(errors, "ignore")==0) 6593 flags = 0; 6594 else { 6595 PyErr_Format(PyExc_ValueError, 6596 "mbcs encoding does not support errors='%s'", 6597 errors); 6598 return -1; 6599 } 6600 6601 /* Skip trailing lead-byte unless 'final' is set */ 6602 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6603 --size; 6604 6605 /* First get the size of the result */ 6606 if (size > 0) { 6607 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6608 if (usize==0) 6609 goto mbcs_decode_error; 6610 } else 6611 usize = 0; 6612 6613 if (*v == NULL) { 6614 /* Create unicode object */ 6615 *v = _PyUnicode_New(usize); 6616 if (*v == NULL) 6617 return -1; 6618 n = 0; 6619 } 6620 else { 6621 /* Extend unicode object */ 6622 n = PyUnicode_GET_SIZE(*v); 6623 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6624 return -1; 6625 } 6626 6627 /* Do the conversion */ 6628 if (usize > 0) { 6629 p = PyUnicode_AS_UNICODE(*v) + n; 6630 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6631 goto mbcs_decode_error; 6632 } 6633 } 6634 return size; 6635 6636mbcs_decode_error: 6637 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6638 we raise a UnicodeDecodeError - else it is a 'generic' 6639 windows error 6640 */ 6641 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6642 /* Ideally, we should get reason from FormatMessage - this 6643 is the Windows 2000 English version of the message 6644 */ 6645 PyObject *exc = NULL; 6646 const char *reason = "No mapping for the Unicode character exists " 6647 "in the target multi-byte code page."; 6648 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6649 if (exc != NULL) { 6650 PyCodec_StrictErrors(exc); 6651 Py_DECREF(exc); 6652 } 6653 } else { 6654 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6655 } 6656 return -1; 6657} 6658 6659PyObject * 6660PyUnicode_DecodeMBCSStateful(const char *s, 6661 Py_ssize_t size, 6662 const char *errors, 6663 Py_ssize_t *consumed) 6664{ 6665 PyUnicodeObject *v = NULL; 6666 int done; 6667 6668 if (consumed) 6669 *consumed = 0; 6670 6671#ifdef NEED_RETRY 6672 retry: 6673 if (size > INT_MAX) 6674 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6675 else 6676#endif 6677 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6678 6679 if (done < 0) { 6680 Py_XDECREF(v); 6681 return NULL; 6682 } 6683 6684 if (consumed) 6685 *consumed += done; 6686 6687#ifdef NEED_RETRY 6688 if (size > INT_MAX) { 6689 s += done; 6690 size -= done; 6691 goto retry; 6692 } 6693#endif 6694 if (_PyUnicode_READY_REPLACE(&v)) { 6695 Py_DECREF(v); 6696 return NULL; 6697 } 6698 return (PyObject *)v; 6699} 6700 6701PyObject * 6702PyUnicode_DecodeMBCS(const char *s, 6703 Py_ssize_t size, 6704 const char *errors) 6705{ 6706 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6707} 6708 6709/* 6710 * Convert unicode into string object (MBCS). 6711 * Returns 0 if succeed, -1 otherwise. 6712 */ 6713static int 6714encode_mbcs(PyObject **repr, 6715 const Py_UNICODE *p, /* unicode */ 6716 int size, /* size of unicode */ 6717 const char* errors) 6718{ 6719 BOOL usedDefaultChar = FALSE; 6720 BOOL *pusedDefaultChar; 6721 int mbcssize; 6722 Py_ssize_t n; 6723 PyObject *exc = NULL; 6724 DWORD flags; 6725 6726 assert(size >= 0); 6727 6728 /* check and handle 'errors' arg */ 6729 if (errors==NULL || strcmp(errors, "strict")==0) { 6730 flags = WC_NO_BEST_FIT_CHARS; 6731 pusedDefaultChar = &usedDefaultChar; 6732 } else if (strcmp(errors, "replace")==0) { 6733 flags = 0; 6734 pusedDefaultChar = NULL; 6735 } else { 6736 PyErr_Format(PyExc_ValueError, 6737 "mbcs encoding does not support errors='%s'", 6738 errors); 6739 return -1; 6740 } 6741 6742 /* First get the size of the result */ 6743 if (size > 0) { 6744 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6745 NULL, pusedDefaultChar); 6746 if (mbcssize == 0) { 6747 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6748 return -1; 6749 } 6750 /* If we used a default char, then we failed! */ 6751 if (pusedDefaultChar && *pusedDefaultChar) 6752 goto mbcs_encode_error; 6753 } else { 6754 mbcssize = 0; 6755 } 6756 6757 if (*repr == NULL) { 6758 /* Create string object */ 6759 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6760 if (*repr == NULL) 6761 return -1; 6762 n = 0; 6763 } 6764 else { 6765 /* Extend string object */ 6766 n = PyBytes_Size(*repr); 6767 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6768 return -1; 6769 } 6770 6771 /* Do the conversion */ 6772 if (size > 0) { 6773 char *s = PyBytes_AS_STRING(*repr) + n; 6774 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6775 NULL, pusedDefaultChar)) { 6776 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6777 return -1; 6778 } 6779 if (pusedDefaultChar && *pusedDefaultChar) 6780 goto mbcs_encode_error; 6781 } 6782 return 0; 6783 6784mbcs_encode_error: 6785 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6786 Py_XDECREF(exc); 6787 return -1; 6788} 6789 6790PyObject * 6791PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6792 Py_ssize_t size, 6793 const char *errors) 6794{ 6795 PyObject *repr = NULL; 6796 int ret; 6797 6798#ifdef NEED_RETRY 6799 retry: 6800 if (size > INT_MAX) 6801 ret = encode_mbcs(&repr, p, INT_MAX, errors); 6802 else 6803#endif 6804 ret = encode_mbcs(&repr, p, (int)size, errors); 6805 6806 if (ret < 0) { 6807 Py_XDECREF(repr); 6808 return NULL; 6809 } 6810 6811#ifdef NEED_RETRY 6812 if (size > INT_MAX) { 6813 p += INT_MAX; 6814 size -= INT_MAX; 6815 goto retry; 6816 } 6817#endif 6818 6819 return repr; 6820} 6821 6822PyObject * 6823PyUnicode_AsMBCSString(PyObject *unicode) 6824{ 6825 if (!PyUnicode_Check(unicode)) { 6826 PyErr_BadArgument(); 6827 return NULL; 6828 } 6829 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 6830 PyUnicode_GET_SIZE(unicode), 6831 NULL); 6832} 6833 6834#undef NEED_RETRY 6835 6836#endif /* HAVE_MBCS */ 6837 6838/* --- Character Mapping Codec -------------------------------------------- */ 6839 6840PyObject * 6841PyUnicode_DecodeCharmap(const char *s, 6842 Py_ssize_t size, 6843 PyObject *mapping, 6844 const char *errors) 6845{ 6846 const char *starts = s; 6847 Py_ssize_t startinpos; 6848 Py_ssize_t endinpos; 6849 Py_ssize_t outpos; 6850 const char *e; 6851 PyUnicodeObject *v; 6852 Py_UNICODE *p; 6853 Py_ssize_t extrachars = 0; 6854 PyObject *errorHandler = NULL; 6855 PyObject *exc = NULL; 6856 Py_UNICODE *mapstring = NULL; 6857 Py_ssize_t maplen = 0; 6858 6859 /* Default to Latin-1 */ 6860 if (mapping == NULL) 6861 return PyUnicode_DecodeLatin1(s, size, errors); 6862 6863 v = _PyUnicode_New(size); 6864 if (v == NULL) 6865 goto onError; 6866 if (size == 0) 6867 return (PyObject *)v; 6868 p = PyUnicode_AS_UNICODE(v); 6869 e = s + size; 6870 if (PyUnicode_CheckExact(mapping)) { 6871 mapstring = PyUnicode_AS_UNICODE(mapping); 6872 maplen = PyUnicode_GET_SIZE(mapping); 6873 while (s < e) { 6874 unsigned char ch = *s; 6875 Py_UNICODE x = 0xfffe; /* illegal value */ 6876 6877 if (ch < maplen) 6878 x = mapstring[ch]; 6879 6880 if (x == 0xfffe) { 6881 /* undefined mapping */ 6882 outpos = p-PyUnicode_AS_UNICODE(v); 6883 startinpos = s-starts; 6884 endinpos = startinpos+1; 6885 if (unicode_decode_call_errorhandler( 6886 errors, &errorHandler, 6887 "charmap", "character maps to <undefined>", 6888 &starts, &e, &startinpos, &endinpos, &exc, &s, 6889 &v, &outpos, &p)) { 6890 goto onError; 6891 } 6892 continue; 6893 } 6894 *p++ = x; 6895 ++s; 6896 } 6897 } 6898 else { 6899 while (s < e) { 6900 unsigned char ch = *s; 6901 PyObject *w, *x; 6902 6903 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 6904 w = PyLong_FromLong((long)ch); 6905 if (w == NULL) 6906 goto onError; 6907 x = PyObject_GetItem(mapping, w); 6908 Py_DECREF(w); 6909 if (x == NULL) { 6910 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6911 /* No mapping found means: mapping is undefined. */ 6912 PyErr_Clear(); 6913 x = Py_None; 6914 Py_INCREF(x); 6915 } else 6916 goto onError; 6917 } 6918 6919 /* Apply mapping */ 6920 if (PyLong_Check(x)) { 6921 long value = PyLong_AS_LONG(x); 6922 if (value < 0 || value > 65535) { 6923 PyErr_SetString(PyExc_TypeError, 6924 "character mapping must be in range(65536)"); 6925 Py_DECREF(x); 6926 goto onError; 6927 } 6928 *p++ = (Py_UNICODE)value; 6929 } 6930 else if (x == Py_None) { 6931 /* undefined mapping */ 6932 outpos = p-PyUnicode_AS_UNICODE(v); 6933 startinpos = s-starts; 6934 endinpos = startinpos+1; 6935 if (unicode_decode_call_errorhandler( 6936 errors, &errorHandler, 6937 "charmap", "character maps to <undefined>", 6938 &starts, &e, &startinpos, &endinpos, &exc, &s, 6939 &v, &outpos, &p)) { 6940 Py_DECREF(x); 6941 goto onError; 6942 } 6943 Py_DECREF(x); 6944 continue; 6945 } 6946 else if (PyUnicode_Check(x)) { 6947 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 6948 6949 if (targetsize == 1) 6950 /* 1-1 mapping */ 6951 *p++ = *PyUnicode_AS_UNICODE(x); 6952 6953 else if (targetsize > 1) { 6954 /* 1-n mapping */ 6955 if (targetsize > extrachars) { 6956 /* resize first */ 6957 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 6958 Py_ssize_t needed = (targetsize - extrachars) + \ 6959 (targetsize << 2); 6960 extrachars += needed; 6961 /* XXX overflow detection missing */ 6962 if (PyUnicode_Resize((PyObject**)&v, 6963 PyUnicode_GET_SIZE(v) + needed) < 0) { 6964 Py_DECREF(x); 6965 goto onError; 6966 } 6967 p = PyUnicode_AS_UNICODE(v) + oldpos; 6968 } 6969 Py_UNICODE_COPY(p, 6970 PyUnicode_AS_UNICODE(x), 6971 targetsize); 6972 p += targetsize; 6973 extrachars -= targetsize; 6974 } 6975 /* 1-0 mapping: skip the character */ 6976 } 6977 else { 6978 /* wrong return value */ 6979 PyErr_SetString(PyExc_TypeError, 6980 "character mapping must return integer, None or str"); 6981 Py_DECREF(x); 6982 goto onError; 6983 } 6984 Py_DECREF(x); 6985 ++s; 6986 } 6987 } 6988 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6989 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6990 goto onError; 6991 Py_XDECREF(errorHandler); 6992 Py_XDECREF(exc); 6993 if (_PyUnicode_READY_REPLACE(&v)) { 6994 Py_DECREF(v); 6995 return NULL; 6996 } 6997 return (PyObject *)v; 6998 6999 onError: 7000 Py_XDECREF(errorHandler); 7001 Py_XDECREF(exc); 7002 Py_XDECREF(v); 7003 return NULL; 7004} 7005 7006/* Charmap encoding: the lookup table */ 7007 7008struct encoding_map { 7009 PyObject_HEAD 7010 unsigned char level1[32]; 7011 int count2, count3; 7012 unsigned char level23[1]; 7013}; 7014 7015static PyObject* 7016encoding_map_size(PyObject *obj, PyObject* args) 7017{ 7018 struct encoding_map *map = (struct encoding_map*)obj; 7019 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7020 128*map->count3); 7021} 7022 7023static PyMethodDef encoding_map_methods[] = { 7024 {"size", encoding_map_size, METH_NOARGS, 7025 PyDoc_STR("Return the size (in bytes) of this object") }, 7026 { 0 } 7027}; 7028 7029static void 7030encoding_map_dealloc(PyObject* o) 7031{ 7032 PyObject_FREE(o); 7033} 7034 7035static PyTypeObject EncodingMapType = { 7036 PyVarObject_HEAD_INIT(NULL, 0) 7037 "EncodingMap", /*tp_name*/ 7038 sizeof(struct encoding_map), /*tp_basicsize*/ 7039 0, /*tp_itemsize*/ 7040 /* methods */ 7041 encoding_map_dealloc, /*tp_dealloc*/ 7042 0, /*tp_print*/ 7043 0, /*tp_getattr*/ 7044 0, /*tp_setattr*/ 7045 0, /*tp_reserved*/ 7046 0, /*tp_repr*/ 7047 0, /*tp_as_number*/ 7048 0, /*tp_as_sequence*/ 7049 0, /*tp_as_mapping*/ 7050 0, /*tp_hash*/ 7051 0, /*tp_call*/ 7052 0, /*tp_str*/ 7053 0, /*tp_getattro*/ 7054 0, /*tp_setattro*/ 7055 0, /*tp_as_buffer*/ 7056 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7057 0, /*tp_doc*/ 7058 0, /*tp_traverse*/ 7059 0, /*tp_clear*/ 7060 0, /*tp_richcompare*/ 7061 0, /*tp_weaklistoffset*/ 7062 0, /*tp_iter*/ 7063 0, /*tp_iternext*/ 7064 encoding_map_methods, /*tp_methods*/ 7065 0, /*tp_members*/ 7066 0, /*tp_getset*/ 7067 0, /*tp_base*/ 7068 0, /*tp_dict*/ 7069 0, /*tp_descr_get*/ 7070 0, /*tp_descr_set*/ 7071 0, /*tp_dictoffset*/ 7072 0, /*tp_init*/ 7073 0, /*tp_alloc*/ 7074 0, /*tp_new*/ 7075 0, /*tp_free*/ 7076 0, /*tp_is_gc*/ 7077}; 7078 7079PyObject* 7080PyUnicode_BuildEncodingMap(PyObject* string) 7081{ 7082 PyObject *result; 7083 struct encoding_map *mresult; 7084 int i; 7085 int need_dict = 0; 7086 unsigned char level1[32]; 7087 unsigned char level2[512]; 7088 unsigned char *mlevel1, *mlevel2, *mlevel3; 7089 int count2 = 0, count3 = 0; 7090 int kind; 7091 void *data; 7092 Py_UCS4 ch; 7093 7094 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7095 PyErr_BadArgument(); 7096 return NULL; 7097 } 7098 kind = PyUnicode_KIND(string); 7099 data = PyUnicode_DATA(string); 7100 memset(level1, 0xFF, sizeof level1); 7101 memset(level2, 0xFF, sizeof level2); 7102 7103 /* If there isn't a one-to-one mapping of NULL to \0, 7104 or if there are non-BMP characters, we need to use 7105 a mapping dictionary. */ 7106 if (PyUnicode_READ(kind, data, 0) != 0) 7107 need_dict = 1; 7108 for (i = 1; i < 256; i++) { 7109 int l1, l2; 7110 ch = PyUnicode_READ(kind, data, i); 7111 if (ch == 0 || ch > 0xFFFF) { 7112 need_dict = 1; 7113 break; 7114 } 7115 if (ch == 0xFFFE) 7116 /* unmapped character */ 7117 continue; 7118 l1 = ch >> 11; 7119 l2 = ch >> 7; 7120 if (level1[l1] == 0xFF) 7121 level1[l1] = count2++; 7122 if (level2[l2] == 0xFF) 7123 level2[l2] = count3++; 7124 } 7125 7126 if (count2 >= 0xFF || count3 >= 0xFF) 7127 need_dict = 1; 7128 7129 if (need_dict) { 7130 PyObject *result = PyDict_New(); 7131 PyObject *key, *value; 7132 if (!result) 7133 return NULL; 7134 for (i = 0; i < 256; i++) { 7135 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7136 value = PyLong_FromLong(i); 7137 if (!key || !value) 7138 goto failed1; 7139 if (PyDict_SetItem(result, key, value) == -1) 7140 goto failed1; 7141 Py_DECREF(key); 7142 Py_DECREF(value); 7143 } 7144 return result; 7145 failed1: 7146 Py_XDECREF(key); 7147 Py_XDECREF(value); 7148 Py_DECREF(result); 7149 return NULL; 7150 } 7151 7152 /* Create a three-level trie */ 7153 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7154 16*count2 + 128*count3 - 1); 7155 if (!result) 7156 return PyErr_NoMemory(); 7157 PyObject_Init(result, &EncodingMapType); 7158 mresult = (struct encoding_map*)result; 7159 mresult->count2 = count2; 7160 mresult->count3 = count3; 7161 mlevel1 = mresult->level1; 7162 mlevel2 = mresult->level23; 7163 mlevel3 = mresult->level23 + 16*count2; 7164 memcpy(mlevel1, level1, 32); 7165 memset(mlevel2, 0xFF, 16*count2); 7166 memset(mlevel3, 0, 128*count3); 7167 count3 = 0; 7168 for (i = 1; i < 256; i++) { 7169 int o1, o2, o3, i2, i3; 7170 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7171 /* unmapped character */ 7172 continue; 7173 o1 = PyUnicode_READ(kind, data, i)>>11; 7174 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7175 i2 = 16*mlevel1[o1] + o2; 7176 if (mlevel2[i2] == 0xFF) 7177 mlevel2[i2] = count3++; 7178 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7179 i3 = 128*mlevel2[i2] + o3; 7180 mlevel3[i3] = i; 7181 } 7182 return result; 7183} 7184 7185static int 7186encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7187{ 7188 struct encoding_map *map = (struct encoding_map*)mapping; 7189 int l1 = c>>11; 7190 int l2 = (c>>7) & 0xF; 7191 int l3 = c & 0x7F; 7192 int i; 7193 7194#ifdef Py_UNICODE_WIDE 7195 if (c > 0xFFFF) { 7196 return -1; 7197 } 7198#endif 7199 if (c == 0) 7200 return 0; 7201 /* level 1*/ 7202 i = map->level1[l1]; 7203 if (i == 0xFF) { 7204 return -1; 7205 } 7206 /* level 2*/ 7207 i = map->level23[16*i+l2]; 7208 if (i == 0xFF) { 7209 return -1; 7210 } 7211 /* level 3 */ 7212 i = map->level23[16*map->count2 + 128*i + l3]; 7213 if (i == 0) { 7214 return -1; 7215 } 7216 return i; 7217} 7218 7219/* Lookup the character ch in the mapping. If the character 7220 can't be found, Py_None is returned (or NULL, if another 7221 error occurred). */ 7222static PyObject * 7223charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7224{ 7225 PyObject *w = PyLong_FromLong((long)c); 7226 PyObject *x; 7227 7228 if (w == NULL) 7229 return NULL; 7230 x = PyObject_GetItem(mapping, w); 7231 Py_DECREF(w); 7232 if (x == NULL) { 7233 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7234 /* No mapping found means: mapping is undefined. */ 7235 PyErr_Clear(); 7236 x = Py_None; 7237 Py_INCREF(x); 7238 return x; 7239 } else 7240 return NULL; 7241 } 7242 else if (x == Py_None) 7243 return x; 7244 else if (PyLong_Check(x)) { 7245 long value = PyLong_AS_LONG(x); 7246 if (value < 0 || value > 255) { 7247 PyErr_SetString(PyExc_TypeError, 7248 "character mapping must be in range(256)"); 7249 Py_DECREF(x); 7250 return NULL; 7251 } 7252 return x; 7253 } 7254 else if (PyBytes_Check(x)) 7255 return x; 7256 else { 7257 /* wrong return value */ 7258 PyErr_Format(PyExc_TypeError, 7259 "character mapping must return integer, bytes or None, not %.400s", 7260 x->ob_type->tp_name); 7261 Py_DECREF(x); 7262 return NULL; 7263 } 7264} 7265 7266static int 7267charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7268{ 7269 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7270 /* exponentially overallocate to minimize reallocations */ 7271 if (requiredsize < 2*outsize) 7272 requiredsize = 2*outsize; 7273 if (_PyBytes_Resize(outobj, requiredsize)) 7274 return -1; 7275 return 0; 7276} 7277 7278typedef enum charmapencode_result { 7279 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7280} charmapencode_result; 7281/* lookup the character, put the result in the output string and adjust 7282 various state variables. Resize the output bytes object if not enough 7283 space is available. Return a new reference to the object that 7284 was put in the output buffer, or Py_None, if the mapping was undefined 7285 (in which case no character was written) or NULL, if a 7286 reallocation error occurred. The caller must decref the result */ 7287static charmapencode_result 7288charmapencode_output(Py_UNICODE c, PyObject *mapping, 7289 PyObject **outobj, Py_ssize_t *outpos) 7290{ 7291 PyObject *rep; 7292 char *outstart; 7293 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7294 7295 if (Py_TYPE(mapping) == &EncodingMapType) { 7296 int res = encoding_map_lookup(c, mapping); 7297 Py_ssize_t requiredsize = *outpos+1; 7298 if (res == -1) 7299 return enc_FAILED; 7300 if (outsize<requiredsize) 7301 if (charmapencode_resize(outobj, outpos, requiredsize)) 7302 return enc_EXCEPTION; 7303 outstart = PyBytes_AS_STRING(*outobj); 7304 outstart[(*outpos)++] = (char)res; 7305 return enc_SUCCESS; 7306 } 7307 7308 rep = charmapencode_lookup(c, mapping); 7309 if (rep==NULL) 7310 return enc_EXCEPTION; 7311 else if (rep==Py_None) { 7312 Py_DECREF(rep); 7313 return enc_FAILED; 7314 } else { 7315 if (PyLong_Check(rep)) { 7316 Py_ssize_t requiredsize = *outpos+1; 7317 if (outsize<requiredsize) 7318 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7319 Py_DECREF(rep); 7320 return enc_EXCEPTION; 7321 } 7322 outstart = PyBytes_AS_STRING(*outobj); 7323 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7324 } 7325 else { 7326 const char *repchars = PyBytes_AS_STRING(rep); 7327 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7328 Py_ssize_t requiredsize = *outpos+repsize; 7329 if (outsize<requiredsize) 7330 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7331 Py_DECREF(rep); 7332 return enc_EXCEPTION; 7333 } 7334 outstart = PyBytes_AS_STRING(*outobj); 7335 memcpy(outstart + *outpos, repchars, repsize); 7336 *outpos += repsize; 7337 } 7338 } 7339 Py_DECREF(rep); 7340 return enc_SUCCESS; 7341} 7342 7343/* handle an error in PyUnicode_EncodeCharmap 7344 Return 0 on success, -1 on error */ 7345static int 7346charmap_encoding_error( 7347 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7348 PyObject **exceptionObject, 7349 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7350 PyObject **res, Py_ssize_t *respos) 7351{ 7352 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7353 Py_ssize_t repsize; 7354 Py_ssize_t newpos; 7355 Py_UNICODE *uni2; 7356 /* startpos for collecting unencodable chars */ 7357 Py_ssize_t collstartpos = *inpos; 7358 Py_ssize_t collendpos = *inpos+1; 7359 Py_ssize_t collpos; 7360 char *encoding = "charmap"; 7361 char *reason = "character maps to <undefined>"; 7362 charmapencode_result x; 7363 7364 /* find all unencodable characters */ 7365 while (collendpos < size) { 7366 PyObject *rep; 7367 if (Py_TYPE(mapping) == &EncodingMapType) { 7368 int res = encoding_map_lookup(p[collendpos], mapping); 7369 if (res != -1) 7370 break; 7371 ++collendpos; 7372 continue; 7373 } 7374 7375 rep = charmapencode_lookup(p[collendpos], mapping); 7376 if (rep==NULL) 7377 return -1; 7378 else if (rep!=Py_None) { 7379 Py_DECREF(rep); 7380 break; 7381 } 7382 Py_DECREF(rep); 7383 ++collendpos; 7384 } 7385 /* cache callback name lookup 7386 * (if not done yet, i.e. it's the first error) */ 7387 if (*known_errorHandler==-1) { 7388 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7389 *known_errorHandler = 1; 7390 else if (!strcmp(errors, "replace")) 7391 *known_errorHandler = 2; 7392 else if (!strcmp(errors, "ignore")) 7393 *known_errorHandler = 3; 7394 else if (!strcmp(errors, "xmlcharrefreplace")) 7395 *known_errorHandler = 4; 7396 else 7397 *known_errorHandler = 0; 7398 } 7399 switch (*known_errorHandler) { 7400 case 1: /* strict */ 7401 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7402 return -1; 7403 case 2: /* replace */ 7404 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7405 x = charmapencode_output('?', mapping, res, respos); 7406 if (x==enc_EXCEPTION) { 7407 return -1; 7408 } 7409 else if (x==enc_FAILED) { 7410 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7411 return -1; 7412 } 7413 } 7414 /* fall through */ 7415 case 3: /* ignore */ 7416 *inpos = collendpos; 7417 break; 7418 case 4: /* xmlcharrefreplace */ 7419 /* generate replacement (temporarily (mis)uses p) */ 7420 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7421 char buffer[2+29+1+1]; 7422 char *cp; 7423 sprintf(buffer, "&#%d;", (int)p[collpos]); 7424 for (cp = buffer; *cp; ++cp) { 7425 x = charmapencode_output(*cp, mapping, res, respos); 7426 if (x==enc_EXCEPTION) 7427 return -1; 7428 else if (x==enc_FAILED) { 7429 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7430 return -1; 7431 } 7432 } 7433 } 7434 *inpos = collendpos; 7435 break; 7436 default: 7437 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7438 encoding, reason, p, size, exceptionObject, 7439 collstartpos, collendpos, &newpos); 7440 if (repunicode == NULL) 7441 return -1; 7442 if (PyBytes_Check(repunicode)) { 7443 /* Directly copy bytes result to output. */ 7444 Py_ssize_t outsize = PyBytes_Size(*res); 7445 Py_ssize_t requiredsize; 7446 repsize = PyBytes_Size(repunicode); 7447 requiredsize = *respos + repsize; 7448 if (requiredsize > outsize) 7449 /* Make room for all additional bytes. */ 7450 if (charmapencode_resize(res, respos, requiredsize)) { 7451 Py_DECREF(repunicode); 7452 return -1; 7453 } 7454 memcpy(PyBytes_AsString(*res) + *respos, 7455 PyBytes_AsString(repunicode), repsize); 7456 *respos += repsize; 7457 *inpos = newpos; 7458 Py_DECREF(repunicode); 7459 break; 7460 } 7461 /* generate replacement */ 7462 repsize = PyUnicode_GET_SIZE(repunicode); 7463 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7464 x = charmapencode_output(*uni2, mapping, res, respos); 7465 if (x==enc_EXCEPTION) { 7466 return -1; 7467 } 7468 else if (x==enc_FAILED) { 7469 Py_DECREF(repunicode); 7470 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7471 return -1; 7472 } 7473 } 7474 *inpos = newpos; 7475 Py_DECREF(repunicode); 7476 } 7477 return 0; 7478} 7479 7480PyObject * 7481PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7482 Py_ssize_t size, 7483 PyObject *mapping, 7484 const char *errors) 7485{ 7486 /* output object */ 7487 PyObject *res = NULL; 7488 /* current input position */ 7489 Py_ssize_t inpos = 0; 7490 /* current output position */ 7491 Py_ssize_t respos = 0; 7492 PyObject *errorHandler = NULL; 7493 PyObject *exc = NULL; 7494 /* the following variable is used for caching string comparisons 7495 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7496 * 3=ignore, 4=xmlcharrefreplace */ 7497 int known_errorHandler = -1; 7498 7499 /* Default to Latin-1 */ 7500 if (mapping == NULL) 7501 return PyUnicode_EncodeLatin1(p, size, errors); 7502 7503 /* allocate enough for a simple encoding without 7504 replacements, if we need more, we'll resize */ 7505 res = PyBytes_FromStringAndSize(NULL, size); 7506 if (res == NULL) 7507 goto onError; 7508 if (size == 0) 7509 return res; 7510 7511 while (inpos<size) { 7512 /* try to encode it */ 7513 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7514 if (x==enc_EXCEPTION) /* error */ 7515 goto onError; 7516 if (x==enc_FAILED) { /* unencodable character */ 7517 if (charmap_encoding_error(p, size, &inpos, mapping, 7518 &exc, 7519 &known_errorHandler, &errorHandler, errors, 7520 &res, &respos)) { 7521 goto onError; 7522 } 7523 } 7524 else 7525 /* done with this character => adjust input position */ 7526 ++inpos; 7527 } 7528 7529 /* Resize if we allocated to much */ 7530 if (respos<PyBytes_GET_SIZE(res)) 7531 if (_PyBytes_Resize(&res, respos) < 0) 7532 goto onError; 7533 7534 Py_XDECREF(exc); 7535 Py_XDECREF(errorHandler); 7536 return res; 7537 7538 onError: 7539 Py_XDECREF(res); 7540 Py_XDECREF(exc); 7541 Py_XDECREF(errorHandler); 7542 return NULL; 7543} 7544 7545PyObject * 7546PyUnicode_AsCharmapString(PyObject *unicode, 7547 PyObject *mapping) 7548{ 7549 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7550 PyErr_BadArgument(); 7551 return NULL; 7552 } 7553 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7554 PyUnicode_GET_SIZE(unicode), 7555 mapping, 7556 NULL); 7557} 7558 7559/* create or adjust a UnicodeTranslateError */ 7560static void 7561make_translate_exception(PyObject **exceptionObject, 7562 PyObject *unicode, 7563 Py_ssize_t startpos, Py_ssize_t endpos, 7564 const char *reason) 7565{ 7566 if (*exceptionObject == NULL) { 7567 *exceptionObject = _PyUnicodeTranslateError_Create( 7568 unicode, startpos, endpos, reason); 7569 } 7570 else { 7571 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7572 goto onError; 7573 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7574 goto onError; 7575 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7576 goto onError; 7577 return; 7578 onError: 7579 Py_DECREF(*exceptionObject); 7580 *exceptionObject = NULL; 7581 } 7582} 7583 7584/* raises a UnicodeTranslateError */ 7585static void 7586raise_translate_exception(PyObject **exceptionObject, 7587 PyObject *unicode, 7588 Py_ssize_t startpos, Py_ssize_t endpos, 7589 const char *reason) 7590{ 7591 make_translate_exception(exceptionObject, 7592 unicode, startpos, endpos, reason); 7593 if (*exceptionObject != NULL) 7594 PyCodec_StrictErrors(*exceptionObject); 7595} 7596 7597/* error handling callback helper: 7598 build arguments, call the callback and check the arguments, 7599 put the result into newpos and return the replacement string, which 7600 has to be freed by the caller */ 7601static PyObject * 7602unicode_translate_call_errorhandler(const char *errors, 7603 PyObject **errorHandler, 7604 const char *reason, 7605 PyObject *unicode, PyObject **exceptionObject, 7606 Py_ssize_t startpos, Py_ssize_t endpos, 7607 Py_ssize_t *newpos) 7608{ 7609 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7610 7611 Py_ssize_t i_newpos; 7612 PyObject *restuple; 7613 PyObject *resunicode; 7614 7615 if (*errorHandler == NULL) { 7616 *errorHandler = PyCodec_LookupError(errors); 7617 if (*errorHandler == NULL) 7618 return NULL; 7619 } 7620 7621 make_translate_exception(exceptionObject, 7622 unicode, startpos, endpos, reason); 7623 if (*exceptionObject == NULL) 7624 return NULL; 7625 7626 restuple = PyObject_CallFunctionObjArgs( 7627 *errorHandler, *exceptionObject, NULL); 7628 if (restuple == NULL) 7629 return NULL; 7630 if (!PyTuple_Check(restuple)) { 7631 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7632 Py_DECREF(restuple); 7633 return NULL; 7634 } 7635 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7636 &resunicode, &i_newpos)) { 7637 Py_DECREF(restuple); 7638 return NULL; 7639 } 7640 if (i_newpos<0) 7641 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7642 else 7643 *newpos = i_newpos; 7644 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7645 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7646 Py_DECREF(restuple); 7647 return NULL; 7648 } 7649 Py_INCREF(resunicode); 7650 Py_DECREF(restuple); 7651 return resunicode; 7652} 7653 7654/* Lookup the character ch in the mapping and put the result in result, 7655 which must be decrefed by the caller. 7656 Return 0 on success, -1 on error */ 7657static int 7658charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7659{ 7660 PyObject *w = PyLong_FromLong((long)c); 7661 PyObject *x; 7662 7663 if (w == NULL) 7664 return -1; 7665 x = PyObject_GetItem(mapping, w); 7666 Py_DECREF(w); 7667 if (x == NULL) { 7668 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7669 /* No mapping found means: use 1:1 mapping. */ 7670 PyErr_Clear(); 7671 *result = NULL; 7672 return 0; 7673 } else 7674 return -1; 7675 } 7676 else if (x == Py_None) { 7677 *result = x; 7678 return 0; 7679 } 7680 else if (PyLong_Check(x)) { 7681 long value = PyLong_AS_LONG(x); 7682 long max = PyUnicode_GetMax(); 7683 if (value < 0 || value > max) { 7684 PyErr_Format(PyExc_TypeError, 7685 "character mapping must be in range(0x%x)", max+1); 7686 Py_DECREF(x); 7687 return -1; 7688 } 7689 *result = x; 7690 return 0; 7691 } 7692 else if (PyUnicode_Check(x)) { 7693 *result = x; 7694 return 0; 7695 } 7696 else { 7697 /* wrong return value */ 7698 PyErr_SetString(PyExc_TypeError, 7699 "character mapping must return integer, None or str"); 7700 Py_DECREF(x); 7701 return -1; 7702 } 7703} 7704/* ensure that *outobj is at least requiredsize characters long, 7705 if not reallocate and adjust various state variables. 7706 Return 0 on success, -1 on error */ 7707static int 7708charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7709 Py_ssize_t requiredsize) 7710{ 7711 Py_ssize_t oldsize = *psize; 7712 if (requiredsize > oldsize) { 7713 /* exponentially overallocate to minimize reallocations */ 7714 if (requiredsize < 2 * oldsize) 7715 requiredsize = 2 * oldsize; 7716 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7717 if (*outobj == 0) 7718 return -1; 7719 *psize = requiredsize; 7720 } 7721 return 0; 7722} 7723/* lookup the character, put the result in the output string and adjust 7724 various state variables. Return a new reference to the object that 7725 was put in the output buffer in *result, or Py_None, if the mapping was 7726 undefined (in which case no character was written). 7727 The called must decref result. 7728 Return 0 on success, -1 on error. */ 7729static int 7730charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7731 PyObject *mapping, Py_UCS4 **output, 7732 Py_ssize_t *osize, Py_ssize_t *opos, 7733 PyObject **res) 7734{ 7735 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7736 if (charmaptranslate_lookup(curinp, mapping, res)) 7737 return -1; 7738 if (*res==NULL) { 7739 /* not found => default to 1:1 mapping */ 7740 (*output)[(*opos)++] = curinp; 7741 } 7742 else if (*res==Py_None) 7743 ; 7744 else if (PyLong_Check(*res)) { 7745 /* no overflow check, because we know that the space is enough */ 7746 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7747 } 7748 else if (PyUnicode_Check(*res)) { 7749 Py_ssize_t repsize; 7750 if (PyUnicode_READY(*res) == -1) 7751 return -1; 7752 repsize = PyUnicode_GET_LENGTH(*res); 7753 if (repsize==1) { 7754 /* no overflow check, because we know that the space is enough */ 7755 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7756 } 7757 else if (repsize!=0) { 7758 /* more than one character */ 7759 Py_ssize_t requiredsize = *opos + 7760 (PyUnicode_GET_LENGTH(input) - ipos) + 7761 repsize - 1; 7762 Py_ssize_t i; 7763 if (charmaptranslate_makespace(output, osize, requiredsize)) 7764 return -1; 7765 for(i = 0; i < repsize; i++) 7766 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7767 } 7768 } 7769 else 7770 return -1; 7771 return 0; 7772} 7773 7774PyObject * 7775_PyUnicode_TranslateCharmap(PyObject *input, 7776 PyObject *mapping, 7777 const char *errors) 7778{ 7779 /* input object */ 7780 char *idata; 7781 Py_ssize_t size, i; 7782 int kind; 7783 /* output buffer */ 7784 Py_UCS4 *output = NULL; 7785 Py_ssize_t osize; 7786 PyObject *res; 7787 /* current output position */ 7788 Py_ssize_t opos; 7789 char *reason = "character maps to <undefined>"; 7790 PyObject *errorHandler = NULL; 7791 PyObject *exc = NULL; 7792 /* the following variable is used for caching string comparisons 7793 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7794 * 3=ignore, 4=xmlcharrefreplace */ 7795 int known_errorHandler = -1; 7796 7797 if (mapping == NULL) { 7798 PyErr_BadArgument(); 7799 return NULL; 7800 } 7801 7802 if (PyUnicode_READY(input) == -1) 7803 return NULL; 7804 idata = (char*)PyUnicode_DATA(input); 7805 kind = PyUnicode_KIND(input); 7806 size = PyUnicode_GET_LENGTH(input); 7807 i = 0; 7808 7809 if (size == 0) { 7810 Py_INCREF(input); 7811 return input; 7812 } 7813 7814 /* allocate enough for a simple 1:1 translation without 7815 replacements, if we need more, we'll resize */ 7816 osize = size; 7817 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 7818 opos = 0; 7819 if (output == NULL) { 7820 PyErr_NoMemory(); 7821 goto onError; 7822 } 7823 7824 while (i<size) { 7825 /* try to encode it */ 7826 PyObject *x = NULL; 7827 if (charmaptranslate_output(input, i, mapping, 7828 &output, &osize, &opos, &x)) { 7829 Py_XDECREF(x); 7830 goto onError; 7831 } 7832 Py_XDECREF(x); 7833 if (x!=Py_None) /* it worked => adjust input pointer */ 7834 ++i; 7835 else { /* untranslatable character */ 7836 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7837 Py_ssize_t repsize; 7838 Py_ssize_t newpos; 7839 Py_ssize_t uni2; 7840 /* startpos for collecting untranslatable chars */ 7841 Py_ssize_t collstart = i; 7842 Py_ssize_t collend = i+1; 7843 Py_ssize_t coll; 7844 7845 /* find all untranslatable characters */ 7846 while (collend < size) { 7847 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 7848 goto onError; 7849 Py_XDECREF(x); 7850 if (x!=Py_None) 7851 break; 7852 ++collend; 7853 } 7854 /* cache callback name lookup 7855 * (if not done yet, i.e. it's the first error) */ 7856 if (known_errorHandler==-1) { 7857 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7858 known_errorHandler = 1; 7859 else if (!strcmp(errors, "replace")) 7860 known_errorHandler = 2; 7861 else if (!strcmp(errors, "ignore")) 7862 known_errorHandler = 3; 7863 else if (!strcmp(errors, "xmlcharrefreplace")) 7864 known_errorHandler = 4; 7865 else 7866 known_errorHandler = 0; 7867 } 7868 switch (known_errorHandler) { 7869 case 1: /* strict */ 7870 raise_translate_exception(&exc, input, collstart, 7871 collend, reason); 7872 goto onError; 7873 case 2: /* replace */ 7874 /* No need to check for space, this is a 1:1 replacement */ 7875 for (coll = collstart; coll<collend; coll++) 7876 output[opos++] = '?'; 7877 /* fall through */ 7878 case 3: /* ignore */ 7879 i = collend; 7880 break; 7881 case 4: /* xmlcharrefreplace */ 7882 /* generate replacement (temporarily (mis)uses i) */ 7883 for (i = collstart; i < collend; ++i) { 7884 char buffer[2+29+1+1]; 7885 char *cp; 7886 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 7887 if (charmaptranslate_makespace(&output, &osize, 7888 opos+strlen(buffer)+(size-collend))) 7889 goto onError; 7890 for (cp = buffer; *cp; ++cp) 7891 output[opos++] = *cp; 7892 } 7893 i = collend; 7894 break; 7895 default: 7896 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 7897 reason, input, &exc, 7898 collstart, collend, &newpos); 7899 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 7900 goto onError; 7901 /* generate replacement */ 7902 repsize = PyUnicode_GET_LENGTH(repunicode); 7903 if (charmaptranslate_makespace(&output, &osize, 7904 opos+repsize+(size-collend))) { 7905 Py_DECREF(repunicode); 7906 goto onError; 7907 } 7908 for (uni2 = 0; repsize-->0; ++uni2) 7909 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 7910 i = newpos; 7911 Py_DECREF(repunicode); 7912 } 7913 } 7914 } 7915 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 7916 if (!res) 7917 goto onError; 7918 PyMem_Free(output); 7919 Py_XDECREF(exc); 7920 Py_XDECREF(errorHandler); 7921 return res; 7922 7923 onError: 7924 PyMem_Free(output); 7925 Py_XDECREF(exc); 7926 Py_XDECREF(errorHandler); 7927 return NULL; 7928} 7929 7930/* Deprecated. Use PyUnicode_Translate instead. */ 7931PyObject * 7932PyUnicode_TranslateCharmap(const Py_UNICODE *p, 7933 Py_ssize_t size, 7934 PyObject *mapping, 7935 const char *errors) 7936{ 7937 PyObject *unicode = PyUnicode_FromUnicode(p, size); 7938 if (!unicode) 7939 return NULL; 7940 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 7941} 7942 7943PyObject * 7944PyUnicode_Translate(PyObject *str, 7945 PyObject *mapping, 7946 const char *errors) 7947{ 7948 PyObject *result; 7949 7950 str = PyUnicode_FromObject(str); 7951 if (str == NULL) 7952 goto onError; 7953 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 7954 Py_DECREF(str); 7955 return result; 7956 7957 onError: 7958 Py_XDECREF(str); 7959 return NULL; 7960} 7961 7962static Py_UCS4 7963fix_decimal_and_space_to_ascii(PyUnicodeObject *self) 7964{ 7965 /* No need to call PyUnicode_READY(self) because this function is only 7966 called as a callback from fixup() which does it already. */ 7967 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 7968 const int kind = PyUnicode_KIND(self); 7969 void *data = PyUnicode_DATA(self); 7970 Py_UCS4 maxchar = 0, ch, fixed; 7971 Py_ssize_t i; 7972 7973 for (i = 0; i < len; ++i) { 7974 ch = PyUnicode_READ(kind, data, i); 7975 fixed = 0; 7976 if (ch > 127) { 7977 if (Py_UNICODE_ISSPACE(ch)) 7978 fixed = ' '; 7979 else { 7980 const int decimal = Py_UNICODE_TODECIMAL(ch); 7981 if (decimal >= 0) 7982 fixed = '0' + decimal; 7983 } 7984 if (fixed != 0) { 7985 if (fixed > maxchar) 7986 maxchar = fixed; 7987 PyUnicode_WRITE(kind, data, i, fixed); 7988 } 7989 else if (ch > maxchar) 7990 maxchar = ch; 7991 } 7992 else if (ch > maxchar) 7993 maxchar = ch; 7994 } 7995 7996 return maxchar; 7997} 7998 7999PyObject * 8000_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8001{ 8002 if (!PyUnicode_Check(unicode)) { 8003 PyErr_BadInternalCall(); 8004 return NULL; 8005 } 8006 if (PyUnicode_READY(unicode) == -1) 8007 return NULL; 8008 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8009 /* If the string is already ASCII, just return the same string */ 8010 Py_INCREF(unicode); 8011 return unicode; 8012 } 8013 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); 8014} 8015 8016PyObject * 8017PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8018 Py_ssize_t length) 8019{ 8020 PyObject *result; 8021 Py_UNICODE *p; /* write pointer into result */ 8022 Py_ssize_t i; 8023 /* Copy to a new string */ 8024 result = (PyObject *)_PyUnicode_New(length); 8025 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8026 if (result == NULL) 8027 return result; 8028 p = PyUnicode_AS_UNICODE(result); 8029 /* Iterate over code points */ 8030 for (i = 0; i < length; i++) { 8031 Py_UNICODE ch =s[i]; 8032 if (ch > 127) { 8033 int decimal = Py_UNICODE_TODECIMAL(ch); 8034 if (decimal >= 0) 8035 p[i] = '0' + decimal; 8036 } 8037 } 8038 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) { 8039 Py_DECREF(result); 8040 return NULL; 8041 } 8042 return result; 8043} 8044/* --- Decimal Encoder ---------------------------------------------------- */ 8045 8046int 8047PyUnicode_EncodeDecimal(Py_UNICODE *s, 8048 Py_ssize_t length, 8049 char *output, 8050 const char *errors) 8051{ 8052 Py_UNICODE *p, *end; 8053 PyObject *errorHandler = NULL; 8054 PyObject *exc = NULL; 8055 const char *encoding = "decimal"; 8056 const char *reason = "invalid decimal Unicode string"; 8057 /* the following variable is used for caching string comparisons 8058 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8059 int known_errorHandler = -1; 8060 8061 if (output == NULL) { 8062 PyErr_BadArgument(); 8063 return -1; 8064 } 8065 8066 p = s; 8067 end = s + length; 8068 while (p < end) { 8069 register Py_UNICODE ch = *p; 8070 int decimal; 8071 PyObject *repunicode; 8072 Py_ssize_t repsize; 8073 Py_ssize_t newpos; 8074 Py_UNICODE *uni2; 8075 Py_UNICODE *collstart; 8076 Py_UNICODE *collend; 8077 8078 if (Py_UNICODE_ISSPACE(ch)) { 8079 *output++ = ' '; 8080 ++p; 8081 continue; 8082 } 8083 decimal = Py_UNICODE_TODECIMAL(ch); 8084 if (decimal >= 0) { 8085 *output++ = '0' + decimal; 8086 ++p; 8087 continue; 8088 } 8089 if (0 < ch && ch < 256) { 8090 *output++ = (char)ch; 8091 ++p; 8092 continue; 8093 } 8094 /* All other characters are considered unencodable */ 8095 collstart = p; 8096 collend = p+1; 8097 while (collend < end) { 8098 if ((0 < *collend && *collend < 256) || 8099 !Py_UNICODE_ISSPACE(*collend) || 8100 Py_UNICODE_TODECIMAL(*collend)) 8101 break; 8102 } 8103 /* cache callback name lookup 8104 * (if not done yet, i.e. it's the first error) */ 8105 if (known_errorHandler==-1) { 8106 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8107 known_errorHandler = 1; 8108 else if (!strcmp(errors, "replace")) 8109 known_errorHandler = 2; 8110 else if (!strcmp(errors, "ignore")) 8111 known_errorHandler = 3; 8112 else if (!strcmp(errors, "xmlcharrefreplace")) 8113 known_errorHandler = 4; 8114 else 8115 known_errorHandler = 0; 8116 } 8117 switch (known_errorHandler) { 8118 case 1: /* strict */ 8119 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8120 goto onError; 8121 case 2: /* replace */ 8122 for (p = collstart; p < collend; ++p) 8123 *output++ = '?'; 8124 /* fall through */ 8125 case 3: /* ignore */ 8126 p = collend; 8127 break; 8128 case 4: /* xmlcharrefreplace */ 8129 /* generate replacement (temporarily (mis)uses p) */ 8130 for (p = collstart; p < collend; ++p) 8131 output += sprintf(output, "&#%d;", (int)*p); 8132 p = collend; 8133 break; 8134 default: 8135 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8136 encoding, reason, s, length, &exc, 8137 collstart-s, collend-s, &newpos); 8138 if (repunicode == NULL) 8139 goto onError; 8140 if (!PyUnicode_Check(repunicode)) { 8141 /* Byte results not supported, since they have no decimal property. */ 8142 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8143 Py_DECREF(repunicode); 8144 goto onError; 8145 } 8146 /* generate replacement */ 8147 repsize = PyUnicode_GET_SIZE(repunicode); 8148 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8149 Py_UNICODE ch = *uni2; 8150 if (Py_UNICODE_ISSPACE(ch)) 8151 *output++ = ' '; 8152 else { 8153 decimal = Py_UNICODE_TODECIMAL(ch); 8154 if (decimal >= 0) 8155 *output++ = '0' + decimal; 8156 else if (0 < ch && ch < 256) 8157 *output++ = (char)ch; 8158 else { 8159 Py_DECREF(repunicode); 8160 raise_encode_exception(&exc, encoding, 8161 s, length, collstart-s, collend-s, reason); 8162 goto onError; 8163 } 8164 } 8165 } 8166 p = s + newpos; 8167 Py_DECREF(repunicode); 8168 } 8169 } 8170 /* 0-terminate the output string */ 8171 *output++ = '\0'; 8172 Py_XDECREF(exc); 8173 Py_XDECREF(errorHandler); 8174 return 0; 8175 8176 onError: 8177 Py_XDECREF(exc); 8178 Py_XDECREF(errorHandler); 8179 return -1; 8180} 8181 8182/* --- Helpers ------------------------------------------------------------ */ 8183 8184#include "stringlib/ucs1lib.h" 8185#include "stringlib/fastsearch.h" 8186#include "stringlib/partition.h" 8187#include "stringlib/split.h" 8188#include "stringlib/count.h" 8189#include "stringlib/find.h" 8190#include "stringlib/localeutil.h" 8191#include "stringlib/undef.h" 8192 8193#include "stringlib/ucs2lib.h" 8194#include "stringlib/fastsearch.h" 8195#include "stringlib/partition.h" 8196#include "stringlib/split.h" 8197#include "stringlib/count.h" 8198#include "stringlib/find.h" 8199#include "stringlib/localeutil.h" 8200#include "stringlib/undef.h" 8201 8202#include "stringlib/ucs4lib.h" 8203#include "stringlib/fastsearch.h" 8204#include "stringlib/partition.h" 8205#include "stringlib/split.h" 8206#include "stringlib/count.h" 8207#include "stringlib/find.h" 8208#include "stringlib/localeutil.h" 8209#include "stringlib/undef.h" 8210 8211static Py_ssize_t 8212any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8213 const Py_UCS1*, Py_ssize_t, 8214 Py_ssize_t, Py_ssize_t), 8215 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8216 const Py_UCS2*, Py_ssize_t, 8217 Py_ssize_t, Py_ssize_t), 8218 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8219 const Py_UCS4*, Py_ssize_t, 8220 Py_ssize_t, Py_ssize_t), 8221 PyObject* s1, PyObject* s2, 8222 Py_ssize_t start, 8223 Py_ssize_t end) 8224{ 8225 int kind1, kind2, kind; 8226 void *buf1, *buf2; 8227 Py_ssize_t len1, len2, result; 8228 8229 kind1 = PyUnicode_KIND(s1); 8230 kind2 = PyUnicode_KIND(s2); 8231 kind = kind1 > kind2 ? kind1 : kind2; 8232 buf1 = PyUnicode_DATA(s1); 8233 buf2 = PyUnicode_DATA(s2); 8234 if (kind1 != kind) 8235 buf1 = _PyUnicode_AsKind(s1, kind); 8236 if (!buf1) 8237 return -2; 8238 if (kind2 != kind) 8239 buf2 = _PyUnicode_AsKind(s2, kind); 8240 if (!buf2) { 8241 if (kind1 != kind) PyMem_Free(buf1); 8242 return -2; 8243 } 8244 len1 = PyUnicode_GET_LENGTH(s1); 8245 len2 = PyUnicode_GET_LENGTH(s2); 8246 8247 switch(kind) { 8248 case PyUnicode_1BYTE_KIND: 8249 result = ucs1(buf1, len1, buf2, len2, start, end); 8250 break; 8251 case PyUnicode_2BYTE_KIND: 8252 result = ucs2(buf1, len1, buf2, len2, start, end); 8253 break; 8254 case PyUnicode_4BYTE_KIND: 8255 result = ucs4(buf1, len1, buf2, len2, start, end); 8256 break; 8257 default: 8258 assert(0); result = -2; 8259 } 8260 8261 if (kind1 != kind) 8262 PyMem_Free(buf1); 8263 if (kind2 != kind) 8264 PyMem_Free(buf2); 8265 8266 return result; 8267} 8268 8269Py_ssize_t 8270_PyUnicode_InsertThousandsGrouping(int kind, void *data, 8271 Py_ssize_t n_buffer, 8272 void *digits, Py_ssize_t n_digits, 8273 Py_ssize_t min_width, 8274 const char *grouping, 8275 const char *thousands_sep) 8276{ 8277 switch(kind) { 8278 case PyUnicode_1BYTE_KIND: 8279 return _PyUnicode_ucs1_InsertThousandsGrouping( 8280 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8281 min_width, grouping, thousands_sep); 8282 case PyUnicode_2BYTE_KIND: 8283 return _PyUnicode_ucs2_InsertThousandsGrouping( 8284 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8285 min_width, grouping, thousands_sep); 8286 case PyUnicode_4BYTE_KIND: 8287 return _PyUnicode_ucs4_InsertThousandsGrouping( 8288 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8289 min_width, grouping, thousands_sep); 8290 } 8291 assert(0); 8292 return -1; 8293} 8294 8295 8296#include "stringlib/unicodedefs.h" 8297#include "stringlib/fastsearch.h" 8298 8299#include "stringlib/count.h" 8300#include "stringlib/find.h" 8301 8302/* helper macro to fixup start/end slice values */ 8303#define ADJUST_INDICES(start, end, len) \ 8304 if (end > len) \ 8305 end = len; \ 8306 else if (end < 0) { \ 8307 end += len; \ 8308 if (end < 0) \ 8309 end = 0; \ 8310 } \ 8311 if (start < 0) { \ 8312 start += len; \ 8313 if (start < 0) \ 8314 start = 0; \ 8315 } 8316 8317Py_ssize_t 8318PyUnicode_Count(PyObject *str, 8319 PyObject *substr, 8320 Py_ssize_t start, 8321 Py_ssize_t end) 8322{ 8323 Py_ssize_t result; 8324 PyUnicodeObject* str_obj; 8325 PyUnicodeObject* sub_obj; 8326 int kind1, kind2, kind; 8327 void *buf1 = NULL, *buf2 = NULL; 8328 Py_ssize_t len1, len2; 8329 8330 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8331 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8332 return -1; 8333 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8334 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8335 Py_DECREF(str_obj); 8336 return -1; 8337 } 8338 8339 kind1 = PyUnicode_KIND(str_obj); 8340 kind2 = PyUnicode_KIND(sub_obj); 8341 kind = kind1 > kind2 ? kind1 : kind2; 8342 buf1 = PyUnicode_DATA(str_obj); 8343 if (kind1 != kind) 8344 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8345 if (!buf1) 8346 goto onError; 8347 buf2 = PyUnicode_DATA(sub_obj); 8348 if (kind2 != kind) 8349 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8350 if (!buf2) 8351 goto onError; 8352 len1 = PyUnicode_GET_LENGTH(str_obj); 8353 len2 = PyUnicode_GET_LENGTH(sub_obj); 8354 8355 ADJUST_INDICES(start, end, len1); 8356 switch(kind) { 8357 case PyUnicode_1BYTE_KIND: 8358 result = ucs1lib_count( 8359 ((Py_UCS1*)buf1) + start, end - start, 8360 buf2, len2, PY_SSIZE_T_MAX 8361 ); 8362 break; 8363 case PyUnicode_2BYTE_KIND: 8364 result = ucs2lib_count( 8365 ((Py_UCS2*)buf1) + start, end - start, 8366 buf2, len2, PY_SSIZE_T_MAX 8367 ); 8368 break; 8369 case PyUnicode_4BYTE_KIND: 8370 result = ucs4lib_count( 8371 ((Py_UCS4*)buf1) + start, end - start, 8372 buf2, len2, PY_SSIZE_T_MAX 8373 ); 8374 break; 8375 default: 8376 assert(0); result = 0; 8377 } 8378 8379 Py_DECREF(sub_obj); 8380 Py_DECREF(str_obj); 8381 8382 if (kind1 != kind) 8383 PyMem_Free(buf1); 8384 if (kind2 != kind) 8385 PyMem_Free(buf2); 8386 8387 return result; 8388 onError: 8389 Py_DECREF(sub_obj); 8390 Py_DECREF(str_obj); 8391 if (kind1 != kind && buf1) 8392 PyMem_Free(buf1); 8393 if (kind2 != kind && buf2) 8394 PyMem_Free(buf2); 8395 return -1; 8396} 8397 8398Py_ssize_t 8399PyUnicode_Find(PyObject *str, 8400 PyObject *sub, 8401 Py_ssize_t start, 8402 Py_ssize_t end, 8403 int direction) 8404{ 8405 Py_ssize_t result; 8406 8407 str = PyUnicode_FromObject(str); 8408 if (!str || PyUnicode_READY(str) == -1) 8409 return -2; 8410 sub = PyUnicode_FromObject(sub); 8411 if (!sub || PyUnicode_READY(sub) == -1) { 8412 Py_DECREF(str); 8413 return -2; 8414 } 8415 8416 if (direction > 0) 8417 result = any_find_slice( 8418 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 8419 str, sub, start, end 8420 ); 8421 else 8422 result = any_find_slice( 8423 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8424 str, sub, start, end 8425 ); 8426 8427 Py_DECREF(str); 8428 Py_DECREF(sub); 8429 8430 return result; 8431} 8432 8433Py_ssize_t 8434PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8435 Py_ssize_t start, Py_ssize_t end, 8436 int direction) 8437{ 8438 char *result; 8439 int kind; 8440 if (PyUnicode_READY(str) == -1) 8441 return -2; 8442 if (start < 0 || end < 0) { 8443 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8444 return -2; 8445 } 8446 if (end > PyUnicode_GET_LENGTH(str)) 8447 end = PyUnicode_GET_LENGTH(str); 8448 kind = PyUnicode_KIND(str); 8449 result = findchar(PyUnicode_1BYTE_DATA(str) 8450 + PyUnicode_KIND_SIZE(kind, start), 8451 kind, 8452 end-start, ch, direction); 8453 if (!result) 8454 return -1; 8455 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8456} 8457 8458static int 8459tailmatch(PyUnicodeObject *self, 8460 PyUnicodeObject *substring, 8461 Py_ssize_t start, 8462 Py_ssize_t end, 8463 int direction) 8464{ 8465 int kind_self; 8466 int kind_sub; 8467 void *data_self; 8468 void *data_sub; 8469 Py_ssize_t offset; 8470 Py_ssize_t i; 8471 Py_ssize_t end_sub; 8472 8473 if (PyUnicode_READY(self) == -1 || 8474 PyUnicode_READY(substring) == -1) 8475 return 0; 8476 8477 if (PyUnicode_GET_LENGTH(substring) == 0) 8478 return 1; 8479 8480 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8481 end -= PyUnicode_GET_LENGTH(substring); 8482 if (end < start) 8483 return 0; 8484 8485 kind_self = PyUnicode_KIND(self); 8486 data_self = PyUnicode_DATA(self); 8487 kind_sub = PyUnicode_KIND(substring); 8488 data_sub = PyUnicode_DATA(substring); 8489 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8490 8491 if (direction > 0) 8492 offset = end; 8493 else 8494 offset = start; 8495 8496 if (PyUnicode_READ(kind_self, data_self, offset) == 8497 PyUnicode_READ(kind_sub, data_sub, 0) && 8498 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8499 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8500 /* If both are of the same kind, memcmp is sufficient */ 8501 if (kind_self == kind_sub) { 8502 return ! memcmp((char *)data_self + 8503 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8504 data_sub, 8505 PyUnicode_GET_LENGTH(substring) * 8506 PyUnicode_CHARACTER_SIZE(substring)); 8507 } 8508 /* otherwise we have to compare each character by first accesing it */ 8509 else { 8510 /* We do not need to compare 0 and len(substring)-1 because 8511 the if statement above ensured already that they are equal 8512 when we end up here. */ 8513 // TODO: honor direction and do a forward or backwards search 8514 for (i = 1; i < end_sub; ++i) { 8515 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8516 PyUnicode_READ(kind_sub, data_sub, i)) 8517 return 0; 8518 } 8519 return 1; 8520 } 8521 } 8522 8523 return 0; 8524} 8525 8526Py_ssize_t 8527PyUnicode_Tailmatch(PyObject *str, 8528 PyObject *substr, 8529 Py_ssize_t start, 8530 Py_ssize_t end, 8531 int direction) 8532{ 8533 Py_ssize_t result; 8534 8535 str = PyUnicode_FromObject(str); 8536 if (str == NULL) 8537 return -1; 8538 substr = PyUnicode_FromObject(substr); 8539 if (substr == NULL) { 8540 Py_DECREF(str); 8541 return -1; 8542 } 8543 8544 result = tailmatch((PyUnicodeObject *)str, 8545 (PyUnicodeObject *)substr, 8546 start, end, direction); 8547 Py_DECREF(str); 8548 Py_DECREF(substr); 8549 return result; 8550} 8551 8552/* Apply fixfct filter to the Unicode object self and return a 8553 reference to the modified object */ 8554 8555static PyObject * 8556fixup(PyUnicodeObject *self, 8557 Py_UCS4 (*fixfct)(PyUnicodeObject *s)) 8558{ 8559 PyObject *u; 8560 Py_UCS4 maxchar_old, maxchar_new = 0; 8561 8562 if (PyUnicode_READY(self) == -1) 8563 return NULL; 8564 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8565 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8566 maxchar_old); 8567 if (u == NULL) 8568 return NULL; 8569 8570 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8571 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8572 8573 /* fix functions return the new maximum character in a string, 8574 if the kind of the resulting unicode object does not change, 8575 everything is fine. Otherwise we need to change the string kind 8576 and re-run the fix function. */ 8577 maxchar_new = fixfct((PyUnicodeObject*)u); 8578 if (maxchar_new == 0) 8579 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8580 else if (maxchar_new <= 127) 8581 maxchar_new = 127; 8582 else if (maxchar_new <= 255) 8583 maxchar_new = 255; 8584 else if (maxchar_new <= 65535) 8585 maxchar_new = 65535; 8586 else 8587 maxchar_new = 1114111; /* 0x10ffff */ 8588 8589 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8590 /* fixfct should return TRUE if it modified the buffer. If 8591 FALSE, return a reference to the original buffer instead 8592 (to save space, not time) */ 8593 Py_INCREF(self); 8594 Py_DECREF(u); 8595 return (PyObject*) self; 8596 } 8597 else if (maxchar_new == maxchar_old) { 8598 return u; 8599 } 8600 else { 8601 /* In case the maximum character changed, we need to 8602 convert the string to the new category. */ 8603 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8604 if (v == NULL) { 8605 Py_DECREF(u); 8606 return NULL; 8607 } 8608 if (maxchar_new > maxchar_old) { 8609 /* If the maxchar increased so that the kind changed, not all 8610 characters are representable anymore and we need to fix the 8611 string again. This only happens in very few cases. */ 8612 if (PyUnicode_CopyCharacters(v, 0, 8613 (PyObject*)self, 0, 8614 PyUnicode_GET_LENGTH(self)) < 0) 8615 { 8616 Py_DECREF(u); 8617 return NULL; 8618 } 8619 maxchar_old = fixfct((PyUnicodeObject*)v); 8620 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8621 } 8622 else { 8623 if (PyUnicode_CopyCharacters(v, 0, 8624 u, 0, 8625 PyUnicode_GET_LENGTH(self)) < 0) 8626 { 8627 Py_DECREF(u); 8628 return NULL; 8629 } 8630 } 8631 8632 Py_DECREF(u); 8633 return v; 8634 } 8635} 8636 8637static Py_UCS4 8638fixupper(PyUnicodeObject *self) 8639{ 8640 /* No need to call PyUnicode_READY(self) because this function is only 8641 called as a callback from fixup() which does it already. */ 8642 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8643 const int kind = PyUnicode_KIND(self); 8644 void *data = PyUnicode_DATA(self); 8645 int touched = 0; 8646 Py_UCS4 maxchar = 0; 8647 Py_ssize_t i; 8648 8649 for (i = 0; i < len; ++i) { 8650 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8651 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8652 if (up != ch) { 8653 if (up > maxchar) 8654 maxchar = up; 8655 PyUnicode_WRITE(kind, data, i, up); 8656 touched = 1; 8657 } 8658 else if (ch > maxchar) 8659 maxchar = ch; 8660 } 8661 8662 if (touched) 8663 return maxchar; 8664 else 8665 return 0; 8666} 8667 8668static Py_UCS4 8669fixlower(PyUnicodeObject *self) 8670{ 8671 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8672 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8673 const int kind = PyUnicode_KIND(self); 8674 void *data = PyUnicode_DATA(self); 8675 int touched = 0; 8676 Py_UCS4 maxchar = 0; 8677 Py_ssize_t i; 8678 8679 for(i = 0; i < len; ++i) { 8680 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8681 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8682 if (lo != ch) { 8683 if (lo > maxchar) 8684 maxchar = lo; 8685 PyUnicode_WRITE(kind, data, i, lo); 8686 touched = 1; 8687 } 8688 else if (ch > maxchar) 8689 maxchar = ch; 8690 } 8691 8692 if (touched) 8693 return maxchar; 8694 else 8695 return 0; 8696} 8697 8698static Py_UCS4 8699fixswapcase(PyUnicodeObject *self) 8700{ 8701 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8702 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8703 const int kind = PyUnicode_KIND(self); 8704 void *data = PyUnicode_DATA(self); 8705 int touched = 0; 8706 Py_UCS4 maxchar = 0; 8707 Py_ssize_t i; 8708 8709 for(i = 0; i < len; ++i) { 8710 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8711 Py_UCS4 nu = 0; 8712 8713 if (Py_UNICODE_ISUPPER(ch)) 8714 nu = Py_UNICODE_TOLOWER(ch); 8715 else if (Py_UNICODE_ISLOWER(ch)) 8716 nu = Py_UNICODE_TOUPPER(ch); 8717 8718 if (nu != 0) { 8719 if (nu > maxchar) 8720 maxchar = nu; 8721 PyUnicode_WRITE(kind, data, i, nu); 8722 touched = 1; 8723 } 8724 else if (ch > maxchar) 8725 maxchar = ch; 8726 } 8727 8728 if (touched) 8729 return maxchar; 8730 else 8731 return 0; 8732} 8733 8734static Py_UCS4 8735fixcapitalize(PyUnicodeObject *self) 8736{ 8737 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8738 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8739 const int kind = PyUnicode_KIND(self); 8740 void *data = PyUnicode_DATA(self); 8741 int touched = 0; 8742 Py_UCS4 maxchar = 0; 8743 Py_ssize_t i = 0; 8744 Py_UCS4 ch; 8745 8746 if (len == 0) 8747 return 0; 8748 8749 ch = PyUnicode_READ(kind, data, i); 8750 if (!Py_UNICODE_ISUPPER(ch)) { 8751 maxchar = Py_UNICODE_TOUPPER(ch); 8752 PyUnicode_WRITE(kind, data, i, maxchar); 8753 touched = 1; 8754 } 8755 ++i; 8756 for(; i < len; ++i) { 8757 ch = PyUnicode_READ(kind, data, i); 8758 if (!Py_UNICODE_ISLOWER(ch)) { 8759 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8760 if (lo > maxchar) 8761 maxchar = lo; 8762 PyUnicode_WRITE(kind, data, i, lo); 8763 touched = 1; 8764 } 8765 else if (ch > maxchar) 8766 maxchar = ch; 8767 } 8768 8769 if (touched) 8770 return maxchar; 8771 else 8772 return 0; 8773} 8774 8775static Py_UCS4 8776fixtitle(PyUnicodeObject *self) 8777{ 8778 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8779 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8780 const int kind = PyUnicode_KIND(self); 8781 void *data = PyUnicode_DATA(self); 8782 Py_UCS4 maxchar = 0; 8783 Py_ssize_t i = 0; 8784 int previous_is_cased; 8785 8786 /* Shortcut for single character strings */ 8787 if (len == 1) { 8788 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8789 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 8790 if (ti != ch) { 8791 PyUnicode_WRITE(kind, data, i, ti); 8792 return ti; 8793 } 8794 else 8795 return 0; 8796 } 8797 previous_is_cased = 0; 8798 for(; i < len; ++i) { 8799 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8800 Py_UCS4 nu; 8801 8802 if (previous_is_cased) 8803 nu = Py_UNICODE_TOLOWER(ch); 8804 else 8805 nu = Py_UNICODE_TOTITLE(ch); 8806 8807 if (nu > maxchar) 8808 maxchar = nu; 8809 PyUnicode_WRITE(kind, data, i, nu); 8810 8811 if (Py_UNICODE_ISLOWER(ch) || 8812 Py_UNICODE_ISUPPER(ch) || 8813 Py_UNICODE_ISTITLE(ch)) 8814 previous_is_cased = 1; 8815 else 8816 previous_is_cased = 0; 8817 } 8818 return maxchar; 8819} 8820 8821PyObject * 8822PyUnicode_Join(PyObject *separator, PyObject *seq) 8823{ 8824 PyObject *sep = NULL; 8825 Py_ssize_t seplen = 1; 8826 PyObject *res = NULL; /* the result */ 8827 PyObject *fseq; /* PySequence_Fast(seq) */ 8828 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 8829 PyObject **items; 8830 PyObject *item; 8831 Py_ssize_t sz, i, res_offset; 8832 Py_UCS4 maxchar = 0; 8833 Py_UCS4 item_maxchar; 8834 8835 fseq = PySequence_Fast(seq, ""); 8836 if (fseq == NULL) { 8837 return NULL; 8838 } 8839 8840 /* NOTE: the following code can't call back into Python code, 8841 * so we are sure that fseq won't be mutated. 8842 */ 8843 8844 seqlen = PySequence_Fast_GET_SIZE(fseq); 8845 /* If empty sequence, return u"". */ 8846 if (seqlen == 0) { 8847 res = PyUnicode_New(0, 0); 8848 goto Done; 8849 } 8850 items = PySequence_Fast_ITEMS(fseq); 8851 /* If singleton sequence with an exact Unicode, return that. */ 8852 if (seqlen == 1) { 8853 item = items[0]; 8854 if (PyUnicode_CheckExact(item)) { 8855 Py_INCREF(item); 8856 res = item; 8857 goto Done; 8858 } 8859 } 8860 else { 8861 /* Set up sep and seplen */ 8862 if (separator == NULL) { 8863 /* fall back to a blank space separator */ 8864 sep = PyUnicode_FromOrdinal(' '); 8865 if (!sep) 8866 goto onError; 8867 } 8868 else { 8869 if (!PyUnicode_Check(separator)) { 8870 PyErr_Format(PyExc_TypeError, 8871 "separator: expected str instance," 8872 " %.80s found", 8873 Py_TYPE(separator)->tp_name); 8874 goto onError; 8875 } 8876 if (PyUnicode_READY(separator)) 8877 goto onError; 8878 sep = separator; 8879 seplen = PyUnicode_GET_LENGTH(separator); 8880 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 8881 /* inc refcount to keep this code path symetric with the 8882 above case of a blank separator */ 8883 Py_INCREF(sep); 8884 } 8885 } 8886 8887 /* There are at least two things to join, or else we have a subclass 8888 * of str in the sequence. 8889 * Do a pre-pass to figure out the total amount of space we'll 8890 * need (sz), and see whether all argument are strings. 8891 */ 8892 sz = 0; 8893 for (i = 0; i < seqlen; i++) { 8894 const Py_ssize_t old_sz = sz; 8895 item = items[i]; 8896 if (!PyUnicode_Check(item)) { 8897 PyErr_Format(PyExc_TypeError, 8898 "sequence item %zd: expected str instance," 8899 " %.80s found", 8900 i, Py_TYPE(item)->tp_name); 8901 goto onError; 8902 } 8903 if (PyUnicode_READY(item) == -1) 8904 goto onError; 8905 sz += PyUnicode_GET_LENGTH(item); 8906 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 8907 if (item_maxchar > maxchar) 8908 maxchar = item_maxchar; 8909 if (i != 0) 8910 sz += seplen; 8911 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 8912 PyErr_SetString(PyExc_OverflowError, 8913 "join() result is too long for a Python string"); 8914 goto onError; 8915 } 8916 } 8917 8918 res = PyUnicode_New(sz, maxchar); 8919 if (res == NULL) 8920 goto onError; 8921 8922 /* Catenate everything. */ 8923 for (i = 0, res_offset = 0; i < seqlen; ++i) { 8924 Py_ssize_t itemlen, copied; 8925 item = items[i]; 8926 /* Copy item, and maybe the separator. */ 8927 if (i && seplen != 0) { 8928 copied = PyUnicode_CopyCharacters(res, res_offset, 8929 sep, 0, seplen); 8930 if (copied < 0) 8931 goto onError; 8932#ifdef Py_DEBUG 8933 res_offset += copied; 8934#else 8935 res_offset += seplen; 8936#endif 8937 } 8938 itemlen = PyUnicode_GET_LENGTH(item); 8939 if (itemlen != 0) { 8940 copied = PyUnicode_CopyCharacters(res, res_offset, 8941 item, 0, itemlen); 8942 if (copied < 0) 8943 goto onError; 8944#ifdef Py_DEBUG 8945 res_offset += copied; 8946#else 8947 res_offset += itemlen; 8948#endif 8949 } 8950 } 8951 assert(res_offset == PyUnicode_GET_LENGTH(res)); 8952 8953 Done: 8954 Py_DECREF(fseq); 8955 Py_XDECREF(sep); 8956 return res; 8957 8958 onError: 8959 Py_DECREF(fseq); 8960 Py_XDECREF(sep); 8961 Py_XDECREF(res); 8962 return NULL; 8963} 8964 8965#define FILL(kind, data, value, start, length) \ 8966 do { \ 8967 Py_ssize_t i_ = 0; \ 8968 assert(kind != PyUnicode_WCHAR_KIND); \ 8969 switch ((kind)) { \ 8970 case PyUnicode_1BYTE_KIND: { \ 8971 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 8972 memset(to_, (unsigned char)value, length); \ 8973 break; \ 8974 } \ 8975 case PyUnicode_2BYTE_KIND: { \ 8976 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 8977 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8978 break; \ 8979 } \ 8980 default: { \ 8981 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 8982 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8983 break; \ 8984 } \ 8985 } \ 8986 } while (0) 8987 8988static PyUnicodeObject * 8989pad(PyUnicodeObject *self, 8990 Py_ssize_t left, 8991 Py_ssize_t right, 8992 Py_UCS4 fill) 8993{ 8994 PyObject *u; 8995 Py_UCS4 maxchar; 8996 int kind; 8997 void *data; 8998 8999 if (left < 0) 9000 left = 0; 9001 if (right < 0) 9002 right = 0; 9003 9004 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9005 Py_INCREF(self); 9006 return self; 9007 } 9008 9009 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9010 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9011 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9012 return NULL; 9013 } 9014 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9015 if (fill > maxchar) 9016 maxchar = fill; 9017 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9018 if (!u) 9019 return NULL; 9020 9021 kind = PyUnicode_KIND(u); 9022 data = PyUnicode_DATA(u); 9023 if (left) 9024 FILL(kind, data, fill, 0, left); 9025 if (right) 9026 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9027 if (PyUnicode_CopyCharacters(u, left, 9028 (PyObject*)self, 0, 9029 _PyUnicode_LENGTH(self)) < 0) 9030 { 9031 Py_DECREF(u); 9032 return NULL; 9033 } 9034 9035 return (PyUnicodeObject*)u; 9036} 9037#undef FILL 9038 9039PyObject * 9040PyUnicode_Splitlines(PyObject *string, int keepends) 9041{ 9042 PyObject *list; 9043 9044 string = PyUnicode_FromObject(string); 9045 if (string == NULL || PyUnicode_READY(string) == -1) 9046 return NULL; 9047 9048 switch(PyUnicode_KIND(string)) { 9049 case PyUnicode_1BYTE_KIND: 9050 list = ucs1lib_splitlines( 9051 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9052 PyUnicode_GET_LENGTH(string), keepends); 9053 break; 9054 case PyUnicode_2BYTE_KIND: 9055 list = ucs2lib_splitlines( 9056 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9057 PyUnicode_GET_LENGTH(string), keepends); 9058 break; 9059 case PyUnicode_4BYTE_KIND: 9060 list = ucs4lib_splitlines( 9061 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9062 PyUnicode_GET_LENGTH(string), keepends); 9063 break; 9064 default: 9065 assert(0); 9066 list = 0; 9067 } 9068 Py_DECREF(string); 9069 return list; 9070} 9071 9072static PyObject * 9073split(PyUnicodeObject *self, 9074 PyUnicodeObject *substring, 9075 Py_ssize_t maxcount) 9076{ 9077 int kind1, kind2, kind; 9078 void *buf1, *buf2; 9079 Py_ssize_t len1, len2; 9080 PyObject* out; 9081 9082 if (maxcount < 0) 9083 maxcount = PY_SSIZE_T_MAX; 9084 9085 if (PyUnicode_READY(self) == -1) 9086 return NULL; 9087 9088 if (substring == NULL) 9089 switch(PyUnicode_KIND(self)) { 9090 case PyUnicode_1BYTE_KIND: 9091 return ucs1lib_split_whitespace( 9092 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9093 PyUnicode_GET_LENGTH(self), maxcount 9094 ); 9095 case PyUnicode_2BYTE_KIND: 9096 return ucs2lib_split_whitespace( 9097 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9098 PyUnicode_GET_LENGTH(self), maxcount 9099 ); 9100 case PyUnicode_4BYTE_KIND: 9101 return ucs4lib_split_whitespace( 9102 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9103 PyUnicode_GET_LENGTH(self), maxcount 9104 ); 9105 default: 9106 assert(0); 9107 return NULL; 9108 } 9109 9110 if (PyUnicode_READY(substring) == -1) 9111 return NULL; 9112 9113 kind1 = PyUnicode_KIND(self); 9114 kind2 = PyUnicode_KIND(substring); 9115 kind = kind1 > kind2 ? kind1 : kind2; 9116 buf1 = PyUnicode_DATA(self); 9117 buf2 = PyUnicode_DATA(substring); 9118 if (kind1 != kind) 9119 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9120 if (!buf1) 9121 return NULL; 9122 if (kind2 != kind) 9123 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9124 if (!buf2) { 9125 if (kind1 != kind) PyMem_Free(buf1); 9126 return NULL; 9127 } 9128 len1 = PyUnicode_GET_LENGTH(self); 9129 len2 = PyUnicode_GET_LENGTH(substring); 9130 9131 switch(kind) { 9132 case PyUnicode_1BYTE_KIND: 9133 out = ucs1lib_split( 9134 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9135 break; 9136 case PyUnicode_2BYTE_KIND: 9137 out = ucs2lib_split( 9138 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9139 break; 9140 case PyUnicode_4BYTE_KIND: 9141 out = ucs4lib_split( 9142 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9143 break; 9144 default: 9145 out = NULL; 9146 } 9147 if (kind1 != kind) 9148 PyMem_Free(buf1); 9149 if (kind2 != kind) 9150 PyMem_Free(buf2); 9151 return out; 9152} 9153 9154static PyObject * 9155rsplit(PyUnicodeObject *self, 9156 PyUnicodeObject *substring, 9157 Py_ssize_t maxcount) 9158{ 9159 int kind1, kind2, kind; 9160 void *buf1, *buf2; 9161 Py_ssize_t len1, len2; 9162 PyObject* out; 9163 9164 if (maxcount < 0) 9165 maxcount = PY_SSIZE_T_MAX; 9166 9167 if (PyUnicode_READY(self) == -1) 9168 return NULL; 9169 9170 if (substring == NULL) 9171 switch(PyUnicode_KIND(self)) { 9172 case PyUnicode_1BYTE_KIND: 9173 return ucs1lib_rsplit_whitespace( 9174 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9175 PyUnicode_GET_LENGTH(self), maxcount 9176 ); 9177 case PyUnicode_2BYTE_KIND: 9178 return ucs2lib_rsplit_whitespace( 9179 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9180 PyUnicode_GET_LENGTH(self), maxcount 9181 ); 9182 case PyUnicode_4BYTE_KIND: 9183 return ucs4lib_rsplit_whitespace( 9184 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9185 PyUnicode_GET_LENGTH(self), maxcount 9186 ); 9187 default: 9188 assert(0); 9189 return NULL; 9190 } 9191 9192 if (PyUnicode_READY(substring) == -1) 9193 return NULL; 9194 9195 kind1 = PyUnicode_KIND(self); 9196 kind2 = PyUnicode_KIND(substring); 9197 kind = kind1 > kind2 ? kind1 : kind2; 9198 buf1 = PyUnicode_DATA(self); 9199 buf2 = PyUnicode_DATA(substring); 9200 if (kind1 != kind) 9201 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9202 if (!buf1) 9203 return NULL; 9204 if (kind2 != kind) 9205 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9206 if (!buf2) { 9207 if (kind1 != kind) PyMem_Free(buf1); 9208 return NULL; 9209 } 9210 len1 = PyUnicode_GET_LENGTH(self); 9211 len2 = PyUnicode_GET_LENGTH(substring); 9212 9213 switch(kind) { 9214 case PyUnicode_1BYTE_KIND: 9215 out = ucs1lib_rsplit( 9216 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9217 break; 9218 case PyUnicode_2BYTE_KIND: 9219 out = ucs2lib_rsplit( 9220 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9221 break; 9222 case PyUnicode_4BYTE_KIND: 9223 out = ucs4lib_rsplit( 9224 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9225 break; 9226 default: 9227 out = NULL; 9228 } 9229 if (kind1 != kind) 9230 PyMem_Free(buf1); 9231 if (kind2 != kind) 9232 PyMem_Free(buf2); 9233 return out; 9234} 9235 9236static Py_ssize_t 9237anylib_find(int kind, void *buf1, Py_ssize_t len1, 9238 void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9239{ 9240 switch(kind) { 9241 case PyUnicode_1BYTE_KIND: 9242 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9243 case PyUnicode_2BYTE_KIND: 9244 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9245 case PyUnicode_4BYTE_KIND: 9246 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9247 } 9248 assert(0); 9249 return -1; 9250} 9251 9252static Py_ssize_t 9253anylib_count(int kind, void* sbuf, Py_ssize_t slen, 9254 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9255{ 9256 switch(kind) { 9257 case PyUnicode_1BYTE_KIND: 9258 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9259 case PyUnicode_2BYTE_KIND: 9260 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9261 case PyUnicode_4BYTE_KIND: 9262 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9263 } 9264 assert(0); 9265 return 0; 9266} 9267 9268static PyObject * 9269replace(PyObject *self, PyObject *str1, 9270 PyObject *str2, Py_ssize_t maxcount) 9271{ 9272 PyObject *u; 9273 char *sbuf = PyUnicode_DATA(self); 9274 char *buf1 = PyUnicode_DATA(str1); 9275 char *buf2 = PyUnicode_DATA(str2); 9276 int srelease = 0, release1 = 0, release2 = 0; 9277 int skind = PyUnicode_KIND(self); 9278 int kind1 = PyUnicode_KIND(str1); 9279 int kind2 = PyUnicode_KIND(str2); 9280 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9281 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9282 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9283 9284 if (maxcount < 0) 9285 maxcount = PY_SSIZE_T_MAX; 9286 else if (maxcount == 0 || slen == 0) 9287 goto nothing; 9288 9289 if (skind < kind1) 9290 /* substring too wide to be present */ 9291 goto nothing; 9292 9293 if (len1 == len2) { 9294 Py_ssize_t i; 9295 /* same length */ 9296 if (len1 == 0) 9297 goto nothing; 9298 if (len1 == 1) { 9299 /* replace characters */ 9300 Py_UCS4 u1, u2, maxchar; 9301 int mayshrink, rkind; 9302 u1 = PyUnicode_READ_CHAR(str1, 0); 9303 if (!findchar(sbuf, PyUnicode_KIND(self), 9304 slen, u1, 1)) 9305 goto nothing; 9306 u2 = PyUnicode_READ_CHAR(str2, 0); 9307 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9308 /* Replacing u1 with u2 may cause a maxchar reduction in the 9309 result string. */ 9310 mayshrink = maxchar > 127; 9311 if (u2 > maxchar) { 9312 maxchar = u2; 9313 mayshrink = 0; 9314 } 9315 u = PyUnicode_New(slen, maxchar); 9316 if (!u) 9317 goto error; 9318 if (PyUnicode_CopyCharacters(u, 0, 9319 (PyObject*)self, 0, slen) < 0) 9320 { 9321 Py_DECREF(u); 9322 return NULL; 9323 } 9324 rkind = PyUnicode_KIND(u); 9325 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9326 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9327 if (--maxcount < 0) 9328 break; 9329 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9330 } 9331 if (mayshrink) { 9332 PyObject *tmp = u; 9333 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9334 PyUnicode_GET_LENGTH(tmp)); 9335 Py_DECREF(tmp); 9336 } 9337 } else { 9338 int rkind = skind; 9339 char *res; 9340 if (kind1 < rkind) { 9341 /* widen substring */ 9342 buf1 = _PyUnicode_AsKind(str1, rkind); 9343 if (!buf1) goto error; 9344 release1 = 1; 9345 } 9346 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); 9347 if (i < 0) 9348 goto nothing; 9349 if (rkind > kind2) { 9350 /* widen replacement */ 9351 buf2 = _PyUnicode_AsKind(str2, rkind); 9352 if (!buf2) goto error; 9353 release2 = 1; 9354 } 9355 else if (rkind < kind2) { 9356 /* widen self and buf1 */ 9357 rkind = kind2; 9358 if (release1) PyMem_Free(buf1); 9359 sbuf = _PyUnicode_AsKind(self, rkind); 9360 if (!sbuf) goto error; 9361 srelease = 1; 9362 buf1 = _PyUnicode_AsKind(str1, rkind); 9363 if (!buf1) goto error; 9364 release1 = 1; 9365 } 9366 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9367 if (!res) { 9368 PyErr_NoMemory(); 9369 goto error; 9370 } 9371 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9372 /* change everything in-place, starting with this one */ 9373 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9374 buf2, 9375 PyUnicode_KIND_SIZE(rkind, len2)); 9376 i += len1; 9377 9378 while ( --maxcount > 0) { 9379 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), 9380 slen-i, 9381 buf1, len1, i); 9382 if (i == -1) 9383 break; 9384 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9385 buf2, 9386 PyUnicode_KIND_SIZE(rkind, len2)); 9387 i += len1; 9388 } 9389 9390 u = PyUnicode_FromKindAndData(rkind, res, slen); 9391 PyMem_Free(res); 9392 if (!u) goto error; 9393 } 9394 } else { 9395 9396 Py_ssize_t n, i, j, ires; 9397 Py_ssize_t product, new_size; 9398 int rkind = skind; 9399 char *res; 9400 9401 if (kind1 < rkind) { 9402 buf1 = _PyUnicode_AsKind(str1, rkind); 9403 if (!buf1) goto error; 9404 release1 = 1; 9405 } 9406 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); 9407 if (n == 0) 9408 goto nothing; 9409 if (kind2 < rkind) { 9410 buf2 = _PyUnicode_AsKind(str2, rkind); 9411 if (!buf2) goto error; 9412 release2 = 1; 9413 } 9414 else if (kind2 > rkind) { 9415 rkind = kind2; 9416 sbuf = _PyUnicode_AsKind(self, rkind); 9417 if (!sbuf) goto error; 9418 srelease = 1; 9419 if (release1) PyMem_Free(buf1); 9420 buf1 = _PyUnicode_AsKind(str1, rkind); 9421 if (!buf1) goto error; 9422 release1 = 1; 9423 } 9424 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9425 PyUnicode_GET_LENGTH(str1))); */ 9426 product = n * (len2-len1); 9427 if ((product / (len2-len1)) != n) { 9428 PyErr_SetString(PyExc_OverflowError, 9429 "replace string is too long"); 9430 goto error; 9431 } 9432 new_size = slen + product; 9433 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9434 PyErr_SetString(PyExc_OverflowError, 9435 "replace string is too long"); 9436 goto error; 9437 } 9438 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9439 if (!res) 9440 goto error; 9441 ires = i = 0; 9442 if (len1 > 0) { 9443 while (n-- > 0) { 9444 /* look for next match */ 9445 j = anylib_find(rkind, 9446 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9447 slen-i, buf1, len1, i); 9448 if (j == -1) 9449 break; 9450 else if (j > i) { 9451 /* copy unchanged part [i:j] */ 9452 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9453 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9454 PyUnicode_KIND_SIZE(rkind, j-i)); 9455 ires += j - i; 9456 } 9457 /* copy substitution string */ 9458 if (len2 > 0) { 9459 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9460 buf2, 9461 PyUnicode_KIND_SIZE(rkind, len2)); 9462 ires += len2; 9463 } 9464 i = j + len1; 9465 } 9466 if (i < slen) 9467 /* copy tail [i:] */ 9468 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9469 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9470 PyUnicode_KIND_SIZE(rkind, slen-i)); 9471 } else { 9472 /* interleave */ 9473 while (n > 0) { 9474 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9475 buf2, 9476 PyUnicode_KIND_SIZE(rkind, len2)); 9477 ires += len2; 9478 if (--n <= 0) 9479 break; 9480 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9481 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9482 PyUnicode_KIND_SIZE(rkind, 1)); 9483 ires++; 9484 i++; 9485 } 9486 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9487 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9488 PyUnicode_KIND_SIZE(rkind, slen-i)); 9489 } 9490 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9491 PyMem_Free(res); 9492 } 9493 if (srelease) 9494 PyMem_FREE(sbuf); 9495 if (release1) 9496 PyMem_FREE(buf1); 9497 if (release2) 9498 PyMem_FREE(buf2); 9499 return u; 9500 9501 nothing: 9502 /* nothing to replace; return original string (when possible) */ 9503 if (srelease) 9504 PyMem_FREE(sbuf); 9505 if (release1) 9506 PyMem_FREE(buf1); 9507 if (release2) 9508 PyMem_FREE(buf2); 9509 if (PyUnicode_CheckExact(self)) { 9510 Py_INCREF(self); 9511 return (PyObject *) self; 9512 } 9513 return PyUnicode_Copy(self); 9514 error: 9515 if (srelease && sbuf) 9516 PyMem_FREE(sbuf); 9517 if (release1 && buf1) 9518 PyMem_FREE(buf1); 9519 if (release2 && buf2) 9520 PyMem_FREE(buf2); 9521 return NULL; 9522} 9523 9524/* --- Unicode Object Methods --------------------------------------------- */ 9525 9526PyDoc_STRVAR(title__doc__, 9527 "S.title() -> str\n\ 9528\n\ 9529Return a titlecased version of S, i.e. words start with title case\n\ 9530characters, all remaining cased characters have lower case."); 9531 9532static PyObject* 9533unicode_title(PyUnicodeObject *self) 9534{ 9535 return fixup(self, fixtitle); 9536} 9537 9538PyDoc_STRVAR(capitalize__doc__, 9539 "S.capitalize() -> str\n\ 9540\n\ 9541Return a capitalized version of S, i.e. make the first character\n\ 9542have upper case and the rest lower case."); 9543 9544static PyObject* 9545unicode_capitalize(PyUnicodeObject *self) 9546{ 9547 return fixup(self, fixcapitalize); 9548} 9549 9550#if 0 9551PyDoc_STRVAR(capwords__doc__, 9552 "S.capwords() -> str\n\ 9553\n\ 9554Apply .capitalize() to all words in S and return the result with\n\ 9555normalized whitespace (all whitespace strings are replaced by ' ')."); 9556 9557static PyObject* 9558unicode_capwords(PyUnicodeObject *self) 9559{ 9560 PyObject *list; 9561 PyObject *item; 9562 Py_ssize_t i; 9563 9564 /* Split into words */ 9565 list = split(self, NULL, -1); 9566 if (!list) 9567 return NULL; 9568 9569 /* Capitalize each word */ 9570 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9571 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9572 fixcapitalize); 9573 if (item == NULL) 9574 goto onError; 9575 Py_DECREF(PyList_GET_ITEM(list, i)); 9576 PyList_SET_ITEM(list, i, item); 9577 } 9578 9579 /* Join the words to form a new string */ 9580 item = PyUnicode_Join(NULL, list); 9581 9582 onError: 9583 Py_DECREF(list); 9584 return (PyObject *)item; 9585} 9586#endif 9587 9588/* Argument converter. Coerces to a single unicode character */ 9589 9590static int 9591convert_uc(PyObject *obj, void *addr) 9592{ 9593 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9594 PyObject *uniobj; 9595 9596 uniobj = PyUnicode_FromObject(obj); 9597 if (uniobj == NULL) { 9598 PyErr_SetString(PyExc_TypeError, 9599 "The fill character cannot be converted to Unicode"); 9600 return 0; 9601 } 9602 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9603 PyErr_SetString(PyExc_TypeError, 9604 "The fill character must be exactly one character long"); 9605 Py_DECREF(uniobj); 9606 return 0; 9607 } 9608 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9609 Py_DECREF(uniobj); 9610 return 1; 9611} 9612 9613PyDoc_STRVAR(center__doc__, 9614 "S.center(width[, fillchar]) -> str\n\ 9615\n\ 9616Return S centered in a string of length width. Padding is\n\ 9617done using the specified fill character (default is a space)"); 9618 9619static PyObject * 9620unicode_center(PyUnicodeObject *self, PyObject *args) 9621{ 9622 Py_ssize_t marg, left; 9623 Py_ssize_t width; 9624 Py_UCS4 fillchar = ' '; 9625 9626 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9627 return NULL; 9628 9629 if (PyUnicode_READY(self) == -1) 9630 return NULL; 9631 9632 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9633 Py_INCREF(self); 9634 return (PyObject*) self; 9635 } 9636 9637 marg = width - _PyUnicode_LENGTH(self); 9638 left = marg / 2 + (marg & width & 1); 9639 9640 return (PyObject*) pad(self, left, marg - left, fillchar); 9641} 9642 9643#if 0 9644 9645/* This code should go into some future Unicode collation support 9646 module. The basic comparison should compare ordinals on a naive 9647 basis (this is what Java does and thus Jython too). */ 9648 9649/* speedy UTF-16 code point order comparison */ 9650/* gleaned from: */ 9651/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9652 9653static short utf16Fixup[32] = 9654{ 9655 0, 0, 0, 0, 0, 0, 0, 0, 9656 0, 0, 0, 0, 0, 0, 0, 0, 9657 0, 0, 0, 0, 0, 0, 0, 0, 9658 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9659}; 9660 9661static int 9662unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9663{ 9664 Py_ssize_t len1, len2; 9665 9666 Py_UNICODE *s1 = str1->str; 9667 Py_UNICODE *s2 = str2->str; 9668 9669 len1 = str1->_base._base.length; 9670 len2 = str2->_base._base.length; 9671 9672 while (len1 > 0 && len2 > 0) { 9673 Py_UNICODE c1, c2; 9674 9675 c1 = *s1++; 9676 c2 = *s2++; 9677 9678 if (c1 > (1<<11) * 26) 9679 c1 += utf16Fixup[c1>>11]; 9680 if (c2 > (1<<11) * 26) 9681 c2 += utf16Fixup[c2>>11]; 9682 /* now c1 and c2 are in UTF-32-compatible order */ 9683 9684 if (c1 != c2) 9685 return (c1 < c2) ? -1 : 1; 9686 9687 len1--; len2--; 9688 } 9689 9690 return (len1 < len2) ? -1 : (len1 != len2); 9691} 9692 9693#else 9694 9695/* This function assumes that str1 and str2 are readied by the caller. */ 9696 9697static int 9698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9699{ 9700 int kind1, kind2; 9701 void *data1, *data2; 9702 Py_ssize_t len1, len2, i; 9703 9704 kind1 = PyUnicode_KIND(str1); 9705 kind2 = PyUnicode_KIND(str2); 9706 data1 = PyUnicode_DATA(str1); 9707 data2 = PyUnicode_DATA(str2); 9708 len1 = PyUnicode_GET_LENGTH(str1); 9709 len2 = PyUnicode_GET_LENGTH(str2); 9710 9711 for (i = 0; i < len1 && i < len2; ++i) { 9712 Py_UCS4 c1, c2; 9713 c1 = PyUnicode_READ(kind1, data1, i); 9714 c2 = PyUnicode_READ(kind2, data2, i); 9715 9716 if (c1 != c2) 9717 return (c1 < c2) ? -1 : 1; 9718 } 9719 9720 return (len1 < len2) ? -1 : (len1 != len2); 9721} 9722 9723#endif 9724 9725int 9726PyUnicode_Compare(PyObject *left, PyObject *right) 9727{ 9728 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9729 if (PyUnicode_READY(left) == -1 || 9730 PyUnicode_READY(right) == -1) 9731 return -1; 9732 return unicode_compare((PyUnicodeObject *)left, 9733 (PyUnicodeObject *)right); 9734 } 9735 PyErr_Format(PyExc_TypeError, 9736 "Can't compare %.100s and %.100s", 9737 left->ob_type->tp_name, 9738 right->ob_type->tp_name); 9739 return -1; 9740} 9741 9742int 9743PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9744{ 9745 Py_ssize_t i; 9746 int kind; 9747 void *data; 9748 Py_UCS4 chr; 9749 9750 assert(_PyUnicode_CHECK(uni)); 9751 if (PyUnicode_READY(uni) == -1) 9752 return -1; 9753 kind = PyUnicode_KIND(uni); 9754 data = PyUnicode_DATA(uni); 9755 /* Compare Unicode string and source character set string */ 9756 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9757 if (chr != str[i]) 9758 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9759 /* This check keeps Python strings that end in '\0' from comparing equal 9760 to C strings identical up to that point. */ 9761 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9762 return 1; /* uni is longer */ 9763 if (str[i]) 9764 return -1; /* str is longer */ 9765 return 0; 9766} 9767 9768 9769#define TEST_COND(cond) \ 9770 ((cond) ? Py_True : Py_False) 9771 9772PyObject * 9773PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 9774{ 9775 int result; 9776 9777 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9778 PyObject *v; 9779 if (PyUnicode_READY(left) == -1 || 9780 PyUnicode_READY(right) == -1) 9781 return NULL; 9782 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 9783 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 9784 if (op == Py_EQ) { 9785 Py_INCREF(Py_False); 9786 return Py_False; 9787 } 9788 if (op == Py_NE) { 9789 Py_INCREF(Py_True); 9790 return Py_True; 9791 } 9792 } 9793 if (left == right) 9794 result = 0; 9795 else 9796 result = unicode_compare((PyUnicodeObject *)left, 9797 (PyUnicodeObject *)right); 9798 9799 /* Convert the return value to a Boolean */ 9800 switch (op) { 9801 case Py_EQ: 9802 v = TEST_COND(result == 0); 9803 break; 9804 case Py_NE: 9805 v = TEST_COND(result != 0); 9806 break; 9807 case Py_LE: 9808 v = TEST_COND(result <= 0); 9809 break; 9810 case Py_GE: 9811 v = TEST_COND(result >= 0); 9812 break; 9813 case Py_LT: 9814 v = TEST_COND(result == -1); 9815 break; 9816 case Py_GT: 9817 v = TEST_COND(result == 1); 9818 break; 9819 default: 9820 PyErr_BadArgument(); 9821 return NULL; 9822 } 9823 Py_INCREF(v); 9824 return v; 9825 } 9826 9827 Py_RETURN_NOTIMPLEMENTED; 9828} 9829 9830int 9831PyUnicode_Contains(PyObject *container, PyObject *element) 9832{ 9833 PyObject *str, *sub; 9834 int kind1, kind2, kind; 9835 void *buf1, *buf2; 9836 Py_ssize_t len1, len2; 9837 int result; 9838 9839 /* Coerce the two arguments */ 9840 sub = PyUnicode_FromObject(element); 9841 if (!sub) { 9842 PyErr_Format(PyExc_TypeError, 9843 "'in <string>' requires string as left operand, not %s", 9844 element->ob_type->tp_name); 9845 return -1; 9846 } 9847 if (PyUnicode_READY(sub) == -1) 9848 return -1; 9849 9850 str = PyUnicode_FromObject(container); 9851 if (!str || PyUnicode_READY(str) == -1) { 9852 Py_DECREF(sub); 9853 return -1; 9854 } 9855 9856 kind1 = PyUnicode_KIND(str); 9857 kind2 = PyUnicode_KIND(sub); 9858 kind = kind1 > kind2 ? kind1 : kind2; 9859 buf1 = PyUnicode_DATA(str); 9860 buf2 = PyUnicode_DATA(sub); 9861 if (kind1 != kind) 9862 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 9863 if (!buf1) { 9864 Py_DECREF(sub); 9865 return -1; 9866 } 9867 if (kind2 != kind) 9868 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 9869 if (!buf2) { 9870 Py_DECREF(sub); 9871 if (kind1 != kind) PyMem_Free(buf1); 9872 return -1; 9873 } 9874 len1 = PyUnicode_GET_LENGTH(str); 9875 len2 = PyUnicode_GET_LENGTH(sub); 9876 9877 switch(kind) { 9878 case PyUnicode_1BYTE_KIND: 9879 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 9880 break; 9881 case PyUnicode_2BYTE_KIND: 9882 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 9883 break; 9884 case PyUnicode_4BYTE_KIND: 9885 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 9886 break; 9887 default: 9888 result = -1; 9889 assert(0); 9890 } 9891 9892 Py_DECREF(str); 9893 Py_DECREF(sub); 9894 9895 if (kind1 != kind) 9896 PyMem_Free(buf1); 9897 if (kind2 != kind) 9898 PyMem_Free(buf2); 9899 9900 return result; 9901} 9902 9903/* Concat to string or Unicode object giving a new Unicode object. */ 9904 9905PyObject * 9906PyUnicode_Concat(PyObject *left, PyObject *right) 9907{ 9908 PyObject *u = NULL, *v = NULL, *w; 9909 Py_UCS4 maxchar; 9910 9911 /* Coerce the two arguments */ 9912 u = PyUnicode_FromObject(left); 9913 if (u == NULL) 9914 goto onError; 9915 v = PyUnicode_FromObject(right); 9916 if (v == NULL) 9917 goto onError; 9918 9919 /* Shortcuts */ 9920 if (v == unicode_empty) { 9921 Py_DECREF(v); 9922 return u; 9923 } 9924 if (u == unicode_empty) { 9925 Py_DECREF(u); 9926 return v; 9927 } 9928 9929 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 9930 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 9931 9932 /* Concat the two Unicode strings */ 9933 w = PyUnicode_New( 9934 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 9935 maxchar); 9936 if (w == NULL) 9937 goto onError; 9938 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) 9939 goto onError; 9940 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), 9941 v, 0, 9942 PyUnicode_GET_LENGTH(v)) < 0) 9943 goto onError; 9944 Py_DECREF(u); 9945 Py_DECREF(v); 9946 return w; 9947 9948 onError: 9949 Py_XDECREF(u); 9950 Py_XDECREF(v); 9951 return NULL; 9952} 9953 9954void 9955PyUnicode_Append(PyObject **p_left, PyObject *right) 9956{ 9957 PyObject *left, *res; 9958 9959 if (p_left == NULL) { 9960 if (!PyErr_Occurred()) 9961 PyErr_BadInternalCall(); 9962 return; 9963 } 9964 left = *p_left; 9965 if (right == NULL || !PyUnicode_Check(left)) { 9966 if (!PyErr_Occurred()) 9967 PyErr_BadInternalCall(); 9968 goto error; 9969 } 9970 9971 if (PyUnicode_CheckExact(left) && left != unicode_empty 9972 && PyUnicode_CheckExact(right) && right != unicode_empty 9973 && unicode_resizable(left) 9974 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 9975 || _PyUnicode_WSTR(left) != NULL)) 9976 { 9977 Py_ssize_t left_len, right_len, new_len; 9978#ifdef Py_DEBUG 9979 Py_ssize_t copied; 9980#endif 9981 9982 if (PyUnicode_READY(left)) 9983 goto error; 9984 if (PyUnicode_READY(right)) 9985 goto error; 9986 9987 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */ 9988 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left)) 9989 { 9990 left_len = PyUnicode_GET_LENGTH(left); 9991 right_len = PyUnicode_GET_LENGTH(right); 9992 if (left_len > PY_SSIZE_T_MAX - right_len) { 9993 PyErr_SetString(PyExc_OverflowError, 9994 "strings are too large to concat"); 9995 goto error; 9996 } 9997 new_len = left_len + right_len; 9998 9999 /* Now we own the last reference to 'left', so we can resize it 10000 * in-place. 10001 */ 10002 if (unicode_resize(&left, new_len) != 0) { 10003 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10004 * deallocated so it cannot be put back into 10005 * 'variable'. The MemoryError is raised when there 10006 * is no value in 'variable', which might (very 10007 * remotely) be a cause of incompatibilities. 10008 */ 10009 goto error; 10010 } 10011 /* copy 'right' into the newly allocated area of 'left' */ 10012#ifdef Py_DEBUG 10013 copied = PyUnicode_CopyCharacters(left, left_len, 10014 right, 0, 10015 right_len); 10016 assert(0 <= copied); 10017#else 10018 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len); 10019#endif 10020 *p_left = left; 10021 return; 10022 } 10023 } 10024 10025 res = PyUnicode_Concat(left, right); 10026 if (res == NULL) 10027 goto error; 10028 Py_DECREF(left); 10029 *p_left = res; 10030 return; 10031 10032error: 10033 Py_DECREF(*p_left); 10034 *p_left = NULL; 10035} 10036 10037void 10038PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10039{ 10040 PyUnicode_Append(pleft, right); 10041 Py_XDECREF(right); 10042} 10043 10044PyDoc_STRVAR(count__doc__, 10045 "S.count(sub[, start[, end]]) -> int\n\ 10046\n\ 10047Return the number of non-overlapping occurrences of substring sub in\n\ 10048string S[start:end]. Optional arguments start and end are\n\ 10049interpreted as in slice notation."); 10050 10051static PyObject * 10052unicode_count(PyUnicodeObject *self, PyObject *args) 10053{ 10054 PyUnicodeObject *substring; 10055 Py_ssize_t start = 0; 10056 Py_ssize_t end = PY_SSIZE_T_MAX; 10057 PyObject *result; 10058 int kind1, kind2, kind; 10059 void *buf1, *buf2; 10060 Py_ssize_t len1, len2, iresult; 10061 10062 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10063 &start, &end)) 10064 return NULL; 10065 10066 kind1 = PyUnicode_KIND(self); 10067 kind2 = PyUnicode_KIND(substring); 10068 kind = kind1 > kind2 ? kind1 : kind2; 10069 buf1 = PyUnicode_DATA(self); 10070 buf2 = PyUnicode_DATA(substring); 10071 if (kind1 != kind) 10072 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10073 if (!buf1) { 10074 Py_DECREF(substring); 10075 return NULL; 10076 } 10077 if (kind2 != kind) 10078 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10079 if (!buf2) { 10080 Py_DECREF(substring); 10081 if (kind1 != kind) PyMem_Free(buf1); 10082 return NULL; 10083 } 10084 len1 = PyUnicode_GET_LENGTH(self); 10085 len2 = PyUnicode_GET_LENGTH(substring); 10086 10087 ADJUST_INDICES(start, end, len1); 10088 switch(kind) { 10089 case PyUnicode_1BYTE_KIND: 10090 iresult = ucs1lib_count( 10091 ((Py_UCS1*)buf1) + start, end - start, 10092 buf2, len2, PY_SSIZE_T_MAX 10093 ); 10094 break; 10095 case PyUnicode_2BYTE_KIND: 10096 iresult = ucs2lib_count( 10097 ((Py_UCS2*)buf1) + start, end - start, 10098 buf2, len2, PY_SSIZE_T_MAX 10099 ); 10100 break; 10101 case PyUnicode_4BYTE_KIND: 10102 iresult = ucs4lib_count( 10103 ((Py_UCS4*)buf1) + start, end - start, 10104 buf2, len2, PY_SSIZE_T_MAX 10105 ); 10106 break; 10107 default: 10108 assert(0); iresult = 0; 10109 } 10110 10111 result = PyLong_FromSsize_t(iresult); 10112 10113 if (kind1 != kind) 10114 PyMem_Free(buf1); 10115 if (kind2 != kind) 10116 PyMem_Free(buf2); 10117 10118 Py_DECREF(substring); 10119 10120 return result; 10121} 10122 10123PyDoc_STRVAR(encode__doc__, 10124 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10125\n\ 10126Encode S using the codec registered for encoding. Default encoding\n\ 10127is 'utf-8'. errors may be given to set a different error\n\ 10128handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10129a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10130'xmlcharrefreplace' as well as any other name registered with\n\ 10131codecs.register_error that can handle UnicodeEncodeErrors."); 10132 10133static PyObject * 10134unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10135{ 10136 static char *kwlist[] = {"encoding", "errors", 0}; 10137 char *encoding = NULL; 10138 char *errors = NULL; 10139 10140 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10141 kwlist, &encoding, &errors)) 10142 return NULL; 10143 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10144} 10145 10146PyDoc_STRVAR(expandtabs__doc__, 10147 "S.expandtabs([tabsize]) -> str\n\ 10148\n\ 10149Return a copy of S where all tab characters are expanded using spaces.\n\ 10150If tabsize is not given, a tab size of 8 characters is assumed."); 10151 10152static PyObject* 10153unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10154{ 10155 Py_UNICODE *e; 10156 Py_UNICODE *p; 10157 Py_UNICODE *q; 10158 Py_UNICODE *qe; 10159 Py_ssize_t i, j, incr, wstr_length; 10160 PyUnicodeObject *u; 10161 int tabsize = 8; 10162 10163 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10164 return NULL; 10165 10166 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL) 10167 return NULL; 10168 10169 /* First pass: determine size of output string */ 10170 i = 0; /* chars up to and including most recent \n or \r */ 10171 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 10172 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */ 10173 for (p = _PyUnicode_WSTR(self); p < e; p++) 10174 if (*p == '\t') { 10175 if (tabsize > 0) { 10176 incr = tabsize - (j % tabsize); /* cannot overflow */ 10177 if (j > PY_SSIZE_T_MAX - incr) 10178 goto overflow1; 10179 j += incr; 10180 } 10181 } 10182 else { 10183 if (j > PY_SSIZE_T_MAX - 1) 10184 goto overflow1; 10185 j++; 10186 if (*p == '\n' || *p == '\r') { 10187 if (i > PY_SSIZE_T_MAX - j) 10188 goto overflow1; 10189 i += j; 10190 j = 0; 10191 } 10192 } 10193 10194 if (i > PY_SSIZE_T_MAX - j) 10195 goto overflow1; 10196 10197 /* Second pass: create output string and fill it */ 10198 u = _PyUnicode_New(i + j); 10199 if (!u) 10200 return NULL; 10201 10202 j = 0; /* same as in first pass */ 10203 q = _PyUnicode_WSTR(u); /* next output char */ 10204 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */ 10205 10206 for (p = _PyUnicode_WSTR(self); p < e; p++) 10207 if (*p == '\t') { 10208 if (tabsize > 0) { 10209 i = tabsize - (j % tabsize); 10210 j += i; 10211 while (i--) { 10212 if (q >= qe) 10213 goto overflow2; 10214 *q++ = ' '; 10215 } 10216 } 10217 } 10218 else { 10219 if (q >= qe) 10220 goto overflow2; 10221 *q++ = *p; 10222 j++; 10223 if (*p == '\n' || *p == '\r') 10224 j = 0; 10225 } 10226 10227 if (_PyUnicode_READY_REPLACE(&u)) { 10228 Py_DECREF(u); 10229 return NULL; 10230 } 10231 return (PyObject*) u; 10232 10233 overflow2: 10234 Py_DECREF(u); 10235 overflow1: 10236 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10237 return NULL; 10238} 10239 10240PyDoc_STRVAR(find__doc__, 10241 "S.find(sub[, start[, end]]) -> int\n\ 10242\n\ 10243Return the lowest index in S where substring sub is found,\n\ 10244such that sub is contained within S[start:end]. Optional\n\ 10245arguments start and end are interpreted as in slice notation.\n\ 10246\n\ 10247Return -1 on failure."); 10248 10249static PyObject * 10250unicode_find(PyObject *self, PyObject *args) 10251{ 10252 PyUnicodeObject *substring; 10253 Py_ssize_t start; 10254 Py_ssize_t end; 10255 Py_ssize_t result; 10256 10257 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10258 &start, &end)) 10259 return NULL; 10260 10261 if (PyUnicode_READY(self) == -1) 10262 return NULL; 10263 if (PyUnicode_READY(substring) == -1) 10264 return NULL; 10265 10266 result = any_find_slice( 10267 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10268 self, (PyObject*)substring, start, end 10269 ); 10270 10271 Py_DECREF(substring); 10272 10273 if (result == -2) 10274 return NULL; 10275 10276 return PyLong_FromSsize_t(result); 10277} 10278 10279static PyObject * 10280unicode_getitem(PyObject *self, Py_ssize_t index) 10281{ 10282 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10283 if (ch == (Py_UCS4)-1) 10284 return NULL; 10285 return PyUnicode_FromOrdinal(ch); 10286} 10287 10288/* Believe it or not, this produces the same value for ASCII strings 10289 as bytes_hash(). */ 10290static Py_hash_t 10291unicode_hash(PyUnicodeObject *self) 10292{ 10293 Py_ssize_t len; 10294 Py_uhash_t x; 10295 10296 if (_PyUnicode_HASH(self) != -1) 10297 return _PyUnicode_HASH(self); 10298 if (PyUnicode_READY(self) == -1) 10299 return -1; 10300 len = PyUnicode_GET_LENGTH(self); 10301 10302 /* The hash function as a macro, gets expanded three times below. */ 10303#define HASH(P) \ 10304 x = (Py_uhash_t)*P << 7; \ 10305 while (--len >= 0) \ 10306 x = (1000003*x) ^ (Py_uhash_t)*P++; 10307 10308 switch (PyUnicode_KIND(self)) { 10309 case PyUnicode_1BYTE_KIND: { 10310 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10311 HASH(c); 10312 break; 10313 } 10314 case PyUnicode_2BYTE_KIND: { 10315 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10316 HASH(s); 10317 break; 10318 } 10319 default: { 10320 Py_UCS4 *l; 10321 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10322 "Impossible switch case in unicode_hash"); 10323 l = PyUnicode_4BYTE_DATA(self); 10324 HASH(l); 10325 break; 10326 } 10327 } 10328 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10329 10330 if (x == -1) 10331 x = -2; 10332 _PyUnicode_HASH(self) = x; 10333 return x; 10334} 10335#undef HASH 10336 10337PyDoc_STRVAR(index__doc__, 10338 "S.index(sub[, start[, end]]) -> int\n\ 10339\n\ 10340Like S.find() but raise ValueError when the substring is not found."); 10341 10342static PyObject * 10343unicode_index(PyObject *self, PyObject *args) 10344{ 10345 Py_ssize_t result; 10346 PyUnicodeObject *substring; 10347 Py_ssize_t start; 10348 Py_ssize_t end; 10349 10350 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10351 &start, &end)) 10352 return NULL; 10353 10354 if (PyUnicode_READY(self) == -1) 10355 return NULL; 10356 if (PyUnicode_READY(substring) == -1) 10357 return NULL; 10358 10359 result = any_find_slice( 10360 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10361 self, (PyObject*)substring, start, end 10362 ); 10363 10364 Py_DECREF(substring); 10365 10366 if (result == -2) 10367 return NULL; 10368 10369 if (result < 0) { 10370 PyErr_SetString(PyExc_ValueError, "substring not found"); 10371 return NULL; 10372 } 10373 10374 return PyLong_FromSsize_t(result); 10375} 10376 10377PyDoc_STRVAR(islower__doc__, 10378 "S.islower() -> bool\n\ 10379\n\ 10380Return True if all cased characters in S are lowercase and there is\n\ 10381at least one cased character in S, False otherwise."); 10382 10383static PyObject* 10384unicode_islower(PyUnicodeObject *self) 10385{ 10386 Py_ssize_t i, length; 10387 int kind; 10388 void *data; 10389 int cased; 10390 10391 if (PyUnicode_READY(self) == -1) 10392 return NULL; 10393 length = PyUnicode_GET_LENGTH(self); 10394 kind = PyUnicode_KIND(self); 10395 data = PyUnicode_DATA(self); 10396 10397 /* Shortcut for single character strings */ 10398 if (length == 1) 10399 return PyBool_FromLong( 10400 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10401 10402 /* Special case for empty strings */ 10403 if (length == 0) 10404 return PyBool_FromLong(0); 10405 10406 cased = 0; 10407 for (i = 0; i < length; i++) { 10408 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10409 10410 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10411 return PyBool_FromLong(0); 10412 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10413 cased = 1; 10414 } 10415 return PyBool_FromLong(cased); 10416} 10417 10418PyDoc_STRVAR(isupper__doc__, 10419 "S.isupper() -> bool\n\ 10420\n\ 10421Return True if all cased characters in S are uppercase and there is\n\ 10422at least one cased character in S, False otherwise."); 10423 10424static PyObject* 10425unicode_isupper(PyUnicodeObject *self) 10426{ 10427 Py_ssize_t i, length; 10428 int kind; 10429 void *data; 10430 int cased; 10431 10432 if (PyUnicode_READY(self) == -1) 10433 return NULL; 10434 length = PyUnicode_GET_LENGTH(self); 10435 kind = PyUnicode_KIND(self); 10436 data = PyUnicode_DATA(self); 10437 10438 /* Shortcut for single character strings */ 10439 if (length == 1) 10440 return PyBool_FromLong( 10441 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10442 10443 /* Special case for empty strings */ 10444 if (length == 0) 10445 return PyBool_FromLong(0); 10446 10447 cased = 0; 10448 for (i = 0; i < length; i++) { 10449 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10450 10451 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10452 return PyBool_FromLong(0); 10453 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10454 cased = 1; 10455 } 10456 return PyBool_FromLong(cased); 10457} 10458 10459PyDoc_STRVAR(istitle__doc__, 10460 "S.istitle() -> bool\n\ 10461\n\ 10462Return True if S is a titlecased string and there is at least one\n\ 10463character in S, i.e. upper- and titlecase characters may only\n\ 10464follow uncased characters and lowercase characters only cased ones.\n\ 10465Return False otherwise."); 10466 10467static PyObject* 10468unicode_istitle(PyUnicodeObject *self) 10469{ 10470 Py_ssize_t i, length; 10471 int kind; 10472 void *data; 10473 int cased, previous_is_cased; 10474 10475 if (PyUnicode_READY(self) == -1) 10476 return NULL; 10477 length = PyUnicode_GET_LENGTH(self); 10478 kind = PyUnicode_KIND(self); 10479 data = PyUnicode_DATA(self); 10480 10481 /* Shortcut for single character strings */ 10482 if (length == 1) { 10483 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10484 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10485 (Py_UNICODE_ISUPPER(ch) != 0)); 10486 } 10487 10488 /* Special case for empty strings */ 10489 if (length == 0) 10490 return PyBool_FromLong(0); 10491 10492 cased = 0; 10493 previous_is_cased = 0; 10494 for (i = 0; i < length; i++) { 10495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10496 10497 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10498 if (previous_is_cased) 10499 return PyBool_FromLong(0); 10500 previous_is_cased = 1; 10501 cased = 1; 10502 } 10503 else if (Py_UNICODE_ISLOWER(ch)) { 10504 if (!previous_is_cased) 10505 return PyBool_FromLong(0); 10506 previous_is_cased = 1; 10507 cased = 1; 10508 } 10509 else 10510 previous_is_cased = 0; 10511 } 10512 return PyBool_FromLong(cased); 10513} 10514 10515PyDoc_STRVAR(isspace__doc__, 10516 "S.isspace() -> bool\n\ 10517\n\ 10518Return True if all characters in S are whitespace\n\ 10519and there is at least one character in S, False otherwise."); 10520 10521static PyObject* 10522unicode_isspace(PyUnicodeObject *self) 10523{ 10524 Py_ssize_t i, length; 10525 int kind; 10526 void *data; 10527 10528 if (PyUnicode_READY(self) == -1) 10529 return NULL; 10530 length = PyUnicode_GET_LENGTH(self); 10531 kind = PyUnicode_KIND(self); 10532 data = PyUnicode_DATA(self); 10533 10534 /* Shortcut for single character strings */ 10535 if (length == 1) 10536 return PyBool_FromLong( 10537 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10538 10539 /* Special case for empty strings */ 10540 if (length == 0) 10541 return PyBool_FromLong(0); 10542 10543 for (i = 0; i < length; i++) { 10544 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10545 if (!Py_UNICODE_ISSPACE(ch)) 10546 return PyBool_FromLong(0); 10547 } 10548 return PyBool_FromLong(1); 10549} 10550 10551PyDoc_STRVAR(isalpha__doc__, 10552 "S.isalpha() -> bool\n\ 10553\n\ 10554Return True if all characters in S are alphabetic\n\ 10555and there is at least one character in S, False otherwise."); 10556 10557static PyObject* 10558unicode_isalpha(PyUnicodeObject *self) 10559{ 10560 Py_ssize_t i, length; 10561 int kind; 10562 void *data; 10563 10564 if (PyUnicode_READY(self) == -1) 10565 return NULL; 10566 length = PyUnicode_GET_LENGTH(self); 10567 kind = PyUnicode_KIND(self); 10568 data = PyUnicode_DATA(self); 10569 10570 /* Shortcut for single character strings */ 10571 if (length == 1) 10572 return PyBool_FromLong( 10573 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10574 10575 /* Special case for empty strings */ 10576 if (length == 0) 10577 return PyBool_FromLong(0); 10578 10579 for (i = 0; i < length; i++) { 10580 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10581 return PyBool_FromLong(0); 10582 } 10583 return PyBool_FromLong(1); 10584} 10585 10586PyDoc_STRVAR(isalnum__doc__, 10587 "S.isalnum() -> bool\n\ 10588\n\ 10589Return True if all characters in S are alphanumeric\n\ 10590and there is at least one character in S, False otherwise."); 10591 10592static PyObject* 10593unicode_isalnum(PyUnicodeObject *self) 10594{ 10595 int kind; 10596 void *data; 10597 Py_ssize_t len, i; 10598 10599 if (PyUnicode_READY(self) == -1) 10600 return NULL; 10601 10602 kind = PyUnicode_KIND(self); 10603 data = PyUnicode_DATA(self); 10604 len = PyUnicode_GET_LENGTH(self); 10605 10606 /* Shortcut for single character strings */ 10607 if (len == 1) { 10608 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10609 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10610 } 10611 10612 /* Special case for empty strings */ 10613 if (len == 0) 10614 return PyBool_FromLong(0); 10615 10616 for (i = 0; i < len; i++) { 10617 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10618 if (!Py_UNICODE_ISALNUM(ch)) 10619 return PyBool_FromLong(0); 10620 } 10621 return PyBool_FromLong(1); 10622} 10623 10624PyDoc_STRVAR(isdecimal__doc__, 10625 "S.isdecimal() -> bool\n\ 10626\n\ 10627Return True if there are only decimal characters in S,\n\ 10628False otherwise."); 10629 10630static PyObject* 10631unicode_isdecimal(PyUnicodeObject *self) 10632{ 10633 Py_ssize_t i, length; 10634 int kind; 10635 void *data; 10636 10637 if (PyUnicode_READY(self) == -1) 10638 return NULL; 10639 length = PyUnicode_GET_LENGTH(self); 10640 kind = PyUnicode_KIND(self); 10641 data = PyUnicode_DATA(self); 10642 10643 /* Shortcut for single character strings */ 10644 if (length == 1) 10645 return PyBool_FromLong( 10646 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10647 10648 /* Special case for empty strings */ 10649 if (length == 0) 10650 return PyBool_FromLong(0); 10651 10652 for (i = 0; i < length; i++) { 10653 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10654 return PyBool_FromLong(0); 10655 } 10656 return PyBool_FromLong(1); 10657} 10658 10659PyDoc_STRVAR(isdigit__doc__, 10660 "S.isdigit() -> bool\n\ 10661\n\ 10662Return True if all characters in S are digits\n\ 10663and there is at least one character in S, False otherwise."); 10664 10665static PyObject* 10666unicode_isdigit(PyUnicodeObject *self) 10667{ 10668 Py_ssize_t i, length; 10669 int kind; 10670 void *data; 10671 10672 if (PyUnicode_READY(self) == -1) 10673 return NULL; 10674 length = PyUnicode_GET_LENGTH(self); 10675 kind = PyUnicode_KIND(self); 10676 data = PyUnicode_DATA(self); 10677 10678 /* Shortcut for single character strings */ 10679 if (length == 1) { 10680 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10681 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10682 } 10683 10684 /* Special case for empty strings */ 10685 if (length == 0) 10686 return PyBool_FromLong(0); 10687 10688 for (i = 0; i < length; i++) { 10689 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10690 return PyBool_FromLong(0); 10691 } 10692 return PyBool_FromLong(1); 10693} 10694 10695PyDoc_STRVAR(isnumeric__doc__, 10696 "S.isnumeric() -> bool\n\ 10697\n\ 10698Return True if there are only numeric characters in S,\n\ 10699False otherwise."); 10700 10701static PyObject* 10702unicode_isnumeric(PyUnicodeObject *self) 10703{ 10704 Py_ssize_t i, length; 10705 int kind; 10706 void *data; 10707 10708 if (PyUnicode_READY(self) == -1) 10709 return NULL; 10710 length = PyUnicode_GET_LENGTH(self); 10711 kind = PyUnicode_KIND(self); 10712 data = PyUnicode_DATA(self); 10713 10714 /* Shortcut for single character strings */ 10715 if (length == 1) 10716 return PyBool_FromLong( 10717 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10718 10719 /* Special case for empty strings */ 10720 if (length == 0) 10721 return PyBool_FromLong(0); 10722 10723 for (i = 0; i < length; i++) { 10724 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10725 return PyBool_FromLong(0); 10726 } 10727 return PyBool_FromLong(1); 10728} 10729 10730int 10731PyUnicode_IsIdentifier(PyObject *self) 10732{ 10733 int kind; 10734 void *data; 10735 Py_ssize_t i; 10736 Py_UCS4 first; 10737 10738 if (PyUnicode_READY(self) == -1) { 10739 Py_FatalError("identifier not ready"); 10740 return 0; 10741 } 10742 10743 /* Special case for empty strings */ 10744 if (PyUnicode_GET_LENGTH(self) == 0) 10745 return 0; 10746 kind = PyUnicode_KIND(self); 10747 data = PyUnicode_DATA(self); 10748 10749 /* PEP 3131 says that the first character must be in 10750 XID_Start and subsequent characters in XID_Continue, 10751 and for the ASCII range, the 2.x rules apply (i.e 10752 start with letters and underscore, continue with 10753 letters, digits, underscore). However, given the current 10754 definition of XID_Start and XID_Continue, it is sufficient 10755 to check just for these, except that _ must be allowed 10756 as starting an identifier. */ 10757 first = PyUnicode_READ(kind, data, 0); 10758 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 10759 return 0; 10760 10761 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 10762 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 10763 return 0; 10764 return 1; 10765} 10766 10767PyDoc_STRVAR(isidentifier__doc__, 10768 "S.isidentifier() -> bool\n\ 10769\n\ 10770Return True if S is a valid identifier according\n\ 10771to the language definition."); 10772 10773static PyObject* 10774unicode_isidentifier(PyObject *self) 10775{ 10776 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 10777} 10778 10779PyDoc_STRVAR(isprintable__doc__, 10780 "S.isprintable() -> bool\n\ 10781\n\ 10782Return True if all characters in S are considered\n\ 10783printable in repr() or S is empty, False otherwise."); 10784 10785static PyObject* 10786unicode_isprintable(PyObject *self) 10787{ 10788 Py_ssize_t i, length; 10789 int kind; 10790 void *data; 10791 10792 if (PyUnicode_READY(self) == -1) 10793 return NULL; 10794 length = PyUnicode_GET_LENGTH(self); 10795 kind = PyUnicode_KIND(self); 10796 data = PyUnicode_DATA(self); 10797 10798 /* Shortcut for single character strings */ 10799 if (length == 1) 10800 return PyBool_FromLong( 10801 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 10802 10803 for (i = 0; i < length; i++) { 10804 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 10805 Py_RETURN_FALSE; 10806 } 10807 } 10808 Py_RETURN_TRUE; 10809} 10810 10811PyDoc_STRVAR(join__doc__, 10812 "S.join(iterable) -> str\n\ 10813\n\ 10814Return a string which is the concatenation of the strings in the\n\ 10815iterable. The separator between elements is S."); 10816 10817static PyObject* 10818unicode_join(PyObject *self, PyObject *data) 10819{ 10820 return PyUnicode_Join(self, data); 10821} 10822 10823static Py_ssize_t 10824unicode_length(PyUnicodeObject *self) 10825{ 10826 if (PyUnicode_READY(self) == -1) 10827 return -1; 10828 return PyUnicode_GET_LENGTH(self); 10829} 10830 10831PyDoc_STRVAR(ljust__doc__, 10832 "S.ljust(width[, fillchar]) -> str\n\ 10833\n\ 10834Return S left-justified in a Unicode string of length width. Padding is\n\ 10835done using the specified fill character (default is a space)."); 10836 10837static PyObject * 10838unicode_ljust(PyUnicodeObject *self, PyObject *args) 10839{ 10840 Py_ssize_t width; 10841 Py_UCS4 fillchar = ' '; 10842 10843 if (PyUnicode_READY(self) == -1) 10844 return NULL; 10845 10846 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 10847 return NULL; 10848 10849 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10850 Py_INCREF(self); 10851 return (PyObject*) self; 10852 } 10853 10854 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 10855} 10856 10857PyDoc_STRVAR(lower__doc__, 10858 "S.lower() -> str\n\ 10859\n\ 10860Return a copy of the string S converted to lowercase."); 10861 10862static PyObject* 10863unicode_lower(PyUnicodeObject *self) 10864{ 10865 return fixup(self, fixlower); 10866} 10867 10868#define LEFTSTRIP 0 10869#define RIGHTSTRIP 1 10870#define BOTHSTRIP 2 10871 10872/* Arrays indexed by above */ 10873static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 10874 10875#define STRIPNAME(i) (stripformat[i]+3) 10876 10877/* externally visible for str.strip(unicode) */ 10878PyObject * 10879_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 10880{ 10881 void *data; 10882 int kind; 10883 Py_ssize_t i, j, len; 10884 BLOOM_MASK sepmask; 10885 10886 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 10887 return NULL; 10888 10889 kind = PyUnicode_KIND(self); 10890 data = PyUnicode_DATA(self); 10891 len = PyUnicode_GET_LENGTH(self); 10892 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 10893 PyUnicode_DATA(sepobj), 10894 PyUnicode_GET_LENGTH(sepobj)); 10895 10896 i = 0; 10897 if (striptype != RIGHTSTRIP) { 10898 while (i < len && 10899 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 10900 i++; 10901 } 10902 } 10903 10904 j = len; 10905 if (striptype != LEFTSTRIP) { 10906 do { 10907 j--; 10908 } while (j >= i && 10909 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 10910 j++; 10911 } 10912 10913 return PyUnicode_Substring((PyObject*)self, i, j); 10914} 10915 10916PyObject* 10917PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 10918{ 10919 unsigned char *data; 10920 int kind; 10921 Py_ssize_t length; 10922 10923 if (PyUnicode_READY(self) == -1) 10924 return NULL; 10925 10926 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 10927 10928 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 10929 { 10930 if (PyUnicode_CheckExact(self)) { 10931 Py_INCREF(self); 10932 return self; 10933 } 10934 else 10935 return PyUnicode_Copy(self); 10936 } 10937 10938 length = end - start; 10939 if (length == 1) 10940 return unicode_getitem(self, start); 10941 10942 if (start < 0 || end < 0) { 10943 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10944 return NULL; 10945 } 10946 10947 kind = PyUnicode_KIND(self); 10948 data = PyUnicode_1BYTE_DATA(self); 10949 return PyUnicode_FromKindAndData(kind, 10950 data + PyUnicode_KIND_SIZE(kind, start), 10951 length); 10952} 10953 10954static PyObject * 10955do_strip(PyUnicodeObject *self, int striptype) 10956{ 10957 int kind; 10958 void *data; 10959 Py_ssize_t len, i, j; 10960 10961 if (PyUnicode_READY(self) == -1) 10962 return NULL; 10963 10964 kind = PyUnicode_KIND(self); 10965 data = PyUnicode_DATA(self); 10966 len = PyUnicode_GET_LENGTH(self); 10967 10968 i = 0; 10969 if (striptype != RIGHTSTRIP) { 10970 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 10971 i++; 10972 } 10973 } 10974 10975 j = len; 10976 if (striptype != LEFTSTRIP) { 10977 do { 10978 j--; 10979 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 10980 j++; 10981 } 10982 10983 return PyUnicode_Substring((PyObject*)self, i, j); 10984} 10985 10986 10987static PyObject * 10988do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 10989{ 10990 PyObject *sep = NULL; 10991 10992 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 10993 return NULL; 10994 10995 if (sep != NULL && sep != Py_None) { 10996 if (PyUnicode_Check(sep)) 10997 return _PyUnicode_XStrip(self, striptype, sep); 10998 else { 10999 PyErr_Format(PyExc_TypeError, 11000 "%s arg must be None or str", 11001 STRIPNAME(striptype)); 11002 return NULL; 11003 } 11004 } 11005 11006 return do_strip(self, striptype); 11007} 11008 11009 11010PyDoc_STRVAR(strip__doc__, 11011 "S.strip([chars]) -> str\n\ 11012\n\ 11013Return a copy of the string S with leading and trailing\n\ 11014whitespace removed.\n\ 11015If chars is given and not None, remove characters in chars instead."); 11016 11017static PyObject * 11018unicode_strip(PyUnicodeObject *self, PyObject *args) 11019{ 11020 if (PyTuple_GET_SIZE(args) == 0) 11021 return do_strip(self, BOTHSTRIP); /* Common case */ 11022 else 11023 return do_argstrip(self, BOTHSTRIP, args); 11024} 11025 11026 11027PyDoc_STRVAR(lstrip__doc__, 11028 "S.lstrip([chars]) -> str\n\ 11029\n\ 11030Return a copy of the string S with leading whitespace removed.\n\ 11031If chars is given and not None, remove characters in chars instead."); 11032 11033static PyObject * 11034unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11035{ 11036 if (PyTuple_GET_SIZE(args) == 0) 11037 return do_strip(self, LEFTSTRIP); /* Common case */ 11038 else 11039 return do_argstrip(self, LEFTSTRIP, args); 11040} 11041 11042 11043PyDoc_STRVAR(rstrip__doc__, 11044 "S.rstrip([chars]) -> str\n\ 11045\n\ 11046Return a copy of the string S with trailing whitespace removed.\n\ 11047If chars is given and not None, remove characters in chars instead."); 11048 11049static PyObject * 11050unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11051{ 11052 if (PyTuple_GET_SIZE(args) == 0) 11053 return do_strip(self, RIGHTSTRIP); /* Common case */ 11054 else 11055 return do_argstrip(self, RIGHTSTRIP, args); 11056} 11057 11058 11059static PyObject* 11060unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11061{ 11062 PyUnicodeObject *u; 11063 Py_ssize_t nchars, n; 11064 11065 if (len < 1) { 11066 Py_INCREF(unicode_empty); 11067 return unicode_empty; 11068 } 11069 11070 if (len == 1 && PyUnicode_CheckExact(str)) { 11071 /* no repeat, return original string */ 11072 Py_INCREF(str); 11073 return (PyObject*) str; 11074 } 11075 11076 if (PyUnicode_READY(str) == -1) 11077 return NULL; 11078 11079 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11080 PyErr_SetString(PyExc_OverflowError, 11081 "repeated string is too long"); 11082 return NULL; 11083 } 11084 nchars = len * PyUnicode_GET_LENGTH(str); 11085 11086 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11087 if (!u) 11088 return NULL; 11089 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11090 11091 if (PyUnicode_GET_LENGTH(str) == 1) { 11092 const int kind = PyUnicode_KIND(str); 11093 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11094 void *to = PyUnicode_DATA(u); 11095 if (kind == PyUnicode_1BYTE_KIND) 11096 memset(to, (unsigned char)fill_char, len); 11097 else { 11098 for (n = 0; n < len; ++n) 11099 PyUnicode_WRITE(kind, to, n, fill_char); 11100 } 11101 } 11102 else { 11103 /* number of characters copied this far */ 11104 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11105 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11106 char *to = (char *) PyUnicode_DATA(u); 11107 Py_MEMCPY(to, PyUnicode_DATA(str), 11108 PyUnicode_GET_LENGTH(str) * char_size); 11109 while (done < nchars) { 11110 n = (done <= nchars-done) ? done : nchars-done; 11111 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11112 done += n; 11113 } 11114 } 11115 11116 return (PyObject*) u; 11117} 11118 11119PyObject * 11120PyUnicode_Replace(PyObject *obj, 11121 PyObject *subobj, 11122 PyObject *replobj, 11123 Py_ssize_t maxcount) 11124{ 11125 PyObject *self; 11126 PyObject *str1; 11127 PyObject *str2; 11128 PyObject *result; 11129 11130 self = PyUnicode_FromObject(obj); 11131 if (self == NULL || PyUnicode_READY(self) == -1) 11132 return NULL; 11133 str1 = PyUnicode_FromObject(subobj); 11134 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11135 Py_DECREF(self); 11136 return NULL; 11137 } 11138 str2 = PyUnicode_FromObject(replobj); 11139 if (str2 == NULL || PyUnicode_READY(str2)) { 11140 Py_DECREF(self); 11141 Py_DECREF(str1); 11142 return NULL; 11143 } 11144 result = replace(self, str1, str2, maxcount); 11145 Py_DECREF(self); 11146 Py_DECREF(str1); 11147 Py_DECREF(str2); 11148 return result; 11149} 11150 11151PyDoc_STRVAR(replace__doc__, 11152 "S.replace(old, new[, count]) -> str\n\ 11153\n\ 11154Return a copy of S with all occurrences of substring\n\ 11155old replaced by new. If the optional argument count is\n\ 11156given, only the first count occurrences are replaced."); 11157 11158static PyObject* 11159unicode_replace(PyObject *self, PyObject *args) 11160{ 11161 PyObject *str1; 11162 PyObject *str2; 11163 Py_ssize_t maxcount = -1; 11164 PyObject *result; 11165 11166 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11167 return NULL; 11168 if (!PyUnicode_READY(self) == -1) 11169 return NULL; 11170 str1 = PyUnicode_FromObject(str1); 11171 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11172 return NULL; 11173 str2 = PyUnicode_FromObject(str2); 11174 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11175 Py_DECREF(str1); 11176 return NULL; 11177 } 11178 11179 result = replace(self, str1, str2, maxcount); 11180 11181 Py_DECREF(str1); 11182 Py_DECREF(str2); 11183 return result; 11184} 11185 11186static PyObject * 11187unicode_repr(PyObject *unicode) 11188{ 11189 PyObject *repr; 11190 Py_ssize_t isize; 11191 Py_ssize_t osize, squote, dquote, i, o; 11192 Py_UCS4 max, quote; 11193 int ikind, okind; 11194 void *idata, *odata; 11195 11196 if (PyUnicode_READY(unicode) == -1) 11197 return NULL; 11198 11199 isize = PyUnicode_GET_LENGTH(unicode); 11200 idata = PyUnicode_DATA(unicode); 11201 11202 /* Compute length of output, quote characters, and 11203 maximum character */ 11204 osize = 2; /* quotes */ 11205 max = 127; 11206 squote = dquote = 0; 11207 ikind = PyUnicode_KIND(unicode); 11208 for (i = 0; i < isize; i++) { 11209 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11210 switch (ch) { 11211 case '\'': squote++; osize++; break; 11212 case '"': dquote++; osize++; break; 11213 case '\\': case '\t': case '\r': case '\n': 11214 osize += 2; break; 11215 default: 11216 /* Fast-path ASCII */ 11217 if (ch < ' ' || ch == 0x7f) 11218 osize += 4; /* \xHH */ 11219 else if (ch < 0x7f) 11220 osize++; 11221 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11222 osize++; 11223 max = ch > max ? ch : max; 11224 } 11225 else if (ch < 0x100) 11226 osize += 4; /* \xHH */ 11227 else if (ch < 0x10000) 11228 osize += 6; /* \uHHHH */ 11229 else 11230 osize += 10; /* \uHHHHHHHH */ 11231 } 11232 } 11233 11234 quote = '\''; 11235 if (squote) { 11236 if (dquote) 11237 /* Both squote and dquote present. Use squote, 11238 and escape them */ 11239 osize += squote; 11240 else 11241 quote = '"'; 11242 } 11243 11244 repr = PyUnicode_New(osize, max); 11245 if (repr == NULL) 11246 return NULL; 11247 okind = PyUnicode_KIND(repr); 11248 odata = PyUnicode_DATA(repr); 11249 11250 PyUnicode_WRITE(okind, odata, 0, quote); 11251 PyUnicode_WRITE(okind, odata, osize-1, quote); 11252 11253 for (i = 0, o = 1; i < isize; i++) { 11254 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11255 11256 /* Escape quotes and backslashes */ 11257 if ((ch == quote) || (ch == '\\')) { 11258 PyUnicode_WRITE(okind, odata, o++, '\\'); 11259 PyUnicode_WRITE(okind, odata, o++, ch); 11260 continue; 11261 } 11262 11263 /* Map special whitespace to '\t', \n', '\r' */ 11264 if (ch == '\t') { 11265 PyUnicode_WRITE(okind, odata, o++, '\\'); 11266 PyUnicode_WRITE(okind, odata, o++, 't'); 11267 } 11268 else if (ch == '\n') { 11269 PyUnicode_WRITE(okind, odata, o++, '\\'); 11270 PyUnicode_WRITE(okind, odata, o++, 'n'); 11271 } 11272 else if (ch == '\r') { 11273 PyUnicode_WRITE(okind, odata, o++, '\\'); 11274 PyUnicode_WRITE(okind, odata, o++, 'r'); 11275 } 11276 11277 /* Map non-printable US ASCII to '\xhh' */ 11278 else if (ch < ' ' || ch == 0x7F) { 11279 PyUnicode_WRITE(okind, odata, o++, '\\'); 11280 PyUnicode_WRITE(okind, odata, o++, 'x'); 11281 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11282 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11283 } 11284 11285 /* Copy ASCII characters as-is */ 11286 else if (ch < 0x7F) { 11287 PyUnicode_WRITE(okind, odata, o++, ch); 11288 } 11289 11290 /* Non-ASCII characters */ 11291 else { 11292 /* Map Unicode whitespace and control characters 11293 (categories Z* and C* except ASCII space) 11294 */ 11295 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11296 /* Map 8-bit characters to '\xhh' */ 11297 if (ch <= 0xff) { 11298 PyUnicode_WRITE(okind, odata, o++, '\\'); 11299 PyUnicode_WRITE(okind, odata, o++, 'x'); 11300 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11301 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11302 } 11303 /* Map 21-bit characters to '\U00xxxxxx' */ 11304 else if (ch >= 0x10000) { 11305 PyUnicode_WRITE(okind, odata, o++, '\\'); 11306 PyUnicode_WRITE(okind, odata, o++, 'U'); 11307 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11308 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11309 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11310 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11311 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11312 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11313 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11314 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11315 } 11316 /* Map 16-bit characters to '\uxxxx' */ 11317 else { 11318 PyUnicode_WRITE(okind, odata, o++, '\\'); 11319 PyUnicode_WRITE(okind, odata, o++, 'u'); 11320 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11321 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11322 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11323 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11324 } 11325 } 11326 /* Copy characters as-is */ 11327 else { 11328 PyUnicode_WRITE(okind, odata, o++, ch); 11329 } 11330 } 11331 } 11332 /* Closing quote already added at the beginning */ 11333 return repr; 11334} 11335 11336PyDoc_STRVAR(rfind__doc__, 11337 "S.rfind(sub[, start[, end]]) -> int\n\ 11338\n\ 11339Return the highest index in S where substring sub is found,\n\ 11340such that sub is contained within S[start:end]. Optional\n\ 11341arguments start and end are interpreted as in slice notation.\n\ 11342\n\ 11343Return -1 on failure."); 11344 11345static PyObject * 11346unicode_rfind(PyObject *self, PyObject *args) 11347{ 11348 PyUnicodeObject *substring; 11349 Py_ssize_t start; 11350 Py_ssize_t end; 11351 Py_ssize_t result; 11352 11353 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11354 &start, &end)) 11355 return NULL; 11356 11357 if (PyUnicode_READY(self) == -1) 11358 return NULL; 11359 if (PyUnicode_READY(substring) == -1) 11360 return NULL; 11361 11362 result = any_find_slice( 11363 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11364 self, (PyObject*)substring, start, end 11365 ); 11366 11367 Py_DECREF(substring); 11368 11369 if (result == -2) 11370 return NULL; 11371 11372 return PyLong_FromSsize_t(result); 11373} 11374 11375PyDoc_STRVAR(rindex__doc__, 11376 "S.rindex(sub[, start[, end]]) -> int\n\ 11377\n\ 11378Like S.rfind() but raise ValueError when the substring is not found."); 11379 11380static PyObject * 11381unicode_rindex(PyObject *self, PyObject *args) 11382{ 11383 PyUnicodeObject *substring; 11384 Py_ssize_t start; 11385 Py_ssize_t end; 11386 Py_ssize_t result; 11387 11388 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11389 &start, &end)) 11390 return NULL; 11391 11392 if (PyUnicode_READY(self) == -1) 11393 return NULL; 11394 if (PyUnicode_READY(substring) == -1) 11395 return NULL; 11396 11397 result = any_find_slice( 11398 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11399 self, (PyObject*)substring, start, end 11400 ); 11401 11402 Py_DECREF(substring); 11403 11404 if (result == -2) 11405 return NULL; 11406 11407 if (result < 0) { 11408 PyErr_SetString(PyExc_ValueError, "substring not found"); 11409 return NULL; 11410 } 11411 11412 return PyLong_FromSsize_t(result); 11413} 11414 11415PyDoc_STRVAR(rjust__doc__, 11416 "S.rjust(width[, fillchar]) -> str\n\ 11417\n\ 11418Return S right-justified in a string of length width. Padding is\n\ 11419done using the specified fill character (default is a space)."); 11420 11421static PyObject * 11422unicode_rjust(PyUnicodeObject *self, PyObject *args) 11423{ 11424 Py_ssize_t width; 11425 Py_UCS4 fillchar = ' '; 11426 11427 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11428 return NULL; 11429 11430 if (PyUnicode_READY(self) == -1) 11431 return NULL; 11432 11433 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11434 Py_INCREF(self); 11435 return (PyObject*) self; 11436 } 11437 11438 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11439} 11440 11441PyObject * 11442PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11443{ 11444 PyObject *result; 11445 11446 s = PyUnicode_FromObject(s); 11447 if (s == NULL) 11448 return NULL; 11449 if (sep != NULL) { 11450 sep = PyUnicode_FromObject(sep); 11451 if (sep == NULL) { 11452 Py_DECREF(s); 11453 return NULL; 11454 } 11455 } 11456 11457 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11458 11459 Py_DECREF(s); 11460 Py_XDECREF(sep); 11461 return result; 11462} 11463 11464PyDoc_STRVAR(split__doc__, 11465 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11466\n\ 11467Return a list of the words in S, using sep as the\n\ 11468delimiter string. If maxsplit is given, at most maxsplit\n\ 11469splits are done. If sep is not specified or is None, any\n\ 11470whitespace string is a separator and empty strings are\n\ 11471removed from the result."); 11472 11473static PyObject* 11474unicode_split(PyUnicodeObject *self, PyObject *args) 11475{ 11476 PyObject *substring = Py_None; 11477 Py_ssize_t maxcount = -1; 11478 11479 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11480 return NULL; 11481 11482 if (substring == Py_None) 11483 return split(self, NULL, maxcount); 11484 else if (PyUnicode_Check(substring)) 11485 return split(self, (PyUnicodeObject *)substring, maxcount); 11486 else 11487 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11488} 11489 11490PyObject * 11491PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11492{ 11493 PyObject* str_obj; 11494 PyObject* sep_obj; 11495 PyObject* out; 11496 int kind1, kind2, kind; 11497 void *buf1 = NULL, *buf2 = NULL; 11498 Py_ssize_t len1, len2; 11499 11500 str_obj = PyUnicode_FromObject(str_in); 11501 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11502 return NULL; 11503 sep_obj = PyUnicode_FromObject(sep_in); 11504 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11505 Py_DECREF(str_obj); 11506 return NULL; 11507 } 11508 11509 kind1 = PyUnicode_KIND(str_in); 11510 kind2 = PyUnicode_KIND(sep_obj); 11511 kind = kind1 > kind2 ? kind1 : kind2; 11512 buf1 = PyUnicode_DATA(str_in); 11513 if (kind1 != kind) 11514 buf1 = _PyUnicode_AsKind(str_in, kind); 11515 if (!buf1) 11516 goto onError; 11517 buf2 = PyUnicode_DATA(sep_obj); 11518 if (kind2 != kind) 11519 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11520 if (!buf2) 11521 goto onError; 11522 len1 = PyUnicode_GET_LENGTH(str_obj); 11523 len2 = PyUnicode_GET_LENGTH(sep_obj); 11524 11525 switch(PyUnicode_KIND(str_in)) { 11526 case PyUnicode_1BYTE_KIND: 11527 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11528 break; 11529 case PyUnicode_2BYTE_KIND: 11530 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11531 break; 11532 case PyUnicode_4BYTE_KIND: 11533 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11534 break; 11535 default: 11536 assert(0); 11537 out = 0; 11538 } 11539 11540 Py_DECREF(sep_obj); 11541 Py_DECREF(str_obj); 11542 if (kind1 != kind) 11543 PyMem_Free(buf1); 11544 if (kind2 != kind) 11545 PyMem_Free(buf2); 11546 11547 return out; 11548 onError: 11549 Py_DECREF(sep_obj); 11550 Py_DECREF(str_obj); 11551 if (kind1 != kind && buf1) 11552 PyMem_Free(buf1); 11553 if (kind2 != kind && buf2) 11554 PyMem_Free(buf2); 11555 return NULL; 11556} 11557 11558 11559PyObject * 11560PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11561{ 11562 PyObject* str_obj; 11563 PyObject* sep_obj; 11564 PyObject* out; 11565 int kind1, kind2, kind; 11566 void *buf1 = NULL, *buf2 = NULL; 11567 Py_ssize_t len1, len2; 11568 11569 str_obj = PyUnicode_FromObject(str_in); 11570 if (!str_obj) 11571 return NULL; 11572 sep_obj = PyUnicode_FromObject(sep_in); 11573 if (!sep_obj) { 11574 Py_DECREF(str_obj); 11575 return NULL; 11576 } 11577 11578 kind1 = PyUnicode_KIND(str_in); 11579 kind2 = PyUnicode_KIND(sep_obj); 11580 kind = Py_MAX(kind1, kind2); 11581 buf1 = PyUnicode_DATA(str_in); 11582 if (kind1 != kind) 11583 buf1 = _PyUnicode_AsKind(str_in, kind); 11584 if (!buf1) 11585 goto onError; 11586 buf2 = PyUnicode_DATA(sep_obj); 11587 if (kind2 != kind) 11588 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11589 if (!buf2) 11590 goto onError; 11591 len1 = PyUnicode_GET_LENGTH(str_obj); 11592 len2 = PyUnicode_GET_LENGTH(sep_obj); 11593 11594 switch(PyUnicode_KIND(str_in)) { 11595 case PyUnicode_1BYTE_KIND: 11596 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11597 break; 11598 case PyUnicode_2BYTE_KIND: 11599 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11600 break; 11601 case PyUnicode_4BYTE_KIND: 11602 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11603 break; 11604 default: 11605 assert(0); 11606 out = 0; 11607 } 11608 11609 Py_DECREF(sep_obj); 11610 Py_DECREF(str_obj); 11611 if (kind1 != kind) 11612 PyMem_Free(buf1); 11613 if (kind2 != kind) 11614 PyMem_Free(buf2); 11615 11616 return out; 11617 onError: 11618 Py_DECREF(sep_obj); 11619 Py_DECREF(str_obj); 11620 if (kind1 != kind && buf1) 11621 PyMem_Free(buf1); 11622 if (kind2 != kind && buf2) 11623 PyMem_Free(buf2); 11624 return NULL; 11625} 11626 11627PyDoc_STRVAR(partition__doc__, 11628 "S.partition(sep) -> (head, sep, tail)\n\ 11629\n\ 11630Search for the separator sep in S, and return the part before it,\n\ 11631the separator itself, and the part after it. If the separator is not\n\ 11632found, return S and two empty strings."); 11633 11634static PyObject* 11635unicode_partition(PyUnicodeObject *self, PyObject *separator) 11636{ 11637 return PyUnicode_Partition((PyObject *)self, separator); 11638} 11639 11640PyDoc_STRVAR(rpartition__doc__, 11641 "S.rpartition(sep) -> (head, sep, tail)\n\ 11642\n\ 11643Search for the separator sep in S, starting at the end of S, and return\n\ 11644the part before it, the separator itself, and the part after it. If the\n\ 11645separator is not found, return two empty strings and S."); 11646 11647static PyObject* 11648unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 11649{ 11650 return PyUnicode_RPartition((PyObject *)self, separator); 11651} 11652 11653PyObject * 11654PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11655{ 11656 PyObject *result; 11657 11658 s = PyUnicode_FromObject(s); 11659 if (s == NULL) 11660 return NULL; 11661 if (sep != NULL) { 11662 sep = PyUnicode_FromObject(sep); 11663 if (sep == NULL) { 11664 Py_DECREF(s); 11665 return NULL; 11666 } 11667 } 11668 11669 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11670 11671 Py_DECREF(s); 11672 Py_XDECREF(sep); 11673 return result; 11674} 11675 11676PyDoc_STRVAR(rsplit__doc__, 11677 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11678\n\ 11679Return a list of the words in S, using sep as the\n\ 11680delimiter string, starting at the end of the string and\n\ 11681working to the front. If maxsplit is given, at most maxsplit\n\ 11682splits are done. If sep is not specified, any whitespace string\n\ 11683is a separator."); 11684 11685static PyObject* 11686unicode_rsplit(PyUnicodeObject *self, PyObject *args) 11687{ 11688 PyObject *substring = Py_None; 11689 Py_ssize_t maxcount = -1; 11690 11691 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11692 return NULL; 11693 11694 if (substring == Py_None) 11695 return rsplit(self, NULL, maxcount); 11696 else if (PyUnicode_Check(substring)) 11697 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 11698 else 11699 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 11700} 11701 11702PyDoc_STRVAR(splitlines__doc__, 11703 "S.splitlines([keepends]) -> list of strings\n\ 11704\n\ 11705Return a list of the lines in S, breaking at line boundaries.\n\ 11706Line breaks are not included in the resulting list unless keepends\n\ 11707is given and true."); 11708 11709static PyObject* 11710unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11711{ 11712 static char *kwlist[] = {"keepends", 0}; 11713 int keepends = 0; 11714 11715 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11716 kwlist, &keepends)) 11717 return NULL; 11718 11719 return PyUnicode_Splitlines((PyObject *)self, keepends); 11720} 11721 11722static 11723PyObject *unicode_str(PyObject *self) 11724{ 11725 if (PyUnicode_CheckExact(self)) { 11726 Py_INCREF(self); 11727 return self; 11728 } else 11729 /* Subtype -- return genuine unicode string with the same value. */ 11730 return PyUnicode_Copy(self); 11731} 11732 11733PyDoc_STRVAR(swapcase__doc__, 11734 "S.swapcase() -> str\n\ 11735\n\ 11736Return a copy of S with uppercase characters converted to lowercase\n\ 11737and vice versa."); 11738 11739static PyObject* 11740unicode_swapcase(PyUnicodeObject *self) 11741{ 11742 return fixup(self, fixswapcase); 11743} 11744 11745PyDoc_STRVAR(maketrans__doc__, 11746 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 11747\n\ 11748Return a translation table usable for str.translate().\n\ 11749If there is only one argument, it must be a dictionary mapping Unicode\n\ 11750ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 11751Character keys will be then converted to ordinals.\n\ 11752If there are two arguments, they must be strings of equal length, and\n\ 11753in the resulting dictionary, each character in x will be mapped to the\n\ 11754character at the same position in y. If there is a third argument, it\n\ 11755must be a string, whose characters will be mapped to None in the result."); 11756 11757static PyObject* 11758unicode_maketrans(PyUnicodeObject *null, PyObject *args) 11759{ 11760 PyObject *x, *y = NULL, *z = NULL; 11761 PyObject *new = NULL, *key, *value; 11762 Py_ssize_t i = 0; 11763 int res; 11764 11765 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 11766 return NULL; 11767 new = PyDict_New(); 11768 if (!new) 11769 return NULL; 11770 if (y != NULL) { 11771 int x_kind, y_kind, z_kind; 11772 void *x_data, *y_data, *z_data; 11773 11774 /* x must be a string too, of equal length */ 11775 if (!PyUnicode_Check(x)) { 11776 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 11777 "be a string if there is a second argument"); 11778 goto err; 11779 } 11780 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 11781 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 11782 "arguments must have equal length"); 11783 goto err; 11784 } 11785 /* create entries for translating chars in x to those in y */ 11786 x_kind = PyUnicode_KIND(x); 11787 y_kind = PyUnicode_KIND(y); 11788 x_data = PyUnicode_DATA(x); 11789 y_data = PyUnicode_DATA(y); 11790 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 11791 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 11792 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 11793 if (!key || !value) 11794 goto err; 11795 res = PyDict_SetItem(new, key, value); 11796 Py_DECREF(key); 11797 Py_DECREF(value); 11798 if (res < 0) 11799 goto err; 11800 } 11801 /* create entries for deleting chars in z */ 11802 if (z != NULL) { 11803 z_kind = PyUnicode_KIND(z); 11804 z_data = PyUnicode_DATA(z); 11805 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 11806 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 11807 if (!key) 11808 goto err; 11809 res = PyDict_SetItem(new, key, Py_None); 11810 Py_DECREF(key); 11811 if (res < 0) 11812 goto err; 11813 } 11814 } 11815 } else { 11816 int kind; 11817 void *data; 11818 11819 /* x must be a dict */ 11820 if (!PyDict_CheckExact(x)) { 11821 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 11822 "to maketrans it must be a dict"); 11823 goto err; 11824 } 11825 /* copy entries into the new dict, converting string keys to int keys */ 11826 while (PyDict_Next(x, &i, &key, &value)) { 11827 if (PyUnicode_Check(key)) { 11828 /* convert string keys to integer keys */ 11829 PyObject *newkey; 11830 if (PyUnicode_GET_SIZE(key) != 1) { 11831 PyErr_SetString(PyExc_ValueError, "string keys in translate " 11832 "table must be of length 1"); 11833 goto err; 11834 } 11835 kind = PyUnicode_KIND(key); 11836 data = PyUnicode_DATA(key); 11837 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 11838 if (!newkey) 11839 goto err; 11840 res = PyDict_SetItem(new, newkey, value); 11841 Py_DECREF(newkey); 11842 if (res < 0) 11843 goto err; 11844 } else if (PyLong_Check(key)) { 11845 /* just keep integer keys */ 11846 if (PyDict_SetItem(new, key, value) < 0) 11847 goto err; 11848 } else { 11849 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 11850 "be strings or integers"); 11851 goto err; 11852 } 11853 } 11854 } 11855 return new; 11856 err: 11857 Py_DECREF(new); 11858 return NULL; 11859} 11860 11861PyDoc_STRVAR(translate__doc__, 11862 "S.translate(table) -> str\n\ 11863\n\ 11864Return a copy of the string S, where all characters have been mapped\n\ 11865through the given translation table, which must be a mapping of\n\ 11866Unicode ordinals to Unicode ordinals, strings, or None.\n\ 11867Unmapped characters are left untouched. Characters mapped to None\n\ 11868are deleted."); 11869 11870static PyObject* 11871unicode_translate(PyObject *self, PyObject *table) 11872{ 11873 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 11874} 11875 11876PyDoc_STRVAR(upper__doc__, 11877 "S.upper() -> str\n\ 11878\n\ 11879Return a copy of S converted to uppercase."); 11880 11881static PyObject* 11882unicode_upper(PyUnicodeObject *self) 11883{ 11884 return fixup(self, fixupper); 11885} 11886 11887PyDoc_STRVAR(zfill__doc__, 11888 "S.zfill(width) -> str\n\ 11889\n\ 11890Pad a numeric string S with zeros on the left, to fill a field\n\ 11891of the specified width. The string S is never truncated."); 11892 11893static PyObject * 11894unicode_zfill(PyUnicodeObject *self, PyObject *args) 11895{ 11896 Py_ssize_t fill; 11897 PyUnicodeObject *u; 11898 Py_ssize_t width; 11899 int kind; 11900 void *data; 11901 Py_UCS4 chr; 11902 11903 if (PyUnicode_READY(self) == -1) 11904 return NULL; 11905 11906 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 11907 return NULL; 11908 11909 if (PyUnicode_GET_LENGTH(self) >= width) { 11910 if (PyUnicode_CheckExact(self)) { 11911 Py_INCREF(self); 11912 return (PyObject*) self; 11913 } 11914 else 11915 return PyUnicode_Copy((PyObject*)self); 11916 } 11917 11918 fill = width - _PyUnicode_LENGTH(self); 11919 11920 u = pad(self, fill, 0, '0'); 11921 11922 if (u == NULL) 11923 return NULL; 11924 11925 kind = PyUnicode_KIND(u); 11926 data = PyUnicode_DATA(u); 11927 chr = PyUnicode_READ(kind, data, fill); 11928 11929 if (chr == '+' || chr == '-') { 11930 /* move sign to beginning of string */ 11931 PyUnicode_WRITE(kind, data, 0, chr); 11932 PyUnicode_WRITE(kind, data, fill, '0'); 11933 } 11934 11935 return (PyObject*) u; 11936} 11937 11938#if 0 11939static PyObject * 11940unicode__decimal2ascii(PyObject *self) 11941{ 11942 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 11943} 11944#endif 11945 11946PyDoc_STRVAR(startswith__doc__, 11947 "S.startswith(prefix[, start[, end]]) -> bool\n\ 11948\n\ 11949Return True if S starts with the specified prefix, False otherwise.\n\ 11950With optional start, test S beginning at that position.\n\ 11951With optional end, stop comparing S at that position.\n\ 11952prefix can also be a tuple of strings to try."); 11953 11954static PyObject * 11955unicode_startswith(PyUnicodeObject *self, 11956 PyObject *args) 11957{ 11958 PyObject *subobj; 11959 PyUnicodeObject *substring; 11960 Py_ssize_t start = 0; 11961 Py_ssize_t end = PY_SSIZE_T_MAX; 11962 int result; 11963 11964 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 11965 return NULL; 11966 if (PyTuple_Check(subobj)) { 11967 Py_ssize_t i; 11968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 11969 substring = (PyUnicodeObject *)PyUnicode_FromObject( 11970 PyTuple_GET_ITEM(subobj, i)); 11971 if (substring == NULL) 11972 return NULL; 11973 result = tailmatch(self, substring, start, end, -1); 11974 Py_DECREF(substring); 11975 if (result) { 11976 Py_RETURN_TRUE; 11977 } 11978 } 11979 /* nothing matched */ 11980 Py_RETURN_FALSE; 11981 } 11982 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 11983 if (substring == NULL) { 11984 if (PyErr_ExceptionMatches(PyExc_TypeError)) 11985 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 11986 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 11987 return NULL; 11988 } 11989 result = tailmatch(self, substring, start, end, -1); 11990 Py_DECREF(substring); 11991 return PyBool_FromLong(result); 11992} 11993 11994 11995PyDoc_STRVAR(endswith__doc__, 11996 "S.endswith(suffix[, start[, end]]) -> bool\n\ 11997\n\ 11998Return True if S ends with the specified suffix, False otherwise.\n\ 11999With optional start, test S beginning at that position.\n\ 12000With optional end, stop comparing S at that position.\n\ 12001suffix can also be a tuple of strings to try."); 12002 12003static PyObject * 12004unicode_endswith(PyUnicodeObject *self, 12005 PyObject *args) 12006{ 12007 PyObject *subobj; 12008 PyUnicodeObject *substring; 12009 Py_ssize_t start = 0; 12010 Py_ssize_t end = PY_SSIZE_T_MAX; 12011 int result; 12012 12013 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12014 return NULL; 12015 if (PyTuple_Check(subobj)) { 12016 Py_ssize_t i; 12017 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12018 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12019 PyTuple_GET_ITEM(subobj, i)); 12020 if (substring == NULL) 12021 return NULL; 12022 result = tailmatch(self, substring, start, end, +1); 12023 Py_DECREF(substring); 12024 if (result) { 12025 Py_RETURN_TRUE; 12026 } 12027 } 12028 Py_RETURN_FALSE; 12029 } 12030 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12031 if (substring == NULL) { 12032 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12033 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12034 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12035 return NULL; 12036 } 12037 result = tailmatch(self, substring, start, end, +1); 12038 Py_DECREF(substring); 12039 return PyBool_FromLong(result); 12040} 12041 12042#include "stringlib/unicode_format.h" 12043 12044PyDoc_STRVAR(format__doc__, 12045 "S.format(*args, **kwargs) -> str\n\ 12046\n\ 12047Return a formatted version of S, using substitutions from args and kwargs.\n\ 12048The substitutions are identified by braces ('{' and '}')."); 12049 12050PyDoc_STRVAR(format_map__doc__, 12051 "S.format_map(mapping) -> str\n\ 12052\n\ 12053Return a formatted version of S, using substitutions from mapping.\n\ 12054The substitutions are identified by braces ('{' and '}')."); 12055 12056static PyObject * 12057unicode__format__(PyObject* self, PyObject* args) 12058{ 12059 PyObject *format_spec; 12060 12061 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12062 return NULL; 12063 12064 return _PyUnicode_FormatAdvanced(self, format_spec, 0, 12065 PyUnicode_GET_LENGTH(format_spec)); 12066} 12067 12068PyDoc_STRVAR(p_format__doc__, 12069 "S.__format__(format_spec) -> str\n\ 12070\n\ 12071Return a formatted version of S as described by format_spec."); 12072 12073static PyObject * 12074unicode__sizeof__(PyUnicodeObject *v) 12075{ 12076 Py_ssize_t size; 12077 12078 /* If it's a compact object, account for base structure + 12079 character data. */ 12080 if (PyUnicode_IS_COMPACT_ASCII(v)) 12081 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12082 else if (PyUnicode_IS_COMPACT(v)) 12083 size = sizeof(PyCompactUnicodeObject) + 12084 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 12085 else { 12086 /* If it is a two-block object, account for base object, and 12087 for character block if present. */ 12088 size = sizeof(PyUnicodeObject); 12089 if (_PyUnicode_DATA_ANY(v)) 12090 size += (PyUnicode_GET_LENGTH(v) + 1) * 12091 PyUnicode_CHARACTER_SIZE(v); 12092 } 12093 /* If the wstr pointer is present, account for it unless it is shared 12094 with the data pointer. Check if the data is not shared. */ 12095 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12096 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12097 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12098 size += PyUnicode_UTF8_LENGTH(v) + 1; 12099 12100 return PyLong_FromSsize_t(size); 12101} 12102 12103PyDoc_STRVAR(sizeof__doc__, 12104 "S.__sizeof__() -> size of S in memory, in bytes"); 12105 12106static PyObject * 12107unicode_getnewargs(PyObject *v) 12108{ 12109 PyObject *copy = PyUnicode_Copy(v); 12110 if (!copy) 12111 return NULL; 12112 return Py_BuildValue("(N)", copy); 12113} 12114 12115static PyMethodDef unicode_methods[] = { 12116 12117 /* Order is according to common usage: often used methods should 12118 appear first, since lookup is done sequentially. */ 12119 12120 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12121 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12122 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12123 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12124 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12125 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12126 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12127 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12128 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12129 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12130 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12131 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12132 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12133 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12134 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12135 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12136 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12137 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12138 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12139 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12140 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12141 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12142 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12143 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12144 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12145 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12146 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12147 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12148 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12149 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12150 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12151 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12152 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12153 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12154 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12155 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12156 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12157 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12158 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12159 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12160 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12161 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12162 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12163 {"maketrans", (PyCFunction) unicode_maketrans, 12164 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12165 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12166#if 0 12167 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12168#endif 12169 12170#if 0 12171 /* These methods are just used for debugging the implementation. */ 12172 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12173#endif 12174 12175 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12176 {NULL, NULL} 12177}; 12178 12179static PyObject * 12180unicode_mod(PyObject *v, PyObject *w) 12181{ 12182 if (!PyUnicode_Check(v)) 12183 Py_RETURN_NOTIMPLEMENTED; 12184 return PyUnicode_Format(v, w); 12185} 12186 12187static PyNumberMethods unicode_as_number = { 12188 0, /*nb_add*/ 12189 0, /*nb_subtract*/ 12190 0, /*nb_multiply*/ 12191 unicode_mod, /*nb_remainder*/ 12192}; 12193 12194static PySequenceMethods unicode_as_sequence = { 12195 (lenfunc) unicode_length, /* sq_length */ 12196 PyUnicode_Concat, /* sq_concat */ 12197 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12198 (ssizeargfunc) unicode_getitem, /* sq_item */ 12199 0, /* sq_slice */ 12200 0, /* sq_ass_item */ 12201 0, /* sq_ass_slice */ 12202 PyUnicode_Contains, /* sq_contains */ 12203}; 12204 12205static PyObject* 12206unicode_subscript(PyUnicodeObject* self, PyObject* item) 12207{ 12208 if (PyUnicode_READY(self) == -1) 12209 return NULL; 12210 12211 if (PyIndex_Check(item)) { 12212 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12213 if (i == -1 && PyErr_Occurred()) 12214 return NULL; 12215 if (i < 0) 12216 i += PyUnicode_GET_LENGTH(self); 12217 return unicode_getitem((PyObject*)self, i); 12218 } else if (PySlice_Check(item)) { 12219 Py_ssize_t start, stop, step, slicelength, cur, i; 12220 const Py_UNICODE* source_buf; 12221 Py_UNICODE* result_buf; 12222 PyObject* result; 12223 12224 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12225 &start, &stop, &step, &slicelength) < 0) { 12226 return NULL; 12227 } 12228 12229 if (slicelength <= 0) { 12230 return PyUnicode_New(0, 0); 12231 } else if (start == 0 && step == 1 && 12232 slicelength == PyUnicode_GET_LENGTH(self) && 12233 PyUnicode_CheckExact(self)) { 12234 Py_INCREF(self); 12235 return (PyObject *)self; 12236 } else if (step == 1) { 12237 return PyUnicode_Substring((PyObject*)self, 12238 start, start + slicelength); 12239 } else { 12240 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 12241 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 12242 sizeof(Py_UNICODE)); 12243 12244 if (result_buf == NULL) 12245 return PyErr_NoMemory(); 12246 12247 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12248 result_buf[i] = source_buf[cur]; 12249 } 12250 12251 result = PyUnicode_FromUnicode(result_buf, slicelength); 12252 PyObject_FREE(result_buf); 12253 return result; 12254 } 12255 } else { 12256 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12257 return NULL; 12258 } 12259} 12260 12261static PyMappingMethods unicode_as_mapping = { 12262 (lenfunc)unicode_length, /* mp_length */ 12263 (binaryfunc)unicode_subscript, /* mp_subscript */ 12264 (objobjargproc)0, /* mp_ass_subscript */ 12265}; 12266 12267 12268/* Helpers for PyUnicode_Format() */ 12269 12270static PyObject * 12271getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12272{ 12273 Py_ssize_t argidx = *p_argidx; 12274 if (argidx < arglen) { 12275 (*p_argidx)++; 12276 if (arglen < 0) 12277 return args; 12278 else 12279 return PyTuple_GetItem(args, argidx); 12280 } 12281 PyErr_SetString(PyExc_TypeError, 12282 "not enough arguments for format string"); 12283 return NULL; 12284} 12285 12286/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12287 12288static PyObject * 12289formatfloat(PyObject *v, int flags, int prec, int type) 12290{ 12291 char *p; 12292 PyObject *result; 12293 double x; 12294 12295 x = PyFloat_AsDouble(v); 12296 if (x == -1.0 && PyErr_Occurred()) 12297 return NULL; 12298 12299 if (prec < 0) 12300 prec = 6; 12301 12302 p = PyOS_double_to_string(x, type, prec, 12303 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12304 if (p == NULL) 12305 return NULL; 12306 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12307 PyMem_Free(p); 12308 return result; 12309} 12310 12311static PyObject* 12312formatlong(PyObject *val, int flags, int prec, int type) 12313{ 12314 char *buf; 12315 int len; 12316 PyObject *str; /* temporary string object. */ 12317 PyObject *result; 12318 12319 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12320 if (!str) 12321 return NULL; 12322 result = PyUnicode_DecodeASCII(buf, len, NULL); 12323 Py_DECREF(str); 12324 return result; 12325} 12326 12327static int 12328formatchar(Py_UCS4 *buf, 12329 size_t buflen, 12330 PyObject *v) 12331{ 12332 /* presume that the buffer is at least 3 characters long */ 12333 if (PyUnicode_Check(v)) { 12334 if (PyUnicode_GET_LENGTH(v) == 1) { 12335 buf[0] = PyUnicode_READ_CHAR(v, 0); 12336 buf[1] = '\0'; 12337 return 1; 12338 } 12339 goto onError; 12340 } 12341 else { 12342 /* Integer input truncated to a character */ 12343 long x; 12344 x = PyLong_AsLong(v); 12345 if (x == -1 && PyErr_Occurred()) 12346 goto onError; 12347 12348 if (x < 0 || x > 0x10ffff) { 12349 PyErr_SetString(PyExc_OverflowError, 12350 "%c arg not in range(0x110000)"); 12351 return -1; 12352 } 12353 12354 buf[0] = (Py_UCS4) x; 12355 buf[1] = '\0'; 12356 return 1; 12357 } 12358 12359 onError: 12360 PyErr_SetString(PyExc_TypeError, 12361 "%c requires int or char"); 12362 return -1; 12363} 12364 12365/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12366 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12367*/ 12368#define FORMATBUFLEN (size_t)10 12369 12370PyObject * 12371PyUnicode_Format(PyObject *format, PyObject *args) 12372{ 12373 void *fmt; 12374 int fmtkind; 12375 PyObject *result; 12376 Py_UCS4 *res, *res0; 12377 Py_UCS4 max; 12378 int kind; 12379 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12380 int args_owned = 0; 12381 PyObject *dict = NULL; 12382 PyUnicodeObject *uformat; 12383 12384 if (format == NULL || args == NULL) { 12385 PyErr_BadInternalCall(); 12386 return NULL; 12387 } 12388 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12389 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12390 return NULL; 12391 fmt = PyUnicode_DATA(uformat); 12392 fmtkind = PyUnicode_KIND(uformat); 12393 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12394 fmtpos = 0; 12395 12396 reslen = rescnt = fmtcnt + 100; 12397 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12398 if (res0 == NULL) { 12399 PyErr_NoMemory(); 12400 goto onError; 12401 } 12402 12403 if (PyTuple_Check(args)) { 12404 arglen = PyTuple_Size(args); 12405 argidx = 0; 12406 } 12407 else { 12408 arglen = -1; 12409 argidx = -2; 12410 } 12411 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12412 !PyUnicode_Check(args)) 12413 dict = args; 12414 12415 while (--fmtcnt >= 0) { 12416 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12417 if (--rescnt < 0) { 12418 rescnt = fmtcnt + 100; 12419 reslen += rescnt; 12420 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12421 if (res0 == NULL){ 12422 PyErr_NoMemory(); 12423 goto onError; 12424 } 12425 res = res0 + reslen - rescnt; 12426 --rescnt; 12427 } 12428 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12429 } 12430 else { 12431 /* Got a format specifier */ 12432 int flags = 0; 12433 Py_ssize_t width = -1; 12434 int prec = -1; 12435 Py_UCS4 c = '\0'; 12436 Py_UCS4 fill; 12437 int isnumok; 12438 PyObject *v = NULL; 12439 PyObject *temp = NULL; 12440 void *pbuf; 12441 Py_ssize_t pindex; 12442 Py_UNICODE sign; 12443 Py_ssize_t len, len1; 12444 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12445 12446 fmtpos++; 12447 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12448 Py_ssize_t keystart; 12449 Py_ssize_t keylen; 12450 PyObject *key; 12451 int pcount = 1; 12452 12453 if (dict == NULL) { 12454 PyErr_SetString(PyExc_TypeError, 12455 "format requires a mapping"); 12456 goto onError; 12457 } 12458 ++fmtpos; 12459 --fmtcnt; 12460 keystart = fmtpos; 12461 /* Skip over balanced parentheses */ 12462 while (pcount > 0 && --fmtcnt >= 0) { 12463 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12464 --pcount; 12465 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12466 ++pcount; 12467 fmtpos++; 12468 } 12469 keylen = fmtpos - keystart - 1; 12470 if (fmtcnt < 0 || pcount > 0) { 12471 PyErr_SetString(PyExc_ValueError, 12472 "incomplete format key"); 12473 goto onError; 12474 } 12475 key = PyUnicode_Substring((PyObject*)uformat, 12476 keystart, keystart + keylen); 12477 if (key == NULL) 12478 goto onError; 12479 if (args_owned) { 12480 Py_DECREF(args); 12481 args_owned = 0; 12482 } 12483 args = PyObject_GetItem(dict, key); 12484 Py_DECREF(key); 12485 if (args == NULL) { 12486 goto onError; 12487 } 12488 args_owned = 1; 12489 arglen = -1; 12490 argidx = -2; 12491 } 12492 while (--fmtcnt >= 0) { 12493 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12494 case '-': flags |= F_LJUST; continue; 12495 case '+': flags |= F_SIGN; continue; 12496 case ' ': flags |= F_BLANK; continue; 12497 case '#': flags |= F_ALT; continue; 12498 case '0': flags |= F_ZERO; continue; 12499 } 12500 break; 12501 } 12502 if (c == '*') { 12503 v = getnextarg(args, arglen, &argidx); 12504 if (v == NULL) 12505 goto onError; 12506 if (!PyLong_Check(v)) { 12507 PyErr_SetString(PyExc_TypeError, 12508 "* wants int"); 12509 goto onError; 12510 } 12511 width = PyLong_AsLong(v); 12512 if (width == -1 && PyErr_Occurred()) 12513 goto onError; 12514 if (width < 0) { 12515 flags |= F_LJUST; 12516 width = -width; 12517 } 12518 if (--fmtcnt >= 0) 12519 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12520 } 12521 else if (c >= '0' && c <= '9') { 12522 width = c - '0'; 12523 while (--fmtcnt >= 0) { 12524 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12525 if (c < '0' || c > '9') 12526 break; 12527 if ((width*10) / 10 != width) { 12528 PyErr_SetString(PyExc_ValueError, 12529 "width too big"); 12530 goto onError; 12531 } 12532 width = width*10 + (c - '0'); 12533 } 12534 } 12535 if (c == '.') { 12536 prec = 0; 12537 if (--fmtcnt >= 0) 12538 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12539 if (c == '*') { 12540 v = getnextarg(args, arglen, &argidx); 12541 if (v == NULL) 12542 goto onError; 12543 if (!PyLong_Check(v)) { 12544 PyErr_SetString(PyExc_TypeError, 12545 "* wants int"); 12546 goto onError; 12547 } 12548 prec = PyLong_AsLong(v); 12549 if (prec == -1 && PyErr_Occurred()) 12550 goto onError; 12551 if (prec < 0) 12552 prec = 0; 12553 if (--fmtcnt >= 0) 12554 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12555 } 12556 else if (c >= '0' && c <= '9') { 12557 prec = c - '0'; 12558 while (--fmtcnt >= 0) { 12559 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12560 if (c < '0' || c > '9') 12561 break; 12562 if ((prec*10) / 10 != prec) { 12563 PyErr_SetString(PyExc_ValueError, 12564 "prec too big"); 12565 goto onError; 12566 } 12567 prec = prec*10 + (c - '0'); 12568 } 12569 } 12570 } /* prec */ 12571 if (fmtcnt >= 0) { 12572 if (c == 'h' || c == 'l' || c == 'L') { 12573 if (--fmtcnt >= 0) 12574 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12575 } 12576 } 12577 if (fmtcnt < 0) { 12578 PyErr_SetString(PyExc_ValueError, 12579 "incomplete format"); 12580 goto onError; 12581 } 12582 if (c != '%') { 12583 v = getnextarg(args, arglen, &argidx); 12584 if (v == NULL) 12585 goto onError; 12586 } 12587 sign = 0; 12588 fill = ' '; 12589 switch (c) { 12590 12591 case '%': 12592 pbuf = formatbuf; 12593 kind = PyUnicode_4BYTE_KIND; 12594 /* presume that buffer length is at least 1 */ 12595 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12596 len = 1; 12597 break; 12598 12599 case 's': 12600 case 'r': 12601 case 'a': 12602 if (PyUnicode_CheckExact(v) && c == 's') { 12603 temp = v; 12604 Py_INCREF(temp); 12605 } 12606 else { 12607 if (c == 's') 12608 temp = PyObject_Str(v); 12609 else if (c == 'r') 12610 temp = PyObject_Repr(v); 12611 else 12612 temp = PyObject_ASCII(v); 12613 if (temp == NULL) 12614 goto onError; 12615 if (PyUnicode_Check(temp)) 12616 /* nothing to do */; 12617 else { 12618 Py_DECREF(temp); 12619 PyErr_SetString(PyExc_TypeError, 12620 "%s argument has non-string str()"); 12621 goto onError; 12622 } 12623 } 12624 if (PyUnicode_READY(temp) == -1) { 12625 Py_CLEAR(temp); 12626 goto onError; 12627 } 12628 pbuf = PyUnicode_DATA(temp); 12629 kind = PyUnicode_KIND(temp); 12630 len = PyUnicode_GET_LENGTH(temp); 12631 if (prec >= 0 && len > prec) 12632 len = prec; 12633 break; 12634 12635 case 'i': 12636 case 'd': 12637 case 'u': 12638 case 'o': 12639 case 'x': 12640 case 'X': 12641 isnumok = 0; 12642 if (PyNumber_Check(v)) { 12643 PyObject *iobj=NULL; 12644 12645 if (PyLong_Check(v)) { 12646 iobj = v; 12647 Py_INCREF(iobj); 12648 } 12649 else { 12650 iobj = PyNumber_Long(v); 12651 } 12652 if (iobj!=NULL) { 12653 if (PyLong_Check(iobj)) { 12654 isnumok = 1; 12655 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12656 Py_DECREF(iobj); 12657 if (!temp) 12658 goto onError; 12659 if (PyUnicode_READY(temp) == -1) { 12660 Py_CLEAR(temp); 12661 goto onError; 12662 } 12663 pbuf = PyUnicode_DATA(temp); 12664 kind = PyUnicode_KIND(temp); 12665 len = PyUnicode_GET_LENGTH(temp); 12666 sign = 1; 12667 } 12668 else { 12669 Py_DECREF(iobj); 12670 } 12671 } 12672 } 12673 if (!isnumok) { 12674 PyErr_Format(PyExc_TypeError, 12675 "%%%c format: a number is required, " 12676 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12677 goto onError; 12678 } 12679 if (flags & F_ZERO) 12680 fill = '0'; 12681 break; 12682 12683 case 'e': 12684 case 'E': 12685 case 'f': 12686 case 'F': 12687 case 'g': 12688 case 'G': 12689 temp = formatfloat(v, flags, prec, c); 12690 if (!temp) 12691 goto onError; 12692 if (PyUnicode_READY(temp) == -1) { 12693 Py_CLEAR(temp); 12694 goto onError; 12695 } 12696 pbuf = PyUnicode_DATA(temp); 12697 kind = PyUnicode_KIND(temp); 12698 len = PyUnicode_GET_LENGTH(temp); 12699 sign = 1; 12700 if (flags & F_ZERO) 12701 fill = '0'; 12702 break; 12703 12704 case 'c': 12705 pbuf = formatbuf; 12706 kind = PyUnicode_4BYTE_KIND; 12707 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12708 if (len < 0) 12709 goto onError; 12710 break; 12711 12712 default: 12713 PyErr_Format(PyExc_ValueError, 12714 "unsupported format character '%c' (0x%x) " 12715 "at index %zd", 12716 (31<=c && c<=126) ? (char)c : '?', 12717 (int)c, 12718 fmtpos - 1); 12719 goto onError; 12720 } 12721 /* pbuf is initialized here. */ 12722 pindex = 0; 12723 if (sign) { 12724 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 12725 PyUnicode_READ(kind, pbuf, pindex) == '+') { 12726 sign = PyUnicode_READ(kind, pbuf, pindex++); 12727 len--; 12728 } 12729 else if (flags & F_SIGN) 12730 sign = '+'; 12731 else if (flags & F_BLANK) 12732 sign = ' '; 12733 else 12734 sign = 0; 12735 } 12736 if (width < len) 12737 width = len; 12738 if (rescnt - (sign != 0) < width) { 12739 reslen -= rescnt; 12740 rescnt = width + fmtcnt + 100; 12741 reslen += rescnt; 12742 if (reslen < 0) { 12743 Py_XDECREF(temp); 12744 PyErr_NoMemory(); 12745 goto onError; 12746 } 12747 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12748 if (res0 == 0) { 12749 PyErr_NoMemory(); 12750 Py_XDECREF(temp); 12751 goto onError; 12752 } 12753 res = res0 + reslen - rescnt; 12754 } 12755 if (sign) { 12756 if (fill != ' ') 12757 *res++ = sign; 12758 rescnt--; 12759 if (width > len) 12760 width--; 12761 } 12762 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12763 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12764 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12765 if (fill != ' ') { 12766 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12767 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12768 } 12769 rescnt -= 2; 12770 width -= 2; 12771 if (width < 0) 12772 width = 0; 12773 len -= 2; 12774 } 12775 if (width > len && !(flags & F_LJUST)) { 12776 do { 12777 --rescnt; 12778 *res++ = fill; 12779 } while (--width > len); 12780 } 12781 if (fill == ' ') { 12782 if (sign) 12783 *res++ = sign; 12784 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12785 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12786 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12787 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12788 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12789 } 12790 } 12791 /* Copy all characters, preserving len */ 12792 len1 = len; 12793 while (len1--) { 12794 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12795 rescnt--; 12796 } 12797 while (--width >= len) { 12798 --rescnt; 12799 *res++ = ' '; 12800 } 12801 if (dict && (argidx < arglen) && c != '%') { 12802 PyErr_SetString(PyExc_TypeError, 12803 "not all arguments converted during string formatting"); 12804 Py_XDECREF(temp); 12805 goto onError; 12806 } 12807 Py_XDECREF(temp); 12808 } /* '%' */ 12809 } /* until end */ 12810 if (argidx < arglen && !dict) { 12811 PyErr_SetString(PyExc_TypeError, 12812 "not all arguments converted during string formatting"); 12813 goto onError; 12814 } 12815 12816 12817 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 12818 if (*res > max) 12819 max = *res; 12820 result = PyUnicode_New(reslen - rescnt, max); 12821 if (!result) 12822 goto onError; 12823 kind = PyUnicode_KIND(result); 12824 for (res = res0; res < res0+reslen-rescnt; res++) 12825 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 12826 PyMem_Free(res0); 12827 if (args_owned) { 12828 Py_DECREF(args); 12829 } 12830 Py_DECREF(uformat); 12831 return (PyObject *)result; 12832 12833 onError: 12834 PyMem_Free(res0); 12835 Py_DECREF(uformat); 12836 if (args_owned) { 12837 Py_DECREF(args); 12838 } 12839 return NULL; 12840} 12841 12842static PyObject * 12843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 12844 12845static PyObject * 12846unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12847{ 12848 PyObject *x = NULL; 12849 static char *kwlist[] = {"object", "encoding", "errors", 0}; 12850 char *encoding = NULL; 12851 char *errors = NULL; 12852 12853 if (type != &PyUnicode_Type) 12854 return unicode_subtype_new(type, args, kwds); 12855 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 12856 kwlist, &x, &encoding, &errors)) 12857 return NULL; 12858 if (x == NULL) 12859 return (PyObject *)PyUnicode_New(0, 0); 12860 if (encoding == NULL && errors == NULL) 12861 return PyObject_Str(x); 12862 else 12863 return PyUnicode_FromEncodedObject(x, encoding, errors); 12864} 12865 12866static PyObject * 12867unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12868{ 12869 PyUnicodeObject *unicode, *self; 12870 Py_ssize_t length, char_size; 12871 int share_wstr, share_utf8; 12872 unsigned int kind; 12873 void *data; 12874 12875 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 12876 12877 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 12878 if (unicode == NULL) 12879 return NULL; 12880 assert(_PyUnicode_CHECK(unicode)); 12881 if (_PyUnicode_READY_REPLACE(&unicode)) 12882 return NULL; 12883 12884 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 12885 if (self == NULL) { 12886 Py_DECREF(unicode); 12887 return NULL; 12888 } 12889 kind = PyUnicode_KIND(unicode); 12890 length = PyUnicode_GET_LENGTH(unicode); 12891 12892 _PyUnicode_LENGTH(self) = length; 12893 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 12894 _PyUnicode_STATE(self).interned = 0; 12895 _PyUnicode_STATE(self).kind = kind; 12896 _PyUnicode_STATE(self).compact = 0; 12897 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 12898 _PyUnicode_STATE(self).ready = 1; 12899 _PyUnicode_WSTR(self) = NULL; 12900 _PyUnicode_UTF8_LENGTH(self) = 0; 12901 _PyUnicode_UTF8(self) = NULL; 12902 _PyUnicode_WSTR_LENGTH(self) = 0; 12903 _PyUnicode_DATA_ANY(self) = NULL; 12904 12905 share_utf8 = 0; 12906 share_wstr = 0; 12907 if (kind == PyUnicode_1BYTE_KIND) { 12908 char_size = 1; 12909 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 12910 share_utf8 = 1; 12911 } 12912 else if (kind == PyUnicode_2BYTE_KIND) { 12913 char_size = 2; 12914 if (sizeof(wchar_t) == 2) 12915 share_wstr = 1; 12916 } 12917 else { 12918 assert(kind == PyUnicode_4BYTE_KIND); 12919 char_size = 4; 12920 if (sizeof(wchar_t) == 4) 12921 share_wstr = 1; 12922 } 12923 12924 /* Ensure we won't overflow the length. */ 12925 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 12926 PyErr_NoMemory(); 12927 goto onError; 12928 } 12929 data = PyObject_MALLOC((length + 1) * char_size); 12930 if (data == NULL) { 12931 PyErr_NoMemory(); 12932 goto onError; 12933 } 12934 12935 _PyUnicode_DATA_ANY(self) = data; 12936 if (share_utf8) { 12937 _PyUnicode_UTF8_LENGTH(self) = length; 12938 _PyUnicode_UTF8(self) = data; 12939 } 12940 if (share_wstr) { 12941 _PyUnicode_WSTR_LENGTH(self) = length; 12942 _PyUnicode_WSTR(self) = (wchar_t *)data; 12943 } 12944 12945 Py_MEMCPY(data, PyUnicode_DATA(unicode), 12946 PyUnicode_KIND_SIZE(kind, length + 1)); 12947 Py_DECREF(unicode); 12948 return (PyObject *)self; 12949 12950onError: 12951 Py_DECREF(unicode); 12952 Py_DECREF(self); 12953 return NULL; 12954} 12955 12956PyDoc_STRVAR(unicode_doc, 12957 "str(string[, encoding[, errors]]) -> str\n\ 12958\n\ 12959Create a new string object from the given encoded string.\n\ 12960encoding defaults to the current default string encoding.\n\ 12961errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 12962 12963static PyObject *unicode_iter(PyObject *seq); 12964 12965PyTypeObject PyUnicode_Type = { 12966 PyVarObject_HEAD_INIT(&PyType_Type, 0) 12967 "str", /* tp_name */ 12968 sizeof(PyUnicodeObject), /* tp_size */ 12969 0, /* tp_itemsize */ 12970 /* Slots */ 12971 (destructor)unicode_dealloc, /* tp_dealloc */ 12972 0, /* tp_print */ 12973 0, /* tp_getattr */ 12974 0, /* tp_setattr */ 12975 0, /* tp_reserved */ 12976 unicode_repr, /* tp_repr */ 12977 &unicode_as_number, /* tp_as_number */ 12978 &unicode_as_sequence, /* tp_as_sequence */ 12979 &unicode_as_mapping, /* tp_as_mapping */ 12980 (hashfunc) unicode_hash, /* tp_hash*/ 12981 0, /* tp_call*/ 12982 (reprfunc) unicode_str, /* tp_str */ 12983 PyObject_GenericGetAttr, /* tp_getattro */ 12984 0, /* tp_setattro */ 12985 0, /* tp_as_buffer */ 12986 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 12987 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 12988 unicode_doc, /* tp_doc */ 12989 0, /* tp_traverse */ 12990 0, /* tp_clear */ 12991 PyUnicode_RichCompare, /* tp_richcompare */ 12992 0, /* tp_weaklistoffset */ 12993 unicode_iter, /* tp_iter */ 12994 0, /* tp_iternext */ 12995 unicode_methods, /* tp_methods */ 12996 0, /* tp_members */ 12997 0, /* tp_getset */ 12998 &PyBaseObject_Type, /* tp_base */ 12999 0, /* tp_dict */ 13000 0, /* tp_descr_get */ 13001 0, /* tp_descr_set */ 13002 0, /* tp_dictoffset */ 13003 0, /* tp_init */ 13004 0, /* tp_alloc */ 13005 unicode_new, /* tp_new */ 13006 PyObject_Del, /* tp_free */ 13007}; 13008 13009/* Initialize the Unicode implementation */ 13010 13011void _PyUnicode_Init(void) 13012{ 13013 int i; 13014 13015 /* XXX - move this array to unicodectype.c ? */ 13016 Py_UCS2 linebreak[] = { 13017 0x000A, /* LINE FEED */ 13018 0x000D, /* CARRIAGE RETURN */ 13019 0x001C, /* FILE SEPARATOR */ 13020 0x001D, /* GROUP SEPARATOR */ 13021 0x001E, /* RECORD SEPARATOR */ 13022 0x0085, /* NEXT LINE */ 13023 0x2028, /* LINE SEPARATOR */ 13024 0x2029, /* PARAGRAPH SEPARATOR */ 13025 }; 13026 13027 /* Init the implementation */ 13028 unicode_empty = PyUnicode_New(0, 0); 13029 if (!unicode_empty) 13030 Py_FatalError("Can't create empty string"); 13031 13032 for (i = 0; i < 256; i++) 13033 unicode_latin1[i] = NULL; 13034 if (PyType_Ready(&PyUnicode_Type) < 0) 13035 Py_FatalError("Can't initialize 'unicode'"); 13036 13037 /* initialize the linebreak bloom filter */ 13038 bloom_linebreak = make_bloom_mask( 13039 PyUnicode_2BYTE_KIND, linebreak, 13040 Py_ARRAY_LENGTH(linebreak)); 13041 13042 PyType_Ready(&EncodingMapType); 13043} 13044 13045/* Finalize the Unicode implementation */ 13046 13047int 13048PyUnicode_ClearFreeList(void) 13049{ 13050 return 0; 13051} 13052 13053void 13054_PyUnicode_Fini(void) 13055{ 13056 int i; 13057 13058 Py_XDECREF(unicode_empty); 13059 unicode_empty = NULL; 13060 13061 for (i = 0; i < 256; i++) { 13062 if (unicode_latin1[i]) { 13063 Py_DECREF(unicode_latin1[i]); 13064 unicode_latin1[i] = NULL; 13065 } 13066 } 13067 (void)PyUnicode_ClearFreeList(); 13068} 13069 13070void 13071PyUnicode_InternInPlace(PyObject **p) 13072{ 13073 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13074 PyObject *t; 13075#ifdef Py_DEBUG 13076 assert(s != NULL); 13077 assert(_PyUnicode_CHECK(s)); 13078#else 13079 if (s == NULL || !PyUnicode_Check(s)) 13080 return; 13081#endif 13082 /* If it's a subclass, we don't really know what putting 13083 it in the interned dict might do. */ 13084 if (!PyUnicode_CheckExact(s)) 13085 return; 13086 if (PyUnicode_CHECK_INTERNED(s)) 13087 return; 13088 if (_PyUnicode_READY_REPLACE(p)) { 13089 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace"); 13090 return; 13091 } 13092 s = (PyUnicodeObject *)(*p); 13093 if (interned == NULL) { 13094 interned = PyDict_New(); 13095 if (interned == NULL) { 13096 PyErr_Clear(); /* Don't leave an exception */ 13097 return; 13098 } 13099 } 13100 /* It might be that the GetItem call fails even 13101 though the key is present in the dictionary, 13102 namely when this happens during a stack overflow. */ 13103 Py_ALLOW_RECURSION 13104 t = PyDict_GetItem(interned, (PyObject *)s); 13105 Py_END_ALLOW_RECURSION 13106 13107 if (t) { 13108 Py_INCREF(t); 13109 Py_DECREF(*p); 13110 *p = t; 13111 return; 13112 } 13113 13114 PyThreadState_GET()->recursion_critical = 1; 13115 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13116 PyErr_Clear(); 13117 PyThreadState_GET()->recursion_critical = 0; 13118 return; 13119 } 13120 PyThreadState_GET()->recursion_critical = 0; 13121 /* The two references in interned are not counted by refcnt. 13122 The deallocator will take care of this */ 13123 Py_REFCNT(s) -= 2; 13124 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13125} 13126 13127void 13128PyUnicode_InternImmortal(PyObject **p) 13129{ 13130 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13131 13132 PyUnicode_InternInPlace(p); 13133 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13134 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13135 Py_INCREF(*p); 13136 } 13137} 13138 13139PyObject * 13140PyUnicode_InternFromString(const char *cp) 13141{ 13142 PyObject *s = PyUnicode_FromString(cp); 13143 if (s == NULL) 13144 return NULL; 13145 PyUnicode_InternInPlace(&s); 13146 return s; 13147} 13148 13149void 13150_Py_ReleaseInternedUnicodeStrings(void) 13151{ 13152 PyObject *keys; 13153 PyUnicodeObject *s; 13154 Py_ssize_t i, n; 13155 Py_ssize_t immortal_size = 0, mortal_size = 0; 13156 13157 if (interned == NULL || !PyDict_Check(interned)) 13158 return; 13159 keys = PyDict_Keys(interned); 13160 if (keys == NULL || !PyList_Check(keys)) { 13161 PyErr_Clear(); 13162 return; 13163 } 13164 13165 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13166 detector, interned unicode strings are not forcibly deallocated; 13167 rather, we give them their stolen references back, and then clear 13168 and DECREF the interned dict. */ 13169 13170 n = PyList_GET_SIZE(keys); 13171 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13172 n); 13173 for (i = 0; i < n; i++) { 13174 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13175 if (PyUnicode_READY(s) == -1) 13176 fprintf(stderr, "could not ready string\n"); 13177 switch (PyUnicode_CHECK_INTERNED(s)) { 13178 case SSTATE_NOT_INTERNED: 13179 /* XXX Shouldn't happen */ 13180 break; 13181 case SSTATE_INTERNED_IMMORTAL: 13182 Py_REFCNT(s) += 1; 13183 immortal_size += PyUnicode_GET_LENGTH(s); 13184 break; 13185 case SSTATE_INTERNED_MORTAL: 13186 Py_REFCNT(s) += 2; 13187 mortal_size += PyUnicode_GET_LENGTH(s); 13188 break; 13189 default: 13190 Py_FatalError("Inconsistent interned string state."); 13191 } 13192 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13193 } 13194 fprintf(stderr, "total size of all interned strings: " 13195 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13196 "mortal/immortal\n", mortal_size, immortal_size); 13197 Py_DECREF(keys); 13198 PyDict_Clear(interned); 13199 Py_DECREF(interned); 13200 interned = NULL; 13201} 13202 13203 13204/********************* Unicode Iterator **************************/ 13205 13206typedef struct { 13207 PyObject_HEAD 13208 Py_ssize_t it_index; 13209 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13210} unicodeiterobject; 13211 13212static void 13213unicodeiter_dealloc(unicodeiterobject *it) 13214{ 13215 _PyObject_GC_UNTRACK(it); 13216 Py_XDECREF(it->it_seq); 13217 PyObject_GC_Del(it); 13218} 13219 13220static int 13221unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13222{ 13223 Py_VISIT(it->it_seq); 13224 return 0; 13225} 13226 13227static PyObject * 13228unicodeiter_next(unicodeiterobject *it) 13229{ 13230 PyUnicodeObject *seq; 13231 PyObject *item; 13232 13233 assert(it != NULL); 13234 seq = it->it_seq; 13235 if (seq == NULL) 13236 return NULL; 13237 assert(_PyUnicode_CHECK(seq)); 13238 13239 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13240 int kind = PyUnicode_KIND(seq); 13241 void *data = PyUnicode_DATA(seq); 13242 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13243 item = PyUnicode_FromOrdinal(chr); 13244 if (item != NULL) 13245 ++it->it_index; 13246 return item; 13247 } 13248 13249 Py_DECREF(seq); 13250 it->it_seq = NULL; 13251 return NULL; 13252} 13253 13254static PyObject * 13255unicodeiter_len(unicodeiterobject *it) 13256{ 13257 Py_ssize_t len = 0; 13258 if (it->it_seq) 13259 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13260 return PyLong_FromSsize_t(len); 13261} 13262 13263PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13264 13265static PyMethodDef unicodeiter_methods[] = { 13266 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13267 length_hint_doc}, 13268 {NULL, NULL} /* sentinel */ 13269}; 13270 13271PyTypeObject PyUnicodeIter_Type = { 13272 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13273 "str_iterator", /* tp_name */ 13274 sizeof(unicodeiterobject), /* tp_basicsize */ 13275 0, /* tp_itemsize */ 13276 /* methods */ 13277 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13278 0, /* tp_print */ 13279 0, /* tp_getattr */ 13280 0, /* tp_setattr */ 13281 0, /* tp_reserved */ 13282 0, /* tp_repr */ 13283 0, /* tp_as_number */ 13284 0, /* tp_as_sequence */ 13285 0, /* tp_as_mapping */ 13286 0, /* tp_hash */ 13287 0, /* tp_call */ 13288 0, /* tp_str */ 13289 PyObject_GenericGetAttr, /* tp_getattro */ 13290 0, /* tp_setattro */ 13291 0, /* tp_as_buffer */ 13292 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13293 0, /* tp_doc */ 13294 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13295 0, /* tp_clear */ 13296 0, /* tp_richcompare */ 13297 0, /* tp_weaklistoffset */ 13298 PyObject_SelfIter, /* tp_iter */ 13299 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13300 unicodeiter_methods, /* tp_methods */ 13301 0, 13302}; 13303 13304static PyObject * 13305unicode_iter(PyObject *seq) 13306{ 13307 unicodeiterobject *it; 13308 13309 if (!PyUnicode_Check(seq)) { 13310 PyErr_BadInternalCall(); 13311 return NULL; 13312 } 13313 if (PyUnicode_READY(seq) == -1) 13314 return NULL; 13315 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13316 if (it == NULL) 13317 return NULL; 13318 it->it_index = 0; 13319 Py_INCREF(seq); 13320 it->it_seq = (PyUnicodeObject *)seq; 13321 _PyObject_GC_TRACK(it); 13322 return (PyObject *)it; 13323} 13324 13325#define UNIOP(x) Py_UNICODE_##x 13326#define UNIOP_t Py_UNICODE 13327#include "uniops.h" 13328#undef UNIOP 13329#undef UNIOP_t 13330#define UNIOP(x) Py_UCS4_##x 13331#define UNIOP_t Py_UCS4 13332#include "uniops.h" 13333#undef UNIOP 13334#undef UNIOP_t 13335 13336Py_UNICODE* 13337PyUnicode_AsUnicodeCopy(PyObject *object) 13338{ 13339 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13340 Py_UNICODE *copy; 13341 Py_ssize_t size; 13342 13343 if (!PyUnicode_Check(unicode)) { 13344 PyErr_BadArgument(); 13345 return NULL; 13346 } 13347 /* Ensure we won't overflow the size. */ 13348 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13349 PyErr_NoMemory(); 13350 return NULL; 13351 } 13352 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13353 size *= sizeof(Py_UNICODE); 13354 copy = PyMem_Malloc(size); 13355 if (copy == NULL) { 13356 PyErr_NoMemory(); 13357 return NULL; 13358 } 13359 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13360 return copy; 13361} 13362 13363/* A _string module, to export formatter_parser and formatter_field_name_split 13364 to the string.Formatter class implemented in Python. */ 13365 13366static PyMethodDef _string_methods[] = { 13367 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13368 METH_O, PyDoc_STR("split the argument as a field name")}, 13369 {"formatter_parser", (PyCFunction) formatter_parser, 13370 METH_O, PyDoc_STR("parse the argument as a format string")}, 13371 {NULL, NULL} 13372}; 13373 13374static struct PyModuleDef _string_module = { 13375 PyModuleDef_HEAD_INIT, 13376 "_string", 13377 PyDoc_STR("string helper module"), 13378 0, 13379 _string_methods, 13380 NULL, 13381 NULL, 13382 NULL, 13383 NULL 13384}; 13385 13386PyMODINIT_FUNC 13387PyInit__string(void) 13388{ 13389 return PyModule_Create(&_string_module); 13390} 13391 13392 13393#ifdef __cplusplus 13394} 13395#endif 13396