unicodeobject.c revision fb9ea8c57eeab6837c830613524c1250488baed1
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49#ifdef Py_DEBUG 50# define DONT_MAKE_RESULT_READY 51#endif 52 53/* Limit for the Unicode object free list */ 54 55#define PyUnicode_MAXFREELIST 1024 56 57/* Limit for the Unicode object free list stay alive optimization. 58 59 The implementation will keep allocated Unicode memory intact for 60 all objects on the free list having a size less than this 61 limit. This reduces malloc() overhead for small Unicode objects. 62 63 At worst this will result in PyUnicode_MAXFREELIST * 64 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 65 malloc()-overhead) bytes of unused garbage. 66 67 Setting the limit to 0 effectively turns the feature off. 68 69 Note: This is an experimental feature ! If you get core dumps when 70 using Unicode objects, turn this feature off. 71 72*/ 73 74#define KEEPALIVE_SIZE_LIMIT 9 75 76/* Endianness switches; defaults to little endian */ 77 78#ifdef WORDS_BIGENDIAN 79# define BYTEORDER_IS_BIG_ENDIAN 80#else 81# define BYTEORDER_IS_LITTLE_ENDIAN 82#endif 83 84/* --- Globals ------------------------------------------------------------ 85 86 The globals are initialized by the _PyUnicode_Init() API and should 87 not be used before calling that API. 88 89*/ 90 91 92#ifdef __cplusplus 93extern "C" { 94#endif 95 96#ifdef Py_DEBUG 97# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 98#else 99# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 100#endif 101 102#define _PyUnicode_UTF8(op) \ 103 (((PyCompactUnicodeObject*)(op))->utf8) 104#define PyUnicode_UTF8(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 assert(PyUnicode_IS_READY(op)), \ 107 PyUnicode_IS_COMPACT_ASCII(op) ? \ 108 ((char*)((PyASCIIObject*)(op) + 1)) : \ 109 _PyUnicode_UTF8(op)) 110#define _PyUnicode_UTF8_LENGTH(op) \ 111 (((PyCompactUnicodeObject*)(op))->utf8_length) 112#define PyUnicode_UTF8_LENGTH(op) \ 113 (assert(_PyUnicode_CHECK(op)), \ 114 assert(PyUnicode_IS_READY(op)), \ 115 PyUnicode_IS_COMPACT_ASCII(op) ? \ 116 ((PyASCIIObject*)(op))->length : \ 117 _PyUnicode_UTF8_LENGTH(op)) 118#define _PyUnicode_WSTR(op) \ 119 (((PyASCIIObject*)(op))->wstr) 120#define _PyUnicode_WSTR_LENGTH(op) \ 121 (((PyCompactUnicodeObject*)(op))->wstr_length) 122#define _PyUnicode_LENGTH(op) \ 123 (((PyASCIIObject *)(op))->length) 124#define _PyUnicode_STATE(op) \ 125 (((PyASCIIObject *)(op))->state) 126#define _PyUnicode_HASH(op) \ 127 (((PyASCIIObject *)(op))->hash) 128#define _PyUnicode_KIND(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 ((PyASCIIObject *)(op))->state.kind) 131#define _PyUnicode_GET_LENGTH(op) \ 132 (assert(_PyUnicode_CHECK(op)), \ 133 ((PyASCIIObject *)(op))->length) 134#define _PyUnicode_DATA_ANY(op) \ 135 (((PyUnicodeObject*)(op))->data.any) 136 137#undef PyUnicode_READY 138#define PyUnicode_READY(op) \ 139 (assert(_PyUnicode_CHECK(op)), \ 140 (PyUnicode_IS_READY(op) ? \ 141 0 : \ 142 _PyUnicode_Ready((PyObject *)(op)))) 143 144#define _PyUnicode_READY_REPLACE(p_obj) \ 145 (assert(_PyUnicode_CHECK(*p_obj)), \ 146 (PyUnicode_IS_READY(*p_obj) ? \ 147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 148 149#define _PyUnicode_SHARE_UTF8(op) \ 150 (assert(_PyUnicode_CHECK(op)), \ 151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 153#define _PyUnicode_SHARE_WSTR(op) \ 154 (assert(_PyUnicode_CHECK(op)), \ 155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 156 157/* true if the Unicode object has an allocated UTF-8 memory block 158 (not shared with other data) */ 159#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 160 (assert(_PyUnicode_CHECK(op)), \ 161 (!PyUnicode_IS_COMPACT_ASCII(op) \ 162 && _PyUnicode_UTF8(op) \ 163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 164 165/* true if the Unicode object has an allocated wstr memory block 166 (not shared with other data) */ 167#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 168 (assert(_PyUnicode_CHECK(op)), \ 169 (_PyUnicode_WSTR(op) && \ 170 (!PyUnicode_IS_READY(op) || \ 171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 172 173/* Generic helper macro to convert characters of different types. 174 from_type and to_type have to be valid type names, begin and end 175 are pointers to the source characters which should be of type 176 "from_type *". to is a pointer of type "to_type *" and points to the 177 buffer where the result characters are written to. */ 178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 179 do { \ 180 const from_type *iter_; to_type *to_; \ 181 for (iter_ = (begin), to_ = (to_type *)(to); \ 182 iter_ < (end); \ 183 ++iter_, ++to_) { \ 184 *to_ = (to_type)*iter_; \ 185 } \ 186 } while (0) 187 188/* The Unicode string has been modified: reset the hash */ 189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 190 191/* This dictionary holds all interned unicode strings. Note that references 192 to strings in this dictionary are *not* counted in the string's ob_refcnt. 193 When the interned string reaches a refcnt of 0 the string deallocation 194 function will delete the reference from this dictionary. 195 196 Another way to look at this is that to say that the actual reference 197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 198*/ 199static PyObject *interned; 200 201/* The empty Unicode object is shared to improve performance. */ 202static PyObject *unicode_empty; 203 204/* Single character Unicode strings in the Latin-1 range are being 205 shared as well. */ 206static PyObject *unicode_latin1[256]; 207 208/* Fast detection of the most frequent whitespace characters */ 209const unsigned char _Py_ascii_whitespace[] = { 210 0, 0, 0, 0, 0, 0, 0, 0, 211/* case 0x0009: * CHARACTER TABULATION */ 212/* case 0x000A: * LINE FEED */ 213/* case 0x000B: * LINE TABULATION */ 214/* case 0x000C: * FORM FEED */ 215/* case 0x000D: * CARRIAGE RETURN */ 216 0, 1, 1, 1, 1, 1, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218/* case 0x001C: * FILE SEPARATOR */ 219/* case 0x001D: * GROUP SEPARATOR */ 220/* case 0x001E: * RECORD SEPARATOR */ 221/* case 0x001F: * UNIT SEPARATOR */ 222 0, 0, 0, 0, 1, 1, 1, 1, 223/* case 0x0020: * SPACE */ 224 1, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 229 0, 0, 0, 0, 0, 0, 0, 0, 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0 237}; 238 239/* forward */ 240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 241static PyObject* get_latin1_char(unsigned char ch); 242static void copy_characters( 243 PyObject *to, Py_ssize_t to_start, 244 PyObject *from, Py_ssize_t from_start, 245 Py_ssize_t how_many); 246static int unicode_is_singleton(PyObject *unicode); 247 248static PyObject * 249unicode_encode_call_errorhandler(const char *errors, 250 PyObject **errorHandler,const char *encoding, const char *reason, 251 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 253 254static void 255raise_encode_exception(PyObject **exceptionObject, 256 const char *encoding, 257 const Py_UNICODE *unicode, Py_ssize_t size, 258 Py_ssize_t startpos, Py_ssize_t endpos, 259 const char *reason); 260 261/* Same for linebreaks */ 262static unsigned char ascii_linebreak[] = { 263 0, 0, 0, 0, 0, 0, 0, 0, 264/* 0x000A, * LINE FEED */ 265/* 0x000B, * LINE TABULATION */ 266/* 0x000C, * FORM FEED */ 267/* 0x000D, * CARRIAGE RETURN */ 268 0, 0, 1, 1, 1, 1, 0, 0, 269 0, 0, 0, 0, 0, 0, 0, 0, 270/* 0x001C, * FILE SEPARATOR */ 271/* 0x001D, * GROUP SEPARATOR */ 272/* 0x001E, * RECORD SEPARATOR */ 273 0, 0, 0, 0, 1, 1, 1, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0, 278 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0 287}; 288 289/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 290 This function is kept for backward compatibility with the old API. */ 291Py_UNICODE 292PyUnicode_GetMax(void) 293{ 294#ifdef Py_UNICODE_WIDE 295 return 0x10FFFF; 296#else 297 /* This is actually an illegal character, so it should 298 not be passed to unichr. */ 299 return 0xFFFF; 300#endif 301} 302 303#ifdef Py_DEBUG 304int 305/* FIXME: use PyObject* type for op */ 306_PyUnicode_CheckConsistency(void *op, int check_content) 307{ 308 PyASCIIObject *ascii; 309 unsigned int kind; 310 311 assert(PyUnicode_Check(op)); 312 313 ascii = (PyASCIIObject *)op; 314 kind = ascii->state.kind; 315 316 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 317 assert(kind == PyUnicode_1BYTE_KIND); 318 assert(ascii->state.ready == 1); 319 } 320 else { 321 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 322 void *data; 323 324 if (ascii->state.compact == 1) { 325 data = compact + 1; 326 assert(kind == PyUnicode_1BYTE_KIND 327 || kind == PyUnicode_2BYTE_KIND 328 || kind == PyUnicode_4BYTE_KIND); 329 assert(ascii->state.ascii == 0); 330 assert(ascii->state.ready == 1); 331 assert (compact->utf8 != data); 332 } else { 333 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 334 335 data = unicode->data.any; 336 if (kind == PyUnicode_WCHAR_KIND) { 337 assert(ascii->state.compact == 0); 338 assert(ascii->state.ascii == 0); 339 assert(ascii->state.ready == 0); 340 assert(ascii->wstr != NULL); 341 assert(data == NULL); 342 assert(compact->utf8 == NULL); 343 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 344 } 345 else { 346 assert(kind == PyUnicode_1BYTE_KIND 347 || kind == PyUnicode_2BYTE_KIND 348 || kind == PyUnicode_4BYTE_KIND); 349 assert(ascii->state.compact == 0); 350 assert(ascii->state.ready == 1); 351 assert(data != NULL); 352 if (ascii->state.ascii) { 353 assert (compact->utf8 == data); 354 assert (compact->utf8_length == ascii->length); 355 } 356 else 357 assert (compact->utf8 != data); 358 } 359 } 360 if (kind != PyUnicode_WCHAR_KIND) { 361 if ( 362#if SIZEOF_WCHAR_T == 2 363 kind == PyUnicode_2BYTE_KIND 364#else 365 kind == PyUnicode_4BYTE_KIND 366#endif 367 ) 368 { 369 assert(ascii->wstr == data); 370 assert(compact->wstr_length == ascii->length); 371 } else 372 assert(ascii->wstr != data); 373 } 374 375 if (compact->utf8 == NULL) 376 assert(compact->utf8_length == 0); 377 if (ascii->wstr == NULL) 378 assert(compact->wstr_length == 0); 379 } 380 /* check that the best kind is used */ 381 if (check_content && kind != PyUnicode_WCHAR_KIND) 382 { 383 Py_ssize_t i; 384 Py_UCS4 maxchar = 0; 385 void *data = PyUnicode_DATA(ascii); 386 for (i=0; i < ascii->length; i++) 387 { 388 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 389 if (ch > maxchar) 390 maxchar = ch; 391 } 392 if (kind == PyUnicode_1BYTE_KIND) { 393 if (ascii->state.ascii == 0) 394 assert(maxchar >= 128); 395 else 396 assert(maxchar < 128); 397 } 398 else if (kind == PyUnicode_2BYTE_KIND) 399 assert(maxchar >= 0x100); 400 else 401 assert(maxchar >= 0x10000); 402 } 403 if (check_content && !unicode_is_singleton((PyObject*)ascii)) 404 assert(ascii->hash == -1); 405 return 1; 406} 407#endif 408 409/* --- Bloom Filters ----------------------------------------------------- */ 410 411/* stuff to implement simple "bloom filters" for Unicode characters. 412 to keep things simple, we use a single bitmask, using the least 5 413 bits from each unicode characters as the bit index. */ 414 415/* the linebreak mask is set up by Unicode_Init below */ 416 417#if LONG_BIT >= 128 418#define BLOOM_WIDTH 128 419#elif LONG_BIT >= 64 420#define BLOOM_WIDTH 64 421#elif LONG_BIT >= 32 422#define BLOOM_WIDTH 32 423#else 424#error "LONG_BIT is smaller than 32" 425#endif 426 427#define BLOOM_MASK unsigned long 428 429static BLOOM_MASK bloom_linebreak; 430 431#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 432#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 433 434#define BLOOM_LINEBREAK(ch) \ 435 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 436 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 437 438Py_LOCAL_INLINE(BLOOM_MASK) 439make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 440{ 441 /* calculate simple bloom-style bitmask for a given unicode string */ 442 443 BLOOM_MASK mask; 444 Py_ssize_t i; 445 446 mask = 0; 447 for (i = 0; i < len; i++) 448 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 449 450 return mask; 451} 452 453#define BLOOM_MEMBER(mask, chr, str) \ 454 (BLOOM(mask, chr) \ 455 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 456 457/* --- Unicode Object ----------------------------------------------------- */ 458 459static PyObject * 460fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 461 462Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 463 Py_ssize_t size, Py_UCS4 ch, 464 int direction) 465{ 466 /* like wcschr, but doesn't stop at NULL characters */ 467 Py_ssize_t i; 468 if (direction == 1) { 469 for(i = 0; i < size; i++) 470 if (PyUnicode_READ(kind, s, i) == ch) 471 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 472 } 473 else { 474 for(i = size-1; i >= 0; i--) 475 if (PyUnicode_READ(kind, s, i) == ch) 476 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 477 } 478 return NULL; 479} 480 481static PyObject* 482resize_compact(PyObject *unicode, Py_ssize_t length) 483{ 484 Py_ssize_t char_size; 485 Py_ssize_t struct_size; 486 Py_ssize_t new_size; 487 int share_wstr; 488 489 assert(PyUnicode_IS_READY(unicode)); 490 char_size = PyUnicode_CHARACTER_SIZE(unicode); 491 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 492 struct_size = sizeof(PyASCIIObject); 493 else 494 struct_size = sizeof(PyCompactUnicodeObject); 495 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 496 497 _Py_DEC_REFTOTAL; 498 _Py_ForgetReference(unicode); 499 500 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 501 PyErr_NoMemory(); 502 return NULL; 503 } 504 new_size = (struct_size + (length + 1) * char_size); 505 506 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 507 if (unicode == NULL) { 508 PyObject_Del(unicode); 509 PyErr_NoMemory(); 510 return NULL; 511 } 512 _Py_NewReference(unicode); 513 _PyUnicode_LENGTH(unicode) = length; 514 if (share_wstr) { 515 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 516 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 517 _PyUnicode_WSTR_LENGTH(unicode) = length; 518 } 519 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 520 length, 0); 521 return unicode; 522} 523 524static int 525resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) 526{ 527 wchar_t *wstr; 528 assert(!PyUnicode_IS_COMPACT(unicode)); 529 assert(Py_REFCNT(unicode) == 1); 530 531 _PyUnicode_DIRTY(unicode); 532 533 if (PyUnicode_IS_READY(unicode)) { 534 Py_ssize_t char_size; 535 Py_ssize_t new_size; 536 int share_wstr, share_utf8; 537 void *data; 538 539 data = _PyUnicode_DATA_ANY(unicode); 540 assert(data != NULL); 541 char_size = PyUnicode_CHARACTER_SIZE(unicode); 542 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 543 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 544 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 545 { 546 PyObject_DEL(_PyUnicode_UTF8(unicode)); 547 _PyUnicode_UTF8(unicode) = NULL; 548 _PyUnicode_UTF8_LENGTH(unicode) = 0; 549 } 550 551 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 552 PyErr_NoMemory(); 553 return -1; 554 } 555 new_size = (length + 1) * char_size; 556 557 data = (PyObject *)PyObject_REALLOC(data, new_size); 558 if (data == NULL) { 559 PyErr_NoMemory(); 560 return -1; 561 } 562 _PyUnicode_DATA_ANY(unicode) = data; 563 if (share_wstr) { 564 _PyUnicode_WSTR(unicode) = data; 565 _PyUnicode_WSTR_LENGTH(unicode) = length; 566 } 567 if (share_utf8) { 568 _PyUnicode_UTF8(unicode) = data; 569 _PyUnicode_UTF8_LENGTH(unicode) = length; 570 } 571 _PyUnicode_LENGTH(unicode) = length; 572 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 573 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 574 assert(_PyUnicode_CheckConsistency(unicode, 0)); 575 return 0; 576 } 577 } 578 assert(_PyUnicode_WSTR(unicode) != NULL); 579 580 /* check for integer overflow */ 581 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 582 PyErr_NoMemory(); 583 return -1; 584 } 585 wstr = _PyUnicode_WSTR(unicode); 586 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 587 if (!wstr) { 588 PyErr_NoMemory(); 589 return -1; 590 } 591 _PyUnicode_WSTR(unicode) = wstr; 592 _PyUnicode_WSTR(unicode)[length] = 0; 593 _PyUnicode_WSTR_LENGTH(unicode) = length; 594 assert(_PyUnicode_CheckConsistency(unicode, 0)); 595 return 0; 596} 597 598static PyObject* 599resize_copy(PyObject *unicode, Py_ssize_t length) 600{ 601 Py_ssize_t copy_length; 602 if (PyUnicode_IS_COMPACT(unicode)) { 603 PyObject *copy; 604 assert(PyUnicode_IS_READY(unicode)); 605 606 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 607 if (copy == NULL) 608 return NULL; 609 610 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 611 copy_characters(copy, 0, unicode, 0, copy_length); 612 return copy; 613 } 614 else { 615 PyUnicodeObject *w; 616 assert(_PyUnicode_WSTR(unicode) != NULL); 617 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 618 w = _PyUnicode_New(length); 619 if (w == NULL) 620 return NULL; 621 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 622 copy_length = Py_MIN(copy_length, length); 623 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 624 copy_length); 625 return (PyObject*)w; 626 } 627} 628 629/* We allocate one more byte to make sure the string is 630 Ux0000 terminated; some code (e.g. new_identifier) 631 relies on that. 632 633 XXX This allocator could further be enhanced by assuring that the 634 free list never reduces its size below 1. 635 636*/ 637 638#ifdef Py_DEBUG 639int unicode_old_new_calls = 0; 640#endif 641 642static PyUnicodeObject * 643_PyUnicode_New(Py_ssize_t length) 644{ 645 register PyUnicodeObject *unicode; 646 size_t new_size; 647 648 /* Optimization for empty strings */ 649 if (length == 0 && unicode_empty != NULL) { 650 Py_INCREF(unicode_empty); 651 return (PyUnicodeObject*)unicode_empty; 652 } 653 654 /* Ensure we won't overflow the size. */ 655 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 656 return (PyUnicodeObject *)PyErr_NoMemory(); 657 } 658 if (length < 0) { 659 PyErr_SetString(PyExc_SystemError, 660 "Negative size passed to _PyUnicode_New"); 661 return NULL; 662 } 663 664#ifdef Py_DEBUG 665 ++unicode_old_new_calls; 666#endif 667 668 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 669 if (unicode == NULL) 670 return NULL; 671 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 672 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 673 if (!_PyUnicode_WSTR(unicode)) { 674 PyErr_NoMemory(); 675 goto onError; 676 } 677 678 /* Initialize the first element to guard against cases where 679 * the caller fails before initializing str -- unicode_resize() 680 * reads str[0], and the Keep-Alive optimization can keep memory 681 * allocated for str alive across a call to unicode_dealloc(unicode). 682 * We don't want unicode_resize to read uninitialized memory in 683 * that case. 684 */ 685 _PyUnicode_WSTR(unicode)[0] = 0; 686 _PyUnicode_WSTR(unicode)[length] = 0; 687 _PyUnicode_WSTR_LENGTH(unicode) = length; 688 _PyUnicode_HASH(unicode) = -1; 689 _PyUnicode_STATE(unicode).interned = 0; 690 _PyUnicode_STATE(unicode).kind = 0; 691 _PyUnicode_STATE(unicode).compact = 0; 692 _PyUnicode_STATE(unicode).ready = 0; 693 _PyUnicode_STATE(unicode).ascii = 0; 694 _PyUnicode_DATA_ANY(unicode) = NULL; 695 _PyUnicode_LENGTH(unicode) = 0; 696 _PyUnicode_UTF8(unicode) = NULL; 697 _PyUnicode_UTF8_LENGTH(unicode) = 0; 698 return unicode; 699 700 onError: 701 /* XXX UNREF/NEWREF interface should be more symmetrical */ 702 _Py_DEC_REFTOTAL; 703 _Py_ForgetReference((PyObject *)unicode); 704 PyObject_Del(unicode); 705 return NULL; 706} 707 708static const char* 709unicode_kind_name(PyObject *unicode) 710{ 711 /* don't check consistency: unicode_kind_name() is called from 712 _PyUnicode_Dump() */ 713 if (!PyUnicode_IS_COMPACT(unicode)) 714 { 715 if (!PyUnicode_IS_READY(unicode)) 716 return "wstr"; 717 switch(PyUnicode_KIND(unicode)) 718 { 719 case PyUnicode_1BYTE_KIND: 720 if (PyUnicode_IS_ASCII(unicode)) 721 return "legacy ascii"; 722 else 723 return "legacy latin1"; 724 case PyUnicode_2BYTE_KIND: 725 return "legacy UCS2"; 726 case PyUnicode_4BYTE_KIND: 727 return "legacy UCS4"; 728 default: 729 return "<legacy invalid kind>"; 730 } 731 } 732 assert(PyUnicode_IS_READY(unicode)); 733 switch(PyUnicode_KIND(unicode)) 734 { 735 case PyUnicode_1BYTE_KIND: 736 if (PyUnicode_IS_ASCII(unicode)) 737 return "ascii"; 738 else 739 return "latin1"; 740 case PyUnicode_2BYTE_KIND: 741 return "UCS2"; 742 case PyUnicode_4BYTE_KIND: 743 return "UCS4"; 744 default: 745 return "<invalid compact kind>"; 746 } 747} 748 749#ifdef Py_DEBUG 750int unicode_new_new_calls = 0; 751 752/* Functions wrapping macros for use in debugger */ 753char *_PyUnicode_utf8(void *unicode){ 754 return PyUnicode_UTF8(unicode); 755} 756 757void *_PyUnicode_compact_data(void *unicode) { 758 return _PyUnicode_COMPACT_DATA(unicode); 759} 760void *_PyUnicode_data(void *unicode){ 761 printf("obj %p\n", unicode); 762 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 763 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 764 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 765 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 766 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 767 return PyUnicode_DATA(unicode); 768} 769 770void 771_PyUnicode_Dump(PyObject *op) 772{ 773 PyASCIIObject *ascii = (PyASCIIObject *)op; 774 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 775 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 776 void *data; 777 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 778 if (ascii->state.compact) 779 data = (compact + 1); 780 else 781 data = unicode->data.any; 782 if (ascii->wstr == data) 783 printf("shared "); 784 printf("wstr=%p", ascii->wstr); 785 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 786 printf(" (%zu), ", compact->wstr_length); 787 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 788 printf("shared "); 789 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 790 } 791 printf(", data=%p\n", data); 792} 793#endif 794 795PyObject * 796PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 797{ 798 PyObject *obj; 799 PyCompactUnicodeObject *unicode; 800 void *data; 801 int kind_state; 802 int is_sharing, is_ascii; 803 Py_ssize_t char_size; 804 Py_ssize_t struct_size; 805 806 /* Optimization for empty strings */ 807 if (size == 0 && unicode_empty != NULL) { 808 Py_INCREF(unicode_empty); 809 return unicode_empty; 810 } 811 812#ifdef Py_DEBUG 813 ++unicode_new_new_calls; 814#endif 815 816 is_ascii = 0; 817 is_sharing = 0; 818 struct_size = sizeof(PyCompactUnicodeObject); 819 if (maxchar < 128) { 820 kind_state = PyUnicode_1BYTE_KIND; 821 char_size = 1; 822 is_ascii = 1; 823 struct_size = sizeof(PyASCIIObject); 824 } 825 else if (maxchar < 256) { 826 kind_state = PyUnicode_1BYTE_KIND; 827 char_size = 1; 828 } 829 else if (maxchar < 65536) { 830 kind_state = PyUnicode_2BYTE_KIND; 831 char_size = 2; 832 if (sizeof(wchar_t) == 2) 833 is_sharing = 1; 834 } 835 else { 836 kind_state = PyUnicode_4BYTE_KIND; 837 char_size = 4; 838 if (sizeof(wchar_t) == 4) 839 is_sharing = 1; 840 } 841 842 /* Ensure we won't overflow the size. */ 843 if (size < 0) { 844 PyErr_SetString(PyExc_SystemError, 845 "Negative size passed to PyUnicode_New"); 846 return NULL; 847 } 848 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 849 return PyErr_NoMemory(); 850 851 /* Duplicated allocation code from _PyObject_New() instead of a call to 852 * PyObject_New() so we are able to allocate space for the object and 853 * it's data buffer. 854 */ 855 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 856 if (obj == NULL) 857 return PyErr_NoMemory(); 858 obj = PyObject_INIT(obj, &PyUnicode_Type); 859 if (obj == NULL) 860 return NULL; 861 862 unicode = (PyCompactUnicodeObject *)obj; 863 if (is_ascii) 864 data = ((PyASCIIObject*)obj) + 1; 865 else 866 data = unicode + 1; 867 _PyUnicode_LENGTH(unicode) = size; 868 _PyUnicode_HASH(unicode) = -1; 869 _PyUnicode_STATE(unicode).interned = 0; 870 _PyUnicode_STATE(unicode).kind = kind_state; 871 _PyUnicode_STATE(unicode).compact = 1; 872 _PyUnicode_STATE(unicode).ready = 1; 873 _PyUnicode_STATE(unicode).ascii = is_ascii; 874 if (is_ascii) { 875 ((char*)data)[size] = 0; 876 _PyUnicode_WSTR(unicode) = NULL; 877 } 878 else if (kind_state == PyUnicode_1BYTE_KIND) { 879 ((char*)data)[size] = 0; 880 _PyUnicode_WSTR(unicode) = NULL; 881 _PyUnicode_WSTR_LENGTH(unicode) = 0; 882 unicode->utf8 = NULL; 883 unicode->utf8_length = 0; 884 } 885 else { 886 unicode->utf8 = NULL; 887 unicode->utf8_length = 0; 888 if (kind_state == PyUnicode_2BYTE_KIND) 889 ((Py_UCS2*)data)[size] = 0; 890 else /* kind_state == PyUnicode_4BYTE_KIND */ 891 ((Py_UCS4*)data)[size] = 0; 892 if (is_sharing) { 893 _PyUnicode_WSTR_LENGTH(unicode) = size; 894 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 895 } 896 else { 897 _PyUnicode_WSTR_LENGTH(unicode) = 0; 898 _PyUnicode_WSTR(unicode) = NULL; 899 } 900 } 901 assert(_PyUnicode_CheckConsistency(unicode, 0)); 902 return obj; 903} 904 905#if SIZEOF_WCHAR_T == 2 906/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 907 will decode surrogate pairs, the other conversions are implemented as macros 908 for efficiency. 909 910 This function assumes that unicode can hold one more code point than wstr 911 characters for a terminating null character. */ 912static void 913unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 914 PyUnicodeObject *unicode) 915{ 916 const wchar_t *iter; 917 Py_UCS4 *ucs4_out; 918 919 assert(unicode != NULL); 920 assert(_PyUnicode_CHECK(unicode)); 921 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 922 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 923 924 for (iter = begin; iter < end; ) { 925 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 926 _PyUnicode_GET_LENGTH(unicode))); 927 if (*iter >= 0xD800 && *iter <= 0xDBFF 928 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 929 { 930 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 931 iter += 2; 932 } 933 else { 934 *ucs4_out++ = *iter; 935 iter++; 936 } 937 } 938 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 939 _PyUnicode_GET_LENGTH(unicode))); 940 941} 942#endif 943 944static int 945_PyUnicode_Dirty(PyObject *unicode) 946{ 947 assert(_PyUnicode_CHECK(unicode)); 948 if (Py_REFCNT(unicode) != 1) { 949 PyErr_SetString(PyExc_SystemError, 950 "Cannot modify a string having more than 1 reference"); 951 return -1; 952 } 953 _PyUnicode_DIRTY(unicode); 954 return 0; 955} 956 957static int 958_copy_characters(PyObject *to, Py_ssize_t to_start, 959 PyObject *from, Py_ssize_t from_start, 960 Py_ssize_t how_many, int check_maxchar) 961{ 962 unsigned int from_kind, to_kind; 963 void *from_data, *to_data; 964 int fast; 965 966 assert(PyUnicode_Check(from)); 967 assert(PyUnicode_Check(to)); 968 assert(PyUnicode_IS_READY(from)); 969 assert(PyUnicode_IS_READY(to)); 970 971 assert(PyUnicode_GET_LENGTH(from) >= how_many); 972 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 973 assert(0 <= how_many); 974 975 if (how_many == 0) 976 return 0; 977 978 from_kind = PyUnicode_KIND(from); 979 from_data = PyUnicode_DATA(from); 980 to_kind = PyUnicode_KIND(to); 981 to_data = PyUnicode_DATA(to); 982 983#ifdef Py_DEBUG 984 if (!check_maxchar 985 && (from_kind > to_kind 986 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 987 { 988 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 989 Py_UCS4 ch; 990 Py_ssize_t i; 991 for (i=0; i < how_many; i++) { 992 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 993 assert(ch <= to_maxchar); 994 } 995 } 996#endif 997 fast = (from_kind == to_kind); 998 if (check_maxchar 999 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1000 { 1001 /* deny latin1 => ascii */ 1002 fast = 0; 1003 } 1004 1005 if (fast) { 1006 Py_MEMCPY((char*)to_data 1007 + PyUnicode_KIND_SIZE(to_kind, to_start), 1008 (char*)from_data 1009 + PyUnicode_KIND_SIZE(from_kind, from_start), 1010 PyUnicode_KIND_SIZE(to_kind, how_many)); 1011 } 1012 else if (from_kind == PyUnicode_1BYTE_KIND 1013 && to_kind == PyUnicode_2BYTE_KIND) 1014 { 1015 _PyUnicode_CONVERT_BYTES( 1016 Py_UCS1, Py_UCS2, 1017 PyUnicode_1BYTE_DATA(from) + from_start, 1018 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1019 PyUnicode_2BYTE_DATA(to) + to_start 1020 ); 1021 } 1022 else if (from_kind == PyUnicode_1BYTE_KIND 1023 && to_kind == PyUnicode_4BYTE_KIND) 1024 { 1025 _PyUnicode_CONVERT_BYTES( 1026 Py_UCS1, Py_UCS4, 1027 PyUnicode_1BYTE_DATA(from) + from_start, 1028 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1029 PyUnicode_4BYTE_DATA(to) + to_start 1030 ); 1031 } 1032 else if (from_kind == PyUnicode_2BYTE_KIND 1033 && to_kind == PyUnicode_4BYTE_KIND) 1034 { 1035 _PyUnicode_CONVERT_BYTES( 1036 Py_UCS2, Py_UCS4, 1037 PyUnicode_2BYTE_DATA(from) + from_start, 1038 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1039 PyUnicode_4BYTE_DATA(to) + to_start 1040 ); 1041 } 1042 else { 1043 /* check if max_char(from substring) <= max_char(to) */ 1044 if (from_kind > to_kind 1045 /* latin1 => ascii */ 1046 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1047 { 1048 /* slow path to check for character overflow */ 1049 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1050 Py_UCS4 ch; 1051 Py_ssize_t i; 1052 1053 for (i=0; i < how_many; i++) { 1054 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1055 if (check_maxchar) { 1056 if (ch > to_maxchar) 1057 return 1; 1058 } 1059 else { 1060 assert(ch <= to_maxchar); 1061 } 1062 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1063 } 1064 } 1065 else { 1066 return -1; 1067 } 1068 } 1069 return 0; 1070} 1071 1072static void 1073copy_characters(PyObject *to, Py_ssize_t to_start, 1074 PyObject *from, Py_ssize_t from_start, 1075 Py_ssize_t how_many) 1076{ 1077 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1078} 1079 1080Py_ssize_t 1081PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1082 PyObject *from, Py_ssize_t from_start, 1083 Py_ssize_t how_many) 1084{ 1085 int err; 1086 1087 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1088 PyErr_BadInternalCall(); 1089 return -1; 1090 } 1091 1092 if (PyUnicode_READY(from)) 1093 return -1; 1094 if (PyUnicode_READY(to)) 1095 return -1; 1096 1097 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1098 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1099 PyErr_Format(PyExc_SystemError, 1100 "Cannot write %zi characters at %zi " 1101 "in a string of %zi characters", 1102 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1103 return -1; 1104 } 1105 1106 if (how_many == 0) 1107 return 0; 1108 1109 if (_PyUnicode_Dirty(to)) 1110 return -1; 1111 1112 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1113 if (err) { 1114 PyErr_Format(PyExc_SystemError, 1115 "Cannot copy %s characters " 1116 "into a string of %s characters", 1117 unicode_kind_name(from), 1118 unicode_kind_name(to)); 1119 return -1; 1120 } 1121 return how_many; 1122} 1123 1124/* Find the maximum code point and count the number of surrogate pairs so a 1125 correct string length can be computed before converting a string to UCS4. 1126 This function counts single surrogates as a character and not as a pair. 1127 1128 Return 0 on success, or -1 on error. */ 1129static int 1130find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1131 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1132{ 1133 const wchar_t *iter; 1134 1135 assert(num_surrogates != NULL && maxchar != NULL); 1136 *num_surrogates = 0; 1137 *maxchar = 0; 1138 1139 for (iter = begin; iter < end; ) { 1140 if (*iter > *maxchar) { 1141 *maxchar = *iter; 1142#if SIZEOF_WCHAR_T != 2 1143 if (*maxchar >= 0x10000) 1144 return 0; 1145#endif 1146 } 1147#if SIZEOF_WCHAR_T == 2 1148 if (*iter >= 0xD800 && *iter <= 0xDBFF 1149 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1150 { 1151 Py_UCS4 surrogate_val; 1152 surrogate_val = (((iter[0] & 0x3FF)<<10) 1153 | (iter[1] & 0x3FF)) + 0x10000; 1154 ++(*num_surrogates); 1155 if (surrogate_val > *maxchar) 1156 *maxchar = surrogate_val; 1157 iter += 2; 1158 } 1159 else 1160 iter++; 1161#else 1162 iter++; 1163#endif 1164 } 1165 return 0; 1166} 1167 1168#ifdef Py_DEBUG 1169int unicode_ready_calls = 0; 1170#endif 1171 1172static int 1173unicode_ready(PyObject **p_obj, int replace) 1174{ 1175 PyUnicodeObject *unicode; 1176 wchar_t *end; 1177 Py_UCS4 maxchar = 0; 1178 Py_ssize_t num_surrogates; 1179#if SIZEOF_WCHAR_T == 2 1180 Py_ssize_t length_wo_surrogates; 1181#endif 1182 1183 assert(p_obj != NULL); 1184 unicode = (PyUnicodeObject *)*p_obj; 1185 1186 /* _PyUnicode_Ready() is only intended for old-style API usage where 1187 strings were created using _PyObject_New() and where no canonical 1188 representation (the str field) has been set yet aka strings 1189 which are not yet ready. */ 1190 assert(_PyUnicode_CHECK(unicode)); 1191 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1192 assert(_PyUnicode_WSTR(unicode) != NULL); 1193 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1194 assert(_PyUnicode_UTF8(unicode) == NULL); 1195 /* Actually, it should neither be interned nor be anything else: */ 1196 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1197 1198#ifdef Py_DEBUG 1199 ++unicode_ready_calls; 1200#endif 1201 1202#ifdef Py_DEBUG 1203 assert(!replace || Py_REFCNT(unicode) == 1); 1204#else 1205 if (replace && Py_REFCNT(unicode) != 1) 1206 replace = 0; 1207#endif 1208 if (replace) { 1209 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1210 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1211 /* Optimization for empty strings */ 1212 if (len == 0) { 1213 Py_INCREF(unicode_empty); 1214 Py_DECREF(*p_obj); 1215 *p_obj = unicode_empty; 1216 return 0; 1217 } 1218 if (len == 1 && wstr[0] < 256) { 1219 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1220 if (latin1_char == NULL) 1221 return -1; 1222 Py_DECREF(*p_obj); 1223 *p_obj = latin1_char; 1224 return 0; 1225 } 1226 } 1227 1228 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1229 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1230 &maxchar, &num_surrogates) == -1) 1231 return -1; 1232 1233 if (maxchar < 256) { 1234 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1235 if (!_PyUnicode_DATA_ANY(unicode)) { 1236 PyErr_NoMemory(); 1237 return -1; 1238 } 1239 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1240 _PyUnicode_WSTR(unicode), end, 1241 PyUnicode_1BYTE_DATA(unicode)); 1242 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1243 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1244 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1245 if (maxchar < 128) { 1246 _PyUnicode_STATE(unicode).ascii = 1; 1247 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1248 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1249 } 1250 else { 1251 _PyUnicode_STATE(unicode).ascii = 0; 1252 _PyUnicode_UTF8(unicode) = NULL; 1253 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1254 } 1255 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1256 _PyUnicode_WSTR(unicode) = NULL; 1257 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1258 } 1259 /* In this case we might have to convert down from 4-byte native 1260 wchar_t to 2-byte unicode. */ 1261 else if (maxchar < 65536) { 1262 assert(num_surrogates == 0 && 1263 "FindMaxCharAndNumSurrogatePairs() messed up"); 1264 1265#if SIZEOF_WCHAR_T == 2 1266 /* We can share representations and are done. */ 1267 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1268 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1270 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1271 _PyUnicode_UTF8(unicode) = NULL; 1272 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1273#else 1274 /* sizeof(wchar_t) == 4 */ 1275 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1276 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1277 if (!_PyUnicode_DATA_ANY(unicode)) { 1278 PyErr_NoMemory(); 1279 return -1; 1280 } 1281 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1282 _PyUnicode_WSTR(unicode), end, 1283 PyUnicode_2BYTE_DATA(unicode)); 1284 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1285 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1286 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1287 _PyUnicode_UTF8(unicode) = NULL; 1288 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1289 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1290 _PyUnicode_WSTR(unicode) = NULL; 1291 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1292#endif 1293 } 1294 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1295 else { 1296#if SIZEOF_WCHAR_T == 2 1297 /* in case the native representation is 2-bytes, we need to allocate a 1298 new normalized 4-byte version. */ 1299 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1300 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1301 if (!_PyUnicode_DATA_ANY(unicode)) { 1302 PyErr_NoMemory(); 1303 return -1; 1304 } 1305 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1306 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1307 _PyUnicode_UTF8(unicode) = NULL; 1308 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1309 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1310 _PyUnicode_STATE(unicode).ready = 1; 1311 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1312 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1313 _PyUnicode_WSTR(unicode) = NULL; 1314 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1315#else 1316 assert(num_surrogates == 0); 1317 1318 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1319 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1320 _PyUnicode_UTF8(unicode) = NULL; 1321 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1322 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1323#endif 1324 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1325 } 1326 _PyUnicode_STATE(unicode).ready = 1; 1327 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1328 return 0; 1329} 1330 1331int 1332_PyUnicode_ReadyReplace(PyObject **op) 1333{ 1334 return unicode_ready(op, 1); 1335} 1336 1337int 1338_PyUnicode_Ready(PyObject *op) 1339{ 1340 return unicode_ready(&op, 0); 1341} 1342 1343static void 1344unicode_dealloc(register PyUnicodeObject *unicode) 1345{ 1346 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1347 case SSTATE_NOT_INTERNED: 1348 break; 1349 1350 case SSTATE_INTERNED_MORTAL: 1351 /* revive dead object temporarily for DelItem */ 1352 Py_REFCNT(unicode) = 3; 1353 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1354 Py_FatalError( 1355 "deletion of interned string failed"); 1356 break; 1357 1358 case SSTATE_INTERNED_IMMORTAL: 1359 Py_FatalError("Immortal interned string died."); 1360 1361 default: 1362 Py_FatalError("Inconsistent interned string state."); 1363 } 1364 1365 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1366 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1367 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1368 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1369 1370 if (PyUnicode_IS_COMPACT(unicode)) { 1371 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1372 } 1373 else { 1374 if (_PyUnicode_DATA_ANY(unicode)) 1375 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1376 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1377 } 1378} 1379 1380#ifdef Py_DEBUG 1381static int 1382unicode_is_singleton(PyObject *unicode) 1383{ 1384 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1385 if (unicode == unicode_empty) 1386 return 1; 1387 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1388 { 1389 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1390 if (ch < 256 && unicode_latin1[ch] == unicode) 1391 return 1; 1392 } 1393 return 0; 1394} 1395#endif 1396 1397static int 1398unicode_resizable(PyObject *unicode) 1399{ 1400 if (Py_REFCNT(unicode) != 1) 1401 return 0; 1402 if (PyUnicode_CHECK_INTERNED(unicode)) 1403 return 0; 1404#ifdef Py_DEBUG 1405 /* singleton refcount is greater than 1 */ 1406 assert(!unicode_is_singleton(unicode)); 1407#endif 1408 return 1; 1409} 1410 1411static int 1412unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1413{ 1414 PyObject *unicode; 1415 Py_ssize_t old_length; 1416 1417 assert(p_unicode != NULL); 1418 unicode = *p_unicode; 1419 1420 assert(unicode != NULL); 1421 assert(PyUnicode_Check(unicode)); 1422 assert(0 <= length); 1423 1424 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1425 old_length = PyUnicode_WSTR_LENGTH(unicode); 1426 else 1427 old_length = PyUnicode_GET_LENGTH(unicode); 1428 if (old_length == length) 1429 return 0; 1430 1431 if (!unicode_resizable(unicode)) { 1432 PyObject *copy = resize_copy(unicode, length); 1433 if (copy == NULL) 1434 return -1; 1435 Py_DECREF(*p_unicode); 1436 *p_unicode = copy; 1437 return 0; 1438 } 1439 1440 if (PyUnicode_IS_COMPACT(unicode)) { 1441 *p_unicode = resize_compact(unicode, length); 1442 if (*p_unicode == NULL) 1443 return -1; 1444 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); 1445 return 0; 1446 } 1447 return resize_inplace((PyUnicodeObject*)unicode, length); 1448} 1449 1450int 1451PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1452{ 1453 PyObject *unicode; 1454 if (p_unicode == NULL) { 1455 PyErr_BadInternalCall(); 1456 return -1; 1457 } 1458 unicode = *p_unicode; 1459 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1460 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1461 { 1462 PyErr_BadInternalCall(); 1463 return -1; 1464 } 1465 return unicode_resize(p_unicode, length); 1466} 1467 1468static PyObject* 1469get_latin1_char(unsigned char ch) 1470{ 1471 PyObject *unicode = unicode_latin1[ch]; 1472 if (!unicode) { 1473 unicode = PyUnicode_New(1, ch); 1474 if (!unicode) 1475 return NULL; 1476 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1477 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1478 unicode_latin1[ch] = unicode; 1479 } 1480 Py_INCREF(unicode); 1481 return unicode; 1482} 1483 1484PyObject * 1485PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1486{ 1487 PyUnicodeObject *unicode; 1488 Py_UCS4 maxchar = 0; 1489 Py_ssize_t num_surrogates; 1490 1491 if (u == NULL) 1492 return (PyObject*)_PyUnicode_New(size); 1493 1494 /* If the Unicode data is known at construction time, we can apply 1495 some optimizations which share commonly used objects. */ 1496 1497 /* Optimization for empty strings */ 1498 if (size == 0 && unicode_empty != NULL) { 1499 Py_INCREF(unicode_empty); 1500 return unicode_empty; 1501 } 1502 1503 /* Single character Unicode objects in the Latin-1 range are 1504 shared when using this constructor */ 1505 if (size == 1 && *u < 256) 1506 return get_latin1_char((unsigned char)*u); 1507 1508 /* If not empty and not single character, copy the Unicode data 1509 into the new object */ 1510 if (find_maxchar_surrogates(u, u + size, 1511 &maxchar, &num_surrogates) == -1) 1512 return NULL; 1513 1514 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1515 maxchar); 1516 if (!unicode) 1517 return NULL; 1518 1519 switch (PyUnicode_KIND(unicode)) { 1520 case PyUnicode_1BYTE_KIND: 1521 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1522 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1523 break; 1524 case PyUnicode_2BYTE_KIND: 1525#if Py_UNICODE_SIZE == 2 1526 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1527#else 1528 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1529 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1530#endif 1531 break; 1532 case PyUnicode_4BYTE_KIND: 1533#if SIZEOF_WCHAR_T == 2 1534 /* This is the only case which has to process surrogates, thus 1535 a simple copy loop is not enough and we need a function. */ 1536 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1537#else 1538 assert(num_surrogates == 0); 1539 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1540#endif 1541 break; 1542 default: 1543 assert(0 && "Impossible state"); 1544 } 1545 1546 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1547 return (PyObject *)unicode; 1548} 1549 1550PyObject * 1551PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1552{ 1553 PyUnicodeObject *unicode; 1554 1555 if (size < 0) { 1556 PyErr_SetString(PyExc_SystemError, 1557 "Negative size passed to PyUnicode_FromStringAndSize"); 1558 return NULL; 1559 } 1560 1561 /* If the Unicode data is known at construction time, we can apply 1562 some optimizations which share commonly used objects. 1563 Also, this means the input must be UTF-8, so fall back to the 1564 UTF-8 decoder at the end. */ 1565 if (u != NULL) { 1566 1567 /* Optimization for empty strings */ 1568 if (size == 0 && unicode_empty != NULL) { 1569 Py_INCREF(unicode_empty); 1570 return unicode_empty; 1571 } 1572 1573 /* Single characters are shared when using this constructor. 1574 Restrict to ASCII, since the input must be UTF-8. */ 1575 if (size == 1 && Py_CHARMASK(*u) < 128) 1576 return get_latin1_char(Py_CHARMASK(*u)); 1577 1578 return PyUnicode_DecodeUTF8(u, size, NULL); 1579 } 1580 1581 unicode = _PyUnicode_New(size); 1582 if (!unicode) 1583 return NULL; 1584 1585 return (PyObject *)unicode; 1586} 1587 1588PyObject * 1589PyUnicode_FromString(const char *u) 1590{ 1591 size_t size = strlen(u); 1592 if (size > PY_SSIZE_T_MAX) { 1593 PyErr_SetString(PyExc_OverflowError, "input too long"); 1594 return NULL; 1595 } 1596 1597 return PyUnicode_FromStringAndSize(u, size); 1598} 1599 1600static PyObject* 1601unicode_fromascii(const unsigned char* s, Py_ssize_t size) 1602{ 1603 PyObject *res; 1604#ifdef Py_DEBUG 1605 const unsigned char *p; 1606 const unsigned char *end = s + size; 1607 for (p=s; p < end; p++) { 1608 assert(*p < 128); 1609 } 1610#endif 1611 res = PyUnicode_New(size, 127); 1612 if (!res) 1613 return NULL; 1614 memcpy(PyUnicode_1BYTE_DATA(res), s, size); 1615 return res; 1616} 1617 1618static Py_UCS4 1619kind_maxchar_limit(unsigned int kind) 1620{ 1621 switch(kind) { 1622 case PyUnicode_1BYTE_KIND: 1623 return 0x80; 1624 case PyUnicode_2BYTE_KIND: 1625 return 0x100; 1626 case PyUnicode_4BYTE_KIND: 1627 return 0x10000; 1628 default: 1629 assert(0 && "invalid kind"); 1630 return 0x10ffff; 1631 } 1632} 1633 1634static PyObject* 1635_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1636{ 1637 PyObject *res; 1638 unsigned char max_char = 127; 1639 Py_ssize_t i; 1640 1641 assert(size >= 0); 1642 for (i = 0; i < size; i++) { 1643 if (u[i] & 0x80) { 1644 max_char = 255; 1645 break; 1646 } 1647 } 1648 res = PyUnicode_New(size, max_char); 1649 if (!res) 1650 return NULL; 1651 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1652 assert(_PyUnicode_CheckConsistency(res, 1)); 1653 return res; 1654} 1655 1656static PyObject* 1657_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1658{ 1659 PyObject *res; 1660 Py_UCS2 max_char = 0; 1661 Py_ssize_t i; 1662 1663 assert(size >= 0); 1664 for (i = 0; i < size; i++) { 1665 if (u[i] > max_char) { 1666 max_char = u[i]; 1667 if (max_char >= 256) 1668 break; 1669 } 1670 } 1671 res = PyUnicode_New(size, max_char); 1672 if (!res) 1673 return NULL; 1674 if (max_char >= 256) 1675 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1676 else 1677 for (i = 0; i < size; i++) 1678 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1679 assert(_PyUnicode_CheckConsistency(res, 1)); 1680 return res; 1681} 1682 1683static PyObject* 1684_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1685{ 1686 PyObject *res; 1687 Py_UCS4 max_char = 0; 1688 Py_ssize_t i; 1689 1690 assert(size >= 0); 1691 for (i = 0; i < size; i++) { 1692 if (u[i] > max_char) { 1693 max_char = u[i]; 1694 if (max_char >= 0x10000) 1695 break; 1696 } 1697 } 1698 res = PyUnicode_New(size, max_char); 1699 if (!res) 1700 return NULL; 1701 if (max_char >= 0x10000) 1702 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1703 else { 1704 int kind = PyUnicode_KIND(res); 1705 void *data = PyUnicode_DATA(res); 1706 for (i = 0; i < size; i++) 1707 PyUnicode_WRITE(kind, data, i, u[i]); 1708 } 1709 assert(_PyUnicode_CheckConsistency(res, 1)); 1710 return res; 1711} 1712 1713PyObject* 1714PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1715{ 1716 switch(kind) { 1717 case PyUnicode_1BYTE_KIND: 1718 return _PyUnicode_FromUCS1(buffer, size); 1719 case PyUnicode_2BYTE_KIND: 1720 return _PyUnicode_FromUCS2(buffer, size); 1721 case PyUnicode_4BYTE_KIND: 1722 return _PyUnicode_FromUCS4(buffer, size); 1723 default: 1724 assert(0 && "invalid kind"); 1725 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1726 return NULL; 1727 } 1728} 1729 1730PyObject* 1731PyUnicode_Copy(PyObject *unicode) 1732{ 1733 Py_ssize_t size; 1734 PyObject *copy; 1735 void *data; 1736 1737 if (!PyUnicode_Check(unicode)) { 1738 PyErr_BadInternalCall(); 1739 return NULL; 1740 } 1741 if (PyUnicode_READY(unicode)) 1742 return NULL; 1743 1744 size = PyUnicode_GET_LENGTH(unicode); 1745 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1746 if (!copy) 1747 return NULL; 1748 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1749 1750 data = PyUnicode_DATA(unicode); 1751 switch (PyUnicode_KIND(unicode)) 1752 { 1753 case PyUnicode_1BYTE_KIND: 1754 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1755 break; 1756 case PyUnicode_2BYTE_KIND: 1757 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1758 break; 1759 case PyUnicode_4BYTE_KIND: 1760 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1761 break; 1762 default: 1763 assert(0); 1764 break; 1765 } 1766 assert(_PyUnicode_CheckConsistency(copy, 1)); 1767 return copy; 1768} 1769 1770 1771/* Widen Unicode objects to larger buffers. Don't write terminating null 1772 character. Return NULL on error. */ 1773 1774void* 1775_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1776{ 1777 Py_ssize_t len; 1778 void *result; 1779 unsigned int skind; 1780 1781 if (PyUnicode_READY(s)) 1782 return NULL; 1783 1784 len = PyUnicode_GET_LENGTH(s); 1785 skind = PyUnicode_KIND(s); 1786 if (skind >= kind) { 1787 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1788 return NULL; 1789 } 1790 switch(kind) { 1791 case PyUnicode_2BYTE_KIND: 1792 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1793 if (!result) 1794 return PyErr_NoMemory(); 1795 assert(skind == PyUnicode_1BYTE_KIND); 1796 _PyUnicode_CONVERT_BYTES( 1797 Py_UCS1, Py_UCS2, 1798 PyUnicode_1BYTE_DATA(s), 1799 PyUnicode_1BYTE_DATA(s) + len, 1800 result); 1801 return result; 1802 case PyUnicode_4BYTE_KIND: 1803 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1804 if (!result) 1805 return PyErr_NoMemory(); 1806 if (skind == PyUnicode_2BYTE_KIND) { 1807 _PyUnicode_CONVERT_BYTES( 1808 Py_UCS2, Py_UCS4, 1809 PyUnicode_2BYTE_DATA(s), 1810 PyUnicode_2BYTE_DATA(s) + len, 1811 result); 1812 } 1813 else { 1814 assert(skind == PyUnicode_1BYTE_KIND); 1815 _PyUnicode_CONVERT_BYTES( 1816 Py_UCS1, Py_UCS4, 1817 PyUnicode_1BYTE_DATA(s), 1818 PyUnicode_1BYTE_DATA(s) + len, 1819 result); 1820 } 1821 return result; 1822 default: 1823 break; 1824 } 1825 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1826 return NULL; 1827} 1828 1829static Py_UCS4* 1830as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1831 int copy_null) 1832{ 1833 int kind; 1834 void *data; 1835 Py_ssize_t len, targetlen; 1836 if (PyUnicode_READY(string) == -1) 1837 return NULL; 1838 kind = PyUnicode_KIND(string); 1839 data = PyUnicode_DATA(string); 1840 len = PyUnicode_GET_LENGTH(string); 1841 targetlen = len; 1842 if (copy_null) 1843 targetlen++; 1844 if (!target) { 1845 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1846 PyErr_NoMemory(); 1847 return NULL; 1848 } 1849 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1850 if (!target) { 1851 PyErr_NoMemory(); 1852 return NULL; 1853 } 1854 } 1855 else { 1856 if (targetsize < targetlen) { 1857 PyErr_Format(PyExc_SystemError, 1858 "string is longer than the buffer"); 1859 if (copy_null && 0 < targetsize) 1860 target[0] = 0; 1861 return NULL; 1862 } 1863 } 1864 if (kind != PyUnicode_4BYTE_KIND) { 1865 Py_ssize_t i; 1866 for (i = 0; i < len; i++) 1867 target[i] = PyUnicode_READ(kind, data, i); 1868 } 1869 else 1870 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1871 if (copy_null) 1872 target[len] = 0; 1873 return target; 1874} 1875 1876Py_UCS4* 1877PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1878 int copy_null) 1879{ 1880 if (target == NULL || targetsize < 1) { 1881 PyErr_BadInternalCall(); 1882 return NULL; 1883 } 1884 return as_ucs4(string, target, targetsize, copy_null); 1885} 1886 1887Py_UCS4* 1888PyUnicode_AsUCS4Copy(PyObject *string) 1889{ 1890 return as_ucs4(string, NULL, 0, 1); 1891} 1892 1893#ifdef HAVE_WCHAR_H 1894 1895PyObject * 1896PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1897{ 1898 if (w == NULL) { 1899 if (size == 0) 1900 return PyUnicode_New(0, 0); 1901 PyErr_BadInternalCall(); 1902 return NULL; 1903 } 1904 1905 if (size == -1) { 1906 size = wcslen(w); 1907 } 1908 1909 return PyUnicode_FromUnicode(w, size); 1910} 1911 1912#endif /* HAVE_WCHAR_H */ 1913 1914static void 1915makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1916 int zeropad, int width, int precision, char c) 1917{ 1918 *fmt++ = '%'; 1919 if (width) { 1920 if (zeropad) 1921 *fmt++ = '0'; 1922 fmt += sprintf(fmt, "%d", width); 1923 } 1924 if (precision) 1925 fmt += sprintf(fmt, ".%d", precision); 1926 if (longflag) 1927 *fmt++ = 'l'; 1928 else if (longlongflag) { 1929 /* longlongflag should only ever be nonzero on machines with 1930 HAVE_LONG_LONG defined */ 1931#ifdef HAVE_LONG_LONG 1932 char *f = PY_FORMAT_LONG_LONG; 1933 while (*f) 1934 *fmt++ = *f++; 1935#else 1936 /* we shouldn't ever get here */ 1937 assert(0); 1938 *fmt++ = 'l'; 1939#endif 1940 } 1941 else if (size_tflag) { 1942 char *f = PY_FORMAT_SIZE_T; 1943 while (*f) 1944 *fmt++ = *f++; 1945 } 1946 *fmt++ = c; 1947 *fmt = '\0'; 1948} 1949 1950/* helper for PyUnicode_FromFormatV() */ 1951 1952static const char* 1953parse_format_flags(const char *f, 1954 int *p_width, int *p_precision, 1955 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1956{ 1957 int width, precision, longflag, longlongflag, size_tflag; 1958 1959 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1960 f++; 1961 width = 0; 1962 while (Py_ISDIGIT((unsigned)*f)) 1963 width = (width*10) + *f++ - '0'; 1964 precision = 0; 1965 if (*f == '.') { 1966 f++; 1967 while (Py_ISDIGIT((unsigned)*f)) 1968 precision = (precision*10) + *f++ - '0'; 1969 if (*f == '%') { 1970 /* "%.3%s" => f points to "3" */ 1971 f--; 1972 } 1973 } 1974 if (*f == '\0') { 1975 /* bogus format "%.1" => go backward, f points to "1" */ 1976 f--; 1977 } 1978 if (p_width != NULL) 1979 *p_width = width; 1980 if (p_precision != NULL) 1981 *p_precision = precision; 1982 1983 /* Handle %ld, %lu, %lld and %llu. */ 1984 longflag = 0; 1985 longlongflag = 0; 1986 size_tflag = 0; 1987 1988 if (*f == 'l') { 1989 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1990 longflag = 1; 1991 ++f; 1992 } 1993#ifdef HAVE_LONG_LONG 1994 else if (f[1] == 'l' && 1995 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1996 longlongflag = 1; 1997 f += 2; 1998 } 1999#endif 2000 } 2001 /* handle the size_t flag. */ 2002 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2003 size_tflag = 1; 2004 ++f; 2005 } 2006 if (p_longflag != NULL) 2007 *p_longflag = longflag; 2008 if (p_longlongflag != NULL) 2009 *p_longlongflag = longlongflag; 2010 if (p_size_tflag != NULL) 2011 *p_size_tflag = size_tflag; 2012 return f; 2013} 2014 2015/* maximum number of characters required for output of %ld. 21 characters 2016 allows for 64-bit integers (in decimal) and an optional sign. */ 2017#define MAX_LONG_CHARS 21 2018/* maximum number of characters required for output of %lld. 2019 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2020 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2021#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2022 2023PyObject * 2024PyUnicode_FromFormatV(const char *format, va_list vargs) 2025{ 2026 va_list count; 2027 Py_ssize_t callcount = 0; 2028 PyObject **callresults = NULL; 2029 PyObject **callresult = NULL; 2030 Py_ssize_t n = 0; 2031 int width = 0; 2032 int precision = 0; 2033 int zeropad; 2034 const char* f; 2035 PyObject *string; 2036 /* used by sprintf */ 2037 char fmt[61]; /* should be enough for %0width.precisionlld */ 2038 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2039 Py_UCS4 argmaxchar; 2040 Py_ssize_t numbersize = 0; 2041 char *numberresults = NULL; 2042 char *numberresult = NULL; 2043 Py_ssize_t i; 2044 int kind; 2045 void *data; 2046 2047 Py_VA_COPY(count, vargs); 2048 /* step 1: count the number of %S/%R/%A/%s format specifications 2049 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2050 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2051 * result in an array) 2052 * also estimate a upper bound for all the number formats in the string, 2053 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2054 * buffer before putting everything together. */ 2055 for (f = format; *f; f++) { 2056 if (*f == '%') { 2057 int longlongflag; 2058 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2059 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2060 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2061 ++callcount; 2062 2063 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2064#ifdef HAVE_LONG_LONG 2065 if (longlongflag) { 2066 if (width < MAX_LONG_LONG_CHARS) 2067 width = MAX_LONG_LONG_CHARS; 2068 } 2069 else 2070#endif 2071 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2072 including sign. Decimal takes the most space. This 2073 isn't enough for octal. If a width is specified we 2074 need more (which we allocate later). */ 2075 if (width < MAX_LONG_CHARS) 2076 width = MAX_LONG_CHARS; 2077 2078 /* account for the size + '\0' to separate numbers 2079 inside of the numberresults buffer */ 2080 numbersize += (width + 1); 2081 } 2082 } 2083 else if ((unsigned char)*f > 127) { 2084 PyErr_Format(PyExc_ValueError, 2085 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2086 "string, got a non-ASCII byte: 0x%02x", 2087 (unsigned char)*f); 2088 return NULL; 2089 } 2090 } 2091 /* step 2: allocate memory for the results of 2092 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2093 if (callcount) { 2094 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2095 if (!callresults) { 2096 PyErr_NoMemory(); 2097 return NULL; 2098 } 2099 callresult = callresults; 2100 } 2101 /* step 2.5: allocate memory for the results of formating numbers */ 2102 if (numbersize) { 2103 numberresults = PyObject_Malloc(numbersize); 2104 if (!numberresults) { 2105 PyErr_NoMemory(); 2106 goto fail; 2107 } 2108 numberresult = numberresults; 2109 } 2110 2111 /* step 3: format numbers and figure out how large a buffer we need */ 2112 for (f = format; *f; f++) { 2113 if (*f == '%') { 2114 const char* p; 2115 int longflag; 2116 int longlongflag; 2117 int size_tflag; 2118 int numprinted; 2119 2120 p = f; 2121 zeropad = (f[1] == '0'); 2122 f = parse_format_flags(f, &width, &precision, 2123 &longflag, &longlongflag, &size_tflag); 2124 switch (*f) { 2125 case 'c': 2126 { 2127 Py_UCS4 ordinal = va_arg(count, int); 2128 maxchar = Py_MAX(maxchar, ordinal); 2129 n++; 2130 break; 2131 } 2132 case '%': 2133 n++; 2134 break; 2135 case 'i': 2136 case 'd': 2137 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2138 width, precision, *f); 2139 if (longflag) 2140 numprinted = sprintf(numberresult, fmt, 2141 va_arg(count, long)); 2142#ifdef HAVE_LONG_LONG 2143 else if (longlongflag) 2144 numprinted = sprintf(numberresult, fmt, 2145 va_arg(count, PY_LONG_LONG)); 2146#endif 2147 else if (size_tflag) 2148 numprinted = sprintf(numberresult, fmt, 2149 va_arg(count, Py_ssize_t)); 2150 else 2151 numprinted = sprintf(numberresult, fmt, 2152 va_arg(count, int)); 2153 n += numprinted; 2154 /* advance by +1 to skip over the '\0' */ 2155 numberresult += (numprinted + 1); 2156 assert(*(numberresult - 1) == '\0'); 2157 assert(*(numberresult - 2) != '\0'); 2158 assert(numprinted >= 0); 2159 assert(numberresult <= numberresults + numbersize); 2160 break; 2161 case 'u': 2162 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2163 width, precision, 'u'); 2164 if (longflag) 2165 numprinted = sprintf(numberresult, fmt, 2166 va_arg(count, unsigned long)); 2167#ifdef HAVE_LONG_LONG 2168 else if (longlongflag) 2169 numprinted = sprintf(numberresult, fmt, 2170 va_arg(count, unsigned PY_LONG_LONG)); 2171#endif 2172 else if (size_tflag) 2173 numprinted = sprintf(numberresult, fmt, 2174 va_arg(count, size_t)); 2175 else 2176 numprinted = sprintf(numberresult, fmt, 2177 va_arg(count, unsigned int)); 2178 n += numprinted; 2179 numberresult += (numprinted + 1); 2180 assert(*(numberresult - 1) == '\0'); 2181 assert(*(numberresult - 2) != '\0'); 2182 assert(numprinted >= 0); 2183 assert(numberresult <= numberresults + numbersize); 2184 break; 2185 case 'x': 2186 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2187 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2188 n += numprinted; 2189 numberresult += (numprinted + 1); 2190 assert(*(numberresult - 1) == '\0'); 2191 assert(*(numberresult - 2) != '\0'); 2192 assert(numprinted >= 0); 2193 assert(numberresult <= numberresults + numbersize); 2194 break; 2195 case 'p': 2196 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2197 /* %p is ill-defined: ensure leading 0x. */ 2198 if (numberresult[1] == 'X') 2199 numberresult[1] = 'x'; 2200 else if (numberresult[1] != 'x') { 2201 memmove(numberresult + 2, numberresult, 2202 strlen(numberresult) + 1); 2203 numberresult[0] = '0'; 2204 numberresult[1] = 'x'; 2205 numprinted += 2; 2206 } 2207 n += numprinted; 2208 numberresult += (numprinted + 1); 2209 assert(*(numberresult - 1) == '\0'); 2210 assert(*(numberresult - 2) != '\0'); 2211 assert(numprinted >= 0); 2212 assert(numberresult <= numberresults + numbersize); 2213 break; 2214 case 's': 2215 { 2216 /* UTF-8 */ 2217 const char *s = va_arg(count, const char*); 2218 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2219 if (!str) 2220 goto fail; 2221 /* since PyUnicode_DecodeUTF8 returns already flexible 2222 unicode objects, there is no need to call ready on them */ 2223 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2224 maxchar = Py_MAX(maxchar, argmaxchar); 2225 n += PyUnicode_GET_LENGTH(str); 2226 /* Remember the str and switch to the next slot */ 2227 *callresult++ = str; 2228 break; 2229 } 2230 case 'U': 2231 { 2232 PyObject *obj = va_arg(count, PyObject *); 2233 assert(obj && _PyUnicode_CHECK(obj)); 2234 if (PyUnicode_READY(obj) == -1) 2235 goto fail; 2236 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2237 maxchar = Py_MAX(maxchar, argmaxchar); 2238 n += PyUnicode_GET_LENGTH(obj); 2239 break; 2240 } 2241 case 'V': 2242 { 2243 PyObject *obj = va_arg(count, PyObject *); 2244 const char *str = va_arg(count, const char *); 2245 PyObject *str_obj; 2246 assert(obj || str); 2247 assert(!obj || _PyUnicode_CHECK(obj)); 2248 if (obj) { 2249 if (PyUnicode_READY(obj) == -1) 2250 goto fail; 2251 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2252 maxchar = Py_MAX(maxchar, argmaxchar); 2253 n += PyUnicode_GET_LENGTH(obj); 2254 *callresult++ = NULL; 2255 } 2256 else { 2257 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2258 if (!str_obj) 2259 goto fail; 2260 if (PyUnicode_READY(str_obj)) { 2261 Py_DECREF(str_obj); 2262 goto fail; 2263 } 2264 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2265 maxchar = Py_MAX(maxchar, argmaxchar); 2266 n += PyUnicode_GET_LENGTH(str_obj); 2267 *callresult++ = str_obj; 2268 } 2269 break; 2270 } 2271 case 'S': 2272 { 2273 PyObject *obj = va_arg(count, PyObject *); 2274 PyObject *str; 2275 assert(obj); 2276 str = PyObject_Str(obj); 2277 if (!str || PyUnicode_READY(str) == -1) 2278 goto fail; 2279 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2280 maxchar = Py_MAX(maxchar, argmaxchar); 2281 n += PyUnicode_GET_LENGTH(str); 2282 /* Remember the str and switch to the next slot */ 2283 *callresult++ = str; 2284 break; 2285 } 2286 case 'R': 2287 { 2288 PyObject *obj = va_arg(count, PyObject *); 2289 PyObject *repr; 2290 assert(obj); 2291 repr = PyObject_Repr(obj); 2292 if (!repr || PyUnicode_READY(repr) == -1) 2293 goto fail; 2294 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2295 maxchar = Py_MAX(maxchar, argmaxchar); 2296 n += PyUnicode_GET_LENGTH(repr); 2297 /* Remember the repr and switch to the next slot */ 2298 *callresult++ = repr; 2299 break; 2300 } 2301 case 'A': 2302 { 2303 PyObject *obj = va_arg(count, PyObject *); 2304 PyObject *ascii; 2305 assert(obj); 2306 ascii = PyObject_ASCII(obj); 2307 if (!ascii || PyUnicode_READY(ascii) == -1) 2308 goto fail; 2309 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2310 maxchar = Py_MAX(maxchar, argmaxchar); 2311 n += PyUnicode_GET_LENGTH(ascii); 2312 /* Remember the repr and switch to the next slot */ 2313 *callresult++ = ascii; 2314 break; 2315 } 2316 default: 2317 /* if we stumble upon an unknown 2318 formatting code, copy the rest of 2319 the format string to the output 2320 string. (we cannot just skip the 2321 code, since there's no way to know 2322 what's in the argument list) */ 2323 n += strlen(p); 2324 goto expand; 2325 } 2326 } else 2327 n++; 2328 } 2329 expand: 2330 /* step 4: fill the buffer */ 2331 /* Since we've analyzed how much space we need, 2332 we don't have to resize the string. 2333 There can be no errors beyond this point. */ 2334 string = PyUnicode_New(n, maxchar); 2335 if (!string) 2336 goto fail; 2337 kind = PyUnicode_KIND(string); 2338 data = PyUnicode_DATA(string); 2339 callresult = callresults; 2340 numberresult = numberresults; 2341 2342 for (i = 0, f = format; *f; f++) { 2343 if (*f == '%') { 2344 const char* p; 2345 2346 p = f; 2347 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2348 /* checking for == because the last argument could be a empty 2349 string, which causes i to point to end, the assert at the end of 2350 the loop */ 2351 assert(i <= PyUnicode_GET_LENGTH(string)); 2352 2353 switch (*f) { 2354 case 'c': 2355 { 2356 const int ordinal = va_arg(vargs, int); 2357 PyUnicode_WRITE(kind, data, i++, ordinal); 2358 break; 2359 } 2360 case 'i': 2361 case 'd': 2362 case 'u': 2363 case 'x': 2364 case 'p': 2365 /* unused, since we already have the result */ 2366 if (*f == 'p') 2367 (void) va_arg(vargs, void *); 2368 else 2369 (void) va_arg(vargs, int); 2370 /* extract the result from numberresults and append. */ 2371 for (; *numberresult; ++i, ++numberresult) 2372 PyUnicode_WRITE(kind, data, i, *numberresult); 2373 /* skip over the separating '\0' */ 2374 assert(*numberresult == '\0'); 2375 numberresult++; 2376 assert(numberresult <= numberresults + numbersize); 2377 break; 2378 case 's': 2379 { 2380 /* unused, since we already have the result */ 2381 Py_ssize_t size; 2382 (void) va_arg(vargs, char *); 2383 size = PyUnicode_GET_LENGTH(*callresult); 2384 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2385 copy_characters(string, i, *callresult, 0, size); 2386 i += size; 2387 /* We're done with the unicode()/repr() => forget it */ 2388 Py_DECREF(*callresult); 2389 /* switch to next unicode()/repr() result */ 2390 ++callresult; 2391 break; 2392 } 2393 case 'U': 2394 { 2395 PyObject *obj = va_arg(vargs, PyObject *); 2396 Py_ssize_t size; 2397 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2398 size = PyUnicode_GET_LENGTH(obj); 2399 copy_characters(string, i, obj, 0, size); 2400 i += size; 2401 break; 2402 } 2403 case 'V': 2404 { 2405 Py_ssize_t size; 2406 PyObject *obj = va_arg(vargs, PyObject *); 2407 va_arg(vargs, const char *); 2408 if (obj) { 2409 size = PyUnicode_GET_LENGTH(obj); 2410 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2411 copy_characters(string, i, obj, 0, size); 2412 i += size; 2413 } else { 2414 size = PyUnicode_GET_LENGTH(*callresult); 2415 assert(PyUnicode_KIND(*callresult) <= 2416 PyUnicode_KIND(string)); 2417 copy_characters(string, i, *callresult, 0, size); 2418 i += size; 2419 Py_DECREF(*callresult); 2420 } 2421 ++callresult; 2422 break; 2423 } 2424 case 'S': 2425 case 'R': 2426 case 'A': 2427 { 2428 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2429 /* unused, since we already have the result */ 2430 (void) va_arg(vargs, PyObject *); 2431 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2432 copy_characters(string, i, *callresult, 0, size); 2433 i += size; 2434 /* We're done with the unicode()/repr() => forget it */ 2435 Py_DECREF(*callresult); 2436 /* switch to next unicode()/repr() result */ 2437 ++callresult; 2438 break; 2439 } 2440 case '%': 2441 PyUnicode_WRITE(kind, data, i++, '%'); 2442 break; 2443 default: 2444 for (; *p; ++p, ++i) 2445 PyUnicode_WRITE(kind, data, i, *p); 2446 assert(i == PyUnicode_GET_LENGTH(string)); 2447 goto end; 2448 } 2449 } 2450 else { 2451 assert(i < PyUnicode_GET_LENGTH(string)); 2452 PyUnicode_WRITE(kind, data, i++, *f); 2453 } 2454 } 2455 assert(i == PyUnicode_GET_LENGTH(string)); 2456 2457 end: 2458 if (callresults) 2459 PyObject_Free(callresults); 2460 if (numberresults) 2461 PyObject_Free(numberresults); 2462 assert(_PyUnicode_CheckConsistency(string, 1)); 2463 return (PyObject *)string; 2464 fail: 2465 if (callresults) { 2466 PyObject **callresult2 = callresults; 2467 while (callresult2 < callresult) { 2468 Py_XDECREF(*callresult2); 2469 ++callresult2; 2470 } 2471 PyObject_Free(callresults); 2472 } 2473 if (numberresults) 2474 PyObject_Free(numberresults); 2475 return NULL; 2476} 2477 2478PyObject * 2479PyUnicode_FromFormat(const char *format, ...) 2480{ 2481 PyObject* ret; 2482 va_list vargs; 2483 2484#ifdef HAVE_STDARG_PROTOTYPES 2485 va_start(vargs, format); 2486#else 2487 va_start(vargs); 2488#endif 2489 ret = PyUnicode_FromFormatV(format, vargs); 2490 va_end(vargs); 2491 return ret; 2492} 2493 2494#ifdef HAVE_WCHAR_H 2495 2496/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2497 convert a Unicode object to a wide character string. 2498 2499 - If w is NULL: return the number of wide characters (including the null 2500 character) required to convert the unicode object. Ignore size argument. 2501 2502 - Otherwise: return the number of wide characters (excluding the null 2503 character) written into w. Write at most size wide characters (including 2504 the null character). */ 2505static Py_ssize_t 2506unicode_aswidechar(PyUnicodeObject *unicode, 2507 wchar_t *w, 2508 Py_ssize_t size) 2509{ 2510 Py_ssize_t res; 2511 const wchar_t *wstr; 2512 2513 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2514 if (wstr == NULL) 2515 return -1; 2516 2517 if (w != NULL) { 2518 if (size > res) 2519 size = res + 1; 2520 else 2521 res = size; 2522 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2523 return res; 2524 } 2525 else 2526 return res + 1; 2527} 2528 2529Py_ssize_t 2530PyUnicode_AsWideChar(PyObject *unicode, 2531 wchar_t *w, 2532 Py_ssize_t size) 2533{ 2534 if (unicode == NULL) { 2535 PyErr_BadInternalCall(); 2536 return -1; 2537 } 2538 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2539} 2540 2541wchar_t* 2542PyUnicode_AsWideCharString(PyObject *unicode, 2543 Py_ssize_t *size) 2544{ 2545 wchar_t* buffer; 2546 Py_ssize_t buflen; 2547 2548 if (unicode == NULL) { 2549 PyErr_BadInternalCall(); 2550 return NULL; 2551 } 2552 2553 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2554 if (buflen == -1) 2555 return NULL; 2556 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2557 PyErr_NoMemory(); 2558 return NULL; 2559 } 2560 2561 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2562 if (buffer == NULL) { 2563 PyErr_NoMemory(); 2564 return NULL; 2565 } 2566 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2567 if (buflen == -1) 2568 return NULL; 2569 if (size != NULL) 2570 *size = buflen; 2571 return buffer; 2572} 2573 2574#endif /* HAVE_WCHAR_H */ 2575 2576PyObject * 2577PyUnicode_FromOrdinal(int ordinal) 2578{ 2579 PyObject *v; 2580 if (ordinal < 0 || ordinal > 0x10ffff) { 2581 PyErr_SetString(PyExc_ValueError, 2582 "chr() arg not in range(0x110000)"); 2583 return NULL; 2584 } 2585 2586 if (ordinal < 256) 2587 return get_latin1_char(ordinal); 2588 2589 v = PyUnicode_New(1, ordinal); 2590 if (v == NULL) 2591 return NULL; 2592 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2593 assert(_PyUnicode_CheckConsistency(v, 1)); 2594 return v; 2595} 2596 2597PyObject * 2598PyUnicode_FromObject(register PyObject *obj) 2599{ 2600 /* XXX Perhaps we should make this API an alias of 2601 PyObject_Str() instead ?! */ 2602 if (PyUnicode_CheckExact(obj)) { 2603 if (PyUnicode_READY(obj)) 2604 return NULL; 2605 Py_INCREF(obj); 2606 return obj; 2607 } 2608 if (PyUnicode_Check(obj)) { 2609 /* For a Unicode subtype that's not a Unicode object, 2610 return a true Unicode object with the same data. */ 2611 return PyUnicode_Copy(obj); 2612 } 2613 PyErr_Format(PyExc_TypeError, 2614 "Can't convert '%.100s' object to str implicitly", 2615 Py_TYPE(obj)->tp_name); 2616 return NULL; 2617} 2618 2619PyObject * 2620PyUnicode_FromEncodedObject(register PyObject *obj, 2621 const char *encoding, 2622 const char *errors) 2623{ 2624 Py_buffer buffer; 2625 PyObject *v; 2626 2627 if (obj == NULL) { 2628 PyErr_BadInternalCall(); 2629 return NULL; 2630 } 2631 2632 /* Decoding bytes objects is the most common case and should be fast */ 2633 if (PyBytes_Check(obj)) { 2634 if (PyBytes_GET_SIZE(obj) == 0) { 2635 Py_INCREF(unicode_empty); 2636 v = unicode_empty; 2637 } 2638 else { 2639 v = PyUnicode_Decode( 2640 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2641 encoding, errors); 2642 } 2643 return v; 2644 } 2645 2646 if (PyUnicode_Check(obj)) { 2647 PyErr_SetString(PyExc_TypeError, 2648 "decoding str is not supported"); 2649 return NULL; 2650 } 2651 2652 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2653 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2654 PyErr_Format(PyExc_TypeError, 2655 "coercing to str: need bytes, bytearray " 2656 "or buffer-like object, %.80s found", 2657 Py_TYPE(obj)->tp_name); 2658 return NULL; 2659 } 2660 2661 if (buffer.len == 0) { 2662 Py_INCREF(unicode_empty); 2663 v = unicode_empty; 2664 } 2665 else 2666 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2667 2668 PyBuffer_Release(&buffer); 2669 return v; 2670} 2671 2672/* Convert encoding to lower case and replace '_' with '-' in order to 2673 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2674 1 on success. */ 2675static int 2676normalize_encoding(const char *encoding, 2677 char *lower, 2678 size_t lower_len) 2679{ 2680 const char *e; 2681 char *l; 2682 char *l_end; 2683 2684 e = encoding; 2685 l = lower; 2686 l_end = &lower[lower_len - 1]; 2687 while (*e) { 2688 if (l == l_end) 2689 return 0; 2690 if (Py_ISUPPER(*e)) { 2691 *l++ = Py_TOLOWER(*e++); 2692 } 2693 else if (*e == '_') { 2694 *l++ = '-'; 2695 e++; 2696 } 2697 else { 2698 *l++ = *e++; 2699 } 2700 } 2701 *l = '\0'; 2702 return 1; 2703} 2704 2705PyObject * 2706PyUnicode_Decode(const char *s, 2707 Py_ssize_t size, 2708 const char *encoding, 2709 const char *errors) 2710{ 2711 PyObject *buffer = NULL, *unicode; 2712 Py_buffer info; 2713 char lower[11]; /* Enough for any encoding shortcut */ 2714 2715 if (encoding == NULL) 2716 return PyUnicode_DecodeUTF8(s, size, errors); 2717 2718 /* Shortcuts for common default encodings */ 2719 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2720 if ((strcmp(lower, "utf-8") == 0) || 2721 (strcmp(lower, "utf8") == 0)) 2722 return PyUnicode_DecodeUTF8(s, size, errors); 2723 else if ((strcmp(lower, "latin-1") == 0) || 2724 (strcmp(lower, "latin1") == 0) || 2725 (strcmp(lower, "iso-8859-1") == 0)) 2726 return PyUnicode_DecodeLatin1(s, size, errors); 2727#ifdef HAVE_MBCS 2728 else if (strcmp(lower, "mbcs") == 0) 2729 return PyUnicode_DecodeMBCS(s, size, errors); 2730#endif 2731 else if (strcmp(lower, "ascii") == 0) 2732 return PyUnicode_DecodeASCII(s, size, errors); 2733 else if (strcmp(lower, "utf-16") == 0) 2734 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2735 else if (strcmp(lower, "utf-32") == 0) 2736 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2737 } 2738 2739 /* Decode via the codec registry */ 2740 buffer = NULL; 2741 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2742 goto onError; 2743 buffer = PyMemoryView_FromBuffer(&info); 2744 if (buffer == NULL) 2745 goto onError; 2746 unicode = PyCodec_Decode(buffer, encoding, errors); 2747 if (unicode == NULL) 2748 goto onError; 2749 if (!PyUnicode_Check(unicode)) { 2750 PyErr_Format(PyExc_TypeError, 2751 "decoder did not return a str object (type=%.400s)", 2752 Py_TYPE(unicode)->tp_name); 2753 Py_DECREF(unicode); 2754 goto onError; 2755 } 2756 Py_DECREF(buffer); 2757#ifndef DONT_MAKE_RESULT_READY 2758 if (_PyUnicode_READY_REPLACE(&unicode)) { 2759 Py_DECREF(unicode); 2760 return NULL; 2761 } 2762#endif 2763 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2764 return unicode; 2765 2766 onError: 2767 Py_XDECREF(buffer); 2768 return NULL; 2769} 2770 2771PyObject * 2772PyUnicode_AsDecodedObject(PyObject *unicode, 2773 const char *encoding, 2774 const char *errors) 2775{ 2776 PyObject *v; 2777 2778 if (!PyUnicode_Check(unicode)) { 2779 PyErr_BadArgument(); 2780 goto onError; 2781 } 2782 2783 if (encoding == NULL) 2784 encoding = PyUnicode_GetDefaultEncoding(); 2785 2786 /* Decode via the codec registry */ 2787 v = PyCodec_Decode(unicode, encoding, errors); 2788 if (v == NULL) 2789 goto onError; 2790 assert(_PyUnicode_CheckConsistency(v, 1)); 2791 return v; 2792 2793 onError: 2794 return NULL; 2795} 2796 2797PyObject * 2798PyUnicode_AsDecodedUnicode(PyObject *unicode, 2799 const char *encoding, 2800 const char *errors) 2801{ 2802 PyObject *v; 2803 2804 if (!PyUnicode_Check(unicode)) { 2805 PyErr_BadArgument(); 2806 goto onError; 2807 } 2808 2809 if (encoding == NULL) 2810 encoding = PyUnicode_GetDefaultEncoding(); 2811 2812 /* Decode via the codec registry */ 2813 v = PyCodec_Decode(unicode, encoding, errors); 2814 if (v == NULL) 2815 goto onError; 2816 if (!PyUnicode_Check(v)) { 2817 PyErr_Format(PyExc_TypeError, 2818 "decoder did not return a str object (type=%.400s)", 2819 Py_TYPE(v)->tp_name); 2820 Py_DECREF(v); 2821 goto onError; 2822 } 2823 assert(_PyUnicode_CheckConsistency(v, 1)); 2824 return v; 2825 2826 onError: 2827 return NULL; 2828} 2829 2830PyObject * 2831PyUnicode_Encode(const Py_UNICODE *s, 2832 Py_ssize_t size, 2833 const char *encoding, 2834 const char *errors) 2835{ 2836 PyObject *v, *unicode; 2837 2838 unicode = PyUnicode_FromUnicode(s, size); 2839 if (unicode == NULL) 2840 return NULL; 2841 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2842 Py_DECREF(unicode); 2843 return v; 2844} 2845 2846PyObject * 2847PyUnicode_AsEncodedObject(PyObject *unicode, 2848 const char *encoding, 2849 const char *errors) 2850{ 2851 PyObject *v; 2852 2853 if (!PyUnicode_Check(unicode)) { 2854 PyErr_BadArgument(); 2855 goto onError; 2856 } 2857 2858 if (encoding == NULL) 2859 encoding = PyUnicode_GetDefaultEncoding(); 2860 2861 /* Encode via the codec registry */ 2862 v = PyCodec_Encode(unicode, encoding, errors); 2863 if (v == NULL) 2864 goto onError; 2865 return v; 2866 2867 onError: 2868 return NULL; 2869} 2870 2871PyObject * 2872PyUnicode_EncodeFSDefault(PyObject *unicode) 2873{ 2874#ifdef HAVE_MBCS 2875 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2876 PyUnicode_GET_SIZE(unicode), 2877 NULL); 2878#elif defined(__APPLE__) 2879 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2880#else 2881 PyInterpreterState *interp = PyThreadState_GET()->interp; 2882 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2883 cannot use it to encode and decode filenames before it is loaded. Load 2884 the Python codec requires to encode at least its own filename. Use the C 2885 version of the locale codec until the codec registry is initialized and 2886 the Python codec is loaded. 2887 2888 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2889 cannot only rely on it: check also interp->fscodec_initialized for 2890 subinterpreters. */ 2891 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2892 return PyUnicode_AsEncodedString(unicode, 2893 Py_FileSystemDefaultEncoding, 2894 "surrogateescape"); 2895 } 2896 else { 2897 /* locale encoding with surrogateescape */ 2898 wchar_t *wchar; 2899 char *bytes; 2900 PyObject *bytes_obj; 2901 size_t error_pos; 2902 2903 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2904 if (wchar == NULL) 2905 return NULL; 2906 bytes = _Py_wchar2char(wchar, &error_pos); 2907 if (bytes == NULL) { 2908 if (error_pos != (size_t)-1) { 2909 char *errmsg = strerror(errno); 2910 PyObject *exc = NULL; 2911 if (errmsg == NULL) 2912 errmsg = "Py_wchar2char() failed"; 2913 raise_encode_exception(&exc, 2914 "filesystemencoding", 2915 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2916 error_pos, error_pos+1, 2917 errmsg); 2918 Py_XDECREF(exc); 2919 } 2920 else 2921 PyErr_NoMemory(); 2922 PyMem_Free(wchar); 2923 return NULL; 2924 } 2925 PyMem_Free(wchar); 2926 2927 bytes_obj = PyBytes_FromString(bytes); 2928 PyMem_Free(bytes); 2929 return bytes_obj; 2930 } 2931#endif 2932} 2933 2934PyObject * 2935PyUnicode_AsEncodedString(PyObject *unicode, 2936 const char *encoding, 2937 const char *errors) 2938{ 2939 PyObject *v; 2940 char lower[11]; /* Enough for any encoding shortcut */ 2941 2942 if (!PyUnicode_Check(unicode)) { 2943 PyErr_BadArgument(); 2944 return NULL; 2945 } 2946 2947 if (encoding == NULL) { 2948 if (errors == NULL || strcmp(errors, "strict") == 0) 2949 return _PyUnicode_AsUTF8String(unicode, NULL); 2950 else 2951 return _PyUnicode_AsUTF8String(unicode, errors); 2952 } 2953 2954 /* Shortcuts for common default encodings */ 2955 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2956 if ((strcmp(lower, "utf-8") == 0) || 2957 (strcmp(lower, "utf8") == 0)) 2958 { 2959 if (errors == NULL || strcmp(errors, "strict") == 0) 2960 return _PyUnicode_AsUTF8String(unicode, NULL); 2961 else 2962 return _PyUnicode_AsUTF8String(unicode, errors); 2963 } 2964 else if ((strcmp(lower, "latin-1") == 0) || 2965 (strcmp(lower, "latin1") == 0) || 2966 (strcmp(lower, "iso-8859-1") == 0)) 2967 return _PyUnicode_AsLatin1String(unicode, errors); 2968#ifdef HAVE_MBCS 2969 else if (strcmp(lower, "mbcs") == 0) 2970 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2971 PyUnicode_GET_SIZE(unicode), 2972 errors); 2973#endif 2974 else if (strcmp(lower, "ascii") == 0) 2975 return _PyUnicode_AsASCIIString(unicode, errors); 2976 } 2977 2978 /* Encode via the codec registry */ 2979 v = PyCodec_Encode(unicode, encoding, errors); 2980 if (v == NULL) 2981 return NULL; 2982 2983 /* The normal path */ 2984 if (PyBytes_Check(v)) 2985 return v; 2986 2987 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2988 if (PyByteArray_Check(v)) { 2989 int error; 2990 PyObject *b; 2991 2992 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2993 "encoder %s returned bytearray instead of bytes", 2994 encoding); 2995 if (error) { 2996 Py_DECREF(v); 2997 return NULL; 2998 } 2999 3000 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3001 Py_DECREF(v); 3002 return b; 3003 } 3004 3005 PyErr_Format(PyExc_TypeError, 3006 "encoder did not return a bytes object (type=%.400s)", 3007 Py_TYPE(v)->tp_name); 3008 Py_DECREF(v); 3009 return NULL; 3010} 3011 3012PyObject * 3013PyUnicode_AsEncodedUnicode(PyObject *unicode, 3014 const char *encoding, 3015 const char *errors) 3016{ 3017 PyObject *v; 3018 3019 if (!PyUnicode_Check(unicode)) { 3020 PyErr_BadArgument(); 3021 goto onError; 3022 } 3023 3024 if (encoding == NULL) 3025 encoding = PyUnicode_GetDefaultEncoding(); 3026 3027 /* Encode via the codec registry */ 3028 v = PyCodec_Encode(unicode, encoding, errors); 3029 if (v == NULL) 3030 goto onError; 3031 if (!PyUnicode_Check(v)) { 3032 PyErr_Format(PyExc_TypeError, 3033 "encoder did not return an str object (type=%.400s)", 3034 Py_TYPE(v)->tp_name); 3035 Py_DECREF(v); 3036 goto onError; 3037 } 3038 return v; 3039 3040 onError: 3041 return NULL; 3042} 3043 3044PyObject* 3045PyUnicode_DecodeFSDefault(const char *s) { 3046 Py_ssize_t size = (Py_ssize_t)strlen(s); 3047 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3048} 3049 3050PyObject* 3051PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3052{ 3053#ifdef HAVE_MBCS 3054 return PyUnicode_DecodeMBCS(s, size, NULL); 3055#elif defined(__APPLE__) 3056 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 3057#else 3058 PyInterpreterState *interp = PyThreadState_GET()->interp; 3059 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3060 cannot use it to encode and decode filenames before it is loaded. Load 3061 the Python codec requires to encode at least its own filename. Use the C 3062 version of the locale codec until the codec registry is initialized and 3063 the Python codec is loaded. 3064 3065 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3066 cannot only rely on it: check also interp->fscodec_initialized for 3067 subinterpreters. */ 3068 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3069 return PyUnicode_Decode(s, size, 3070 Py_FileSystemDefaultEncoding, 3071 "surrogateescape"); 3072 } 3073 else { 3074 /* locale encoding with surrogateescape */ 3075 wchar_t *wchar; 3076 PyObject *unicode; 3077 size_t len; 3078 3079 if (s[size] != '\0' || size != strlen(s)) { 3080 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3081 return NULL; 3082 } 3083 3084 wchar = _Py_char2wchar(s, &len); 3085 if (wchar == NULL) 3086 return PyErr_NoMemory(); 3087 3088 unicode = PyUnicode_FromWideChar(wchar, len); 3089 PyMem_Free(wchar); 3090 return unicode; 3091 } 3092#endif 3093} 3094 3095 3096int 3097PyUnicode_FSConverter(PyObject* arg, void* addr) 3098{ 3099 PyObject *output = NULL; 3100 Py_ssize_t size; 3101 void *data; 3102 if (arg == NULL) { 3103 Py_DECREF(*(PyObject**)addr); 3104 return 1; 3105 } 3106 if (PyBytes_Check(arg)) { 3107 output = arg; 3108 Py_INCREF(output); 3109 } 3110 else { 3111 arg = PyUnicode_FromObject(arg); 3112 if (!arg) 3113 return 0; 3114 output = PyUnicode_EncodeFSDefault(arg); 3115 Py_DECREF(arg); 3116 if (!output) 3117 return 0; 3118 if (!PyBytes_Check(output)) { 3119 Py_DECREF(output); 3120 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3121 return 0; 3122 } 3123 } 3124 size = PyBytes_GET_SIZE(output); 3125 data = PyBytes_AS_STRING(output); 3126 if (size != strlen(data)) { 3127 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3128 Py_DECREF(output); 3129 return 0; 3130 } 3131 *(PyObject**)addr = output; 3132 return Py_CLEANUP_SUPPORTED; 3133} 3134 3135 3136int 3137PyUnicode_FSDecoder(PyObject* arg, void* addr) 3138{ 3139 PyObject *output = NULL; 3140 if (arg == NULL) { 3141 Py_DECREF(*(PyObject**)addr); 3142 return 1; 3143 } 3144 if (PyUnicode_Check(arg)) { 3145 if (PyUnicode_READY(arg)) 3146 return 0; 3147 output = arg; 3148 Py_INCREF(output); 3149 } 3150 else { 3151 arg = PyBytes_FromObject(arg); 3152 if (!arg) 3153 return 0; 3154 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3155 PyBytes_GET_SIZE(arg)); 3156 Py_DECREF(arg); 3157 if (!output) 3158 return 0; 3159 if (!PyUnicode_Check(output)) { 3160 Py_DECREF(output); 3161 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3162 return 0; 3163 } 3164 } 3165 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3166 PyUnicode_GET_LENGTH(output), 0, 1)) { 3167 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3168 Py_DECREF(output); 3169 return 0; 3170 } 3171 *(PyObject**)addr = output; 3172 return Py_CLEANUP_SUPPORTED; 3173} 3174 3175 3176char* 3177PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3178{ 3179 PyObject *bytes; 3180 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 3181 3182 if (!PyUnicode_Check(unicode)) { 3183 PyErr_BadArgument(); 3184 return NULL; 3185 } 3186 if (PyUnicode_READY(u) == -1) 3187 return NULL; 3188 3189 if (PyUnicode_UTF8(unicode) == NULL) { 3190 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3191 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3192 if (bytes == NULL) 3193 return NULL; 3194 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3195 if (_PyUnicode_UTF8(u) == NULL) { 3196 Py_DECREF(bytes); 3197 return NULL; 3198 } 3199 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 3200 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 3201 Py_DECREF(bytes); 3202 } 3203 3204 if (psize) 3205 *psize = PyUnicode_UTF8_LENGTH(unicode); 3206 return PyUnicode_UTF8(unicode); 3207} 3208 3209char* 3210PyUnicode_AsUTF8(PyObject *unicode) 3211{ 3212 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3213} 3214 3215#ifdef Py_DEBUG 3216int unicode_as_unicode_calls = 0; 3217#endif 3218 3219 3220Py_UNICODE * 3221PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3222{ 3223 PyUnicodeObject *u; 3224 const unsigned char *one_byte; 3225#if SIZEOF_WCHAR_T == 4 3226 const Py_UCS2 *two_bytes; 3227#else 3228 const Py_UCS4 *four_bytes; 3229 const Py_UCS4 *ucs4_end; 3230 Py_ssize_t num_surrogates; 3231#endif 3232 wchar_t *w; 3233 wchar_t *wchar_end; 3234 3235 if (!PyUnicode_Check(unicode)) { 3236 PyErr_BadArgument(); 3237 return NULL; 3238 } 3239 u = (PyUnicodeObject*)unicode; 3240 if (_PyUnicode_WSTR(u) == NULL) { 3241 /* Non-ASCII compact unicode object */ 3242 assert(_PyUnicode_KIND(u) != 0); 3243 assert(PyUnicode_IS_READY(u)); 3244 3245#ifdef Py_DEBUG 3246 ++unicode_as_unicode_calls; 3247#endif 3248 3249 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3250#if SIZEOF_WCHAR_T == 2 3251 four_bytes = PyUnicode_4BYTE_DATA(u); 3252 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3253 num_surrogates = 0; 3254 3255 for (; four_bytes < ucs4_end; ++four_bytes) { 3256 if (*four_bytes > 0xFFFF) 3257 ++num_surrogates; 3258 } 3259 3260 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3261 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3262 if (!_PyUnicode_WSTR(u)) { 3263 PyErr_NoMemory(); 3264 return NULL; 3265 } 3266 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3267 3268 w = _PyUnicode_WSTR(u); 3269 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3270 four_bytes = PyUnicode_4BYTE_DATA(u); 3271 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3272 if (*four_bytes > 0xFFFF) { 3273 /* encode surrogate pair in this case */ 3274 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3275 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3276 } 3277 else 3278 *w = *four_bytes; 3279 3280 if (w > wchar_end) { 3281 assert(0 && "Miscalculated string end"); 3282 } 3283 } 3284 *w = 0; 3285#else 3286 /* sizeof(wchar_t) == 4 */ 3287 Py_FatalError("Impossible unicode object state, wstr and str " 3288 "should share memory already."); 3289 return NULL; 3290#endif 3291 } 3292 else { 3293 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3294 (_PyUnicode_LENGTH(u) + 1)); 3295 if (!_PyUnicode_WSTR(u)) { 3296 PyErr_NoMemory(); 3297 return NULL; 3298 } 3299 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3300 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3301 w = _PyUnicode_WSTR(u); 3302 wchar_end = w + _PyUnicode_LENGTH(u); 3303 3304 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3305 one_byte = PyUnicode_1BYTE_DATA(u); 3306 for (; w < wchar_end; ++one_byte, ++w) 3307 *w = *one_byte; 3308 /* null-terminate the wstr */ 3309 *w = 0; 3310 } 3311 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3312#if SIZEOF_WCHAR_T == 4 3313 two_bytes = PyUnicode_2BYTE_DATA(u); 3314 for (; w < wchar_end; ++two_bytes, ++w) 3315 *w = *two_bytes; 3316 /* null-terminate the wstr */ 3317 *w = 0; 3318#else 3319 /* sizeof(wchar_t) == 2 */ 3320 PyObject_FREE(_PyUnicode_WSTR(u)); 3321 _PyUnicode_WSTR(u) = NULL; 3322 Py_FatalError("Impossible unicode object state, wstr " 3323 "and str should share memory already."); 3324 return NULL; 3325#endif 3326 } 3327 else { 3328 assert(0 && "This should never happen."); 3329 } 3330 } 3331 } 3332 if (size != NULL) 3333 *size = PyUnicode_WSTR_LENGTH(u); 3334 return _PyUnicode_WSTR(u); 3335} 3336 3337Py_UNICODE * 3338PyUnicode_AsUnicode(PyObject *unicode) 3339{ 3340 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3341} 3342 3343 3344Py_ssize_t 3345PyUnicode_GetSize(PyObject *unicode) 3346{ 3347 if (!PyUnicode_Check(unicode)) { 3348 PyErr_BadArgument(); 3349 goto onError; 3350 } 3351 return PyUnicode_GET_SIZE(unicode); 3352 3353 onError: 3354 return -1; 3355} 3356 3357Py_ssize_t 3358PyUnicode_GetLength(PyObject *unicode) 3359{ 3360 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3361 PyErr_BadArgument(); 3362 return -1; 3363 } 3364 3365 return PyUnicode_GET_LENGTH(unicode); 3366} 3367 3368Py_UCS4 3369PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3370{ 3371 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3372 PyErr_BadArgument(); 3373 return (Py_UCS4)-1; 3374 } 3375 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3376 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3377 return (Py_UCS4)-1; 3378 } 3379 return PyUnicode_READ_CHAR(unicode, index); 3380} 3381 3382int 3383PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3384{ 3385 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3386 PyErr_BadArgument(); 3387 return -1; 3388 } 3389 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3390 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3391 return -1; 3392 } 3393 if (_PyUnicode_Dirty(unicode)) 3394 return -1; 3395 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3396 index, ch); 3397 return 0; 3398} 3399 3400const char * 3401PyUnicode_GetDefaultEncoding(void) 3402{ 3403 return "utf-8"; 3404} 3405 3406/* create or adjust a UnicodeDecodeError */ 3407static void 3408make_decode_exception(PyObject **exceptionObject, 3409 const char *encoding, 3410 const char *input, Py_ssize_t length, 3411 Py_ssize_t startpos, Py_ssize_t endpos, 3412 const char *reason) 3413{ 3414 if (*exceptionObject == NULL) { 3415 *exceptionObject = PyUnicodeDecodeError_Create( 3416 encoding, input, length, startpos, endpos, reason); 3417 } 3418 else { 3419 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3420 goto onError; 3421 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3422 goto onError; 3423 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3424 goto onError; 3425 } 3426 return; 3427 3428onError: 3429 Py_DECREF(*exceptionObject); 3430 *exceptionObject = NULL; 3431} 3432 3433/* error handling callback helper: 3434 build arguments, call the callback and check the arguments, 3435 if no exception occurred, copy the replacement to the output 3436 and adjust various state variables. 3437 return 0 on success, -1 on error 3438*/ 3439 3440static int 3441unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3442 const char *encoding, const char *reason, 3443 const char **input, const char **inend, Py_ssize_t *startinpos, 3444 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3445 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3446{ 3447 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3448 3449 PyObject *restuple = NULL; 3450 PyObject *repunicode = NULL; 3451 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3452 Py_ssize_t insize; 3453 Py_ssize_t requiredsize; 3454 Py_ssize_t newpos; 3455 const Py_UNICODE *repptr; 3456 PyObject *inputobj = NULL; 3457 Py_ssize_t repsize; 3458 int res = -1; 3459 3460 if (*errorHandler == NULL) { 3461 *errorHandler = PyCodec_LookupError(errors); 3462 if (*errorHandler == NULL) 3463 goto onError; 3464 } 3465 3466 make_decode_exception(exceptionObject, 3467 encoding, 3468 *input, *inend - *input, 3469 *startinpos, *endinpos, 3470 reason); 3471 if (*exceptionObject == NULL) 3472 goto onError; 3473 3474 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3475 if (restuple == NULL) 3476 goto onError; 3477 if (!PyTuple_Check(restuple)) { 3478 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3479 goto onError; 3480 } 3481 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3482 goto onError; 3483 3484 /* Copy back the bytes variables, which might have been modified by the 3485 callback */ 3486 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3487 if (!inputobj) 3488 goto onError; 3489 if (!PyBytes_Check(inputobj)) { 3490 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3491 } 3492 *input = PyBytes_AS_STRING(inputobj); 3493 insize = PyBytes_GET_SIZE(inputobj); 3494 *inend = *input + insize; 3495 /* we can DECREF safely, as the exception has another reference, 3496 so the object won't go away. */ 3497 Py_DECREF(inputobj); 3498 3499 if (newpos<0) 3500 newpos = insize+newpos; 3501 if (newpos<0 || newpos>insize) { 3502 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3503 goto onError; 3504 } 3505 3506 /* need more space? (at least enough for what we 3507 have+the replacement+the rest of the string (starting 3508 at the new input position), so we won't have to check space 3509 when there are no errors in the rest of the string) */ 3510 repptr = PyUnicode_AS_UNICODE(repunicode); 3511 repsize = PyUnicode_GET_SIZE(repunicode); 3512 requiredsize = *outpos + repsize + insize-newpos; 3513 if (requiredsize > outsize) { 3514 if (requiredsize<2*outsize) 3515 requiredsize = 2*outsize; 3516 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3517 goto onError; 3518 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3519 } 3520 *endinpos = newpos; 3521 *inptr = *input + newpos; 3522 Py_UNICODE_COPY(*outptr, repptr, repsize); 3523 *outptr += repsize; 3524 *outpos += repsize; 3525 3526 /* we made it! */ 3527 res = 0; 3528 3529 onError: 3530 Py_XDECREF(restuple); 3531 return res; 3532} 3533 3534/* --- UTF-7 Codec -------------------------------------------------------- */ 3535 3536/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3537 3538/* Three simple macros defining base-64. */ 3539 3540/* Is c a base-64 character? */ 3541 3542#define IS_BASE64(c) \ 3543 (((c) >= 'A' && (c) <= 'Z') || \ 3544 ((c) >= 'a' && (c) <= 'z') || \ 3545 ((c) >= '0' && (c) <= '9') || \ 3546 (c) == '+' || (c) == '/') 3547 3548/* given that c is a base-64 character, what is its base-64 value? */ 3549 3550#define FROM_BASE64(c) \ 3551 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3552 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3553 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3554 (c) == '+' ? 62 : 63) 3555 3556/* What is the base-64 character of the bottom 6 bits of n? */ 3557 3558#define TO_BASE64(n) \ 3559 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3560 3561/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3562 * decoded as itself. We are permissive on decoding; the only ASCII 3563 * byte not decoding to itself is the + which begins a base64 3564 * string. */ 3565 3566#define DECODE_DIRECT(c) \ 3567 ((c) <= 127 && (c) != '+') 3568 3569/* The UTF-7 encoder treats ASCII characters differently according to 3570 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3571 * the above). See RFC2152. This array identifies these different 3572 * sets: 3573 * 0 : "Set D" 3574 * alphanumeric and '(),-./:? 3575 * 1 : "Set O" 3576 * !"#$%&*;<=>@[]^_`{|} 3577 * 2 : "whitespace" 3578 * ht nl cr sp 3579 * 3 : special (must be base64 encoded) 3580 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3581 */ 3582 3583static 3584char utf7_category[128] = { 3585/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3586 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3587/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3588 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3589/* sp ! " # $ % & ' ( ) * + , - . / */ 3590 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3591/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3593/* @ A B C D E F G H I J K L M N O */ 3594 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3595/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3597/* ` a b c d e f g h i j k l m n o */ 3598 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3599/* p q r s t u v w x y z { | } ~ del */ 3600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3601}; 3602 3603/* ENCODE_DIRECT: this character should be encoded as itself. The 3604 * answer depends on whether we are encoding set O as itself, and also 3605 * on whether we are encoding whitespace as itself. RFC2152 makes it 3606 * clear that the answers to these questions vary between 3607 * applications, so this code needs to be flexible. */ 3608 3609#define ENCODE_DIRECT(c, directO, directWS) \ 3610 ((c) < 128 && (c) > 0 && \ 3611 ((utf7_category[(c)] == 0) || \ 3612 (directWS && (utf7_category[(c)] == 2)) || \ 3613 (directO && (utf7_category[(c)] == 1)))) 3614 3615PyObject * 3616PyUnicode_DecodeUTF7(const char *s, 3617 Py_ssize_t size, 3618 const char *errors) 3619{ 3620 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3621} 3622 3623/* The decoder. The only state we preserve is our read position, 3624 * i.e. how many characters we have consumed. So if we end in the 3625 * middle of a shift sequence we have to back off the read position 3626 * and the output to the beginning of the sequence, otherwise we lose 3627 * all the shift state (seen bits, number of bits seen, high 3628 * surrogate). */ 3629 3630PyObject * 3631PyUnicode_DecodeUTF7Stateful(const char *s, 3632 Py_ssize_t size, 3633 const char *errors, 3634 Py_ssize_t *consumed) 3635{ 3636 const char *starts = s; 3637 Py_ssize_t startinpos; 3638 Py_ssize_t endinpos; 3639 Py_ssize_t outpos; 3640 const char *e; 3641 PyUnicodeObject *unicode; 3642 Py_UNICODE *p; 3643 const char *errmsg = ""; 3644 int inShift = 0; 3645 Py_UNICODE *shiftOutStart; 3646 unsigned int base64bits = 0; 3647 unsigned long base64buffer = 0; 3648 Py_UNICODE surrogate = 0; 3649 PyObject *errorHandler = NULL; 3650 PyObject *exc = NULL; 3651 3652 unicode = _PyUnicode_New(size); 3653 if (!unicode) 3654 return NULL; 3655 if (size == 0) { 3656 if (consumed) 3657 *consumed = 0; 3658 return (PyObject *)unicode; 3659 } 3660 3661 p = PyUnicode_AS_UNICODE(unicode); 3662 shiftOutStart = p; 3663 e = s + size; 3664 3665 while (s < e) { 3666 Py_UNICODE ch; 3667 restart: 3668 ch = (unsigned char) *s; 3669 3670 if (inShift) { /* in a base-64 section */ 3671 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3672 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3673 base64bits += 6; 3674 s++; 3675 if (base64bits >= 16) { 3676 /* we have enough bits for a UTF-16 value */ 3677 Py_UNICODE outCh = (Py_UNICODE) 3678 (base64buffer >> (base64bits-16)); 3679 base64bits -= 16; 3680 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3681 if (surrogate) { 3682 /* expecting a second surrogate */ 3683 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3684#ifdef Py_UNICODE_WIDE 3685 *p++ = (((surrogate & 0x3FF)<<10) 3686 | (outCh & 0x3FF)) + 0x10000; 3687#else 3688 *p++ = surrogate; 3689 *p++ = outCh; 3690#endif 3691 surrogate = 0; 3692 } 3693 else { 3694 surrogate = 0; 3695 errmsg = "second surrogate missing"; 3696 goto utf7Error; 3697 } 3698 } 3699 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3700 /* first surrogate */ 3701 surrogate = outCh; 3702 } 3703 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3704 errmsg = "unexpected second surrogate"; 3705 goto utf7Error; 3706 } 3707 else { 3708 *p++ = outCh; 3709 } 3710 } 3711 } 3712 else { /* now leaving a base-64 section */ 3713 inShift = 0; 3714 s++; 3715 if (surrogate) { 3716 errmsg = "second surrogate missing at end of shift sequence"; 3717 goto utf7Error; 3718 } 3719 if (base64bits > 0) { /* left-over bits */ 3720 if (base64bits >= 6) { 3721 /* We've seen at least one base-64 character */ 3722 errmsg = "partial character in shift sequence"; 3723 goto utf7Error; 3724 } 3725 else { 3726 /* Some bits remain; they should be zero */ 3727 if (base64buffer != 0) { 3728 errmsg = "non-zero padding bits in shift sequence"; 3729 goto utf7Error; 3730 } 3731 } 3732 } 3733 if (ch != '-') { 3734 /* '-' is absorbed; other terminating 3735 characters are preserved */ 3736 *p++ = ch; 3737 } 3738 } 3739 } 3740 else if ( ch == '+' ) { 3741 startinpos = s-starts; 3742 s++; /* consume '+' */ 3743 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3744 s++; 3745 *p++ = '+'; 3746 } 3747 else { /* begin base64-encoded section */ 3748 inShift = 1; 3749 shiftOutStart = p; 3750 base64bits = 0; 3751 } 3752 } 3753 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3754 *p++ = ch; 3755 s++; 3756 } 3757 else { 3758 startinpos = s-starts; 3759 s++; 3760 errmsg = "unexpected special character"; 3761 goto utf7Error; 3762 } 3763 continue; 3764utf7Error: 3765 outpos = p-PyUnicode_AS_UNICODE(unicode); 3766 endinpos = s-starts; 3767 if (unicode_decode_call_errorhandler( 3768 errors, &errorHandler, 3769 "utf7", errmsg, 3770 &starts, &e, &startinpos, &endinpos, &exc, &s, 3771 &unicode, &outpos, &p)) 3772 goto onError; 3773 } 3774 3775 /* end of string */ 3776 3777 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3778 /* if we're in an inconsistent state, that's an error */ 3779 if (surrogate || 3780 (base64bits >= 6) || 3781 (base64bits > 0 && base64buffer != 0)) { 3782 outpos = p-PyUnicode_AS_UNICODE(unicode); 3783 endinpos = size; 3784 if (unicode_decode_call_errorhandler( 3785 errors, &errorHandler, 3786 "utf7", "unterminated shift sequence", 3787 &starts, &e, &startinpos, &endinpos, &exc, &s, 3788 &unicode, &outpos, &p)) 3789 goto onError; 3790 if (s < e) 3791 goto restart; 3792 } 3793 } 3794 3795 /* return state */ 3796 if (consumed) { 3797 if (inShift) { 3798 p = shiftOutStart; /* back off output */ 3799 *consumed = startinpos; 3800 } 3801 else { 3802 *consumed = s-starts; 3803 } 3804 } 3805 3806 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3807 goto onError; 3808 3809 Py_XDECREF(errorHandler); 3810 Py_XDECREF(exc); 3811#ifndef DONT_MAKE_RESULT_READY 3812 if (_PyUnicode_READY_REPLACE(&unicode)) { 3813 Py_DECREF(unicode); 3814 return NULL; 3815 } 3816#endif 3817 assert(_PyUnicode_CheckConsistency(unicode, 1)); 3818 return (PyObject *)unicode; 3819 3820 onError: 3821 Py_XDECREF(errorHandler); 3822 Py_XDECREF(exc); 3823 Py_DECREF(unicode); 3824 return NULL; 3825} 3826 3827 3828PyObject * 3829PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3830 Py_ssize_t size, 3831 int base64SetO, 3832 int base64WhiteSpace, 3833 const char *errors) 3834{ 3835 PyObject *v; 3836 /* It might be possible to tighten this worst case */ 3837 Py_ssize_t allocated = 8 * size; 3838 int inShift = 0; 3839 Py_ssize_t i = 0; 3840 unsigned int base64bits = 0; 3841 unsigned long base64buffer = 0; 3842 char * out; 3843 char * start; 3844 3845 if (size == 0) 3846 return PyBytes_FromStringAndSize(NULL, 0); 3847 3848 if (allocated / 8 != size) 3849 return PyErr_NoMemory(); 3850 3851 v = PyBytes_FromStringAndSize(NULL, allocated); 3852 if (v == NULL) 3853 return NULL; 3854 3855 start = out = PyBytes_AS_STRING(v); 3856 for (;i < size; ++i) { 3857 Py_UNICODE ch = s[i]; 3858 3859 if (inShift) { 3860 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3861 /* shifting out */ 3862 if (base64bits) { /* output remaining bits */ 3863 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3864 base64buffer = 0; 3865 base64bits = 0; 3866 } 3867 inShift = 0; 3868 /* Characters not in the BASE64 set implicitly unshift the sequence 3869 so no '-' is required, except if the character is itself a '-' */ 3870 if (IS_BASE64(ch) || ch == '-') { 3871 *out++ = '-'; 3872 } 3873 *out++ = (char) ch; 3874 } 3875 else { 3876 goto encode_char; 3877 } 3878 } 3879 else { /* not in a shift sequence */ 3880 if (ch == '+') { 3881 *out++ = '+'; 3882 *out++ = '-'; 3883 } 3884 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3885 *out++ = (char) ch; 3886 } 3887 else { 3888 *out++ = '+'; 3889 inShift = 1; 3890 goto encode_char; 3891 } 3892 } 3893 continue; 3894encode_char: 3895#ifdef Py_UNICODE_WIDE 3896 if (ch >= 0x10000) { 3897 /* code first surrogate */ 3898 base64bits += 16; 3899 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3900 while (base64bits >= 6) { 3901 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3902 base64bits -= 6; 3903 } 3904 /* prepare second surrogate */ 3905 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3906 } 3907#endif 3908 base64bits += 16; 3909 base64buffer = (base64buffer << 16) | ch; 3910 while (base64bits >= 6) { 3911 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3912 base64bits -= 6; 3913 } 3914 } 3915 if (base64bits) 3916 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3917 if (inShift) 3918 *out++ = '-'; 3919 if (_PyBytes_Resize(&v, out - start) < 0) 3920 return NULL; 3921 return v; 3922} 3923 3924#undef IS_BASE64 3925#undef FROM_BASE64 3926#undef TO_BASE64 3927#undef DECODE_DIRECT 3928#undef ENCODE_DIRECT 3929 3930/* --- UTF-8 Codec -------------------------------------------------------- */ 3931 3932static 3933char utf8_code_length[256] = { 3934 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3935 illegal prefix. See RFC 3629 for details */ 3936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3941 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3942 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3943 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3945 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3946 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3947 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3948 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3949 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3950 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3951 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3952}; 3953 3954PyObject * 3955PyUnicode_DecodeUTF8(const char *s, 3956 Py_ssize_t size, 3957 const char *errors) 3958{ 3959 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3960} 3961 3962/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3963#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3964 3965/* Mask to quickly check whether a C 'long' contains a 3966 non-ASCII, UTF8-encoded char. */ 3967#if (SIZEOF_LONG == 8) 3968# define ASCII_CHAR_MASK 0x8080808080808080L 3969#elif (SIZEOF_LONG == 4) 3970# define ASCII_CHAR_MASK 0x80808080L 3971#else 3972# error C 'long' size should be either 4 or 8! 3973#endif 3974 3975/* Scans a UTF-8 string and returns the maximum character to be expected, 3976 the size of the decoded unicode string and if any major errors were 3977 encountered. 3978 3979 This function does check basic UTF-8 sanity, it does however NOT CHECK 3980 if the string contains surrogates, and if all continuation bytes are 3981 within the correct ranges, these checks are performed in 3982 PyUnicode_DecodeUTF8Stateful. 3983 3984 If it sets has_errors to 1, it means the value of unicode_size and max_char 3985 will be bogus and you should not rely on useful information in them. 3986 */ 3987static Py_UCS4 3988utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3989 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3990 int *has_errors) 3991{ 3992 Py_ssize_t n; 3993 Py_ssize_t char_count = 0; 3994 Py_UCS4 max_char = 127, new_max; 3995 Py_UCS4 upper_bound; 3996 const unsigned char *p = (const unsigned char *)s; 3997 const unsigned char *end = p + string_size; 3998 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3999 int err = 0; 4000 4001 for (; p < end && !err; ++p, ++char_count) { 4002 /* Only check value if it's not a ASCII char... */ 4003 if (*p < 0x80) { 4004 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 4005 an explanation. */ 4006 if (!((size_t) p & LONG_PTR_MASK)) { 4007 /* Help register allocation */ 4008 register const unsigned char *_p = p; 4009 while (_p < aligned_end) { 4010 unsigned long value = *(unsigned long *) _p; 4011 if (value & ASCII_CHAR_MASK) 4012 break; 4013 _p += SIZEOF_LONG; 4014 char_count += SIZEOF_LONG; 4015 } 4016 p = _p; 4017 if (p == end) 4018 break; 4019 } 4020 } 4021 if (*p >= 0x80) { 4022 n = utf8_code_length[*p]; 4023 new_max = max_char; 4024 switch (n) { 4025 /* invalid start byte */ 4026 case 0: 4027 err = 1; 4028 break; 4029 case 2: 4030 /* Code points between 0x00FF and 0x07FF inclusive. 4031 Approximate the upper bound of the code point, 4032 if this flips over 255 we can be sure it will be more 4033 than 255 and the string will need 2 bytes per code coint, 4034 if it stays under or equal to 255, we can be sure 1 byte 4035 is enough. 4036 ((*p & 0b00011111) << 6) | 0b00111111 */ 4037 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 4038 if (max_char < upper_bound) 4039 new_max = upper_bound; 4040 /* Ensure we track at least that we left ASCII space. */ 4041 if (new_max < 128) 4042 new_max = 128; 4043 break; 4044 case 3: 4045 /* Between 0x0FFF and 0xFFFF inclusive, so values are 4046 always > 255 and <= 65535 and will always need 2 bytes. */ 4047 if (max_char < 65535) 4048 new_max = 65535; 4049 break; 4050 case 4: 4051 /* Code point will be above 0xFFFF for sure in this case. */ 4052 new_max = 65537; 4053 break; 4054 /* Internal error, this should be caught by the first if */ 4055 case 1: 4056 default: 4057 assert(0 && "Impossible case in utf8_max_char_and_size"); 4058 err = 1; 4059 } 4060 /* Instead of number of overall bytes for this code point, 4061 n contains the number of following bytes: */ 4062 --n; 4063 /* Check if the follow up chars are all valid continuation bytes */ 4064 if (n >= 1) { 4065 const unsigned char *cont; 4066 if ((p + n) >= end) { 4067 if (consumed == 0) 4068 /* incomplete data, non-incremental decoding */ 4069 err = 1; 4070 break; 4071 } 4072 for (cont = p + 1; cont < (p + n); ++cont) { 4073 if ((*cont & 0xc0) != 0x80) { 4074 err = 1; 4075 break; 4076 } 4077 } 4078 p += n; 4079 } 4080 else 4081 err = 1; 4082 max_char = new_max; 4083 } 4084 } 4085 4086 if (unicode_size) 4087 *unicode_size = char_count; 4088 if (has_errors) 4089 *has_errors = err; 4090 return max_char; 4091} 4092 4093/* Similar to PyUnicode_WRITE but can also write into wstr field 4094 of the legacy unicode representation */ 4095#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 4096 do { \ 4097 const int k_ = (kind); \ 4098 if (k_ == PyUnicode_WCHAR_KIND) \ 4099 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 4100 else if (k_ == PyUnicode_1BYTE_KIND) \ 4101 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 4102 else if (k_ == PyUnicode_2BYTE_KIND) \ 4103 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 4104 else \ 4105 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 4106 } while (0) 4107 4108PyObject * 4109PyUnicode_DecodeUTF8Stateful(const char *s, 4110 Py_ssize_t size, 4111 const char *errors, 4112 Py_ssize_t *consumed) 4113{ 4114 const char *starts = s; 4115 int n; 4116 int k; 4117 Py_ssize_t startinpos; 4118 Py_ssize_t endinpos; 4119 const char *e, *aligned_end; 4120 PyUnicodeObject *unicode; 4121 const char *errmsg = ""; 4122 PyObject *errorHandler = NULL; 4123 PyObject *exc = NULL; 4124 Py_UCS4 maxchar = 0; 4125 Py_ssize_t unicode_size; 4126 Py_ssize_t i; 4127 int kind; 4128 void *data; 4129 int has_errors; 4130 Py_UNICODE *error_outptr; 4131#if SIZEOF_WCHAR_T == 2 4132 Py_ssize_t wchar_offset = 0; 4133#endif 4134 4135 if (size == 0) { 4136 if (consumed) 4137 *consumed = 0; 4138 return (PyObject *)PyUnicode_New(0, 0); 4139 } 4140 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4141 consumed, &has_errors); 4142 if (has_errors) { 4143 unicode = _PyUnicode_New(size); 4144 if (!unicode) 4145 return NULL; 4146 kind = PyUnicode_WCHAR_KIND; 4147 data = PyUnicode_AS_UNICODE(unicode); 4148 assert(data != NULL); 4149 } 4150 else { 4151 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 4152 if (!unicode) 4153 return NULL; 4154 /* When the string is ASCII only, just use memcpy and return. 4155 unicode_size may be != size if there is an incomplete UTF-8 4156 sequence at the end of the ASCII block. */ 4157 if (maxchar < 128 && size == unicode_size) { 4158 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4159 return (PyObject *)unicode; 4160 } 4161 kind = PyUnicode_KIND(unicode); 4162 data = PyUnicode_DATA(unicode); 4163 } 4164 /* Unpack UTF-8 encoded data */ 4165 i = 0; 4166 e = s + size; 4167 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4168 4169 while (s < e) { 4170 Py_UCS4 ch = (unsigned char)*s; 4171 4172 if (ch < 0x80) { 4173 /* Fast path for runs of ASCII characters. Given that common UTF-8 4174 input will consist of an overwhelming majority of ASCII 4175 characters, we try to optimize for this case by checking 4176 as many characters as a C 'long' can contain. 4177 First, check if we can do an aligned read, as most CPUs have 4178 a penalty for unaligned reads. 4179 */ 4180 if (!((size_t) s & LONG_PTR_MASK)) { 4181 /* Help register allocation */ 4182 register const char *_s = s; 4183 register Py_ssize_t _i = i; 4184 while (_s < aligned_end) { 4185 /* Read a whole long at a time (either 4 or 8 bytes), 4186 and do a fast unrolled copy if it only contains ASCII 4187 characters. */ 4188 unsigned long value = *(unsigned long *) _s; 4189 if (value & ASCII_CHAR_MASK) 4190 break; 4191 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4192 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4193 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4194 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4195#if (SIZEOF_LONG == 8) 4196 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4197 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4198 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4199 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4200#endif 4201 _s += SIZEOF_LONG; 4202 _i += SIZEOF_LONG; 4203 } 4204 s = _s; 4205 i = _i; 4206 if (s == e) 4207 break; 4208 ch = (unsigned char)*s; 4209 } 4210 } 4211 4212 if (ch < 0x80) { 4213 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4214 s++; 4215 continue; 4216 } 4217 4218 n = utf8_code_length[ch]; 4219 4220 if (s + n > e) { 4221 if (consumed) 4222 break; 4223 else { 4224 errmsg = "unexpected end of data"; 4225 startinpos = s-starts; 4226 endinpos = startinpos+1; 4227 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4228 endinpos++; 4229 goto utf8Error; 4230 } 4231 } 4232 4233 switch (n) { 4234 4235 case 0: 4236 errmsg = "invalid start byte"; 4237 startinpos = s-starts; 4238 endinpos = startinpos+1; 4239 goto utf8Error; 4240 4241 case 1: 4242 errmsg = "internal error"; 4243 startinpos = s-starts; 4244 endinpos = startinpos+1; 4245 goto utf8Error; 4246 4247 case 2: 4248 if ((s[1] & 0xc0) != 0x80) { 4249 errmsg = "invalid continuation byte"; 4250 startinpos = s-starts; 4251 endinpos = startinpos + 1; 4252 goto utf8Error; 4253 } 4254 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4255 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4256 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4257 break; 4258 4259 case 3: 4260 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4261 will result in surrogates in range d800-dfff. Surrogates are 4262 not valid UTF-8 so they are rejected. 4263 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4264 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4265 if ((s[1] & 0xc0) != 0x80 || 4266 (s[2] & 0xc0) != 0x80 || 4267 ((unsigned char)s[0] == 0xE0 && 4268 (unsigned char)s[1] < 0xA0) || 4269 ((unsigned char)s[0] == 0xED && 4270 (unsigned char)s[1] > 0x9F)) { 4271 errmsg = "invalid continuation byte"; 4272 startinpos = s-starts; 4273 endinpos = startinpos + 1; 4274 4275 /* if s[1] first two bits are 1 and 0, then the invalid 4276 continuation byte is s[2], so increment endinpos by 1, 4277 if not, s[1] is invalid and endinpos doesn't need to 4278 be incremented. */ 4279 if ((s[1] & 0xC0) == 0x80) 4280 endinpos++; 4281 goto utf8Error; 4282 } 4283 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4284 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4285 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4286 break; 4287 4288 case 4: 4289 if ((s[1] & 0xc0) != 0x80 || 4290 (s[2] & 0xc0) != 0x80 || 4291 (s[3] & 0xc0) != 0x80 || 4292 ((unsigned char)s[0] == 0xF0 && 4293 (unsigned char)s[1] < 0x90) || 4294 ((unsigned char)s[0] == 0xF4 && 4295 (unsigned char)s[1] > 0x8F)) { 4296 errmsg = "invalid continuation byte"; 4297 startinpos = s-starts; 4298 endinpos = startinpos + 1; 4299 if ((s[1] & 0xC0) == 0x80) { 4300 endinpos++; 4301 if ((s[2] & 0xC0) == 0x80) 4302 endinpos++; 4303 } 4304 goto utf8Error; 4305 } 4306 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4307 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4308 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4309 4310 /* If the string is flexible or we have native UCS-4, write 4311 directly.. */ 4312 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4313 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4314 4315 else { 4316 /* compute and append the two surrogates: */ 4317 4318 /* translate from 10000..10FFFF to 0..FFFF */ 4319 ch -= 0x10000; 4320 4321 /* high surrogate = top 10 bits added to D800 */ 4322 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4323 (Py_UNICODE)(0xD800 + (ch >> 10))); 4324 4325 /* low surrogate = bottom 10 bits added to DC00 */ 4326 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4327 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4328 } 4329#if SIZEOF_WCHAR_T == 2 4330 wchar_offset++; 4331#endif 4332 break; 4333 } 4334 s += n; 4335 continue; 4336 4337 utf8Error: 4338 /* If this is not yet a resizable string, make it one.. */ 4339 if (kind != PyUnicode_WCHAR_KIND) { 4340 const Py_UNICODE *u; 4341 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4342 if (!new_unicode) 4343 goto onError; 4344 u = PyUnicode_AsUnicode((PyObject *)unicode); 4345 if (!u) 4346 goto onError; 4347#if SIZEOF_WCHAR_T == 2 4348 i += wchar_offset; 4349#endif 4350 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4351 Py_DECREF(unicode); 4352 unicode = new_unicode; 4353 kind = 0; 4354 data = PyUnicode_AS_UNICODE(new_unicode); 4355 assert(data != NULL); 4356 } 4357 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4358 if (unicode_decode_call_errorhandler( 4359 errors, &errorHandler, 4360 "utf8", errmsg, 4361 &starts, &e, &startinpos, &endinpos, &exc, &s, 4362 &unicode, &i, &error_outptr)) 4363 goto onError; 4364 /* Update data because unicode_decode_call_errorhandler might have 4365 re-created or resized the unicode object. */ 4366 data = PyUnicode_AS_UNICODE(unicode); 4367 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4368 } 4369 /* Ensure the unicode_size calculation above was correct: */ 4370 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4371 4372 if (consumed) 4373 *consumed = s-starts; 4374 4375 /* Adjust length and ready string when it contained errors and 4376 is of the old resizable kind. */ 4377 if (kind == PyUnicode_WCHAR_KIND) { 4378 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0) 4379 goto onError; 4380 } 4381 4382 Py_XDECREF(errorHandler); 4383 Py_XDECREF(exc); 4384#ifndef DONT_MAKE_RESULT_READY 4385 if (_PyUnicode_READY_REPLACE(&unicode)) { 4386 Py_DECREF(unicode); 4387 return NULL; 4388 } 4389#endif 4390 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4391 return (PyObject *)unicode; 4392 4393 onError: 4394 Py_XDECREF(errorHandler); 4395 Py_XDECREF(exc); 4396 Py_DECREF(unicode); 4397 return NULL; 4398} 4399 4400#undef WRITE_FLEXIBLE_OR_WSTR 4401 4402#ifdef __APPLE__ 4403 4404/* Simplified UTF-8 decoder using surrogateescape error handler, 4405 used to decode the command line arguments on Mac OS X. */ 4406 4407wchar_t* 4408_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4409{ 4410 int n; 4411 const char *e; 4412 wchar_t *unicode, *p; 4413 4414 /* Note: size will always be longer than the resulting Unicode 4415 character count */ 4416 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4417 PyErr_NoMemory(); 4418 return NULL; 4419 } 4420 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4421 if (!unicode) 4422 return NULL; 4423 4424 /* Unpack UTF-8 encoded data */ 4425 p = unicode; 4426 e = s + size; 4427 while (s < e) { 4428 Py_UCS4 ch = (unsigned char)*s; 4429 4430 if (ch < 0x80) { 4431 *p++ = (wchar_t)ch; 4432 s++; 4433 continue; 4434 } 4435 4436 n = utf8_code_length[ch]; 4437 if (s + n > e) { 4438 goto surrogateescape; 4439 } 4440 4441 switch (n) { 4442 case 0: 4443 case 1: 4444 goto surrogateescape; 4445 4446 case 2: 4447 if ((s[1] & 0xc0) != 0x80) 4448 goto surrogateescape; 4449 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4450 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4451 *p++ = (wchar_t)ch; 4452 break; 4453 4454 case 3: 4455 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4456 will result in surrogates in range d800-dfff. Surrogates are 4457 not valid UTF-8 so they are rejected. 4458 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4459 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4460 if ((s[1] & 0xc0) != 0x80 || 4461 (s[2] & 0xc0) != 0x80 || 4462 ((unsigned char)s[0] == 0xE0 && 4463 (unsigned char)s[1] < 0xA0) || 4464 ((unsigned char)s[0] == 0xED && 4465 (unsigned char)s[1] > 0x9F)) { 4466 4467 goto surrogateescape; 4468 } 4469 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4470 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4471 *p++ = (wchar_t)ch; 4472 break; 4473 4474 case 4: 4475 if ((s[1] & 0xc0) != 0x80 || 4476 (s[2] & 0xc0) != 0x80 || 4477 (s[3] & 0xc0) != 0x80 || 4478 ((unsigned char)s[0] == 0xF0 && 4479 (unsigned char)s[1] < 0x90) || 4480 ((unsigned char)s[0] == 0xF4 && 4481 (unsigned char)s[1] > 0x8F)) { 4482 goto surrogateescape; 4483 } 4484 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4485 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4486 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4487 4488#if SIZEOF_WCHAR_T == 4 4489 *p++ = (wchar_t)ch; 4490#else 4491 /* compute and append the two surrogates: */ 4492 4493 /* translate from 10000..10FFFF to 0..FFFF */ 4494 ch -= 0x10000; 4495 4496 /* high surrogate = top 10 bits added to D800 */ 4497 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4498 4499 /* low surrogate = bottom 10 bits added to DC00 */ 4500 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4501#endif 4502 break; 4503 } 4504 s += n; 4505 continue; 4506 4507 surrogateescape: 4508 *p++ = 0xDC00 + ch; 4509 s++; 4510 } 4511 *p = L'\0'; 4512 return unicode; 4513} 4514 4515#endif /* __APPLE__ */ 4516 4517/* Primary internal function which creates utf8 encoded bytes objects. 4518 4519 Allocation strategy: if the string is short, convert into a stack buffer 4520 and allocate exactly as much space needed at the end. Else allocate the 4521 maximum possible needed (4 result bytes per Unicode character), and return 4522 the excess memory at the end. 4523*/ 4524PyObject * 4525_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4526{ 4527#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4528 4529 Py_ssize_t i; /* index into s of next input byte */ 4530 PyObject *result; /* result string object */ 4531 char *p; /* next free byte in output buffer */ 4532 Py_ssize_t nallocated; /* number of result bytes allocated */ 4533 Py_ssize_t nneeded; /* number of result bytes needed */ 4534 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4535 PyObject *errorHandler = NULL; 4536 PyObject *exc = NULL; 4537 int kind; 4538 void *data; 4539 Py_ssize_t size; 4540 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4541#if SIZEOF_WCHAR_T == 2 4542 Py_ssize_t wchar_offset = 0; 4543#endif 4544 4545 if (!PyUnicode_Check(unicode)) { 4546 PyErr_BadArgument(); 4547 return NULL; 4548 } 4549 4550 if (PyUnicode_READY(unicode) == -1) 4551 return NULL; 4552 4553 if (PyUnicode_UTF8(unicode)) 4554 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4555 PyUnicode_UTF8_LENGTH(unicode)); 4556 4557 kind = PyUnicode_KIND(unicode); 4558 data = PyUnicode_DATA(unicode); 4559 size = PyUnicode_GET_LENGTH(unicode); 4560 4561 assert(size >= 0); 4562 4563 if (size <= MAX_SHORT_UNICHARS) { 4564 /* Write into the stack buffer; nallocated can't overflow. 4565 * At the end, we'll allocate exactly as much heap space as it 4566 * turns out we need. 4567 */ 4568 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4569 result = NULL; /* will allocate after we're done */ 4570 p = stackbuf; 4571 } 4572 else { 4573 /* Overallocate on the heap, and give the excess back at the end. */ 4574 nallocated = size * 4; 4575 if (nallocated / 4 != size) /* overflow! */ 4576 return PyErr_NoMemory(); 4577 result = PyBytes_FromStringAndSize(NULL, nallocated); 4578 if (result == NULL) 4579 return NULL; 4580 p = PyBytes_AS_STRING(result); 4581 } 4582 4583 for (i = 0; i < size;) { 4584 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4585 4586 if (ch < 0x80) 4587 /* Encode ASCII */ 4588 *p++ = (char) ch; 4589 4590 else if (ch < 0x0800) { 4591 /* Encode Latin-1 */ 4592 *p++ = (char)(0xc0 | (ch >> 6)); 4593 *p++ = (char)(0x80 | (ch & 0x3f)); 4594 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4595 Py_ssize_t newpos; 4596 PyObject *rep; 4597 Py_ssize_t repsize, k, startpos; 4598 startpos = i-1; 4599#if SIZEOF_WCHAR_T == 2 4600 startpos += wchar_offset; 4601#endif 4602 rep = unicode_encode_call_errorhandler( 4603 errors, &errorHandler, "utf-8", "surrogates not allowed", 4604 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4605 &exc, startpos, startpos+1, &newpos); 4606 if (!rep) 4607 goto error; 4608 4609 if (PyBytes_Check(rep)) 4610 repsize = PyBytes_GET_SIZE(rep); 4611 else 4612 repsize = PyUnicode_GET_SIZE(rep); 4613 4614 if (repsize > 4) { 4615 Py_ssize_t offset; 4616 4617 if (result == NULL) 4618 offset = p - stackbuf; 4619 else 4620 offset = p - PyBytes_AS_STRING(result); 4621 4622 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4623 /* integer overflow */ 4624 PyErr_NoMemory(); 4625 goto error; 4626 } 4627 nallocated += repsize - 4; 4628 if (result != NULL) { 4629 if (_PyBytes_Resize(&result, nallocated) < 0) 4630 goto error; 4631 } else { 4632 result = PyBytes_FromStringAndSize(NULL, nallocated); 4633 if (result == NULL) 4634 goto error; 4635 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4636 } 4637 p = PyBytes_AS_STRING(result) + offset; 4638 } 4639 4640 if (PyBytes_Check(rep)) { 4641 char *prep = PyBytes_AS_STRING(rep); 4642 for(k = repsize; k > 0; k--) 4643 *p++ = *prep++; 4644 } else /* rep is unicode */ { 4645 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4646 Py_UNICODE c; 4647 4648 for(k=0; k<repsize; k++) { 4649 c = prep[k]; 4650 if (0x80 <= c) { 4651 raise_encode_exception(&exc, "utf-8", 4652 PyUnicode_AS_UNICODE(unicode), 4653 size, i-1, i, 4654 "surrogates not allowed"); 4655 goto error; 4656 } 4657 *p++ = (char)prep[k]; 4658 } 4659 } 4660 Py_DECREF(rep); 4661 } else if (ch < 0x10000) { 4662 *p++ = (char)(0xe0 | (ch >> 12)); 4663 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4664 *p++ = (char)(0x80 | (ch & 0x3f)); 4665 } else /* ch >= 0x10000 */ { 4666 /* Encode UCS4 Unicode ordinals */ 4667 *p++ = (char)(0xf0 | (ch >> 18)); 4668 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4669 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4670 *p++ = (char)(0x80 | (ch & 0x3f)); 4671#if SIZEOF_WCHAR_T == 2 4672 wchar_offset++; 4673#endif 4674 } 4675 } 4676 4677 if (result == NULL) { 4678 /* This was stack allocated. */ 4679 nneeded = p - stackbuf; 4680 assert(nneeded <= nallocated); 4681 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4682 } 4683 else { 4684 /* Cut back to size actually needed. */ 4685 nneeded = p - PyBytes_AS_STRING(result); 4686 assert(nneeded <= nallocated); 4687 _PyBytes_Resize(&result, nneeded); 4688 } 4689 4690 Py_XDECREF(errorHandler); 4691 Py_XDECREF(exc); 4692 return result; 4693 error: 4694 Py_XDECREF(errorHandler); 4695 Py_XDECREF(exc); 4696 Py_XDECREF(result); 4697 return NULL; 4698 4699#undef MAX_SHORT_UNICHARS 4700} 4701 4702PyObject * 4703PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4704 Py_ssize_t size, 4705 const char *errors) 4706{ 4707 PyObject *v, *unicode; 4708 4709 unicode = PyUnicode_FromUnicode(s, size); 4710 if (unicode == NULL) 4711 return NULL; 4712 v = _PyUnicode_AsUTF8String(unicode, errors); 4713 Py_DECREF(unicode); 4714 return v; 4715} 4716 4717PyObject * 4718PyUnicode_AsUTF8String(PyObject *unicode) 4719{ 4720 return _PyUnicode_AsUTF8String(unicode, NULL); 4721} 4722 4723/* --- UTF-32 Codec ------------------------------------------------------- */ 4724 4725PyObject * 4726PyUnicode_DecodeUTF32(const char *s, 4727 Py_ssize_t size, 4728 const char *errors, 4729 int *byteorder) 4730{ 4731 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4732} 4733 4734PyObject * 4735PyUnicode_DecodeUTF32Stateful(const char *s, 4736 Py_ssize_t size, 4737 const char *errors, 4738 int *byteorder, 4739 Py_ssize_t *consumed) 4740{ 4741 const char *starts = s; 4742 Py_ssize_t startinpos; 4743 Py_ssize_t endinpos; 4744 Py_ssize_t outpos; 4745 PyUnicodeObject *unicode; 4746 Py_UNICODE *p; 4747#ifndef Py_UNICODE_WIDE 4748 int pairs = 0; 4749 const unsigned char *qq; 4750#else 4751 const int pairs = 0; 4752#endif 4753 const unsigned char *q, *e; 4754 int bo = 0; /* assume native ordering by default */ 4755 const char *errmsg = ""; 4756 /* Offsets from q for retrieving bytes in the right order. */ 4757#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4758 int iorder[] = {0, 1, 2, 3}; 4759#else 4760 int iorder[] = {3, 2, 1, 0}; 4761#endif 4762 PyObject *errorHandler = NULL; 4763 PyObject *exc = NULL; 4764 4765 q = (unsigned char *)s; 4766 e = q + size; 4767 4768 if (byteorder) 4769 bo = *byteorder; 4770 4771 /* Check for BOM marks (U+FEFF) in the input and adjust current 4772 byte order setting accordingly. In native mode, the leading BOM 4773 mark is skipped, in all other modes, it is copied to the output 4774 stream as-is (giving a ZWNBSP character). */ 4775 if (bo == 0) { 4776 if (size >= 4) { 4777 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4778 (q[iorder[1]] << 8) | q[iorder[0]]; 4779#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4780 if (bom == 0x0000FEFF) { 4781 q += 4; 4782 bo = -1; 4783 } 4784 else if (bom == 0xFFFE0000) { 4785 q += 4; 4786 bo = 1; 4787 } 4788#else 4789 if (bom == 0x0000FEFF) { 4790 q += 4; 4791 bo = 1; 4792 } 4793 else if (bom == 0xFFFE0000) { 4794 q += 4; 4795 bo = -1; 4796 } 4797#endif 4798 } 4799 } 4800 4801 if (bo == -1) { 4802 /* force LE */ 4803 iorder[0] = 0; 4804 iorder[1] = 1; 4805 iorder[2] = 2; 4806 iorder[3] = 3; 4807 } 4808 else if (bo == 1) { 4809 /* force BE */ 4810 iorder[0] = 3; 4811 iorder[1] = 2; 4812 iorder[2] = 1; 4813 iorder[3] = 0; 4814 } 4815 4816 /* On narrow builds we split characters outside the BMP into two 4817 codepoints => count how much extra space we need. */ 4818#ifndef Py_UNICODE_WIDE 4819 for (qq = q; qq < e; qq += 4) 4820 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4821 pairs++; 4822#endif 4823 4824 /* This might be one to much, because of a BOM */ 4825 unicode = _PyUnicode_New((size+3)/4+pairs); 4826 if (!unicode) 4827 return NULL; 4828 if (size == 0) 4829 return (PyObject *)unicode; 4830 4831 /* Unpack UTF-32 encoded data */ 4832 p = PyUnicode_AS_UNICODE(unicode); 4833 4834 while (q < e) { 4835 Py_UCS4 ch; 4836 /* remaining bytes at the end? (size should be divisible by 4) */ 4837 if (e-q<4) { 4838 if (consumed) 4839 break; 4840 errmsg = "truncated data"; 4841 startinpos = ((const char *)q)-starts; 4842 endinpos = ((const char *)e)-starts; 4843 goto utf32Error; 4844 /* The remaining input chars are ignored if the callback 4845 chooses to skip the input */ 4846 } 4847 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4848 (q[iorder[1]] << 8) | q[iorder[0]]; 4849 4850 if (ch >= 0x110000) 4851 { 4852 errmsg = "codepoint not in range(0x110000)"; 4853 startinpos = ((const char *)q)-starts; 4854 endinpos = startinpos+4; 4855 goto utf32Error; 4856 } 4857#ifndef Py_UNICODE_WIDE 4858 if (ch >= 0x10000) 4859 { 4860 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4861 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4862 } 4863 else 4864#endif 4865 *p++ = ch; 4866 q += 4; 4867 continue; 4868 utf32Error: 4869 outpos = p-PyUnicode_AS_UNICODE(unicode); 4870 if (unicode_decode_call_errorhandler( 4871 errors, &errorHandler, 4872 "utf32", errmsg, 4873 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4874 &unicode, &outpos, &p)) 4875 goto onError; 4876 } 4877 4878 if (byteorder) 4879 *byteorder = bo; 4880 4881 if (consumed) 4882 *consumed = (const char *)q-starts; 4883 4884 /* Adjust length */ 4885 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4886 goto onError; 4887 4888 Py_XDECREF(errorHandler); 4889 Py_XDECREF(exc); 4890#ifndef DONT_MAKE_RESULT_READY 4891 if (_PyUnicode_READY_REPLACE(&unicode)) { 4892 Py_DECREF(unicode); 4893 return NULL; 4894 } 4895#endif 4896 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4897 return (PyObject *)unicode; 4898 4899 onError: 4900 Py_DECREF(unicode); 4901 Py_XDECREF(errorHandler); 4902 Py_XDECREF(exc); 4903 return NULL; 4904} 4905 4906PyObject * 4907PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4908 Py_ssize_t size, 4909 const char *errors, 4910 int byteorder) 4911{ 4912 PyObject *v; 4913 unsigned char *p; 4914 Py_ssize_t nsize, bytesize; 4915#ifndef Py_UNICODE_WIDE 4916 Py_ssize_t i, pairs; 4917#else 4918 const int pairs = 0; 4919#endif 4920 /* Offsets from p for storing byte pairs in the right order. */ 4921#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4922 int iorder[] = {0, 1, 2, 3}; 4923#else 4924 int iorder[] = {3, 2, 1, 0}; 4925#endif 4926 4927#define STORECHAR(CH) \ 4928 do { \ 4929 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4930 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4931 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4932 p[iorder[0]] = (CH) & 0xff; \ 4933 p += 4; \ 4934 } while(0) 4935 4936 /* In narrow builds we can output surrogate pairs as one codepoint, 4937 so we need less space. */ 4938#ifndef Py_UNICODE_WIDE 4939 for (i = pairs = 0; i < size-1; i++) 4940 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4941 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4942 pairs++; 4943#endif 4944 nsize = (size - pairs + (byteorder == 0)); 4945 bytesize = nsize * 4; 4946 if (bytesize / 4 != nsize) 4947 return PyErr_NoMemory(); 4948 v = PyBytes_FromStringAndSize(NULL, bytesize); 4949 if (v == NULL) 4950 return NULL; 4951 4952 p = (unsigned char *)PyBytes_AS_STRING(v); 4953 if (byteorder == 0) 4954 STORECHAR(0xFEFF); 4955 if (size == 0) 4956 goto done; 4957 4958 if (byteorder == -1) { 4959 /* force LE */ 4960 iorder[0] = 0; 4961 iorder[1] = 1; 4962 iorder[2] = 2; 4963 iorder[3] = 3; 4964 } 4965 else if (byteorder == 1) { 4966 /* force BE */ 4967 iorder[0] = 3; 4968 iorder[1] = 2; 4969 iorder[2] = 1; 4970 iorder[3] = 0; 4971 } 4972 4973 while (size-- > 0) { 4974 Py_UCS4 ch = *s++; 4975#ifndef Py_UNICODE_WIDE 4976 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4977 Py_UCS4 ch2 = *s; 4978 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4979 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4980 s++; 4981 size--; 4982 } 4983 } 4984#endif 4985 STORECHAR(ch); 4986 } 4987 4988 done: 4989 return v; 4990#undef STORECHAR 4991} 4992 4993PyObject * 4994PyUnicode_AsUTF32String(PyObject *unicode) 4995{ 4996 if (!PyUnicode_Check(unicode)) { 4997 PyErr_BadArgument(); 4998 return NULL; 4999 } 5000 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 5001 PyUnicode_GET_SIZE(unicode), 5002 NULL, 5003 0); 5004} 5005 5006/* --- UTF-16 Codec ------------------------------------------------------- */ 5007 5008PyObject * 5009PyUnicode_DecodeUTF16(const char *s, 5010 Py_ssize_t size, 5011 const char *errors, 5012 int *byteorder) 5013{ 5014 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5015} 5016 5017/* Two masks for fast checking of whether a C 'long' may contain 5018 UTF16-encoded surrogate characters. This is an efficient heuristic, 5019 assuming that non-surrogate characters with a code point >= 0x8000 are 5020 rare in most input. 5021 FAST_CHAR_MASK is used when the input is in native byte ordering, 5022 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 5023*/ 5024#if (SIZEOF_LONG == 8) 5025# define FAST_CHAR_MASK 0x8000800080008000L 5026# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 5027#elif (SIZEOF_LONG == 4) 5028# define FAST_CHAR_MASK 0x80008000L 5029# define SWAPPED_FAST_CHAR_MASK 0x00800080L 5030#else 5031# error C 'long' size should be either 4 or 8! 5032#endif 5033 5034PyObject * 5035PyUnicode_DecodeUTF16Stateful(const char *s, 5036 Py_ssize_t size, 5037 const char *errors, 5038 int *byteorder, 5039 Py_ssize_t *consumed) 5040{ 5041 const char *starts = s; 5042 Py_ssize_t startinpos; 5043 Py_ssize_t endinpos; 5044 Py_ssize_t outpos; 5045 PyUnicodeObject *unicode; 5046 Py_UNICODE *p; 5047 const unsigned char *q, *e, *aligned_end; 5048 int bo = 0; /* assume native ordering by default */ 5049 int native_ordering = 0; 5050 const char *errmsg = ""; 5051 /* Offsets from q for retrieving byte pairs in the right order. */ 5052#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5053 int ihi = 1, ilo = 0; 5054#else 5055 int ihi = 0, ilo = 1; 5056#endif 5057 PyObject *errorHandler = NULL; 5058 PyObject *exc = NULL; 5059 5060 /* Note: size will always be longer than the resulting Unicode 5061 character count */ 5062 unicode = _PyUnicode_New(size); 5063 if (!unicode) 5064 return NULL; 5065 if (size == 0) 5066 return (PyObject *)unicode; 5067 5068 /* Unpack UTF-16 encoded data */ 5069 p = PyUnicode_AS_UNICODE(unicode); 5070 q = (unsigned char *)s; 5071 e = q + size - 1; 5072 5073 if (byteorder) 5074 bo = *byteorder; 5075 5076 /* Check for BOM marks (U+FEFF) in the input and adjust current 5077 byte order setting accordingly. In native mode, the leading BOM 5078 mark is skipped, in all other modes, it is copied to the output 5079 stream as-is (giving a ZWNBSP character). */ 5080 if (bo == 0) { 5081 if (size >= 2) { 5082 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 5083#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5084 if (bom == 0xFEFF) { 5085 q += 2; 5086 bo = -1; 5087 } 5088 else if (bom == 0xFFFE) { 5089 q += 2; 5090 bo = 1; 5091 } 5092#else 5093 if (bom == 0xFEFF) { 5094 q += 2; 5095 bo = 1; 5096 } 5097 else if (bom == 0xFFFE) { 5098 q += 2; 5099 bo = -1; 5100 } 5101#endif 5102 } 5103 } 5104 5105 if (bo == -1) { 5106 /* force LE */ 5107 ihi = 1; 5108 ilo = 0; 5109 } 5110 else if (bo == 1) { 5111 /* force BE */ 5112 ihi = 0; 5113 ilo = 1; 5114 } 5115#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5116 native_ordering = ilo < ihi; 5117#else 5118 native_ordering = ilo > ihi; 5119#endif 5120 5121 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5122 while (q < e) { 5123 Py_UNICODE ch; 5124 /* First check for possible aligned read of a C 'long'. Unaligned 5125 reads are more expensive, better to defer to another iteration. */ 5126 if (!((size_t) q & LONG_PTR_MASK)) { 5127 /* Fast path for runs of non-surrogate chars. */ 5128 register const unsigned char *_q = q; 5129 Py_UNICODE *_p = p; 5130 if (native_ordering) { 5131 /* Native ordering is simple: as long as the input cannot 5132 possibly contain a surrogate char, do an unrolled copy 5133 of several 16-bit code points to the target object. 5134 The non-surrogate check is done on several input bytes 5135 at a time (as many as a C 'long' can contain). */ 5136 while (_q < aligned_end) { 5137 unsigned long data = * (unsigned long *) _q; 5138 if (data & FAST_CHAR_MASK) 5139 break; 5140 _p[0] = ((unsigned short *) _q)[0]; 5141 _p[1] = ((unsigned short *) _q)[1]; 5142#if (SIZEOF_LONG == 8) 5143 _p[2] = ((unsigned short *) _q)[2]; 5144 _p[3] = ((unsigned short *) _q)[3]; 5145#endif 5146 _q += SIZEOF_LONG; 5147 _p += SIZEOF_LONG / 2; 5148 } 5149 } 5150 else { 5151 /* Byteswapped ordering is similar, but we must decompose 5152 the copy bytewise, and take care of zero'ing out the 5153 upper bytes if the target object is in 32-bit units 5154 (that is, in UCS-4 builds). */ 5155 while (_q < aligned_end) { 5156 unsigned long data = * (unsigned long *) _q; 5157 if (data & SWAPPED_FAST_CHAR_MASK) 5158 break; 5159 /* Zero upper bytes in UCS-4 builds */ 5160#if (Py_UNICODE_SIZE > 2) 5161 _p[0] = 0; 5162 _p[1] = 0; 5163#if (SIZEOF_LONG == 8) 5164 _p[2] = 0; 5165 _p[3] = 0; 5166#endif 5167#endif 5168 /* Issue #4916; UCS-4 builds on big endian machines must 5169 fill the two last bytes of each 4-byte unit. */ 5170#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5171# define OFF 2 5172#else 5173# define OFF 0 5174#endif 5175 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5176 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5177 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5178 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5179#if (SIZEOF_LONG == 8) 5180 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5181 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5182 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5183 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5184#endif 5185#undef OFF 5186 _q += SIZEOF_LONG; 5187 _p += SIZEOF_LONG / 2; 5188 } 5189 } 5190 p = _p; 5191 q = _q; 5192 if (q >= e) 5193 break; 5194 } 5195 ch = (q[ihi] << 8) | q[ilo]; 5196 5197 q += 2; 5198 5199 if (ch < 0xD800 || ch > 0xDFFF) { 5200 *p++ = ch; 5201 continue; 5202 } 5203 5204 /* UTF-16 code pair: */ 5205 if (q > e) { 5206 errmsg = "unexpected end of data"; 5207 startinpos = (((const char *)q) - 2) - starts; 5208 endinpos = ((const char *)e) + 1 - starts; 5209 goto utf16Error; 5210 } 5211 if (0xD800 <= ch && ch <= 0xDBFF) { 5212 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5213 q += 2; 5214 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5215#ifndef Py_UNICODE_WIDE 5216 *p++ = ch; 5217 *p++ = ch2; 5218#else 5219 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5220#endif 5221 continue; 5222 } 5223 else { 5224 errmsg = "illegal UTF-16 surrogate"; 5225 startinpos = (((const char *)q)-4)-starts; 5226 endinpos = startinpos+2; 5227 goto utf16Error; 5228 } 5229 5230 } 5231 errmsg = "illegal encoding"; 5232 startinpos = (((const char *)q)-2)-starts; 5233 endinpos = startinpos+2; 5234 /* Fall through to report the error */ 5235 5236 utf16Error: 5237 outpos = p - PyUnicode_AS_UNICODE(unicode); 5238 if (unicode_decode_call_errorhandler( 5239 errors, 5240 &errorHandler, 5241 "utf16", errmsg, 5242 &starts, 5243 (const char **)&e, 5244 &startinpos, 5245 &endinpos, 5246 &exc, 5247 (const char **)&q, 5248 &unicode, 5249 &outpos, 5250 &p)) 5251 goto onError; 5252 } 5253 /* remaining byte at the end? (size should be even) */ 5254 if (e == q) { 5255 if (!consumed) { 5256 errmsg = "truncated data"; 5257 startinpos = ((const char *)q) - starts; 5258 endinpos = ((const char *)e) + 1 - starts; 5259 outpos = p - PyUnicode_AS_UNICODE(unicode); 5260 if (unicode_decode_call_errorhandler( 5261 errors, 5262 &errorHandler, 5263 "utf16", errmsg, 5264 &starts, 5265 (const char **)&e, 5266 &startinpos, 5267 &endinpos, 5268 &exc, 5269 (const char **)&q, 5270 &unicode, 5271 &outpos, 5272 &p)) 5273 goto onError; 5274 /* The remaining input chars are ignored if the callback 5275 chooses to skip the input */ 5276 } 5277 } 5278 5279 if (byteorder) 5280 *byteorder = bo; 5281 5282 if (consumed) 5283 *consumed = (const char *)q-starts; 5284 5285 /* Adjust length */ 5286 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5287 goto onError; 5288 5289 Py_XDECREF(errorHandler); 5290 Py_XDECREF(exc); 5291#ifndef DONT_MAKE_RESULT_READY 5292 if (_PyUnicode_READY_REPLACE(&unicode)) { 5293 Py_DECREF(unicode); 5294 return NULL; 5295 } 5296#endif 5297 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5298 return (PyObject *)unicode; 5299 5300 onError: 5301 Py_DECREF(unicode); 5302 Py_XDECREF(errorHandler); 5303 Py_XDECREF(exc); 5304 return NULL; 5305} 5306 5307#undef FAST_CHAR_MASK 5308#undef SWAPPED_FAST_CHAR_MASK 5309 5310PyObject * 5311PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5312 Py_ssize_t size, 5313 const char *errors, 5314 int byteorder) 5315{ 5316 PyObject *v; 5317 unsigned char *p; 5318 Py_ssize_t nsize, bytesize; 5319#ifdef Py_UNICODE_WIDE 5320 Py_ssize_t i, pairs; 5321#else 5322 const int pairs = 0; 5323#endif 5324 /* Offsets from p for storing byte pairs in the right order. */ 5325#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5326 int ihi = 1, ilo = 0; 5327#else 5328 int ihi = 0, ilo = 1; 5329#endif 5330 5331#define STORECHAR(CH) \ 5332 do { \ 5333 p[ihi] = ((CH) >> 8) & 0xff; \ 5334 p[ilo] = (CH) & 0xff; \ 5335 p += 2; \ 5336 } while(0) 5337 5338#ifdef Py_UNICODE_WIDE 5339 for (i = pairs = 0; i < size; i++) 5340 if (s[i] >= 0x10000) 5341 pairs++; 5342#endif 5343 /* 2 * (size + pairs + (byteorder == 0)) */ 5344 if (size > PY_SSIZE_T_MAX || 5345 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5346 return PyErr_NoMemory(); 5347 nsize = size + pairs + (byteorder == 0); 5348 bytesize = nsize * 2; 5349 if (bytesize / 2 != nsize) 5350 return PyErr_NoMemory(); 5351 v = PyBytes_FromStringAndSize(NULL, bytesize); 5352 if (v == NULL) 5353 return NULL; 5354 5355 p = (unsigned char *)PyBytes_AS_STRING(v); 5356 if (byteorder == 0) 5357 STORECHAR(0xFEFF); 5358 if (size == 0) 5359 goto done; 5360 5361 if (byteorder == -1) { 5362 /* force LE */ 5363 ihi = 1; 5364 ilo = 0; 5365 } 5366 else if (byteorder == 1) { 5367 /* force BE */ 5368 ihi = 0; 5369 ilo = 1; 5370 } 5371 5372 while (size-- > 0) { 5373 Py_UNICODE ch = *s++; 5374 Py_UNICODE ch2 = 0; 5375#ifdef Py_UNICODE_WIDE 5376 if (ch >= 0x10000) { 5377 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5378 ch = 0xD800 | ((ch-0x10000) >> 10); 5379 } 5380#endif 5381 STORECHAR(ch); 5382 if (ch2) 5383 STORECHAR(ch2); 5384 } 5385 5386 done: 5387 return v; 5388#undef STORECHAR 5389} 5390 5391PyObject * 5392PyUnicode_AsUTF16String(PyObject *unicode) 5393{ 5394 if (!PyUnicode_Check(unicode)) { 5395 PyErr_BadArgument(); 5396 return NULL; 5397 } 5398 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5399 PyUnicode_GET_SIZE(unicode), 5400 NULL, 5401 0); 5402} 5403 5404/* --- Unicode Escape Codec ----------------------------------------------- */ 5405 5406/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5407 if all the escapes in the string make it still a valid ASCII string. 5408 Returns -1 if any escapes were found which cause the string to 5409 pop out of ASCII range. Otherwise returns the length of the 5410 required buffer to hold the string. 5411 */ 5412Py_ssize_t 5413length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5414{ 5415 const unsigned char *p = (const unsigned char *)s; 5416 const unsigned char *end = p + size; 5417 Py_ssize_t length = 0; 5418 5419 if (size < 0) 5420 return -1; 5421 5422 for (; p < end; ++p) { 5423 if (*p > 127) { 5424 /* Non-ASCII */ 5425 return -1; 5426 } 5427 else if (*p != '\\') { 5428 /* Normal character */ 5429 ++length; 5430 } 5431 else { 5432 /* Backslash-escape, check next char */ 5433 ++p; 5434 /* Escape sequence reaches till end of string or 5435 non-ASCII follow-up. */ 5436 if (p >= end || *p > 127) 5437 return -1; 5438 switch (*p) { 5439 case '\n': 5440 /* backslash + \n result in zero characters */ 5441 break; 5442 case '\\': case '\'': case '\"': 5443 case 'b': case 'f': case 't': 5444 case 'n': case 'r': case 'v': case 'a': 5445 ++length; 5446 break; 5447 case '0': case '1': case '2': case '3': 5448 case '4': case '5': case '6': case '7': 5449 case 'x': case 'u': case 'U': case 'N': 5450 /* these do not guarantee ASCII characters */ 5451 return -1; 5452 default: 5453 /* count the backslash + the other character */ 5454 length += 2; 5455 } 5456 } 5457 } 5458 return length; 5459} 5460 5461/* Similar to PyUnicode_WRITE but either write into wstr field 5462 or treat string as ASCII. */ 5463#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5464 do { \ 5465 if ((kind) != PyUnicode_WCHAR_KIND) \ 5466 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5467 else \ 5468 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5469 } while (0) 5470 5471#define WRITE_WSTR(buf, index, value) \ 5472 assert(kind == PyUnicode_WCHAR_KIND), \ 5473 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5474 5475 5476static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5477 5478PyObject * 5479PyUnicode_DecodeUnicodeEscape(const char *s, 5480 Py_ssize_t size, 5481 const char *errors) 5482{ 5483 const char *starts = s; 5484 Py_ssize_t startinpos; 5485 Py_ssize_t endinpos; 5486 int j; 5487 PyUnicodeObject *v; 5488 Py_UNICODE *p; 5489 const char *end; 5490 char* message; 5491 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5492 PyObject *errorHandler = NULL; 5493 PyObject *exc = NULL; 5494 Py_ssize_t ascii_length; 5495 Py_ssize_t i; 5496 int kind; 5497 void *data; 5498 5499 ascii_length = length_of_escaped_ascii_string(s, size); 5500 5501 /* After length_of_escaped_ascii_string() there are two alternatives, 5502 either the string is pure ASCII with named escapes like \n, etc. 5503 and we determined it's exact size (common case) 5504 or it contains \x, \u, ... escape sequences. then we create a 5505 legacy wchar string and resize it at the end of this function. */ 5506 if (ascii_length >= 0) { 5507 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5508 if (!v) 5509 goto onError; 5510 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5511 kind = PyUnicode_1BYTE_KIND; 5512 data = PyUnicode_DATA(v); 5513 } 5514 else { 5515 /* Escaped strings will always be longer than the resulting 5516 Unicode string, so we start with size here and then reduce the 5517 length after conversion to the true value. 5518 (but if the error callback returns a long replacement string 5519 we'll have to allocate more space) */ 5520 v = _PyUnicode_New(size); 5521 if (!v) 5522 goto onError; 5523 kind = PyUnicode_WCHAR_KIND; 5524 data = PyUnicode_AS_UNICODE(v); 5525 } 5526 5527 if (size == 0) 5528 return (PyObject *)v; 5529 i = 0; 5530 end = s + size; 5531 5532 while (s < end) { 5533 unsigned char c; 5534 Py_UNICODE x; 5535 int digits; 5536 5537 if (kind == PyUnicode_WCHAR_KIND) { 5538 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5539 } 5540 else { 5541 /* The only case in which i == ascii_length is a backslash 5542 followed by a newline. */ 5543 assert(i <= ascii_length); 5544 } 5545 5546 /* Non-escape characters are interpreted as Unicode ordinals */ 5547 if (*s != '\\') { 5548 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5549 continue; 5550 } 5551 5552 startinpos = s-starts; 5553 /* \ - Escapes */ 5554 s++; 5555 c = *s++; 5556 if (s > end) 5557 c = '\0'; /* Invalid after \ */ 5558 5559 if (kind == PyUnicode_WCHAR_KIND) { 5560 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5561 } 5562 else { 5563 /* The only case in which i == ascii_length is a backslash 5564 followed by a newline. */ 5565 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5566 } 5567 5568 switch (c) { 5569 5570 /* \x escapes */ 5571 case '\n': break; 5572 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5573 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5574 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5575 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5576 /* FF */ 5577 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5578 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5579 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5580 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5581 /* VT */ 5582 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5583 /* BEL, not classic C */ 5584 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5585 5586 /* \OOO (octal) escapes */ 5587 case '0': case '1': case '2': case '3': 5588 case '4': case '5': case '6': case '7': 5589 x = s[-1] - '0'; 5590 if (s < end && '0' <= *s && *s <= '7') { 5591 x = (x<<3) + *s++ - '0'; 5592 if (s < end && '0' <= *s && *s <= '7') 5593 x = (x<<3) + *s++ - '0'; 5594 } 5595 WRITE_WSTR(data, i++, x); 5596 break; 5597 5598 /* hex escapes */ 5599 /* \xXX */ 5600 case 'x': 5601 digits = 2; 5602 message = "truncated \\xXX escape"; 5603 goto hexescape; 5604 5605 /* \uXXXX */ 5606 case 'u': 5607 digits = 4; 5608 message = "truncated \\uXXXX escape"; 5609 goto hexescape; 5610 5611 /* \UXXXXXXXX */ 5612 case 'U': 5613 digits = 8; 5614 message = "truncated \\UXXXXXXXX escape"; 5615 hexescape: 5616 chr = 0; 5617 p = PyUnicode_AS_UNICODE(v) + i; 5618 if (s+digits>end) { 5619 endinpos = size; 5620 if (unicode_decode_call_errorhandler( 5621 errors, &errorHandler, 5622 "unicodeescape", "end of string in escape sequence", 5623 &starts, &end, &startinpos, &endinpos, &exc, &s, 5624 &v, &i, &p)) 5625 goto onError; 5626 data = PyUnicode_AS_UNICODE(v); 5627 goto nextByte; 5628 } 5629 for (j = 0; j < digits; ++j) { 5630 c = (unsigned char) s[j]; 5631 if (!Py_ISXDIGIT(c)) { 5632 endinpos = (s+j+1)-starts; 5633 p = PyUnicode_AS_UNICODE(v) + i; 5634 if (unicode_decode_call_errorhandler( 5635 errors, &errorHandler, 5636 "unicodeescape", message, 5637 &starts, &end, &startinpos, &endinpos, &exc, &s, 5638 &v, &i, &p)) 5639 goto onError; 5640 data = PyUnicode_AS_UNICODE(v); 5641 goto nextByte; 5642 } 5643 chr = (chr<<4) & ~0xF; 5644 if (c >= '0' && c <= '9') 5645 chr += c - '0'; 5646 else if (c >= 'a' && c <= 'f') 5647 chr += 10 + c - 'a'; 5648 else 5649 chr += 10 + c - 'A'; 5650 } 5651 s += j; 5652 if (chr == 0xffffffff && PyErr_Occurred()) 5653 /* _decoding_error will have already written into the 5654 target buffer. */ 5655 break; 5656 store: 5657 /* when we get here, chr is a 32-bit unicode character */ 5658 if (chr <= 0xffff) 5659 /* UCS-2 character */ 5660 WRITE_WSTR(data, i++, chr); 5661 else if (chr <= 0x10ffff) { 5662 /* UCS-4 character. Either store directly, or as 5663 surrogate pair. */ 5664#ifdef Py_UNICODE_WIDE 5665 WRITE_WSTR(data, i++, chr); 5666#else 5667 chr -= 0x10000L; 5668 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5669 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5670#endif 5671 } else { 5672 endinpos = s-starts; 5673 p = PyUnicode_AS_UNICODE(v) + i; 5674 if (unicode_decode_call_errorhandler( 5675 errors, &errorHandler, 5676 "unicodeescape", "illegal Unicode character", 5677 &starts, &end, &startinpos, &endinpos, &exc, &s, 5678 &v, &i, &p)) 5679 goto onError; 5680 data = PyUnicode_AS_UNICODE(v); 5681 } 5682 break; 5683 5684 /* \N{name} */ 5685 case 'N': 5686 message = "malformed \\N character escape"; 5687 if (ucnhash_CAPI == NULL) { 5688 /* load the unicode data module */ 5689 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5690 PyUnicodeData_CAPSULE_NAME, 1); 5691 if (ucnhash_CAPI == NULL) 5692 goto ucnhashError; 5693 } 5694 if (*s == '{') { 5695 const char *start = s+1; 5696 /* look for the closing brace */ 5697 while (*s != '}' && s < end) 5698 s++; 5699 if (s > start && s < end && *s == '}') { 5700 /* found a name. look it up in the unicode database */ 5701 message = "unknown Unicode character name"; 5702 s++; 5703 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5704 &chr)) 5705 goto store; 5706 } 5707 } 5708 endinpos = s-starts; 5709 p = PyUnicode_AS_UNICODE(v) + i; 5710 if (unicode_decode_call_errorhandler( 5711 errors, &errorHandler, 5712 "unicodeescape", message, 5713 &starts, &end, &startinpos, &endinpos, &exc, &s, 5714 &v, &i, &p)) 5715 goto onError; 5716 data = PyUnicode_AS_UNICODE(v); 5717 break; 5718 5719 default: 5720 if (s > end) { 5721 assert(kind == PyUnicode_WCHAR_KIND); 5722 message = "\\ at end of string"; 5723 s--; 5724 endinpos = s-starts; 5725 p = PyUnicode_AS_UNICODE(v) + i; 5726 if (unicode_decode_call_errorhandler( 5727 errors, &errorHandler, 5728 "unicodeescape", message, 5729 &starts, &end, &startinpos, &endinpos, &exc, &s, 5730 &v, &i, &p)) 5731 goto onError; 5732 data = PyUnicode_AS_UNICODE(v); 5733 } 5734 else { 5735 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5736 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5737 } 5738 break; 5739 } 5740 nextByte: 5741 ; 5742 } 5743 /* Ensure the length prediction worked in case of ASCII strings */ 5744 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5745 5746 if (kind == PyUnicode_WCHAR_KIND) 5747 { 5748 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5749 goto onError; 5750 } 5751 Py_XDECREF(errorHandler); 5752 Py_XDECREF(exc); 5753#ifndef DONT_MAKE_RESULT_READY 5754 if (_PyUnicode_READY_REPLACE(&v)) { 5755 Py_DECREF(v); 5756 return NULL; 5757 } 5758#endif 5759 assert(_PyUnicode_CheckConsistency(v, 1)); 5760 return (PyObject *)v; 5761 5762 ucnhashError: 5763 PyErr_SetString( 5764 PyExc_UnicodeError, 5765 "\\N escapes not supported (can't load unicodedata module)" 5766 ); 5767 Py_XDECREF(v); 5768 Py_XDECREF(errorHandler); 5769 Py_XDECREF(exc); 5770 return NULL; 5771 5772 onError: 5773 Py_XDECREF(v); 5774 Py_XDECREF(errorHandler); 5775 Py_XDECREF(exc); 5776 return NULL; 5777} 5778 5779#undef WRITE_ASCII_OR_WSTR 5780#undef WRITE_WSTR 5781 5782/* Return a Unicode-Escape string version of the Unicode object. 5783 5784 If quotes is true, the string is enclosed in u"" or u'' quotes as 5785 appropriate. 5786 5787*/ 5788 5789static const char *hexdigits = "0123456789abcdef"; 5790 5791PyObject * 5792PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5793 Py_ssize_t size) 5794{ 5795 PyObject *repr; 5796 char *p; 5797 5798#ifdef Py_UNICODE_WIDE 5799 const Py_ssize_t expandsize = 10; 5800#else 5801 const Py_ssize_t expandsize = 6; 5802#endif 5803 5804 /* XXX(nnorwitz): rather than over-allocating, it would be 5805 better to choose a different scheme. Perhaps scan the 5806 first N-chars of the string and allocate based on that size. 5807 */ 5808 /* Initial allocation is based on the longest-possible unichr 5809 escape. 5810 5811 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5812 unichr, so in this case it's the longest unichr escape. In 5813 narrow (UTF-16) builds this is five chars per source unichr 5814 since there are two unichrs in the surrogate pair, so in narrow 5815 (UTF-16) builds it's not the longest unichr escape. 5816 5817 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5818 so in the narrow (UTF-16) build case it's the longest unichr 5819 escape. 5820 */ 5821 5822 if (size == 0) 5823 return PyBytes_FromStringAndSize(NULL, 0); 5824 5825 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5826 return PyErr_NoMemory(); 5827 5828 repr = PyBytes_FromStringAndSize(NULL, 5829 2 5830 + expandsize*size 5831 + 1); 5832 if (repr == NULL) 5833 return NULL; 5834 5835 p = PyBytes_AS_STRING(repr); 5836 5837 while (size-- > 0) { 5838 Py_UNICODE ch = *s++; 5839 5840 /* Escape backslashes */ 5841 if (ch == '\\') { 5842 *p++ = '\\'; 5843 *p++ = (char) ch; 5844 continue; 5845 } 5846 5847#ifdef Py_UNICODE_WIDE 5848 /* Map 21-bit characters to '\U00xxxxxx' */ 5849 else if (ch >= 0x10000) { 5850 *p++ = '\\'; 5851 *p++ = 'U'; 5852 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5853 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5854 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5855 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5856 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5857 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5858 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5859 *p++ = hexdigits[ch & 0x0000000F]; 5860 continue; 5861 } 5862#else 5863 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5864 else if (ch >= 0xD800 && ch < 0xDC00) { 5865 Py_UNICODE ch2; 5866 Py_UCS4 ucs; 5867 5868 ch2 = *s++; 5869 size--; 5870 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5871 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5872 *p++ = '\\'; 5873 *p++ = 'U'; 5874 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5875 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5876 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5877 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5878 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5879 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5880 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5881 *p++ = hexdigits[ucs & 0x0000000F]; 5882 continue; 5883 } 5884 /* Fall through: isolated surrogates are copied as-is */ 5885 s--; 5886 size++; 5887 } 5888#endif 5889 5890 /* Map 16-bit characters to '\uxxxx' */ 5891 if (ch >= 256) { 5892 *p++ = '\\'; 5893 *p++ = 'u'; 5894 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5895 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5896 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5897 *p++ = hexdigits[ch & 0x000F]; 5898 } 5899 5900 /* Map special whitespace to '\t', \n', '\r' */ 5901 else if (ch == '\t') { 5902 *p++ = '\\'; 5903 *p++ = 't'; 5904 } 5905 else if (ch == '\n') { 5906 *p++ = '\\'; 5907 *p++ = 'n'; 5908 } 5909 else if (ch == '\r') { 5910 *p++ = '\\'; 5911 *p++ = 'r'; 5912 } 5913 5914 /* Map non-printable US ASCII to '\xhh' */ 5915 else if (ch < ' ' || ch >= 0x7F) { 5916 *p++ = '\\'; 5917 *p++ = 'x'; 5918 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5919 *p++ = hexdigits[ch & 0x000F]; 5920 } 5921 5922 /* Copy everything else as-is */ 5923 else 5924 *p++ = (char) ch; 5925 } 5926 5927 assert(p - PyBytes_AS_STRING(repr) > 0); 5928 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5929 return NULL; 5930 return repr; 5931} 5932 5933PyObject * 5934PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5935{ 5936 PyObject *s; 5937 if (!PyUnicode_Check(unicode)) { 5938 PyErr_BadArgument(); 5939 return NULL; 5940 } 5941 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5942 PyUnicode_GET_SIZE(unicode)); 5943 return s; 5944} 5945 5946/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5947 5948PyObject * 5949PyUnicode_DecodeRawUnicodeEscape(const char *s, 5950 Py_ssize_t size, 5951 const char *errors) 5952{ 5953 const char *starts = s; 5954 Py_ssize_t startinpos; 5955 Py_ssize_t endinpos; 5956 Py_ssize_t outpos; 5957 PyUnicodeObject *v; 5958 Py_UNICODE *p; 5959 const char *end; 5960 const char *bs; 5961 PyObject *errorHandler = NULL; 5962 PyObject *exc = NULL; 5963 5964 /* Escaped strings will always be longer than the resulting 5965 Unicode string, so we start with size here and then reduce the 5966 length after conversion to the true value. (But decoding error 5967 handler might have to resize the string) */ 5968 v = _PyUnicode_New(size); 5969 if (v == NULL) 5970 goto onError; 5971 if (size == 0) 5972 return (PyObject *)v; 5973 p = PyUnicode_AS_UNICODE(v); 5974 end = s + size; 5975 while (s < end) { 5976 unsigned char c; 5977 Py_UCS4 x; 5978 int i; 5979 int count; 5980 5981 /* Non-escape characters are interpreted as Unicode ordinals */ 5982 if (*s != '\\') { 5983 *p++ = (unsigned char)*s++; 5984 continue; 5985 } 5986 startinpos = s-starts; 5987 5988 /* \u-escapes are only interpreted iff the number of leading 5989 backslashes if odd */ 5990 bs = s; 5991 for (;s < end;) { 5992 if (*s != '\\') 5993 break; 5994 *p++ = (unsigned char)*s++; 5995 } 5996 if (((s - bs) & 1) == 0 || 5997 s >= end || 5998 (*s != 'u' && *s != 'U')) { 5999 continue; 6000 } 6001 p--; 6002 count = *s=='u' ? 4 : 8; 6003 s++; 6004 6005 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6006 outpos = p-PyUnicode_AS_UNICODE(v); 6007 for (x = 0, i = 0; i < count; ++i, ++s) { 6008 c = (unsigned char)*s; 6009 if (!Py_ISXDIGIT(c)) { 6010 endinpos = s-starts; 6011 if (unicode_decode_call_errorhandler( 6012 errors, &errorHandler, 6013 "rawunicodeescape", "truncated \\uXXXX", 6014 &starts, &end, &startinpos, &endinpos, &exc, &s, 6015 &v, &outpos, &p)) 6016 goto onError; 6017 goto nextByte; 6018 } 6019 x = (x<<4) & ~0xF; 6020 if (c >= '0' && c <= '9') 6021 x += c - '0'; 6022 else if (c >= 'a' && c <= 'f') 6023 x += 10 + c - 'a'; 6024 else 6025 x += 10 + c - 'A'; 6026 } 6027 if (x <= 0xffff) 6028 /* UCS-2 character */ 6029 *p++ = (Py_UNICODE) x; 6030 else if (x <= 0x10ffff) { 6031 /* UCS-4 character. Either store directly, or as 6032 surrogate pair. */ 6033#ifdef Py_UNICODE_WIDE 6034 *p++ = (Py_UNICODE) x; 6035#else 6036 x -= 0x10000L; 6037 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 6038 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 6039#endif 6040 } else { 6041 endinpos = s-starts; 6042 outpos = p-PyUnicode_AS_UNICODE(v); 6043 if (unicode_decode_call_errorhandler( 6044 errors, &errorHandler, 6045 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6046 &starts, &end, &startinpos, &endinpos, &exc, &s, 6047 &v, &outpos, &p)) 6048 goto onError; 6049 } 6050 nextByte: 6051 ; 6052 } 6053 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6054 goto onError; 6055 Py_XDECREF(errorHandler); 6056 Py_XDECREF(exc); 6057#ifndef DONT_MAKE_RESULT_READY 6058 if (_PyUnicode_READY_REPLACE(&v)) { 6059 Py_DECREF(v); 6060 return NULL; 6061 } 6062#endif 6063 assert(_PyUnicode_CheckConsistency(v, 1)); 6064 return (PyObject *)v; 6065 6066 onError: 6067 Py_XDECREF(v); 6068 Py_XDECREF(errorHandler); 6069 Py_XDECREF(exc); 6070 return NULL; 6071} 6072 6073PyObject * 6074PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6075 Py_ssize_t size) 6076{ 6077 PyObject *repr; 6078 char *p; 6079 char *q; 6080 6081#ifdef Py_UNICODE_WIDE 6082 const Py_ssize_t expandsize = 10; 6083#else 6084 const Py_ssize_t expandsize = 6; 6085#endif 6086 6087 if (size > PY_SSIZE_T_MAX / expandsize) 6088 return PyErr_NoMemory(); 6089 6090 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 6091 if (repr == NULL) 6092 return NULL; 6093 if (size == 0) 6094 return repr; 6095 6096 p = q = PyBytes_AS_STRING(repr); 6097 while (size-- > 0) { 6098 Py_UNICODE ch = *s++; 6099#ifdef Py_UNICODE_WIDE 6100 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6101 if (ch >= 0x10000) { 6102 *p++ = '\\'; 6103 *p++ = 'U'; 6104 *p++ = hexdigits[(ch >> 28) & 0xf]; 6105 *p++ = hexdigits[(ch >> 24) & 0xf]; 6106 *p++ = hexdigits[(ch >> 20) & 0xf]; 6107 *p++ = hexdigits[(ch >> 16) & 0xf]; 6108 *p++ = hexdigits[(ch >> 12) & 0xf]; 6109 *p++ = hexdigits[(ch >> 8) & 0xf]; 6110 *p++ = hexdigits[(ch >> 4) & 0xf]; 6111 *p++ = hexdigits[ch & 15]; 6112 } 6113 else 6114#else 6115 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 6116 if (ch >= 0xD800 && ch < 0xDC00) { 6117 Py_UNICODE ch2; 6118 Py_UCS4 ucs; 6119 6120 ch2 = *s++; 6121 size--; 6122 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 6123 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 6124 *p++ = '\\'; 6125 *p++ = 'U'; 6126 *p++ = hexdigits[(ucs >> 28) & 0xf]; 6127 *p++ = hexdigits[(ucs >> 24) & 0xf]; 6128 *p++ = hexdigits[(ucs >> 20) & 0xf]; 6129 *p++ = hexdigits[(ucs >> 16) & 0xf]; 6130 *p++ = hexdigits[(ucs >> 12) & 0xf]; 6131 *p++ = hexdigits[(ucs >> 8) & 0xf]; 6132 *p++ = hexdigits[(ucs >> 4) & 0xf]; 6133 *p++ = hexdigits[ucs & 0xf]; 6134 continue; 6135 } 6136 /* Fall through: isolated surrogates are copied as-is */ 6137 s--; 6138 size++; 6139 } 6140#endif 6141 /* Map 16-bit characters to '\uxxxx' */ 6142 if (ch >= 256) { 6143 *p++ = '\\'; 6144 *p++ = 'u'; 6145 *p++ = hexdigits[(ch >> 12) & 0xf]; 6146 *p++ = hexdigits[(ch >> 8) & 0xf]; 6147 *p++ = hexdigits[(ch >> 4) & 0xf]; 6148 *p++ = hexdigits[ch & 15]; 6149 } 6150 /* Copy everything else as-is */ 6151 else 6152 *p++ = (char) ch; 6153 } 6154 size = p - q; 6155 6156 assert(size > 0); 6157 if (_PyBytes_Resize(&repr, size) < 0) 6158 return NULL; 6159 return repr; 6160} 6161 6162PyObject * 6163PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6164{ 6165 PyObject *s; 6166 if (!PyUnicode_Check(unicode)) { 6167 PyErr_BadArgument(); 6168 return NULL; 6169 } 6170 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6171 PyUnicode_GET_SIZE(unicode)); 6172 6173 return s; 6174} 6175 6176/* --- Unicode Internal Codec ------------------------------------------- */ 6177 6178PyObject * 6179_PyUnicode_DecodeUnicodeInternal(const char *s, 6180 Py_ssize_t size, 6181 const char *errors) 6182{ 6183 const char *starts = s; 6184 Py_ssize_t startinpos; 6185 Py_ssize_t endinpos; 6186 Py_ssize_t outpos; 6187 PyUnicodeObject *v; 6188 Py_UNICODE *p; 6189 const char *end; 6190 const char *reason; 6191 PyObject *errorHandler = NULL; 6192 PyObject *exc = NULL; 6193 6194#ifdef Py_UNICODE_WIDE 6195 Py_UNICODE unimax = PyUnicode_GetMax(); 6196#endif 6197 6198 /* XXX overflow detection missing */ 6199 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6200 if (v == NULL) 6201 goto onError; 6202 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6203 as string was created with the old API. */ 6204 if (PyUnicode_GET_SIZE(v) == 0) 6205 return (PyObject *)v; 6206 p = PyUnicode_AS_UNICODE(v); 6207 end = s + size; 6208 6209 while (s < end) { 6210 memcpy(p, s, sizeof(Py_UNICODE)); 6211 /* We have to sanity check the raw data, otherwise doom looms for 6212 some malformed UCS-4 data. */ 6213 if ( 6214#ifdef Py_UNICODE_WIDE 6215 *p > unimax || *p < 0 || 6216#endif 6217 end-s < Py_UNICODE_SIZE 6218 ) 6219 { 6220 startinpos = s - starts; 6221 if (end-s < Py_UNICODE_SIZE) { 6222 endinpos = end-starts; 6223 reason = "truncated input"; 6224 } 6225 else { 6226 endinpos = s - starts + Py_UNICODE_SIZE; 6227 reason = "illegal code point (> 0x10FFFF)"; 6228 } 6229 outpos = p - PyUnicode_AS_UNICODE(v); 6230 if (unicode_decode_call_errorhandler( 6231 errors, &errorHandler, 6232 "unicode_internal", reason, 6233 &starts, &end, &startinpos, &endinpos, &exc, &s, 6234 &v, &outpos, &p)) { 6235 goto onError; 6236 } 6237 } 6238 else { 6239 p++; 6240 s += Py_UNICODE_SIZE; 6241 } 6242 } 6243 6244 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6245 goto onError; 6246 Py_XDECREF(errorHandler); 6247 Py_XDECREF(exc); 6248#ifndef DONT_MAKE_RESULT_READY 6249 if (_PyUnicode_READY_REPLACE(&v)) { 6250 Py_DECREF(v); 6251 return NULL; 6252 } 6253#endif 6254 assert(_PyUnicode_CheckConsistency(v, 1)); 6255 return (PyObject *)v; 6256 6257 onError: 6258 Py_XDECREF(v); 6259 Py_XDECREF(errorHandler); 6260 Py_XDECREF(exc); 6261 return NULL; 6262} 6263 6264/* --- Latin-1 Codec ------------------------------------------------------ */ 6265 6266PyObject * 6267PyUnicode_DecodeLatin1(const char *s, 6268 Py_ssize_t size, 6269 const char *errors) 6270{ 6271 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6272 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6273} 6274 6275/* create or adjust a UnicodeEncodeError */ 6276static void 6277make_encode_exception(PyObject **exceptionObject, 6278 const char *encoding, 6279 const Py_UNICODE *unicode, Py_ssize_t size, 6280 Py_ssize_t startpos, Py_ssize_t endpos, 6281 const char *reason) 6282{ 6283 if (*exceptionObject == NULL) { 6284 *exceptionObject = PyUnicodeEncodeError_Create( 6285 encoding, unicode, size, startpos, endpos, reason); 6286 } 6287 else { 6288 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6289 goto onError; 6290 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6291 goto onError; 6292 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6293 goto onError; 6294 return; 6295 onError: 6296 Py_DECREF(*exceptionObject); 6297 *exceptionObject = NULL; 6298 } 6299} 6300 6301/* raises a UnicodeEncodeError */ 6302static void 6303raise_encode_exception(PyObject **exceptionObject, 6304 const char *encoding, 6305 const Py_UNICODE *unicode, Py_ssize_t size, 6306 Py_ssize_t startpos, Py_ssize_t endpos, 6307 const char *reason) 6308{ 6309 make_encode_exception(exceptionObject, 6310 encoding, unicode, size, startpos, endpos, reason); 6311 if (*exceptionObject != NULL) 6312 PyCodec_StrictErrors(*exceptionObject); 6313} 6314 6315/* error handling callback helper: 6316 build arguments, call the callback and check the arguments, 6317 put the result into newpos and return the replacement string, which 6318 has to be freed by the caller */ 6319static PyObject * 6320unicode_encode_call_errorhandler(const char *errors, 6321 PyObject **errorHandler, 6322 const char *encoding, const char *reason, 6323 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6324 Py_ssize_t startpos, Py_ssize_t endpos, 6325 Py_ssize_t *newpos) 6326{ 6327 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6328 6329 PyObject *restuple; 6330 PyObject *resunicode; 6331 6332 if (*errorHandler == NULL) { 6333 *errorHandler = PyCodec_LookupError(errors); 6334 if (*errorHandler == NULL) 6335 return NULL; 6336 } 6337 6338 make_encode_exception(exceptionObject, 6339 encoding, unicode, size, startpos, endpos, reason); 6340 if (*exceptionObject == NULL) 6341 return NULL; 6342 6343 restuple = PyObject_CallFunctionObjArgs( 6344 *errorHandler, *exceptionObject, NULL); 6345 if (restuple == NULL) 6346 return NULL; 6347 if (!PyTuple_Check(restuple)) { 6348 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6349 Py_DECREF(restuple); 6350 return NULL; 6351 } 6352 if (!PyArg_ParseTuple(restuple, argparse, 6353 &resunicode, newpos)) { 6354 Py_DECREF(restuple); 6355 return NULL; 6356 } 6357 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6358 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6359 Py_DECREF(restuple); 6360 return NULL; 6361 } 6362 if (*newpos<0) 6363 *newpos = size+*newpos; 6364 if (*newpos<0 || *newpos>size) { 6365 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6366 Py_DECREF(restuple); 6367 return NULL; 6368 } 6369 Py_INCREF(resunicode); 6370 Py_DECREF(restuple); 6371 return resunicode; 6372} 6373 6374static PyObject * 6375unicode_encode_ucs1(const Py_UNICODE *p, 6376 Py_ssize_t size, 6377 const char *errors, 6378 int limit) 6379{ 6380 /* output object */ 6381 PyObject *res; 6382 /* pointers to the beginning and end+1 of input */ 6383 const Py_UNICODE *startp = p; 6384 const Py_UNICODE *endp = p + size; 6385 /* pointer to the beginning of the unencodable characters */ 6386 /* const Py_UNICODE *badp = NULL; */ 6387 /* pointer into the output */ 6388 char *str; 6389 /* current output position */ 6390 Py_ssize_t ressize; 6391 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6392 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6393 PyObject *errorHandler = NULL; 6394 PyObject *exc = NULL; 6395 /* the following variable is used for caching string comparisons 6396 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6397 int known_errorHandler = -1; 6398 6399 /* allocate enough for a simple encoding without 6400 replacements, if we need more, we'll resize */ 6401 if (size == 0) 6402 return PyBytes_FromStringAndSize(NULL, 0); 6403 res = PyBytes_FromStringAndSize(NULL, size); 6404 if (res == NULL) 6405 return NULL; 6406 str = PyBytes_AS_STRING(res); 6407 ressize = size; 6408 6409 while (p<endp) { 6410 Py_UNICODE c = *p; 6411 6412 /* can we encode this? */ 6413 if (c<limit) { 6414 /* no overflow check, because we know that the space is enough */ 6415 *str++ = (char)c; 6416 ++p; 6417 } 6418 else { 6419 Py_ssize_t unicodepos = p-startp; 6420 Py_ssize_t requiredsize; 6421 PyObject *repunicode; 6422 Py_ssize_t repsize; 6423 Py_ssize_t newpos; 6424 Py_ssize_t respos; 6425 Py_UNICODE *uni2; 6426 /* startpos for collecting unencodable chars */ 6427 const Py_UNICODE *collstart = p; 6428 const Py_UNICODE *collend = p; 6429 /* find all unecodable characters */ 6430 while ((collend < endp) && ((*collend)>=limit)) 6431 ++collend; 6432 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6433 if (known_errorHandler==-1) { 6434 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6435 known_errorHandler = 1; 6436 else if (!strcmp(errors, "replace")) 6437 known_errorHandler = 2; 6438 else if (!strcmp(errors, "ignore")) 6439 known_errorHandler = 3; 6440 else if (!strcmp(errors, "xmlcharrefreplace")) 6441 known_errorHandler = 4; 6442 else 6443 known_errorHandler = 0; 6444 } 6445 switch (known_errorHandler) { 6446 case 1: /* strict */ 6447 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6448 goto onError; 6449 case 2: /* replace */ 6450 while (collstart++<collend) 6451 *str++ = '?'; /* fall through */ 6452 case 3: /* ignore */ 6453 p = collend; 6454 break; 6455 case 4: /* xmlcharrefreplace */ 6456 respos = str - PyBytes_AS_STRING(res); 6457 /* determine replacement size (temporarily (mis)uses p) */ 6458 for (p = collstart, repsize = 0; p < collend; ++p) { 6459 if (*p<10) 6460 repsize += 2+1+1; 6461 else if (*p<100) 6462 repsize += 2+2+1; 6463 else if (*p<1000) 6464 repsize += 2+3+1; 6465 else if (*p<10000) 6466 repsize += 2+4+1; 6467#ifndef Py_UNICODE_WIDE 6468 else 6469 repsize += 2+5+1; 6470#else 6471 else if (*p<100000) 6472 repsize += 2+5+1; 6473 else if (*p<1000000) 6474 repsize += 2+6+1; 6475 else 6476 repsize += 2+7+1; 6477#endif 6478 } 6479 requiredsize = respos+repsize+(endp-collend); 6480 if (requiredsize > ressize) { 6481 if (requiredsize<2*ressize) 6482 requiredsize = 2*ressize; 6483 if (_PyBytes_Resize(&res, requiredsize)) 6484 goto onError; 6485 str = PyBytes_AS_STRING(res) + respos; 6486 ressize = requiredsize; 6487 } 6488 /* generate replacement (temporarily (mis)uses p) */ 6489 for (p = collstart; p < collend; ++p) { 6490 str += sprintf(str, "&#%d;", (int)*p); 6491 } 6492 p = collend; 6493 break; 6494 default: 6495 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6496 encoding, reason, startp, size, &exc, 6497 collstart-startp, collend-startp, &newpos); 6498 if (repunicode == NULL) 6499 goto onError; 6500 if (PyBytes_Check(repunicode)) { 6501 /* Directly copy bytes result to output. */ 6502 repsize = PyBytes_Size(repunicode); 6503 if (repsize > 1) { 6504 /* Make room for all additional bytes. */ 6505 respos = str - PyBytes_AS_STRING(res); 6506 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6507 Py_DECREF(repunicode); 6508 goto onError; 6509 } 6510 str = PyBytes_AS_STRING(res) + respos; 6511 ressize += repsize-1; 6512 } 6513 memcpy(str, PyBytes_AsString(repunicode), repsize); 6514 str += repsize; 6515 p = startp + newpos; 6516 Py_DECREF(repunicode); 6517 break; 6518 } 6519 /* need more space? (at least enough for what we 6520 have+the replacement+the rest of the string, so 6521 we won't have to check space for encodable characters) */ 6522 respos = str - PyBytes_AS_STRING(res); 6523 repsize = PyUnicode_GET_SIZE(repunicode); 6524 requiredsize = respos+repsize+(endp-collend); 6525 if (requiredsize > ressize) { 6526 if (requiredsize<2*ressize) 6527 requiredsize = 2*ressize; 6528 if (_PyBytes_Resize(&res, requiredsize)) { 6529 Py_DECREF(repunicode); 6530 goto onError; 6531 } 6532 str = PyBytes_AS_STRING(res) + respos; 6533 ressize = requiredsize; 6534 } 6535 /* check if there is anything unencodable in the replacement 6536 and copy it to the output */ 6537 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6538 c = *uni2; 6539 if (c >= limit) { 6540 raise_encode_exception(&exc, encoding, startp, size, 6541 unicodepos, unicodepos+1, reason); 6542 Py_DECREF(repunicode); 6543 goto onError; 6544 } 6545 *str = (char)c; 6546 } 6547 p = startp + newpos; 6548 Py_DECREF(repunicode); 6549 } 6550 } 6551 } 6552 /* Resize if we allocated to much */ 6553 size = str - PyBytes_AS_STRING(res); 6554 if (size < ressize) { /* If this falls res will be NULL */ 6555 assert(size >= 0); 6556 if (_PyBytes_Resize(&res, size) < 0) 6557 goto onError; 6558 } 6559 6560 Py_XDECREF(errorHandler); 6561 Py_XDECREF(exc); 6562 return res; 6563 6564 onError: 6565 Py_XDECREF(res); 6566 Py_XDECREF(errorHandler); 6567 Py_XDECREF(exc); 6568 return NULL; 6569} 6570 6571PyObject * 6572PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6573 Py_ssize_t size, 6574 const char *errors) 6575{ 6576 return unicode_encode_ucs1(p, size, errors, 256); 6577} 6578 6579PyObject * 6580_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6581{ 6582 if (!PyUnicode_Check(unicode)) { 6583 PyErr_BadArgument(); 6584 return NULL; 6585 } 6586 if (PyUnicode_READY(unicode) == -1) 6587 return NULL; 6588 /* Fast path: if it is a one-byte string, construct 6589 bytes object directly. */ 6590 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6591 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6592 PyUnicode_GET_LENGTH(unicode)); 6593 /* Non-Latin-1 characters present. Defer to above function to 6594 raise the exception. */ 6595 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6596 PyUnicode_GET_SIZE(unicode), 6597 errors); 6598} 6599 6600PyObject* 6601PyUnicode_AsLatin1String(PyObject *unicode) 6602{ 6603 return _PyUnicode_AsLatin1String(unicode, NULL); 6604} 6605 6606/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6607 6608PyObject * 6609PyUnicode_DecodeASCII(const char *s, 6610 Py_ssize_t size, 6611 const char *errors) 6612{ 6613 const char *starts = s; 6614 PyUnicodeObject *v; 6615 Py_UNICODE *u; 6616 Py_ssize_t startinpos; 6617 Py_ssize_t endinpos; 6618 Py_ssize_t outpos; 6619 const char *e; 6620 int has_error; 6621 const unsigned char *p = (const unsigned char *)s; 6622 const unsigned char *end = p + size; 6623 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6624 PyObject *errorHandler = NULL; 6625 PyObject *exc = NULL; 6626 6627 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6628 if (size == 1 && (unsigned char)s[0] < 128) 6629 return get_latin1_char((unsigned char)s[0]); 6630 6631 has_error = 0; 6632 while (p < end && !has_error) { 6633 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6634 an explanation. */ 6635 if (!((size_t) p & LONG_PTR_MASK)) { 6636 /* Help register allocation */ 6637 register const unsigned char *_p = p; 6638 while (_p < aligned_end) { 6639 unsigned long value = *(unsigned long *) _p; 6640 if (value & ASCII_CHAR_MASK) { 6641 has_error = 1; 6642 break; 6643 } 6644 _p += SIZEOF_LONG; 6645 } 6646 if (_p == end) 6647 break; 6648 if (has_error) 6649 break; 6650 p = _p; 6651 } 6652 if (*p & 0x80) { 6653 has_error = 1; 6654 break; 6655 } 6656 else { 6657 ++p; 6658 } 6659 } 6660 if (!has_error) 6661 return unicode_fromascii((const unsigned char *)s, size); 6662 6663 v = _PyUnicode_New(size); 6664 if (v == NULL) 6665 goto onError; 6666 if (size == 0) 6667 return (PyObject *)v; 6668 u = PyUnicode_AS_UNICODE(v); 6669 e = s + size; 6670 while (s < e) { 6671 register unsigned char c = (unsigned char)*s; 6672 if (c < 128) { 6673 *u++ = c; 6674 ++s; 6675 } 6676 else { 6677 startinpos = s-starts; 6678 endinpos = startinpos + 1; 6679 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6680 if (unicode_decode_call_errorhandler( 6681 errors, &errorHandler, 6682 "ascii", "ordinal not in range(128)", 6683 &starts, &e, &startinpos, &endinpos, &exc, &s, 6684 &v, &outpos, &u)) 6685 goto onError; 6686 } 6687 } 6688 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6689 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0) 6690 goto onError; 6691 Py_XDECREF(errorHandler); 6692 Py_XDECREF(exc); 6693#ifndef DONT_MAKE_RESULT_READY 6694 if (_PyUnicode_READY_REPLACE(&v)) { 6695 Py_DECREF(v); 6696 return NULL; 6697 } 6698#endif 6699 assert(_PyUnicode_CheckConsistency(v, 1)); 6700 return (PyObject *)v; 6701 6702 onError: 6703 Py_XDECREF(v); 6704 Py_XDECREF(errorHandler); 6705 Py_XDECREF(exc); 6706 return NULL; 6707} 6708 6709PyObject * 6710PyUnicode_EncodeASCII(const Py_UNICODE *p, 6711 Py_ssize_t size, 6712 const char *errors) 6713{ 6714 return unicode_encode_ucs1(p, size, errors, 128); 6715} 6716 6717PyObject * 6718_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6719{ 6720 if (!PyUnicode_Check(unicode)) { 6721 PyErr_BadArgument(); 6722 return NULL; 6723 } 6724 if (PyUnicode_READY(unicode) == -1) 6725 return NULL; 6726 /* Fast path: if it is an ASCII-only string, construct bytes object 6727 directly. Else defer to above function to raise the exception. */ 6728 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6729 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6730 PyUnicode_GET_LENGTH(unicode)); 6731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6732 PyUnicode_GET_SIZE(unicode), 6733 errors); 6734} 6735 6736PyObject * 6737PyUnicode_AsASCIIString(PyObject *unicode) 6738{ 6739 return _PyUnicode_AsASCIIString(unicode, NULL); 6740} 6741 6742#ifdef HAVE_MBCS 6743 6744/* --- MBCS codecs for Windows -------------------------------------------- */ 6745 6746#if SIZEOF_INT < SIZEOF_SIZE_T 6747#define NEED_RETRY 6748#endif 6749 6750/* XXX This code is limited to "true" double-byte encodings, as 6751 a) it assumes an incomplete character consists of a single byte, and 6752 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6753 encodings, see IsDBCSLeadByteEx documentation. */ 6754 6755static int 6756is_dbcs_lead_byte(const char *s, int offset) 6757{ 6758 const char *curr = s + offset; 6759 6760 if (IsDBCSLeadByte(*curr)) { 6761 const char *prev = CharPrev(s, curr); 6762 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6763 } 6764 return 0; 6765} 6766 6767/* 6768 * Decode MBCS string into unicode object. If 'final' is set, converts 6769 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6770 */ 6771static int 6772decode_mbcs(PyUnicodeObject **v, 6773 const char *s, /* MBCS string */ 6774 int size, /* sizeof MBCS string */ 6775 int final, 6776 const char *errors) 6777{ 6778 Py_UNICODE *p; 6779 Py_ssize_t n; 6780 DWORD usize; 6781 DWORD flags; 6782 6783 assert(size >= 0); 6784 6785 /* check and handle 'errors' arg */ 6786 if (errors==NULL || strcmp(errors, "strict")==0) 6787 flags = MB_ERR_INVALID_CHARS; 6788 else if (strcmp(errors, "ignore")==0) 6789 flags = 0; 6790 else { 6791 PyErr_Format(PyExc_ValueError, 6792 "mbcs encoding does not support errors='%s'", 6793 errors); 6794 return -1; 6795 } 6796 6797 /* Skip trailing lead-byte unless 'final' is set */ 6798 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6799 --size; 6800 6801 /* First get the size of the result */ 6802 if (size > 0) { 6803 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6804 if (usize==0) 6805 goto mbcs_decode_error; 6806 } else 6807 usize = 0; 6808 6809 if (*v == NULL) { 6810 /* Create unicode object */ 6811 *v = _PyUnicode_New(usize); 6812 if (*v == NULL) 6813 return -1; 6814 n = 0; 6815 } 6816 else { 6817 /* Extend unicode object */ 6818 n = PyUnicode_GET_SIZE(*v); 6819 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6820 return -1; 6821 } 6822 6823 /* Do the conversion */ 6824 if (usize > 0) { 6825 p = PyUnicode_AS_UNICODE(*v) + n; 6826 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6827 goto mbcs_decode_error; 6828 } 6829 } 6830 return size; 6831 6832mbcs_decode_error: 6833 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6834 we raise a UnicodeDecodeError - else it is a 'generic' 6835 windows error 6836 */ 6837 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6838 /* Ideally, we should get reason from FormatMessage - this 6839 is the Windows 2000 English version of the message 6840 */ 6841 PyObject *exc = NULL; 6842 const char *reason = "No mapping for the Unicode character exists " 6843 "in the target multi-byte code page."; 6844 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6845 if (exc != NULL) { 6846 PyCodec_StrictErrors(exc); 6847 Py_DECREF(exc); 6848 } 6849 } else { 6850 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6851 } 6852 return -1; 6853} 6854 6855PyObject * 6856PyUnicode_DecodeMBCSStateful(const char *s, 6857 Py_ssize_t size, 6858 const char *errors, 6859 Py_ssize_t *consumed) 6860{ 6861 PyUnicodeObject *v = NULL; 6862 int done; 6863 6864 if (consumed) 6865 *consumed = 0; 6866 6867#ifdef NEED_RETRY 6868 retry: 6869 if (size > INT_MAX) 6870 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6871 else 6872#endif 6873 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6874 6875 if (done < 0) { 6876 Py_XDECREF(v); 6877 return NULL; 6878 } 6879 6880 if (consumed) 6881 *consumed += done; 6882 6883#ifdef NEED_RETRY 6884 if (size > INT_MAX) { 6885 s += done; 6886 size -= done; 6887 goto retry; 6888 } 6889#endif 6890#ifndef DONT_MAKE_RESULT_READY 6891 if (_PyUnicode_READY_REPLACE(&v)) { 6892 Py_DECREF(v); 6893 return NULL; 6894 } 6895#endif 6896 assert(_PyUnicode_CheckConsistency(v, 1)); 6897 return (PyObject *)v; 6898} 6899 6900PyObject * 6901PyUnicode_DecodeMBCS(const char *s, 6902 Py_ssize_t size, 6903 const char *errors) 6904{ 6905 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6906} 6907 6908/* 6909 * Convert unicode into string object (MBCS). 6910 * Returns 0 if succeed, -1 otherwise. 6911 */ 6912static int 6913encode_mbcs(PyObject **repr, 6914 const Py_UNICODE *p, /* unicode */ 6915 int size, /* size of unicode */ 6916 const char* errors) 6917{ 6918 BOOL usedDefaultChar = FALSE; 6919 BOOL *pusedDefaultChar; 6920 int mbcssize; 6921 Py_ssize_t n; 6922 PyObject *exc = NULL; 6923 DWORD flags; 6924 6925 assert(size >= 0); 6926 6927 /* check and handle 'errors' arg */ 6928 if (errors==NULL || strcmp(errors, "strict")==0) { 6929 flags = WC_NO_BEST_FIT_CHARS; 6930 pusedDefaultChar = &usedDefaultChar; 6931 } else if (strcmp(errors, "replace")==0) { 6932 flags = 0; 6933 pusedDefaultChar = NULL; 6934 } else { 6935 PyErr_Format(PyExc_ValueError, 6936 "mbcs encoding does not support errors='%s'", 6937 errors); 6938 return -1; 6939 } 6940 6941 /* First get the size of the result */ 6942 if (size > 0) { 6943 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6944 NULL, pusedDefaultChar); 6945 if (mbcssize == 0) { 6946 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6947 return -1; 6948 } 6949 /* If we used a default char, then we failed! */ 6950 if (pusedDefaultChar && *pusedDefaultChar) 6951 goto mbcs_encode_error; 6952 } else { 6953 mbcssize = 0; 6954 } 6955 6956 if (*repr == NULL) { 6957 /* Create string object */ 6958 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6959 if (*repr == NULL) 6960 return -1; 6961 n = 0; 6962 } 6963 else { 6964 /* Extend string object */ 6965 n = PyBytes_Size(*repr); 6966 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6967 return -1; 6968 } 6969 6970 /* Do the conversion */ 6971 if (size > 0) { 6972 char *s = PyBytes_AS_STRING(*repr) + n; 6973 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6974 NULL, pusedDefaultChar)) { 6975 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6976 return -1; 6977 } 6978 if (pusedDefaultChar && *pusedDefaultChar) 6979 goto mbcs_encode_error; 6980 } 6981 return 0; 6982 6983mbcs_encode_error: 6984 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6985 Py_XDECREF(exc); 6986 return -1; 6987} 6988 6989PyObject * 6990PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6991 Py_ssize_t size, 6992 const char *errors) 6993{ 6994 PyObject *repr = NULL; 6995 int ret; 6996 6997#ifdef NEED_RETRY 6998 retry: 6999 if (size > INT_MAX) 7000 ret = encode_mbcs(&repr, p, INT_MAX, errors); 7001 else 7002#endif 7003 ret = encode_mbcs(&repr, p, (int)size, errors); 7004 7005 if (ret < 0) { 7006 Py_XDECREF(repr); 7007 return NULL; 7008 } 7009 7010#ifdef NEED_RETRY 7011 if (size > INT_MAX) { 7012 p += INT_MAX; 7013 size -= INT_MAX; 7014 goto retry; 7015 } 7016#endif 7017 7018 return repr; 7019} 7020 7021PyObject * 7022PyUnicode_AsMBCSString(PyObject *unicode) 7023{ 7024 if (!PyUnicode_Check(unicode)) { 7025 PyErr_BadArgument(); 7026 return NULL; 7027 } 7028 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 7029 PyUnicode_GET_SIZE(unicode), 7030 NULL); 7031} 7032 7033#undef NEED_RETRY 7034 7035#endif /* HAVE_MBCS */ 7036 7037/* --- Character Mapping Codec -------------------------------------------- */ 7038 7039PyObject * 7040PyUnicode_DecodeCharmap(const char *s, 7041 Py_ssize_t size, 7042 PyObject *mapping, 7043 const char *errors) 7044{ 7045 const char *starts = s; 7046 Py_ssize_t startinpos; 7047 Py_ssize_t endinpos; 7048 Py_ssize_t outpos; 7049 const char *e; 7050 PyUnicodeObject *v; 7051 Py_UNICODE *p; 7052 Py_ssize_t extrachars = 0; 7053 PyObject *errorHandler = NULL; 7054 PyObject *exc = NULL; 7055 Py_UNICODE *mapstring = NULL; 7056 Py_ssize_t maplen = 0; 7057 7058 /* Default to Latin-1 */ 7059 if (mapping == NULL) 7060 return PyUnicode_DecodeLatin1(s, size, errors); 7061 7062 v = _PyUnicode_New(size); 7063 if (v == NULL) 7064 goto onError; 7065 if (size == 0) 7066 return (PyObject *)v; 7067 p = PyUnicode_AS_UNICODE(v); 7068 e = s + size; 7069 if (PyUnicode_CheckExact(mapping)) { 7070 mapstring = PyUnicode_AS_UNICODE(mapping); 7071 maplen = PyUnicode_GET_SIZE(mapping); 7072 while (s < e) { 7073 unsigned char ch = *s; 7074 Py_UNICODE x = 0xfffe; /* illegal value */ 7075 7076 if (ch < maplen) 7077 x = mapstring[ch]; 7078 7079 if (x == 0xfffe) { 7080 /* undefined mapping */ 7081 outpos = p-PyUnicode_AS_UNICODE(v); 7082 startinpos = s-starts; 7083 endinpos = startinpos+1; 7084 if (unicode_decode_call_errorhandler( 7085 errors, &errorHandler, 7086 "charmap", "character maps to <undefined>", 7087 &starts, &e, &startinpos, &endinpos, &exc, &s, 7088 &v, &outpos, &p)) { 7089 goto onError; 7090 } 7091 continue; 7092 } 7093 *p++ = x; 7094 ++s; 7095 } 7096 } 7097 else { 7098 while (s < e) { 7099 unsigned char ch = *s; 7100 PyObject *w, *x; 7101 7102 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7103 w = PyLong_FromLong((long)ch); 7104 if (w == NULL) 7105 goto onError; 7106 x = PyObject_GetItem(mapping, w); 7107 Py_DECREF(w); 7108 if (x == NULL) { 7109 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7110 /* No mapping found means: mapping is undefined. */ 7111 PyErr_Clear(); 7112 x = Py_None; 7113 Py_INCREF(x); 7114 } else 7115 goto onError; 7116 } 7117 7118 /* Apply mapping */ 7119 if (PyLong_Check(x)) { 7120 long value = PyLong_AS_LONG(x); 7121 if (value < 0 || value > 65535) { 7122 PyErr_SetString(PyExc_TypeError, 7123 "character mapping must be in range(65536)"); 7124 Py_DECREF(x); 7125 goto onError; 7126 } 7127 *p++ = (Py_UNICODE)value; 7128 } 7129 else if (x == Py_None) { 7130 /* undefined mapping */ 7131 outpos = p-PyUnicode_AS_UNICODE(v); 7132 startinpos = s-starts; 7133 endinpos = startinpos+1; 7134 if (unicode_decode_call_errorhandler( 7135 errors, &errorHandler, 7136 "charmap", "character maps to <undefined>", 7137 &starts, &e, &startinpos, &endinpos, &exc, &s, 7138 &v, &outpos, &p)) { 7139 Py_DECREF(x); 7140 goto onError; 7141 } 7142 Py_DECREF(x); 7143 continue; 7144 } 7145 else if (PyUnicode_Check(x)) { 7146 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 7147 7148 if (targetsize == 1) 7149 /* 1-1 mapping */ 7150 *p++ = *PyUnicode_AS_UNICODE(x); 7151 7152 else if (targetsize > 1) { 7153 /* 1-n mapping */ 7154 if (targetsize > extrachars) { 7155 /* resize first */ 7156 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 7157 Py_ssize_t needed = (targetsize - extrachars) + \ 7158 (targetsize << 2); 7159 extrachars += needed; 7160 /* XXX overflow detection missing */ 7161 if (PyUnicode_Resize((PyObject**)&v, 7162 PyUnicode_GET_SIZE(v) + needed) < 0) { 7163 Py_DECREF(x); 7164 goto onError; 7165 } 7166 p = PyUnicode_AS_UNICODE(v) + oldpos; 7167 } 7168 Py_UNICODE_COPY(p, 7169 PyUnicode_AS_UNICODE(x), 7170 targetsize); 7171 p += targetsize; 7172 extrachars -= targetsize; 7173 } 7174 /* 1-0 mapping: skip the character */ 7175 } 7176 else { 7177 /* wrong return value */ 7178 PyErr_SetString(PyExc_TypeError, 7179 "character mapping must return integer, None or str"); 7180 Py_DECREF(x); 7181 goto onError; 7182 } 7183 Py_DECREF(x); 7184 ++s; 7185 } 7186 } 7187 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 7188 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 7189 goto onError; 7190 Py_XDECREF(errorHandler); 7191 Py_XDECREF(exc); 7192#ifndef DONT_MAKE_RESULT_READY 7193 if (_PyUnicode_READY_REPLACE(&v)) { 7194 Py_DECREF(v); 7195 return NULL; 7196 } 7197#endif 7198 assert(_PyUnicode_CheckConsistency(v, 1)); 7199 return (PyObject *)v; 7200 7201 onError: 7202 Py_XDECREF(errorHandler); 7203 Py_XDECREF(exc); 7204 Py_XDECREF(v); 7205 return NULL; 7206} 7207 7208/* Charmap encoding: the lookup table */ 7209 7210struct encoding_map { 7211 PyObject_HEAD 7212 unsigned char level1[32]; 7213 int count2, count3; 7214 unsigned char level23[1]; 7215}; 7216 7217static PyObject* 7218encoding_map_size(PyObject *obj, PyObject* args) 7219{ 7220 struct encoding_map *map = (struct encoding_map*)obj; 7221 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7222 128*map->count3); 7223} 7224 7225static PyMethodDef encoding_map_methods[] = { 7226 {"size", encoding_map_size, METH_NOARGS, 7227 PyDoc_STR("Return the size (in bytes) of this object") }, 7228 { 0 } 7229}; 7230 7231static void 7232encoding_map_dealloc(PyObject* o) 7233{ 7234 PyObject_FREE(o); 7235} 7236 7237static PyTypeObject EncodingMapType = { 7238 PyVarObject_HEAD_INIT(NULL, 0) 7239 "EncodingMap", /*tp_name*/ 7240 sizeof(struct encoding_map), /*tp_basicsize*/ 7241 0, /*tp_itemsize*/ 7242 /* methods */ 7243 encoding_map_dealloc, /*tp_dealloc*/ 7244 0, /*tp_print*/ 7245 0, /*tp_getattr*/ 7246 0, /*tp_setattr*/ 7247 0, /*tp_reserved*/ 7248 0, /*tp_repr*/ 7249 0, /*tp_as_number*/ 7250 0, /*tp_as_sequence*/ 7251 0, /*tp_as_mapping*/ 7252 0, /*tp_hash*/ 7253 0, /*tp_call*/ 7254 0, /*tp_str*/ 7255 0, /*tp_getattro*/ 7256 0, /*tp_setattro*/ 7257 0, /*tp_as_buffer*/ 7258 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7259 0, /*tp_doc*/ 7260 0, /*tp_traverse*/ 7261 0, /*tp_clear*/ 7262 0, /*tp_richcompare*/ 7263 0, /*tp_weaklistoffset*/ 7264 0, /*tp_iter*/ 7265 0, /*tp_iternext*/ 7266 encoding_map_methods, /*tp_methods*/ 7267 0, /*tp_members*/ 7268 0, /*tp_getset*/ 7269 0, /*tp_base*/ 7270 0, /*tp_dict*/ 7271 0, /*tp_descr_get*/ 7272 0, /*tp_descr_set*/ 7273 0, /*tp_dictoffset*/ 7274 0, /*tp_init*/ 7275 0, /*tp_alloc*/ 7276 0, /*tp_new*/ 7277 0, /*tp_free*/ 7278 0, /*tp_is_gc*/ 7279}; 7280 7281PyObject* 7282PyUnicode_BuildEncodingMap(PyObject* string) 7283{ 7284 PyObject *result; 7285 struct encoding_map *mresult; 7286 int i; 7287 int need_dict = 0; 7288 unsigned char level1[32]; 7289 unsigned char level2[512]; 7290 unsigned char *mlevel1, *mlevel2, *mlevel3; 7291 int count2 = 0, count3 = 0; 7292 int kind; 7293 void *data; 7294 Py_UCS4 ch; 7295 7296 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7297 PyErr_BadArgument(); 7298 return NULL; 7299 } 7300 kind = PyUnicode_KIND(string); 7301 data = PyUnicode_DATA(string); 7302 memset(level1, 0xFF, sizeof level1); 7303 memset(level2, 0xFF, sizeof level2); 7304 7305 /* If there isn't a one-to-one mapping of NULL to \0, 7306 or if there are non-BMP characters, we need to use 7307 a mapping dictionary. */ 7308 if (PyUnicode_READ(kind, data, 0) != 0) 7309 need_dict = 1; 7310 for (i = 1; i < 256; i++) { 7311 int l1, l2; 7312 ch = PyUnicode_READ(kind, data, i); 7313 if (ch == 0 || ch > 0xFFFF) { 7314 need_dict = 1; 7315 break; 7316 } 7317 if (ch == 0xFFFE) 7318 /* unmapped character */ 7319 continue; 7320 l1 = ch >> 11; 7321 l2 = ch >> 7; 7322 if (level1[l1] == 0xFF) 7323 level1[l1] = count2++; 7324 if (level2[l2] == 0xFF) 7325 level2[l2] = count3++; 7326 } 7327 7328 if (count2 >= 0xFF || count3 >= 0xFF) 7329 need_dict = 1; 7330 7331 if (need_dict) { 7332 PyObject *result = PyDict_New(); 7333 PyObject *key, *value; 7334 if (!result) 7335 return NULL; 7336 for (i = 0; i < 256; i++) { 7337 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7338 value = PyLong_FromLong(i); 7339 if (!key || !value) 7340 goto failed1; 7341 if (PyDict_SetItem(result, key, value) == -1) 7342 goto failed1; 7343 Py_DECREF(key); 7344 Py_DECREF(value); 7345 } 7346 return result; 7347 failed1: 7348 Py_XDECREF(key); 7349 Py_XDECREF(value); 7350 Py_DECREF(result); 7351 return NULL; 7352 } 7353 7354 /* Create a three-level trie */ 7355 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7356 16*count2 + 128*count3 - 1); 7357 if (!result) 7358 return PyErr_NoMemory(); 7359 PyObject_Init(result, &EncodingMapType); 7360 mresult = (struct encoding_map*)result; 7361 mresult->count2 = count2; 7362 mresult->count3 = count3; 7363 mlevel1 = mresult->level1; 7364 mlevel2 = mresult->level23; 7365 mlevel3 = mresult->level23 + 16*count2; 7366 memcpy(mlevel1, level1, 32); 7367 memset(mlevel2, 0xFF, 16*count2); 7368 memset(mlevel3, 0, 128*count3); 7369 count3 = 0; 7370 for (i = 1; i < 256; i++) { 7371 int o1, o2, o3, i2, i3; 7372 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7373 /* unmapped character */ 7374 continue; 7375 o1 = PyUnicode_READ(kind, data, i)>>11; 7376 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7377 i2 = 16*mlevel1[o1] + o2; 7378 if (mlevel2[i2] == 0xFF) 7379 mlevel2[i2] = count3++; 7380 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7381 i3 = 128*mlevel2[i2] + o3; 7382 mlevel3[i3] = i; 7383 } 7384 return result; 7385} 7386 7387static int 7388encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7389{ 7390 struct encoding_map *map = (struct encoding_map*)mapping; 7391 int l1 = c>>11; 7392 int l2 = (c>>7) & 0xF; 7393 int l3 = c & 0x7F; 7394 int i; 7395 7396#ifdef Py_UNICODE_WIDE 7397 if (c > 0xFFFF) { 7398 return -1; 7399 } 7400#endif 7401 if (c == 0) 7402 return 0; 7403 /* level 1*/ 7404 i = map->level1[l1]; 7405 if (i == 0xFF) { 7406 return -1; 7407 } 7408 /* level 2*/ 7409 i = map->level23[16*i+l2]; 7410 if (i == 0xFF) { 7411 return -1; 7412 } 7413 /* level 3 */ 7414 i = map->level23[16*map->count2 + 128*i + l3]; 7415 if (i == 0) { 7416 return -1; 7417 } 7418 return i; 7419} 7420 7421/* Lookup the character ch in the mapping. If the character 7422 can't be found, Py_None is returned (or NULL, if another 7423 error occurred). */ 7424static PyObject * 7425charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7426{ 7427 PyObject *w = PyLong_FromLong((long)c); 7428 PyObject *x; 7429 7430 if (w == NULL) 7431 return NULL; 7432 x = PyObject_GetItem(mapping, w); 7433 Py_DECREF(w); 7434 if (x == NULL) { 7435 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7436 /* No mapping found means: mapping is undefined. */ 7437 PyErr_Clear(); 7438 x = Py_None; 7439 Py_INCREF(x); 7440 return x; 7441 } else 7442 return NULL; 7443 } 7444 else if (x == Py_None) 7445 return x; 7446 else if (PyLong_Check(x)) { 7447 long value = PyLong_AS_LONG(x); 7448 if (value < 0 || value > 255) { 7449 PyErr_SetString(PyExc_TypeError, 7450 "character mapping must be in range(256)"); 7451 Py_DECREF(x); 7452 return NULL; 7453 } 7454 return x; 7455 } 7456 else if (PyBytes_Check(x)) 7457 return x; 7458 else { 7459 /* wrong return value */ 7460 PyErr_Format(PyExc_TypeError, 7461 "character mapping must return integer, bytes or None, not %.400s", 7462 x->ob_type->tp_name); 7463 Py_DECREF(x); 7464 return NULL; 7465 } 7466} 7467 7468static int 7469charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7470{ 7471 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7472 /* exponentially overallocate to minimize reallocations */ 7473 if (requiredsize < 2*outsize) 7474 requiredsize = 2*outsize; 7475 if (_PyBytes_Resize(outobj, requiredsize)) 7476 return -1; 7477 return 0; 7478} 7479 7480typedef enum charmapencode_result { 7481 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7482} charmapencode_result; 7483/* lookup the character, put the result in the output string and adjust 7484 various state variables. Resize the output bytes object if not enough 7485 space is available. Return a new reference to the object that 7486 was put in the output buffer, or Py_None, if the mapping was undefined 7487 (in which case no character was written) or NULL, if a 7488 reallocation error occurred. The caller must decref the result */ 7489static charmapencode_result 7490charmapencode_output(Py_UNICODE c, PyObject *mapping, 7491 PyObject **outobj, Py_ssize_t *outpos) 7492{ 7493 PyObject *rep; 7494 char *outstart; 7495 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7496 7497 if (Py_TYPE(mapping) == &EncodingMapType) { 7498 int res = encoding_map_lookup(c, mapping); 7499 Py_ssize_t requiredsize = *outpos+1; 7500 if (res == -1) 7501 return enc_FAILED; 7502 if (outsize<requiredsize) 7503 if (charmapencode_resize(outobj, outpos, requiredsize)) 7504 return enc_EXCEPTION; 7505 outstart = PyBytes_AS_STRING(*outobj); 7506 outstart[(*outpos)++] = (char)res; 7507 return enc_SUCCESS; 7508 } 7509 7510 rep = charmapencode_lookup(c, mapping); 7511 if (rep==NULL) 7512 return enc_EXCEPTION; 7513 else if (rep==Py_None) { 7514 Py_DECREF(rep); 7515 return enc_FAILED; 7516 } else { 7517 if (PyLong_Check(rep)) { 7518 Py_ssize_t requiredsize = *outpos+1; 7519 if (outsize<requiredsize) 7520 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7521 Py_DECREF(rep); 7522 return enc_EXCEPTION; 7523 } 7524 outstart = PyBytes_AS_STRING(*outobj); 7525 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7526 } 7527 else { 7528 const char *repchars = PyBytes_AS_STRING(rep); 7529 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7530 Py_ssize_t requiredsize = *outpos+repsize; 7531 if (outsize<requiredsize) 7532 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7533 Py_DECREF(rep); 7534 return enc_EXCEPTION; 7535 } 7536 outstart = PyBytes_AS_STRING(*outobj); 7537 memcpy(outstart + *outpos, repchars, repsize); 7538 *outpos += repsize; 7539 } 7540 } 7541 Py_DECREF(rep); 7542 return enc_SUCCESS; 7543} 7544 7545/* handle an error in PyUnicode_EncodeCharmap 7546 Return 0 on success, -1 on error */ 7547static int 7548charmap_encoding_error( 7549 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7550 PyObject **exceptionObject, 7551 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7552 PyObject **res, Py_ssize_t *respos) 7553{ 7554 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7555 Py_ssize_t repsize; 7556 Py_ssize_t newpos; 7557 Py_UNICODE *uni2; 7558 /* startpos for collecting unencodable chars */ 7559 Py_ssize_t collstartpos = *inpos; 7560 Py_ssize_t collendpos = *inpos+1; 7561 Py_ssize_t collpos; 7562 char *encoding = "charmap"; 7563 char *reason = "character maps to <undefined>"; 7564 charmapencode_result x; 7565 7566 /* find all unencodable characters */ 7567 while (collendpos < size) { 7568 PyObject *rep; 7569 if (Py_TYPE(mapping) == &EncodingMapType) { 7570 int res = encoding_map_lookup(p[collendpos], mapping); 7571 if (res != -1) 7572 break; 7573 ++collendpos; 7574 continue; 7575 } 7576 7577 rep = charmapencode_lookup(p[collendpos], mapping); 7578 if (rep==NULL) 7579 return -1; 7580 else if (rep!=Py_None) { 7581 Py_DECREF(rep); 7582 break; 7583 } 7584 Py_DECREF(rep); 7585 ++collendpos; 7586 } 7587 /* cache callback name lookup 7588 * (if not done yet, i.e. it's the first error) */ 7589 if (*known_errorHandler==-1) { 7590 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7591 *known_errorHandler = 1; 7592 else if (!strcmp(errors, "replace")) 7593 *known_errorHandler = 2; 7594 else if (!strcmp(errors, "ignore")) 7595 *known_errorHandler = 3; 7596 else if (!strcmp(errors, "xmlcharrefreplace")) 7597 *known_errorHandler = 4; 7598 else 7599 *known_errorHandler = 0; 7600 } 7601 switch (*known_errorHandler) { 7602 case 1: /* strict */ 7603 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7604 return -1; 7605 case 2: /* replace */ 7606 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7607 x = charmapencode_output('?', mapping, res, respos); 7608 if (x==enc_EXCEPTION) { 7609 return -1; 7610 } 7611 else if (x==enc_FAILED) { 7612 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7613 return -1; 7614 } 7615 } 7616 /* fall through */ 7617 case 3: /* ignore */ 7618 *inpos = collendpos; 7619 break; 7620 case 4: /* xmlcharrefreplace */ 7621 /* generate replacement (temporarily (mis)uses p) */ 7622 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7623 char buffer[2+29+1+1]; 7624 char *cp; 7625 sprintf(buffer, "&#%d;", (int)p[collpos]); 7626 for (cp = buffer; *cp; ++cp) { 7627 x = charmapencode_output(*cp, mapping, res, respos); 7628 if (x==enc_EXCEPTION) 7629 return -1; 7630 else if (x==enc_FAILED) { 7631 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7632 return -1; 7633 } 7634 } 7635 } 7636 *inpos = collendpos; 7637 break; 7638 default: 7639 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7640 encoding, reason, p, size, exceptionObject, 7641 collstartpos, collendpos, &newpos); 7642 if (repunicode == NULL) 7643 return -1; 7644 if (PyBytes_Check(repunicode)) { 7645 /* Directly copy bytes result to output. */ 7646 Py_ssize_t outsize = PyBytes_Size(*res); 7647 Py_ssize_t requiredsize; 7648 repsize = PyBytes_Size(repunicode); 7649 requiredsize = *respos + repsize; 7650 if (requiredsize > outsize) 7651 /* Make room for all additional bytes. */ 7652 if (charmapencode_resize(res, respos, requiredsize)) { 7653 Py_DECREF(repunicode); 7654 return -1; 7655 } 7656 memcpy(PyBytes_AsString(*res) + *respos, 7657 PyBytes_AsString(repunicode), repsize); 7658 *respos += repsize; 7659 *inpos = newpos; 7660 Py_DECREF(repunicode); 7661 break; 7662 } 7663 /* generate replacement */ 7664 repsize = PyUnicode_GET_SIZE(repunicode); 7665 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7666 x = charmapencode_output(*uni2, mapping, res, respos); 7667 if (x==enc_EXCEPTION) { 7668 return -1; 7669 } 7670 else if (x==enc_FAILED) { 7671 Py_DECREF(repunicode); 7672 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7673 return -1; 7674 } 7675 } 7676 *inpos = newpos; 7677 Py_DECREF(repunicode); 7678 } 7679 return 0; 7680} 7681 7682PyObject * 7683PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7684 Py_ssize_t size, 7685 PyObject *mapping, 7686 const char *errors) 7687{ 7688 /* output object */ 7689 PyObject *res = NULL; 7690 /* current input position */ 7691 Py_ssize_t inpos = 0; 7692 /* current output position */ 7693 Py_ssize_t respos = 0; 7694 PyObject *errorHandler = NULL; 7695 PyObject *exc = NULL; 7696 /* the following variable is used for caching string comparisons 7697 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7698 * 3=ignore, 4=xmlcharrefreplace */ 7699 int known_errorHandler = -1; 7700 7701 /* Default to Latin-1 */ 7702 if (mapping == NULL) 7703 return PyUnicode_EncodeLatin1(p, size, errors); 7704 7705 /* allocate enough for a simple encoding without 7706 replacements, if we need more, we'll resize */ 7707 res = PyBytes_FromStringAndSize(NULL, size); 7708 if (res == NULL) 7709 goto onError; 7710 if (size == 0) 7711 return res; 7712 7713 while (inpos<size) { 7714 /* try to encode it */ 7715 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7716 if (x==enc_EXCEPTION) /* error */ 7717 goto onError; 7718 if (x==enc_FAILED) { /* unencodable character */ 7719 if (charmap_encoding_error(p, size, &inpos, mapping, 7720 &exc, 7721 &known_errorHandler, &errorHandler, errors, 7722 &res, &respos)) { 7723 goto onError; 7724 } 7725 } 7726 else 7727 /* done with this character => adjust input position */ 7728 ++inpos; 7729 } 7730 7731 /* Resize if we allocated to much */ 7732 if (respos<PyBytes_GET_SIZE(res)) 7733 if (_PyBytes_Resize(&res, respos) < 0) 7734 goto onError; 7735 7736 Py_XDECREF(exc); 7737 Py_XDECREF(errorHandler); 7738 return res; 7739 7740 onError: 7741 Py_XDECREF(res); 7742 Py_XDECREF(exc); 7743 Py_XDECREF(errorHandler); 7744 return NULL; 7745} 7746 7747PyObject * 7748PyUnicode_AsCharmapString(PyObject *unicode, 7749 PyObject *mapping) 7750{ 7751 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7752 PyErr_BadArgument(); 7753 return NULL; 7754 } 7755 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7756 PyUnicode_GET_SIZE(unicode), 7757 mapping, 7758 NULL); 7759} 7760 7761/* create or adjust a UnicodeTranslateError */ 7762static void 7763make_translate_exception(PyObject **exceptionObject, 7764 PyObject *unicode, 7765 Py_ssize_t startpos, Py_ssize_t endpos, 7766 const char *reason) 7767{ 7768 if (*exceptionObject == NULL) { 7769 *exceptionObject = _PyUnicodeTranslateError_Create( 7770 unicode, startpos, endpos, reason); 7771 } 7772 else { 7773 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7774 goto onError; 7775 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7776 goto onError; 7777 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7778 goto onError; 7779 return; 7780 onError: 7781 Py_DECREF(*exceptionObject); 7782 *exceptionObject = NULL; 7783 } 7784} 7785 7786/* raises a UnicodeTranslateError */ 7787static void 7788raise_translate_exception(PyObject **exceptionObject, 7789 PyObject *unicode, 7790 Py_ssize_t startpos, Py_ssize_t endpos, 7791 const char *reason) 7792{ 7793 make_translate_exception(exceptionObject, 7794 unicode, startpos, endpos, reason); 7795 if (*exceptionObject != NULL) 7796 PyCodec_StrictErrors(*exceptionObject); 7797} 7798 7799/* error handling callback helper: 7800 build arguments, call the callback and check the arguments, 7801 put the result into newpos and return the replacement string, which 7802 has to be freed by the caller */ 7803static PyObject * 7804unicode_translate_call_errorhandler(const char *errors, 7805 PyObject **errorHandler, 7806 const char *reason, 7807 PyObject *unicode, PyObject **exceptionObject, 7808 Py_ssize_t startpos, Py_ssize_t endpos, 7809 Py_ssize_t *newpos) 7810{ 7811 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7812 7813 Py_ssize_t i_newpos; 7814 PyObject *restuple; 7815 PyObject *resunicode; 7816 7817 if (*errorHandler == NULL) { 7818 *errorHandler = PyCodec_LookupError(errors); 7819 if (*errorHandler == NULL) 7820 return NULL; 7821 } 7822 7823 make_translate_exception(exceptionObject, 7824 unicode, startpos, endpos, reason); 7825 if (*exceptionObject == NULL) 7826 return NULL; 7827 7828 restuple = PyObject_CallFunctionObjArgs( 7829 *errorHandler, *exceptionObject, NULL); 7830 if (restuple == NULL) 7831 return NULL; 7832 if (!PyTuple_Check(restuple)) { 7833 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7834 Py_DECREF(restuple); 7835 return NULL; 7836 } 7837 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7838 &resunicode, &i_newpos)) { 7839 Py_DECREF(restuple); 7840 return NULL; 7841 } 7842 if (i_newpos<0) 7843 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7844 else 7845 *newpos = i_newpos; 7846 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7847 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7848 Py_DECREF(restuple); 7849 return NULL; 7850 } 7851 Py_INCREF(resunicode); 7852 Py_DECREF(restuple); 7853 return resunicode; 7854} 7855 7856/* Lookup the character ch in the mapping and put the result in result, 7857 which must be decrefed by the caller. 7858 Return 0 on success, -1 on error */ 7859static int 7860charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7861{ 7862 PyObject *w = PyLong_FromLong((long)c); 7863 PyObject *x; 7864 7865 if (w == NULL) 7866 return -1; 7867 x = PyObject_GetItem(mapping, w); 7868 Py_DECREF(w); 7869 if (x == NULL) { 7870 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7871 /* No mapping found means: use 1:1 mapping. */ 7872 PyErr_Clear(); 7873 *result = NULL; 7874 return 0; 7875 } else 7876 return -1; 7877 } 7878 else if (x == Py_None) { 7879 *result = x; 7880 return 0; 7881 } 7882 else if (PyLong_Check(x)) { 7883 long value = PyLong_AS_LONG(x); 7884 long max = PyUnicode_GetMax(); 7885 if (value < 0 || value > max) { 7886 PyErr_Format(PyExc_TypeError, 7887 "character mapping must be in range(0x%x)", max+1); 7888 Py_DECREF(x); 7889 return -1; 7890 } 7891 *result = x; 7892 return 0; 7893 } 7894 else if (PyUnicode_Check(x)) { 7895 *result = x; 7896 return 0; 7897 } 7898 else { 7899 /* wrong return value */ 7900 PyErr_SetString(PyExc_TypeError, 7901 "character mapping must return integer, None or str"); 7902 Py_DECREF(x); 7903 return -1; 7904 } 7905} 7906/* ensure that *outobj is at least requiredsize characters long, 7907 if not reallocate and adjust various state variables. 7908 Return 0 on success, -1 on error */ 7909static int 7910charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7911 Py_ssize_t requiredsize) 7912{ 7913 Py_ssize_t oldsize = *psize; 7914 if (requiredsize > oldsize) { 7915 /* exponentially overallocate to minimize reallocations */ 7916 if (requiredsize < 2 * oldsize) 7917 requiredsize = 2 * oldsize; 7918 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7919 if (*outobj == 0) 7920 return -1; 7921 *psize = requiredsize; 7922 } 7923 return 0; 7924} 7925/* lookup the character, put the result in the output string and adjust 7926 various state variables. Return a new reference to the object that 7927 was put in the output buffer in *result, or Py_None, if the mapping was 7928 undefined (in which case no character was written). 7929 The called must decref result. 7930 Return 0 on success, -1 on error. */ 7931static int 7932charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7933 PyObject *mapping, Py_UCS4 **output, 7934 Py_ssize_t *osize, Py_ssize_t *opos, 7935 PyObject **res) 7936{ 7937 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7938 if (charmaptranslate_lookup(curinp, mapping, res)) 7939 return -1; 7940 if (*res==NULL) { 7941 /* not found => default to 1:1 mapping */ 7942 (*output)[(*opos)++] = curinp; 7943 } 7944 else if (*res==Py_None) 7945 ; 7946 else if (PyLong_Check(*res)) { 7947 /* no overflow check, because we know that the space is enough */ 7948 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7949 } 7950 else if (PyUnicode_Check(*res)) { 7951 Py_ssize_t repsize; 7952 if (PyUnicode_READY(*res) == -1) 7953 return -1; 7954 repsize = PyUnicode_GET_LENGTH(*res); 7955 if (repsize==1) { 7956 /* no overflow check, because we know that the space is enough */ 7957 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7958 } 7959 else if (repsize!=0) { 7960 /* more than one character */ 7961 Py_ssize_t requiredsize = *opos + 7962 (PyUnicode_GET_LENGTH(input) - ipos) + 7963 repsize - 1; 7964 Py_ssize_t i; 7965 if (charmaptranslate_makespace(output, osize, requiredsize)) 7966 return -1; 7967 for(i = 0; i < repsize; i++) 7968 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7969 } 7970 } 7971 else 7972 return -1; 7973 return 0; 7974} 7975 7976PyObject * 7977_PyUnicode_TranslateCharmap(PyObject *input, 7978 PyObject *mapping, 7979 const char *errors) 7980{ 7981 /* input object */ 7982 char *idata; 7983 Py_ssize_t size, i; 7984 int kind; 7985 /* output buffer */ 7986 Py_UCS4 *output = NULL; 7987 Py_ssize_t osize; 7988 PyObject *res; 7989 /* current output position */ 7990 Py_ssize_t opos; 7991 char *reason = "character maps to <undefined>"; 7992 PyObject *errorHandler = NULL; 7993 PyObject *exc = NULL; 7994 /* the following variable is used for caching string comparisons 7995 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7996 * 3=ignore, 4=xmlcharrefreplace */ 7997 int known_errorHandler = -1; 7998 7999 if (mapping == NULL) { 8000 PyErr_BadArgument(); 8001 return NULL; 8002 } 8003 8004 if (PyUnicode_READY(input) == -1) 8005 return NULL; 8006 idata = (char*)PyUnicode_DATA(input); 8007 kind = PyUnicode_KIND(input); 8008 size = PyUnicode_GET_LENGTH(input); 8009 i = 0; 8010 8011 if (size == 0) { 8012 Py_INCREF(input); 8013 return input; 8014 } 8015 8016 /* allocate enough for a simple 1:1 translation without 8017 replacements, if we need more, we'll resize */ 8018 osize = size; 8019 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8020 opos = 0; 8021 if (output == NULL) { 8022 PyErr_NoMemory(); 8023 goto onError; 8024 } 8025 8026 while (i<size) { 8027 /* try to encode it */ 8028 PyObject *x = NULL; 8029 if (charmaptranslate_output(input, i, mapping, 8030 &output, &osize, &opos, &x)) { 8031 Py_XDECREF(x); 8032 goto onError; 8033 } 8034 Py_XDECREF(x); 8035 if (x!=Py_None) /* it worked => adjust input pointer */ 8036 ++i; 8037 else { /* untranslatable character */ 8038 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8039 Py_ssize_t repsize; 8040 Py_ssize_t newpos; 8041 Py_ssize_t uni2; 8042 /* startpos for collecting untranslatable chars */ 8043 Py_ssize_t collstart = i; 8044 Py_ssize_t collend = i+1; 8045 Py_ssize_t coll; 8046 8047 /* find all untranslatable characters */ 8048 while (collend < size) { 8049 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8050 goto onError; 8051 Py_XDECREF(x); 8052 if (x!=Py_None) 8053 break; 8054 ++collend; 8055 } 8056 /* cache callback name lookup 8057 * (if not done yet, i.e. it's the first error) */ 8058 if (known_errorHandler==-1) { 8059 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8060 known_errorHandler = 1; 8061 else if (!strcmp(errors, "replace")) 8062 known_errorHandler = 2; 8063 else if (!strcmp(errors, "ignore")) 8064 known_errorHandler = 3; 8065 else if (!strcmp(errors, "xmlcharrefreplace")) 8066 known_errorHandler = 4; 8067 else 8068 known_errorHandler = 0; 8069 } 8070 switch (known_errorHandler) { 8071 case 1: /* strict */ 8072 raise_translate_exception(&exc, input, collstart, 8073 collend, reason); 8074 goto onError; 8075 case 2: /* replace */ 8076 /* No need to check for space, this is a 1:1 replacement */ 8077 for (coll = collstart; coll<collend; coll++) 8078 output[opos++] = '?'; 8079 /* fall through */ 8080 case 3: /* ignore */ 8081 i = collend; 8082 break; 8083 case 4: /* xmlcharrefreplace */ 8084 /* generate replacement (temporarily (mis)uses i) */ 8085 for (i = collstart; i < collend; ++i) { 8086 char buffer[2+29+1+1]; 8087 char *cp; 8088 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8089 if (charmaptranslate_makespace(&output, &osize, 8090 opos+strlen(buffer)+(size-collend))) 8091 goto onError; 8092 for (cp = buffer; *cp; ++cp) 8093 output[opos++] = *cp; 8094 } 8095 i = collend; 8096 break; 8097 default: 8098 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8099 reason, input, &exc, 8100 collstart, collend, &newpos); 8101 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 8102 goto onError; 8103 /* generate replacement */ 8104 repsize = PyUnicode_GET_LENGTH(repunicode); 8105 if (charmaptranslate_makespace(&output, &osize, 8106 opos+repsize+(size-collend))) { 8107 Py_DECREF(repunicode); 8108 goto onError; 8109 } 8110 for (uni2 = 0; repsize-->0; ++uni2) 8111 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8112 i = newpos; 8113 Py_DECREF(repunicode); 8114 } 8115 } 8116 } 8117 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8118 if (!res) 8119 goto onError; 8120 PyMem_Free(output); 8121 Py_XDECREF(exc); 8122 Py_XDECREF(errorHandler); 8123 return res; 8124 8125 onError: 8126 PyMem_Free(output); 8127 Py_XDECREF(exc); 8128 Py_XDECREF(errorHandler); 8129 return NULL; 8130} 8131 8132/* Deprecated. Use PyUnicode_Translate instead. */ 8133PyObject * 8134PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8135 Py_ssize_t size, 8136 PyObject *mapping, 8137 const char *errors) 8138{ 8139 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8140 if (!unicode) 8141 return NULL; 8142 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8143} 8144 8145PyObject * 8146PyUnicode_Translate(PyObject *str, 8147 PyObject *mapping, 8148 const char *errors) 8149{ 8150 PyObject *result; 8151 8152 str = PyUnicode_FromObject(str); 8153 if (str == NULL) 8154 goto onError; 8155 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8156 Py_DECREF(str); 8157 return result; 8158 8159 onError: 8160 Py_XDECREF(str); 8161 return NULL; 8162} 8163 8164static Py_UCS4 8165fix_decimal_and_space_to_ascii(PyObject *self) 8166{ 8167 /* No need to call PyUnicode_READY(self) because this function is only 8168 called as a callback from fixup() which does it already. */ 8169 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8170 const int kind = PyUnicode_KIND(self); 8171 void *data = PyUnicode_DATA(self); 8172 Py_UCS4 maxchar = 0, ch, fixed; 8173 Py_ssize_t i; 8174 8175 for (i = 0; i < len; ++i) { 8176 ch = PyUnicode_READ(kind, data, i); 8177 fixed = 0; 8178 if (ch > 127) { 8179 if (Py_UNICODE_ISSPACE(ch)) 8180 fixed = ' '; 8181 else { 8182 const int decimal = Py_UNICODE_TODECIMAL(ch); 8183 if (decimal >= 0) 8184 fixed = '0' + decimal; 8185 } 8186 if (fixed != 0) { 8187 if (fixed > maxchar) 8188 maxchar = fixed; 8189 PyUnicode_WRITE(kind, data, i, fixed); 8190 } 8191 else if (ch > maxchar) 8192 maxchar = ch; 8193 } 8194 else if (ch > maxchar) 8195 maxchar = ch; 8196 } 8197 8198 return maxchar; 8199} 8200 8201PyObject * 8202_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8203{ 8204 if (!PyUnicode_Check(unicode)) { 8205 PyErr_BadInternalCall(); 8206 return NULL; 8207 } 8208 if (PyUnicode_READY(unicode) == -1) 8209 return NULL; 8210 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8211 /* If the string is already ASCII, just return the same string */ 8212 Py_INCREF(unicode); 8213 return unicode; 8214 } 8215 return fixup(unicode, fix_decimal_and_space_to_ascii); 8216} 8217 8218PyObject * 8219PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8220 Py_ssize_t length) 8221{ 8222 PyObject *result; 8223 Py_UNICODE *p; /* write pointer into result */ 8224 Py_ssize_t i; 8225 /* Copy to a new string */ 8226 result = (PyObject *)_PyUnicode_New(length); 8227 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8228 if (result == NULL) 8229 return result; 8230 p = PyUnicode_AS_UNICODE(result); 8231 /* Iterate over code points */ 8232 for (i = 0; i < length; i++) { 8233 Py_UNICODE ch =s[i]; 8234 if (ch > 127) { 8235 int decimal = Py_UNICODE_TODECIMAL(ch); 8236 if (decimal >= 0) 8237 p[i] = '0' + decimal; 8238 } 8239 } 8240#ifndef DONT_MAKE_RESULT_READY 8241 if (_PyUnicode_READY_REPLACE(&result)) { 8242 Py_DECREF(result); 8243 return NULL; 8244 } 8245#endif 8246 assert(_PyUnicode_CheckConsistency(result, 1)); 8247 return result; 8248} 8249/* --- Decimal Encoder ---------------------------------------------------- */ 8250 8251int 8252PyUnicode_EncodeDecimal(Py_UNICODE *s, 8253 Py_ssize_t length, 8254 char *output, 8255 const char *errors) 8256{ 8257 Py_UNICODE *p, *end; 8258 PyObject *errorHandler = NULL; 8259 PyObject *exc = NULL; 8260 const char *encoding = "decimal"; 8261 const char *reason = "invalid decimal Unicode string"; 8262 /* the following variable is used for caching string comparisons 8263 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8264 int known_errorHandler = -1; 8265 8266 if (output == NULL) { 8267 PyErr_BadArgument(); 8268 return -1; 8269 } 8270 8271 p = s; 8272 end = s + length; 8273 while (p < end) { 8274 register Py_UNICODE ch = *p; 8275 int decimal; 8276 PyObject *repunicode; 8277 Py_ssize_t repsize; 8278 Py_ssize_t newpos; 8279 Py_UNICODE *uni2; 8280 Py_UNICODE *collstart; 8281 Py_UNICODE *collend; 8282 8283 if (Py_UNICODE_ISSPACE(ch)) { 8284 *output++ = ' '; 8285 ++p; 8286 continue; 8287 } 8288 decimal = Py_UNICODE_TODECIMAL(ch); 8289 if (decimal >= 0) { 8290 *output++ = '0' + decimal; 8291 ++p; 8292 continue; 8293 } 8294 if (0 < ch && ch < 256) { 8295 *output++ = (char)ch; 8296 ++p; 8297 continue; 8298 } 8299 /* All other characters are considered unencodable */ 8300 collstart = p; 8301 collend = p+1; 8302 while (collend < end) { 8303 if ((0 < *collend && *collend < 256) || 8304 !Py_UNICODE_ISSPACE(*collend) || 8305 Py_UNICODE_TODECIMAL(*collend)) 8306 break; 8307 } 8308 /* cache callback name lookup 8309 * (if not done yet, i.e. it's the first error) */ 8310 if (known_errorHandler==-1) { 8311 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8312 known_errorHandler = 1; 8313 else if (!strcmp(errors, "replace")) 8314 known_errorHandler = 2; 8315 else if (!strcmp(errors, "ignore")) 8316 known_errorHandler = 3; 8317 else if (!strcmp(errors, "xmlcharrefreplace")) 8318 known_errorHandler = 4; 8319 else 8320 known_errorHandler = 0; 8321 } 8322 switch (known_errorHandler) { 8323 case 1: /* strict */ 8324 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8325 goto onError; 8326 case 2: /* replace */ 8327 for (p = collstart; p < collend; ++p) 8328 *output++ = '?'; 8329 /* fall through */ 8330 case 3: /* ignore */ 8331 p = collend; 8332 break; 8333 case 4: /* xmlcharrefreplace */ 8334 /* generate replacement (temporarily (mis)uses p) */ 8335 for (p = collstart; p < collend; ++p) 8336 output += sprintf(output, "&#%d;", (int)*p); 8337 p = collend; 8338 break; 8339 default: 8340 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8341 encoding, reason, s, length, &exc, 8342 collstart-s, collend-s, &newpos); 8343 if (repunicode == NULL) 8344 goto onError; 8345 if (!PyUnicode_Check(repunicode)) { 8346 /* Byte results not supported, since they have no decimal property. */ 8347 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8348 Py_DECREF(repunicode); 8349 goto onError; 8350 } 8351 /* generate replacement */ 8352 repsize = PyUnicode_GET_SIZE(repunicode); 8353 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8354 Py_UNICODE ch = *uni2; 8355 if (Py_UNICODE_ISSPACE(ch)) 8356 *output++ = ' '; 8357 else { 8358 decimal = Py_UNICODE_TODECIMAL(ch); 8359 if (decimal >= 0) 8360 *output++ = '0' + decimal; 8361 else if (0 < ch && ch < 256) 8362 *output++ = (char)ch; 8363 else { 8364 Py_DECREF(repunicode); 8365 raise_encode_exception(&exc, encoding, 8366 s, length, collstart-s, collend-s, reason); 8367 goto onError; 8368 } 8369 } 8370 } 8371 p = s + newpos; 8372 Py_DECREF(repunicode); 8373 } 8374 } 8375 /* 0-terminate the output string */ 8376 *output++ = '\0'; 8377 Py_XDECREF(exc); 8378 Py_XDECREF(errorHandler); 8379 return 0; 8380 8381 onError: 8382 Py_XDECREF(exc); 8383 Py_XDECREF(errorHandler); 8384 return -1; 8385} 8386 8387/* --- Helpers ------------------------------------------------------------ */ 8388 8389#include "stringlib/asciilib.h" 8390#include "stringlib/fastsearch.h" 8391#include "stringlib/partition.h" 8392#include "stringlib/split.h" 8393#include "stringlib/count.h" 8394#include "stringlib/find.h" 8395#include "stringlib/localeutil.h" 8396#include "stringlib/undef.h" 8397 8398#include "stringlib/ucs1lib.h" 8399#include "stringlib/fastsearch.h" 8400#include "stringlib/partition.h" 8401#include "stringlib/split.h" 8402#include "stringlib/count.h" 8403#include "stringlib/find.h" 8404#include "stringlib/localeutil.h" 8405#include "stringlib/undef.h" 8406 8407#include "stringlib/ucs2lib.h" 8408#include "stringlib/fastsearch.h" 8409#include "stringlib/partition.h" 8410#include "stringlib/split.h" 8411#include "stringlib/count.h" 8412#include "stringlib/find.h" 8413#include "stringlib/localeutil.h" 8414#include "stringlib/undef.h" 8415 8416#include "stringlib/ucs4lib.h" 8417#include "stringlib/fastsearch.h" 8418#include "stringlib/partition.h" 8419#include "stringlib/split.h" 8420#include "stringlib/count.h" 8421#include "stringlib/find.h" 8422#include "stringlib/localeutil.h" 8423#include "stringlib/undef.h" 8424 8425static Py_ssize_t 8426any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t, 8427 const Py_UCS1*, Py_ssize_t, 8428 Py_ssize_t, Py_ssize_t), 8429 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8430 const Py_UCS1*, Py_ssize_t, 8431 Py_ssize_t, Py_ssize_t), 8432 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8433 const Py_UCS2*, Py_ssize_t, 8434 Py_ssize_t, Py_ssize_t), 8435 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8436 const Py_UCS4*, Py_ssize_t, 8437 Py_ssize_t, Py_ssize_t), 8438 PyObject* s1, PyObject* s2, 8439 Py_ssize_t start, 8440 Py_ssize_t end) 8441{ 8442 int kind1, kind2, kind; 8443 void *buf1, *buf2; 8444 Py_ssize_t len1, len2, result; 8445 8446 kind1 = PyUnicode_KIND(s1); 8447 kind2 = PyUnicode_KIND(s2); 8448 kind = kind1 > kind2 ? kind1 : kind2; 8449 buf1 = PyUnicode_DATA(s1); 8450 buf2 = PyUnicode_DATA(s2); 8451 if (kind1 != kind) 8452 buf1 = _PyUnicode_AsKind(s1, kind); 8453 if (!buf1) 8454 return -2; 8455 if (kind2 != kind) 8456 buf2 = _PyUnicode_AsKind(s2, kind); 8457 if (!buf2) { 8458 if (kind1 != kind) PyMem_Free(buf1); 8459 return -2; 8460 } 8461 len1 = PyUnicode_GET_LENGTH(s1); 8462 len2 = PyUnicode_GET_LENGTH(s2); 8463 8464 switch(kind) { 8465 case PyUnicode_1BYTE_KIND: 8466 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8467 result = ascii(buf1, len1, buf2, len2, start, end); 8468 else 8469 result = ucs1(buf1, len1, buf2, len2, start, end); 8470 break; 8471 case PyUnicode_2BYTE_KIND: 8472 result = ucs2(buf1, len1, buf2, len2, start, end); 8473 break; 8474 case PyUnicode_4BYTE_KIND: 8475 result = ucs4(buf1, len1, buf2, len2, start, end); 8476 break; 8477 default: 8478 assert(0); result = -2; 8479 } 8480 8481 if (kind1 != kind) 8482 PyMem_Free(buf1); 8483 if (kind2 != kind) 8484 PyMem_Free(buf2); 8485 8486 return result; 8487} 8488 8489Py_ssize_t 8490_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, 8491 Py_ssize_t n_buffer, 8492 void *digits, Py_ssize_t n_digits, 8493 Py_ssize_t min_width, 8494 const char *grouping, 8495 const char *thousands_sep) 8496{ 8497 switch(kind) { 8498 case PyUnicode_1BYTE_KIND: 8499 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8500 return _PyUnicode_ascii_InsertThousandsGrouping( 8501 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8502 min_width, grouping, thousands_sep); 8503 else 8504 return _PyUnicode_ucs1_InsertThousandsGrouping( 8505 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8506 min_width, grouping, thousands_sep); 8507 case PyUnicode_2BYTE_KIND: 8508 return _PyUnicode_ucs2_InsertThousandsGrouping( 8509 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8510 min_width, grouping, thousands_sep); 8511 case PyUnicode_4BYTE_KIND: 8512 return _PyUnicode_ucs4_InsertThousandsGrouping( 8513 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8514 min_width, grouping, thousands_sep); 8515 } 8516 assert(0); 8517 return -1; 8518} 8519 8520 8521#include "stringlib/unicodedefs.h" 8522#include "stringlib/fastsearch.h" 8523 8524#include "stringlib/count.h" 8525#include "stringlib/find.h" 8526 8527/* helper macro to fixup start/end slice values */ 8528#define ADJUST_INDICES(start, end, len) \ 8529 if (end > len) \ 8530 end = len; \ 8531 else if (end < 0) { \ 8532 end += len; \ 8533 if (end < 0) \ 8534 end = 0; \ 8535 } \ 8536 if (start < 0) { \ 8537 start += len; \ 8538 if (start < 0) \ 8539 start = 0; \ 8540 } 8541 8542Py_ssize_t 8543PyUnicode_Count(PyObject *str, 8544 PyObject *substr, 8545 Py_ssize_t start, 8546 Py_ssize_t end) 8547{ 8548 Py_ssize_t result; 8549 PyUnicodeObject* str_obj; 8550 PyUnicodeObject* sub_obj; 8551 int kind1, kind2, kind; 8552 void *buf1 = NULL, *buf2 = NULL; 8553 Py_ssize_t len1, len2; 8554 8555 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8556 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8557 return -1; 8558 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8559 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8560 Py_DECREF(str_obj); 8561 return -1; 8562 } 8563 8564 kind1 = PyUnicode_KIND(str_obj); 8565 kind2 = PyUnicode_KIND(sub_obj); 8566 kind = kind1 > kind2 ? kind1 : kind2; 8567 buf1 = PyUnicode_DATA(str_obj); 8568 if (kind1 != kind) 8569 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8570 if (!buf1) 8571 goto onError; 8572 buf2 = PyUnicode_DATA(sub_obj); 8573 if (kind2 != kind) 8574 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8575 if (!buf2) 8576 goto onError; 8577 len1 = PyUnicode_GET_LENGTH(str_obj); 8578 len2 = PyUnicode_GET_LENGTH(sub_obj); 8579 8580 ADJUST_INDICES(start, end, len1); 8581 switch(kind) { 8582 case PyUnicode_1BYTE_KIND: 8583 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8584 result = asciilib_count( 8585 ((Py_UCS1*)buf1) + start, end - start, 8586 buf2, len2, PY_SSIZE_T_MAX 8587 ); 8588 else 8589 result = ucs1lib_count( 8590 ((Py_UCS1*)buf1) + start, end - start, 8591 buf2, len2, PY_SSIZE_T_MAX 8592 ); 8593 break; 8594 case PyUnicode_2BYTE_KIND: 8595 result = ucs2lib_count( 8596 ((Py_UCS2*)buf1) + start, end - start, 8597 buf2, len2, PY_SSIZE_T_MAX 8598 ); 8599 break; 8600 case PyUnicode_4BYTE_KIND: 8601 result = ucs4lib_count( 8602 ((Py_UCS4*)buf1) + start, end - start, 8603 buf2, len2, PY_SSIZE_T_MAX 8604 ); 8605 break; 8606 default: 8607 assert(0); result = 0; 8608 } 8609 8610 Py_DECREF(sub_obj); 8611 Py_DECREF(str_obj); 8612 8613 if (kind1 != kind) 8614 PyMem_Free(buf1); 8615 if (kind2 != kind) 8616 PyMem_Free(buf2); 8617 8618 return result; 8619 onError: 8620 Py_DECREF(sub_obj); 8621 Py_DECREF(str_obj); 8622 if (kind1 != kind && buf1) 8623 PyMem_Free(buf1); 8624 if (kind2 != kind && buf2) 8625 PyMem_Free(buf2); 8626 return -1; 8627} 8628 8629Py_ssize_t 8630PyUnicode_Find(PyObject *str, 8631 PyObject *sub, 8632 Py_ssize_t start, 8633 Py_ssize_t end, 8634 int direction) 8635{ 8636 Py_ssize_t result; 8637 8638 str = PyUnicode_FromObject(str); 8639 if (!str || PyUnicode_READY(str) == -1) 8640 return -2; 8641 sub = PyUnicode_FromObject(sub); 8642 if (!sub || PyUnicode_READY(sub) == -1) { 8643 Py_DECREF(str); 8644 return -2; 8645 } 8646 8647 if (direction > 0) 8648 result = any_find_slice( 8649 asciilib_find_slice, ucs1lib_find_slice, 8650 ucs2lib_find_slice, ucs4lib_find_slice, 8651 str, sub, start, end 8652 ); 8653 else 8654 result = any_find_slice( 8655 asciilib_find_slice, ucs1lib_rfind_slice, 8656 ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8657 str, sub, start, end 8658 ); 8659 8660 Py_DECREF(str); 8661 Py_DECREF(sub); 8662 8663 return result; 8664} 8665 8666Py_ssize_t 8667PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8668 Py_ssize_t start, Py_ssize_t end, 8669 int direction) 8670{ 8671 char *result; 8672 int kind; 8673 if (PyUnicode_READY(str) == -1) 8674 return -2; 8675 if (start < 0 || end < 0) { 8676 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8677 return -2; 8678 } 8679 if (end > PyUnicode_GET_LENGTH(str)) 8680 end = PyUnicode_GET_LENGTH(str); 8681 kind = PyUnicode_KIND(str); 8682 result = findchar(PyUnicode_1BYTE_DATA(str) 8683 + PyUnicode_KIND_SIZE(kind, start), 8684 kind, 8685 end-start, ch, direction); 8686 if (!result) 8687 return -1; 8688 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8689} 8690 8691static int 8692tailmatch(PyUnicodeObject *self, 8693 PyUnicodeObject *substring, 8694 Py_ssize_t start, 8695 Py_ssize_t end, 8696 int direction) 8697{ 8698 int kind_self; 8699 int kind_sub; 8700 void *data_self; 8701 void *data_sub; 8702 Py_ssize_t offset; 8703 Py_ssize_t i; 8704 Py_ssize_t end_sub; 8705 8706 if (PyUnicode_READY(self) == -1 || 8707 PyUnicode_READY(substring) == -1) 8708 return 0; 8709 8710 if (PyUnicode_GET_LENGTH(substring) == 0) 8711 return 1; 8712 8713 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8714 end -= PyUnicode_GET_LENGTH(substring); 8715 if (end < start) 8716 return 0; 8717 8718 kind_self = PyUnicode_KIND(self); 8719 data_self = PyUnicode_DATA(self); 8720 kind_sub = PyUnicode_KIND(substring); 8721 data_sub = PyUnicode_DATA(substring); 8722 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8723 8724 if (direction > 0) 8725 offset = end; 8726 else 8727 offset = start; 8728 8729 if (PyUnicode_READ(kind_self, data_self, offset) == 8730 PyUnicode_READ(kind_sub, data_sub, 0) && 8731 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8732 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8733 /* If both are of the same kind, memcmp is sufficient */ 8734 if (kind_self == kind_sub) { 8735 return ! memcmp((char *)data_self + 8736 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8737 data_sub, 8738 PyUnicode_GET_LENGTH(substring) * 8739 PyUnicode_CHARACTER_SIZE(substring)); 8740 } 8741 /* otherwise we have to compare each character by first accesing it */ 8742 else { 8743 /* We do not need to compare 0 and len(substring)-1 because 8744 the if statement above ensured already that they are equal 8745 when we end up here. */ 8746 // TODO: honor direction and do a forward or backwards search 8747 for (i = 1; i < end_sub; ++i) { 8748 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8749 PyUnicode_READ(kind_sub, data_sub, i)) 8750 return 0; 8751 } 8752 return 1; 8753 } 8754 } 8755 8756 return 0; 8757} 8758 8759Py_ssize_t 8760PyUnicode_Tailmatch(PyObject *str, 8761 PyObject *substr, 8762 Py_ssize_t start, 8763 Py_ssize_t end, 8764 int direction) 8765{ 8766 Py_ssize_t result; 8767 8768 str = PyUnicode_FromObject(str); 8769 if (str == NULL) 8770 return -1; 8771 substr = PyUnicode_FromObject(substr); 8772 if (substr == NULL) { 8773 Py_DECREF(str); 8774 return -1; 8775 } 8776 8777 result = tailmatch((PyUnicodeObject *)str, 8778 (PyUnicodeObject *)substr, 8779 start, end, direction); 8780 Py_DECREF(str); 8781 Py_DECREF(substr); 8782 return result; 8783} 8784 8785/* Apply fixfct filter to the Unicode object self and return a 8786 reference to the modified object */ 8787 8788static PyObject * 8789fixup(PyObject *self, 8790 Py_UCS4 (*fixfct)(PyObject *s)) 8791{ 8792 PyObject *u; 8793 Py_UCS4 maxchar_old, maxchar_new = 0; 8794 8795 if (PyUnicode_READY(self) == -1) 8796 return NULL; 8797 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8798 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8799 maxchar_old); 8800 if (u == NULL) 8801 return NULL; 8802 8803 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8804 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8805 8806 /* fix functions return the new maximum character in a string, 8807 if the kind of the resulting unicode object does not change, 8808 everything is fine. Otherwise we need to change the string kind 8809 and re-run the fix function. */ 8810 maxchar_new = fixfct(u); 8811 if (maxchar_new == 0) 8812 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8813 else if (maxchar_new <= 127) 8814 maxchar_new = 127; 8815 else if (maxchar_new <= 255) 8816 maxchar_new = 255; 8817 else if (maxchar_new <= 65535) 8818 maxchar_new = 65535; 8819 else 8820 maxchar_new = 1114111; /* 0x10ffff */ 8821 8822 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8823 /* fixfct should return TRUE if it modified the buffer. If 8824 FALSE, return a reference to the original buffer instead 8825 (to save space, not time) */ 8826 Py_INCREF(self); 8827 Py_DECREF(u); 8828 return (PyObject*) self; 8829 } 8830 else if (maxchar_new == maxchar_old) { 8831 return u; 8832 } 8833 else { 8834 /* In case the maximum character changed, we need to 8835 convert the string to the new category. */ 8836 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8837 if (v == NULL) { 8838 Py_DECREF(u); 8839 return NULL; 8840 } 8841 if (maxchar_new > maxchar_old) { 8842 /* If the maxchar increased so that the kind changed, not all 8843 characters are representable anymore and we need to fix the 8844 string again. This only happens in very few cases. */ 8845 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); 8846 maxchar_old = fixfct(v); 8847 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8848 } 8849 else { 8850 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); 8851 } 8852 8853 Py_DECREF(u); 8854 assert(_PyUnicode_CheckConsistency(v, 1)); 8855 return v; 8856 } 8857} 8858 8859static Py_UCS4 8860fixupper(PyObject *self) 8861{ 8862 /* No need to call PyUnicode_READY(self) because this function is only 8863 called as a callback from fixup() which does it already. */ 8864 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8865 const int kind = PyUnicode_KIND(self); 8866 void *data = PyUnicode_DATA(self); 8867 int touched = 0; 8868 Py_UCS4 maxchar = 0; 8869 Py_ssize_t i; 8870 8871 for (i = 0; i < len; ++i) { 8872 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8873 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8874 if (up != ch) { 8875 if (up > maxchar) 8876 maxchar = up; 8877 PyUnicode_WRITE(kind, data, i, up); 8878 touched = 1; 8879 } 8880 else if (ch > maxchar) 8881 maxchar = ch; 8882 } 8883 8884 if (touched) 8885 return maxchar; 8886 else 8887 return 0; 8888} 8889 8890static Py_UCS4 8891fixlower(PyObject *self) 8892{ 8893 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8894 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8895 const int kind = PyUnicode_KIND(self); 8896 void *data = PyUnicode_DATA(self); 8897 int touched = 0; 8898 Py_UCS4 maxchar = 0; 8899 Py_ssize_t i; 8900 8901 for(i = 0; i < len; ++i) { 8902 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8903 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8904 if (lo != ch) { 8905 if (lo > maxchar) 8906 maxchar = lo; 8907 PyUnicode_WRITE(kind, data, i, lo); 8908 touched = 1; 8909 } 8910 else if (ch > maxchar) 8911 maxchar = ch; 8912 } 8913 8914 if (touched) 8915 return maxchar; 8916 else 8917 return 0; 8918} 8919 8920static Py_UCS4 8921fixswapcase(PyObject *self) 8922{ 8923 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8924 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8925 const int kind = PyUnicode_KIND(self); 8926 void *data = PyUnicode_DATA(self); 8927 int touched = 0; 8928 Py_UCS4 maxchar = 0; 8929 Py_ssize_t i; 8930 8931 for(i = 0; i < len; ++i) { 8932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8933 Py_UCS4 nu = 0; 8934 8935 if (Py_UNICODE_ISUPPER(ch)) 8936 nu = Py_UNICODE_TOLOWER(ch); 8937 else if (Py_UNICODE_ISLOWER(ch)) 8938 nu = Py_UNICODE_TOUPPER(ch); 8939 8940 if (nu != 0) { 8941 if (nu > maxchar) 8942 maxchar = nu; 8943 PyUnicode_WRITE(kind, data, i, nu); 8944 touched = 1; 8945 } 8946 else if (ch > maxchar) 8947 maxchar = ch; 8948 } 8949 8950 if (touched) 8951 return maxchar; 8952 else 8953 return 0; 8954} 8955 8956static Py_UCS4 8957fixcapitalize(PyObject *self) 8958{ 8959 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8960 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8961 const int kind = PyUnicode_KIND(self); 8962 void *data = PyUnicode_DATA(self); 8963 int touched = 0; 8964 Py_UCS4 maxchar = 0; 8965 Py_ssize_t i = 0; 8966 Py_UCS4 ch; 8967 8968 if (len == 0) 8969 return 0; 8970 8971 ch = PyUnicode_READ(kind, data, i); 8972 if (!Py_UNICODE_ISUPPER(ch)) { 8973 maxchar = Py_UNICODE_TOUPPER(ch); 8974 PyUnicode_WRITE(kind, data, i, maxchar); 8975 touched = 1; 8976 } 8977 ++i; 8978 for(; i < len; ++i) { 8979 ch = PyUnicode_READ(kind, data, i); 8980 if (!Py_UNICODE_ISLOWER(ch)) { 8981 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8982 if (lo > maxchar) 8983 maxchar = lo; 8984 PyUnicode_WRITE(kind, data, i, lo); 8985 touched = 1; 8986 } 8987 else if (ch > maxchar) 8988 maxchar = ch; 8989 } 8990 8991 if (touched) 8992 return maxchar; 8993 else 8994 return 0; 8995} 8996 8997static Py_UCS4 8998fixtitle(PyObject *self) 8999{ 9000 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9001 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9002 const int kind = PyUnicode_KIND(self); 9003 void *data = PyUnicode_DATA(self); 9004 Py_UCS4 maxchar = 0; 9005 Py_ssize_t i = 0; 9006 int previous_is_cased; 9007 9008 /* Shortcut for single character strings */ 9009 if (len == 1) { 9010 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9011 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 9012 if (ti != ch) { 9013 PyUnicode_WRITE(kind, data, i, ti); 9014 return ti; 9015 } 9016 else 9017 return 0; 9018 } 9019 previous_is_cased = 0; 9020 for(; i < len; ++i) { 9021 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9022 Py_UCS4 nu; 9023 9024 if (previous_is_cased) 9025 nu = Py_UNICODE_TOLOWER(ch); 9026 else 9027 nu = Py_UNICODE_TOTITLE(ch); 9028 9029 if (nu > maxchar) 9030 maxchar = nu; 9031 PyUnicode_WRITE(kind, data, i, nu); 9032 9033 if (Py_UNICODE_ISLOWER(ch) || 9034 Py_UNICODE_ISUPPER(ch) || 9035 Py_UNICODE_ISTITLE(ch)) 9036 previous_is_cased = 1; 9037 else 9038 previous_is_cased = 0; 9039 } 9040 return maxchar; 9041} 9042 9043PyObject * 9044PyUnicode_Join(PyObject *separator, PyObject *seq) 9045{ 9046 PyObject *sep = NULL; 9047 Py_ssize_t seplen = 1; 9048 PyObject *res = NULL; /* the result */ 9049 PyObject *fseq; /* PySequence_Fast(seq) */ 9050 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9051 PyObject **items; 9052 PyObject *item; 9053 Py_ssize_t sz, i, res_offset; 9054 Py_UCS4 maxchar; 9055 Py_UCS4 item_maxchar; 9056 9057 fseq = PySequence_Fast(seq, ""); 9058 if (fseq == NULL) { 9059 return NULL; 9060 } 9061 9062 /* NOTE: the following code can't call back into Python code, 9063 * so we are sure that fseq won't be mutated. 9064 */ 9065 9066 seqlen = PySequence_Fast_GET_SIZE(fseq); 9067 /* If empty sequence, return u"". */ 9068 if (seqlen == 0) { 9069 Py_DECREF(fseq); 9070 Py_INCREF(unicode_empty); 9071 res = unicode_empty; 9072 return res; 9073 } 9074 9075 /* If singleton sequence with an exact Unicode, return that. */ 9076 items = PySequence_Fast_ITEMS(fseq); 9077 if (seqlen == 1 && PyUnicode_CheckExact(items[0])) { 9078 res = items[0]; 9079 Py_INCREF(res); 9080 Py_DECREF(fseq); 9081 return res; 9082 } 9083 9084 /* Set up sep and seplen */ 9085 if (separator == NULL) { 9086 /* fall back to a blank space separator */ 9087 sep = PyUnicode_FromOrdinal(' '); 9088 if (!sep) 9089 goto onError; 9090 maxchar = 32; 9091 } 9092 else { 9093 if (!PyUnicode_Check(separator)) { 9094 PyErr_Format(PyExc_TypeError, 9095 "separator: expected str instance," 9096 " %.80s found", 9097 Py_TYPE(separator)->tp_name); 9098 goto onError; 9099 } 9100 if (PyUnicode_READY(separator)) 9101 goto onError; 9102 sep = separator; 9103 seplen = PyUnicode_GET_LENGTH(separator); 9104 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9105 /* inc refcount to keep this code path symmetric with the 9106 above case of a blank separator */ 9107 Py_INCREF(sep); 9108 } 9109 9110 /* There are at least two things to join, or else we have a subclass 9111 * of str in the sequence. 9112 * Do a pre-pass to figure out the total amount of space we'll 9113 * need (sz), and see whether all argument are strings. 9114 */ 9115 sz = 0; 9116 for (i = 0; i < seqlen; i++) { 9117 const Py_ssize_t old_sz = sz; 9118 item = items[i]; 9119 if (!PyUnicode_Check(item)) { 9120 PyErr_Format(PyExc_TypeError, 9121 "sequence item %zd: expected str instance," 9122 " %.80s found", 9123 i, Py_TYPE(item)->tp_name); 9124 goto onError; 9125 } 9126 if (PyUnicode_READY(item) == -1) 9127 goto onError; 9128 sz += PyUnicode_GET_LENGTH(item); 9129 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9130 if (item_maxchar > maxchar) 9131 maxchar = item_maxchar; 9132 if (i != 0) 9133 sz += seplen; 9134 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9135 PyErr_SetString(PyExc_OverflowError, 9136 "join() result is too long for a Python string"); 9137 goto onError; 9138 } 9139 } 9140 9141 res = PyUnicode_New(sz, maxchar); 9142 if (res == NULL) 9143 goto onError; 9144 9145 /* Catenate everything. */ 9146 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9147 Py_ssize_t itemlen; 9148 item = items[i]; 9149 /* Copy item, and maybe the separator. */ 9150 if (i && seplen != 0) { 9151 copy_characters(res, res_offset, sep, 0, seplen); 9152 res_offset += seplen; 9153 } 9154 itemlen = PyUnicode_GET_LENGTH(item); 9155 if (itemlen != 0) { 9156 copy_characters(res, res_offset, item, 0, itemlen); 9157 res_offset += itemlen; 9158 } 9159 } 9160 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9161 9162 Py_DECREF(fseq); 9163 Py_XDECREF(sep); 9164 assert(_PyUnicode_CheckConsistency(res, 1)); 9165 return res; 9166 9167 onError: 9168 Py_DECREF(fseq); 9169 Py_XDECREF(sep); 9170 Py_XDECREF(res); 9171 return NULL; 9172} 9173 9174#define FILL(kind, data, value, start, length) \ 9175 do { \ 9176 Py_ssize_t i_ = 0; \ 9177 assert(kind != PyUnicode_WCHAR_KIND); \ 9178 switch ((kind)) { \ 9179 case PyUnicode_1BYTE_KIND: { \ 9180 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9181 memset(to_, (unsigned char)value, length); \ 9182 break; \ 9183 } \ 9184 case PyUnicode_2BYTE_KIND: { \ 9185 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9186 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9187 break; \ 9188 } \ 9189 default: { \ 9190 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9191 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9192 break; \ 9193 } \ 9194 } \ 9195 } while (0) 9196 9197static PyObject * 9198pad(PyObject *self, 9199 Py_ssize_t left, 9200 Py_ssize_t right, 9201 Py_UCS4 fill) 9202{ 9203 PyObject *u; 9204 Py_UCS4 maxchar; 9205 int kind; 9206 void *data; 9207 9208 if (left < 0) 9209 left = 0; 9210 if (right < 0) 9211 right = 0; 9212 9213 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9214 Py_INCREF(self); 9215 return self; 9216 } 9217 9218 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9219 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9220 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9221 return NULL; 9222 } 9223 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9224 if (fill > maxchar) 9225 maxchar = fill; 9226 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9227 if (!u) 9228 return NULL; 9229 9230 kind = PyUnicode_KIND(u); 9231 data = PyUnicode_DATA(u); 9232 if (left) 9233 FILL(kind, data, fill, 0, left); 9234 if (right) 9235 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9236 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9237 assert(_PyUnicode_CheckConsistency(u, 1)); 9238 return u; 9239} 9240#undef FILL 9241 9242PyObject * 9243PyUnicode_Splitlines(PyObject *string, int keepends) 9244{ 9245 PyObject *list; 9246 9247 string = PyUnicode_FromObject(string); 9248 if (string == NULL || PyUnicode_READY(string) == -1) 9249 return NULL; 9250 9251 switch(PyUnicode_KIND(string)) { 9252 case PyUnicode_1BYTE_KIND: 9253 if (PyUnicode_IS_ASCII(string)) 9254 list = asciilib_splitlines( 9255 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9256 PyUnicode_GET_LENGTH(string), keepends); 9257 else 9258 list = ucs1lib_splitlines( 9259 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 9260 PyUnicode_GET_LENGTH(string), keepends); 9261 break; 9262 case PyUnicode_2BYTE_KIND: 9263 list = ucs2lib_splitlines( 9264 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 9265 PyUnicode_GET_LENGTH(string), keepends); 9266 break; 9267 case PyUnicode_4BYTE_KIND: 9268 list = ucs4lib_splitlines( 9269 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 9270 PyUnicode_GET_LENGTH(string), keepends); 9271 break; 9272 default: 9273 assert(0); 9274 list = 0; 9275 } 9276 Py_DECREF(string); 9277 return list; 9278} 9279 9280static PyObject * 9281split(PyObject *self, 9282 PyObject *substring, 9283 Py_ssize_t maxcount) 9284{ 9285 int kind1, kind2, kind; 9286 void *buf1, *buf2; 9287 Py_ssize_t len1, len2; 9288 PyObject* out; 9289 9290 if (maxcount < 0) 9291 maxcount = PY_SSIZE_T_MAX; 9292 9293 if (PyUnicode_READY(self) == -1) 9294 return NULL; 9295 9296 if (substring == NULL) 9297 switch(PyUnicode_KIND(self)) { 9298 case PyUnicode_1BYTE_KIND: 9299 if (PyUnicode_IS_ASCII(self)) 9300 return asciilib_split_whitespace( 9301 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9302 PyUnicode_GET_LENGTH(self), maxcount 9303 ); 9304 else 9305 return ucs1lib_split_whitespace( 9306 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9307 PyUnicode_GET_LENGTH(self), maxcount 9308 ); 9309 case PyUnicode_2BYTE_KIND: 9310 return ucs2lib_split_whitespace( 9311 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9312 PyUnicode_GET_LENGTH(self), maxcount 9313 ); 9314 case PyUnicode_4BYTE_KIND: 9315 return ucs4lib_split_whitespace( 9316 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9317 PyUnicode_GET_LENGTH(self), maxcount 9318 ); 9319 default: 9320 assert(0); 9321 return NULL; 9322 } 9323 9324 if (PyUnicode_READY(substring) == -1) 9325 return NULL; 9326 9327 kind1 = PyUnicode_KIND(self); 9328 kind2 = PyUnicode_KIND(substring); 9329 kind = kind1 > kind2 ? kind1 : kind2; 9330 buf1 = PyUnicode_DATA(self); 9331 buf2 = PyUnicode_DATA(substring); 9332 if (kind1 != kind) 9333 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9334 if (!buf1) 9335 return NULL; 9336 if (kind2 != kind) 9337 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9338 if (!buf2) { 9339 if (kind1 != kind) PyMem_Free(buf1); 9340 return NULL; 9341 } 9342 len1 = PyUnicode_GET_LENGTH(self); 9343 len2 = PyUnicode_GET_LENGTH(substring); 9344 9345 switch(kind) { 9346 case PyUnicode_1BYTE_KIND: 9347 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9348 out = asciilib_split( 9349 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9350 else 9351 out = ucs1lib_split( 9352 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9353 break; 9354 case PyUnicode_2BYTE_KIND: 9355 out = ucs2lib_split( 9356 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9357 break; 9358 case PyUnicode_4BYTE_KIND: 9359 out = ucs4lib_split( 9360 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9361 break; 9362 default: 9363 out = NULL; 9364 } 9365 if (kind1 != kind) 9366 PyMem_Free(buf1); 9367 if (kind2 != kind) 9368 PyMem_Free(buf2); 9369 return out; 9370} 9371 9372static PyObject * 9373rsplit(PyObject *self, 9374 PyObject *substring, 9375 Py_ssize_t maxcount) 9376{ 9377 int kind1, kind2, kind; 9378 void *buf1, *buf2; 9379 Py_ssize_t len1, len2; 9380 PyObject* out; 9381 9382 if (maxcount < 0) 9383 maxcount = PY_SSIZE_T_MAX; 9384 9385 if (PyUnicode_READY(self) == -1) 9386 return NULL; 9387 9388 if (substring == NULL) 9389 switch(PyUnicode_KIND(self)) { 9390 case PyUnicode_1BYTE_KIND: 9391 if (PyUnicode_IS_ASCII(self)) 9392 return asciilib_rsplit_whitespace( 9393 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9394 PyUnicode_GET_LENGTH(self), maxcount 9395 ); 9396 else 9397 return ucs1lib_rsplit_whitespace( 9398 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9399 PyUnicode_GET_LENGTH(self), maxcount 9400 ); 9401 case PyUnicode_2BYTE_KIND: 9402 return ucs2lib_rsplit_whitespace( 9403 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9404 PyUnicode_GET_LENGTH(self), maxcount 9405 ); 9406 case PyUnicode_4BYTE_KIND: 9407 return ucs4lib_rsplit_whitespace( 9408 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9409 PyUnicode_GET_LENGTH(self), maxcount 9410 ); 9411 default: 9412 assert(0); 9413 return NULL; 9414 } 9415 9416 if (PyUnicode_READY(substring) == -1) 9417 return NULL; 9418 9419 kind1 = PyUnicode_KIND(self); 9420 kind2 = PyUnicode_KIND(substring); 9421 kind = kind1 > kind2 ? kind1 : kind2; 9422 buf1 = PyUnicode_DATA(self); 9423 buf2 = PyUnicode_DATA(substring); 9424 if (kind1 != kind) 9425 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9426 if (!buf1) 9427 return NULL; 9428 if (kind2 != kind) 9429 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9430 if (!buf2) { 9431 if (kind1 != kind) PyMem_Free(buf1); 9432 return NULL; 9433 } 9434 len1 = PyUnicode_GET_LENGTH(self); 9435 len2 = PyUnicode_GET_LENGTH(substring); 9436 9437 switch(kind) { 9438 case PyUnicode_1BYTE_KIND: 9439 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9440 out = asciilib_rsplit( 9441 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9442 else 9443 out = ucs1lib_rsplit( 9444 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9445 break; 9446 case PyUnicode_2BYTE_KIND: 9447 out = ucs2lib_rsplit( 9448 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9449 break; 9450 case PyUnicode_4BYTE_KIND: 9451 out = ucs4lib_rsplit( 9452 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9453 break; 9454 default: 9455 out = NULL; 9456 } 9457 if (kind1 != kind) 9458 PyMem_Free(buf1); 9459 if (kind2 != kind) 9460 PyMem_Free(buf2); 9461 return out; 9462} 9463 9464static Py_ssize_t 9465anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9466 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9467{ 9468 switch(kind) { 9469 case PyUnicode_1BYTE_KIND: 9470 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9471 return asciilib_find(buf1, len1, buf2, len2, offset); 9472 else 9473 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9474 case PyUnicode_2BYTE_KIND: 9475 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9476 case PyUnicode_4BYTE_KIND: 9477 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9478 } 9479 assert(0); 9480 return -1; 9481} 9482 9483static Py_ssize_t 9484anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9485 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9486{ 9487 switch(kind) { 9488 case PyUnicode_1BYTE_KIND: 9489 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 9490 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 9491 else 9492 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9493 case PyUnicode_2BYTE_KIND: 9494 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9495 case PyUnicode_4BYTE_KIND: 9496 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9497 } 9498 assert(0); 9499 return 0; 9500} 9501 9502static PyObject * 9503replace(PyObject *self, PyObject *str1, 9504 PyObject *str2, Py_ssize_t maxcount) 9505{ 9506 PyObject *u; 9507 char *sbuf = PyUnicode_DATA(self); 9508 char *buf1 = PyUnicode_DATA(str1); 9509 char *buf2 = PyUnicode_DATA(str2); 9510 int srelease = 0, release1 = 0, release2 = 0; 9511 int skind = PyUnicode_KIND(self); 9512 int kind1 = PyUnicode_KIND(str1); 9513 int kind2 = PyUnicode_KIND(str2); 9514 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9515 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9516 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9517 9518 if (maxcount < 0) 9519 maxcount = PY_SSIZE_T_MAX; 9520 else if (maxcount == 0 || slen == 0) 9521 goto nothing; 9522 9523 if (skind < kind1) 9524 /* substring too wide to be present */ 9525 goto nothing; 9526 9527 if (len1 == len2) { 9528 Py_ssize_t i; 9529 /* same length */ 9530 if (len1 == 0) 9531 goto nothing; 9532 if (len1 == 1) { 9533 /* replace characters */ 9534 Py_UCS4 u1, u2, maxchar; 9535 int mayshrink, rkind; 9536 u1 = PyUnicode_READ_CHAR(str1, 0); 9537 if (!findchar(sbuf, PyUnicode_KIND(self), 9538 slen, u1, 1)) 9539 goto nothing; 9540 u2 = PyUnicode_READ_CHAR(str2, 0); 9541 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9542 /* Replacing u1 with u2 may cause a maxchar reduction in the 9543 result string. */ 9544 if (u2 > maxchar) { 9545 maxchar = u2; 9546 mayshrink = 0; 9547 } 9548 else 9549 mayshrink = maxchar > 127; 9550 u = PyUnicode_New(slen, maxchar); 9551 if (!u) 9552 goto error; 9553 copy_characters(u, 0, self, 0, slen); 9554 rkind = PyUnicode_KIND(u); 9555 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9556 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9557 if (--maxcount < 0) 9558 break; 9559 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9560 } 9561 if (mayshrink) { 9562 PyObject *tmp = u; 9563 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9564 PyUnicode_GET_LENGTH(tmp)); 9565 Py_DECREF(tmp); 9566 } 9567 } else { 9568 int rkind = skind; 9569 char *res; 9570 if (kind1 < rkind) { 9571 /* widen substring */ 9572 buf1 = _PyUnicode_AsKind(str1, rkind); 9573 if (!buf1) goto error; 9574 release1 = 1; 9575 } 9576 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 9577 if (i < 0) 9578 goto nothing; 9579 if (rkind > kind2) { 9580 /* widen replacement */ 9581 buf2 = _PyUnicode_AsKind(str2, rkind); 9582 if (!buf2) goto error; 9583 release2 = 1; 9584 } 9585 else if (rkind < kind2) { 9586 /* widen self and buf1 */ 9587 rkind = kind2; 9588 if (release1) PyMem_Free(buf1); 9589 sbuf = _PyUnicode_AsKind(self, rkind); 9590 if (!sbuf) goto error; 9591 srelease = 1; 9592 buf1 = _PyUnicode_AsKind(str1, rkind); 9593 if (!buf1) goto error; 9594 release1 = 1; 9595 } 9596 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9597 if (!res) { 9598 PyErr_NoMemory(); 9599 goto error; 9600 } 9601 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9602 /* change everything in-place, starting with this one */ 9603 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9604 buf2, 9605 PyUnicode_KIND_SIZE(rkind, len2)); 9606 i += len1; 9607 9608 while ( --maxcount > 0) { 9609 i = anylib_find(rkind, self, 9610 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i, 9611 str1, buf1, len1, i); 9612 if (i == -1) 9613 break; 9614 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9615 buf2, 9616 PyUnicode_KIND_SIZE(rkind, len2)); 9617 i += len1; 9618 } 9619 9620 u = PyUnicode_FromKindAndData(rkind, res, slen); 9621 PyMem_Free(res); 9622 if (!u) goto error; 9623 } 9624 } else { 9625 9626 Py_ssize_t n, i, j, ires; 9627 Py_ssize_t product, new_size; 9628 int rkind = skind; 9629 char *res; 9630 9631 if (kind1 < rkind) { 9632 buf1 = _PyUnicode_AsKind(str1, rkind); 9633 if (!buf1) goto error; 9634 release1 = 1; 9635 } 9636 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 9637 if (n == 0) 9638 goto nothing; 9639 if (kind2 < rkind) { 9640 buf2 = _PyUnicode_AsKind(str2, rkind); 9641 if (!buf2) goto error; 9642 release2 = 1; 9643 } 9644 else if (kind2 > rkind) { 9645 rkind = kind2; 9646 sbuf = _PyUnicode_AsKind(self, rkind); 9647 if (!sbuf) goto error; 9648 srelease = 1; 9649 if (release1) PyMem_Free(buf1); 9650 buf1 = _PyUnicode_AsKind(str1, rkind); 9651 if (!buf1) goto error; 9652 release1 = 1; 9653 } 9654 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9655 PyUnicode_GET_LENGTH(str1))); */ 9656 product = n * (len2-len1); 9657 if ((product / (len2-len1)) != n) { 9658 PyErr_SetString(PyExc_OverflowError, 9659 "replace string is too long"); 9660 goto error; 9661 } 9662 new_size = slen + product; 9663 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9664 PyErr_SetString(PyExc_OverflowError, 9665 "replace string is too long"); 9666 goto error; 9667 } 9668 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9669 if (!res) 9670 goto error; 9671 ires = i = 0; 9672 if (len1 > 0) { 9673 while (n-- > 0) { 9674 /* look for next match */ 9675 j = anylib_find(rkind, self, 9676 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i, 9677 str1, buf1, len1, i); 9678 if (j == -1) 9679 break; 9680 else if (j > i) { 9681 /* copy unchanged part [i:j] */ 9682 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9683 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9684 PyUnicode_KIND_SIZE(rkind, j-i)); 9685 ires += j - i; 9686 } 9687 /* copy substitution string */ 9688 if (len2 > 0) { 9689 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9690 buf2, 9691 PyUnicode_KIND_SIZE(rkind, len2)); 9692 ires += len2; 9693 } 9694 i = j + len1; 9695 } 9696 if (i < slen) 9697 /* copy tail [i:] */ 9698 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9699 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9700 PyUnicode_KIND_SIZE(rkind, slen-i)); 9701 } else { 9702 /* interleave */ 9703 while (n > 0) { 9704 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9705 buf2, 9706 PyUnicode_KIND_SIZE(rkind, len2)); 9707 ires += len2; 9708 if (--n <= 0) 9709 break; 9710 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9711 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9712 PyUnicode_KIND_SIZE(rkind, 1)); 9713 ires++; 9714 i++; 9715 } 9716 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9717 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9718 PyUnicode_KIND_SIZE(rkind, slen-i)); 9719 } 9720 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2)) 9721 u = unicode_fromascii((unsigned char*)res, new_size); 9722 else 9723 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9724 PyMem_Free(res); 9725 } 9726 if (srelease) 9727 PyMem_FREE(sbuf); 9728 if (release1) 9729 PyMem_FREE(buf1); 9730 if (release2) 9731 PyMem_FREE(buf2); 9732 assert(_PyUnicode_CheckConsistency(u, 1)); 9733 return u; 9734 9735 nothing: 9736 /* nothing to replace; return original string (when possible) */ 9737 if (srelease) 9738 PyMem_FREE(sbuf); 9739 if (release1) 9740 PyMem_FREE(buf1); 9741 if (release2) 9742 PyMem_FREE(buf2); 9743 if (PyUnicode_CheckExact(self)) { 9744 Py_INCREF(self); 9745 return (PyObject *) self; 9746 } 9747 return PyUnicode_Copy(self); 9748 error: 9749 if (srelease && sbuf) 9750 PyMem_FREE(sbuf); 9751 if (release1 && buf1) 9752 PyMem_FREE(buf1); 9753 if (release2 && buf2) 9754 PyMem_FREE(buf2); 9755 return NULL; 9756} 9757 9758/* --- Unicode Object Methods --------------------------------------------- */ 9759 9760PyDoc_STRVAR(title__doc__, 9761 "S.title() -> str\n\ 9762\n\ 9763Return a titlecased version of S, i.e. words start with title case\n\ 9764characters, all remaining cased characters have lower case."); 9765 9766static PyObject* 9767unicode_title(PyObject *self) 9768{ 9769 return fixup(self, fixtitle); 9770} 9771 9772PyDoc_STRVAR(capitalize__doc__, 9773 "S.capitalize() -> str\n\ 9774\n\ 9775Return a capitalized version of S, i.e. make the first character\n\ 9776have upper case and the rest lower case."); 9777 9778static PyObject* 9779unicode_capitalize(PyObject *self) 9780{ 9781 return fixup(self, fixcapitalize); 9782} 9783 9784#if 0 9785PyDoc_STRVAR(capwords__doc__, 9786 "S.capwords() -> str\n\ 9787\n\ 9788Apply .capitalize() to all words in S and return the result with\n\ 9789normalized whitespace (all whitespace strings are replaced by ' ')."); 9790 9791static PyObject* 9792unicode_capwords(PyUnicodeObject *self) 9793{ 9794 PyObject *list; 9795 PyObject *item; 9796 Py_ssize_t i; 9797 9798 /* Split into words */ 9799 list = split(self, NULL, -1); 9800 if (!list) 9801 return NULL; 9802 9803 /* Capitalize each word */ 9804 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9805 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9806 fixcapitalize); 9807 if (item == NULL) 9808 goto onError; 9809 Py_DECREF(PyList_GET_ITEM(list, i)); 9810 PyList_SET_ITEM(list, i, item); 9811 } 9812 9813 /* Join the words to form a new string */ 9814 item = PyUnicode_Join(NULL, list); 9815 9816 onError: 9817 Py_DECREF(list); 9818 return (PyObject *)item; 9819} 9820#endif 9821 9822/* Argument converter. Coerces to a single unicode character */ 9823 9824static int 9825convert_uc(PyObject *obj, void *addr) 9826{ 9827 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9828 PyObject *uniobj; 9829 9830 uniobj = PyUnicode_FromObject(obj); 9831 if (uniobj == NULL) { 9832 PyErr_SetString(PyExc_TypeError, 9833 "The fill character cannot be converted to Unicode"); 9834 return 0; 9835 } 9836 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9837 PyErr_SetString(PyExc_TypeError, 9838 "The fill character must be exactly one character long"); 9839 Py_DECREF(uniobj); 9840 return 0; 9841 } 9842 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9843 Py_DECREF(uniobj); 9844 return 1; 9845} 9846 9847PyDoc_STRVAR(center__doc__, 9848 "S.center(width[, fillchar]) -> str\n\ 9849\n\ 9850Return S centered in a string of length width. Padding is\n\ 9851done using the specified fill character (default is a space)"); 9852 9853static PyObject * 9854unicode_center(PyObject *self, PyObject *args) 9855{ 9856 Py_ssize_t marg, left; 9857 Py_ssize_t width; 9858 Py_UCS4 fillchar = ' '; 9859 9860 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9861 return NULL; 9862 9863 if (PyUnicode_READY(self) == -1) 9864 return NULL; 9865 9866 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9867 Py_INCREF(self); 9868 return (PyObject*) self; 9869 } 9870 9871 marg = width - _PyUnicode_LENGTH(self); 9872 left = marg / 2 + (marg & width & 1); 9873 9874 return pad(self, left, marg - left, fillchar); 9875} 9876 9877#if 0 9878 9879/* This code should go into some future Unicode collation support 9880 module. The basic comparison should compare ordinals on a naive 9881 basis (this is what Java does and thus Jython too). */ 9882 9883/* speedy UTF-16 code point order comparison */ 9884/* gleaned from: */ 9885/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9886 9887static short utf16Fixup[32] = 9888{ 9889 0, 0, 0, 0, 0, 0, 0, 0, 9890 0, 0, 0, 0, 0, 0, 0, 0, 9891 0, 0, 0, 0, 0, 0, 0, 0, 9892 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9893}; 9894 9895static int 9896unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9897{ 9898 Py_ssize_t len1, len2; 9899 9900 Py_UNICODE *s1 = str1->str; 9901 Py_UNICODE *s2 = str2->str; 9902 9903 len1 = str1->_base._base.length; 9904 len2 = str2->_base._base.length; 9905 9906 while (len1 > 0 && len2 > 0) { 9907 Py_UNICODE c1, c2; 9908 9909 c1 = *s1++; 9910 c2 = *s2++; 9911 9912 if (c1 > (1<<11) * 26) 9913 c1 += utf16Fixup[c1>>11]; 9914 if (c2 > (1<<11) * 26) 9915 c2 += utf16Fixup[c2>>11]; 9916 /* now c1 and c2 are in UTF-32-compatible order */ 9917 9918 if (c1 != c2) 9919 return (c1 < c2) ? -1 : 1; 9920 9921 len1--; len2--; 9922 } 9923 9924 return (len1 < len2) ? -1 : (len1 != len2); 9925} 9926 9927#else 9928 9929/* This function assumes that str1 and str2 are readied by the caller. */ 9930 9931static int 9932unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9933{ 9934 int kind1, kind2; 9935 void *data1, *data2; 9936 Py_ssize_t len1, len2, i; 9937 9938 kind1 = PyUnicode_KIND(str1); 9939 kind2 = PyUnicode_KIND(str2); 9940 data1 = PyUnicode_DATA(str1); 9941 data2 = PyUnicode_DATA(str2); 9942 len1 = PyUnicode_GET_LENGTH(str1); 9943 len2 = PyUnicode_GET_LENGTH(str2); 9944 9945 for (i = 0; i < len1 && i < len2; ++i) { 9946 Py_UCS4 c1, c2; 9947 c1 = PyUnicode_READ(kind1, data1, i); 9948 c2 = PyUnicode_READ(kind2, data2, i); 9949 9950 if (c1 != c2) 9951 return (c1 < c2) ? -1 : 1; 9952 } 9953 9954 return (len1 < len2) ? -1 : (len1 != len2); 9955} 9956 9957#endif 9958 9959int 9960PyUnicode_Compare(PyObject *left, PyObject *right) 9961{ 9962 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9963 if (PyUnicode_READY(left) == -1 || 9964 PyUnicode_READY(right) == -1) 9965 return -1; 9966 return unicode_compare((PyUnicodeObject *)left, 9967 (PyUnicodeObject *)right); 9968 } 9969 PyErr_Format(PyExc_TypeError, 9970 "Can't compare %.100s and %.100s", 9971 left->ob_type->tp_name, 9972 right->ob_type->tp_name); 9973 return -1; 9974} 9975 9976int 9977PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9978{ 9979 Py_ssize_t i; 9980 int kind; 9981 void *data; 9982 Py_UCS4 chr; 9983 9984 assert(_PyUnicode_CHECK(uni)); 9985 if (PyUnicode_READY(uni) == -1) 9986 return -1; 9987 kind = PyUnicode_KIND(uni); 9988 data = PyUnicode_DATA(uni); 9989 /* Compare Unicode string and source character set string */ 9990 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9991 if (chr != str[i]) 9992 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9993 /* This check keeps Python strings that end in '\0' from comparing equal 9994 to C strings identical up to that point. */ 9995 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9996 return 1; /* uni is longer */ 9997 if (str[i]) 9998 return -1; /* str is longer */ 9999 return 0; 10000} 10001 10002 10003#define TEST_COND(cond) \ 10004 ((cond) ? Py_True : Py_False) 10005 10006PyObject * 10007PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10008{ 10009 int result; 10010 10011 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10012 PyObject *v; 10013 if (PyUnicode_READY(left) == -1 || 10014 PyUnicode_READY(right) == -1) 10015 return NULL; 10016 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10017 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10018 if (op == Py_EQ) { 10019 Py_INCREF(Py_False); 10020 return Py_False; 10021 } 10022 if (op == Py_NE) { 10023 Py_INCREF(Py_True); 10024 return Py_True; 10025 } 10026 } 10027 if (left == right) 10028 result = 0; 10029 else 10030 result = unicode_compare((PyUnicodeObject *)left, 10031 (PyUnicodeObject *)right); 10032 10033 /* Convert the return value to a Boolean */ 10034 switch (op) { 10035 case Py_EQ: 10036 v = TEST_COND(result == 0); 10037 break; 10038 case Py_NE: 10039 v = TEST_COND(result != 0); 10040 break; 10041 case Py_LE: 10042 v = TEST_COND(result <= 0); 10043 break; 10044 case Py_GE: 10045 v = TEST_COND(result >= 0); 10046 break; 10047 case Py_LT: 10048 v = TEST_COND(result == -1); 10049 break; 10050 case Py_GT: 10051 v = TEST_COND(result == 1); 10052 break; 10053 default: 10054 PyErr_BadArgument(); 10055 return NULL; 10056 } 10057 Py_INCREF(v); 10058 return v; 10059 } 10060 10061 Py_RETURN_NOTIMPLEMENTED; 10062} 10063 10064int 10065PyUnicode_Contains(PyObject *container, PyObject *element) 10066{ 10067 PyObject *str, *sub; 10068 int kind1, kind2, kind; 10069 void *buf1, *buf2; 10070 Py_ssize_t len1, len2; 10071 int result; 10072 10073 /* Coerce the two arguments */ 10074 sub = PyUnicode_FromObject(element); 10075 if (!sub) { 10076 PyErr_Format(PyExc_TypeError, 10077 "'in <string>' requires string as left operand, not %s", 10078 element->ob_type->tp_name); 10079 return -1; 10080 } 10081 if (PyUnicode_READY(sub) == -1) 10082 return -1; 10083 10084 str = PyUnicode_FromObject(container); 10085 if (!str || PyUnicode_READY(str) == -1) { 10086 Py_DECREF(sub); 10087 return -1; 10088 } 10089 10090 kind1 = PyUnicode_KIND(str); 10091 kind2 = PyUnicode_KIND(sub); 10092 kind = kind1 > kind2 ? kind1 : kind2; 10093 buf1 = PyUnicode_DATA(str); 10094 buf2 = PyUnicode_DATA(sub); 10095 if (kind1 != kind) 10096 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 10097 if (!buf1) { 10098 Py_DECREF(sub); 10099 return -1; 10100 } 10101 if (kind2 != kind) 10102 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 10103 if (!buf2) { 10104 Py_DECREF(sub); 10105 if (kind1 != kind) PyMem_Free(buf1); 10106 return -1; 10107 } 10108 len1 = PyUnicode_GET_LENGTH(str); 10109 len2 = PyUnicode_GET_LENGTH(sub); 10110 10111 switch(kind) { 10112 case PyUnicode_1BYTE_KIND: 10113 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10114 break; 10115 case PyUnicode_2BYTE_KIND: 10116 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10117 break; 10118 case PyUnicode_4BYTE_KIND: 10119 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10120 break; 10121 default: 10122 result = -1; 10123 assert(0); 10124 } 10125 10126 Py_DECREF(str); 10127 Py_DECREF(sub); 10128 10129 if (kind1 != kind) 10130 PyMem_Free(buf1); 10131 if (kind2 != kind) 10132 PyMem_Free(buf2); 10133 10134 return result; 10135} 10136 10137/* Concat to string or Unicode object giving a new Unicode object. */ 10138 10139PyObject * 10140PyUnicode_Concat(PyObject *left, PyObject *right) 10141{ 10142 PyObject *u = NULL, *v = NULL, *w; 10143 Py_UCS4 maxchar; 10144 10145 /* Coerce the two arguments */ 10146 u = PyUnicode_FromObject(left); 10147 if (u == NULL) 10148 goto onError; 10149 v = PyUnicode_FromObject(right); 10150 if (v == NULL) 10151 goto onError; 10152 10153 /* Shortcuts */ 10154 if (v == unicode_empty) { 10155 Py_DECREF(v); 10156 return u; 10157 } 10158 if (u == unicode_empty) { 10159 Py_DECREF(u); 10160 return v; 10161 } 10162 10163 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10164 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 10165 10166 /* Concat the two Unicode strings */ 10167 w = PyUnicode_New( 10168 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10169 maxchar); 10170 if (w == NULL) 10171 goto onError; 10172 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); 10173 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); 10174 Py_DECREF(u); 10175 Py_DECREF(v); 10176 assert(_PyUnicode_CheckConsistency(w, 1)); 10177 return w; 10178 10179 onError: 10180 Py_XDECREF(u); 10181 Py_XDECREF(v); 10182 return NULL; 10183} 10184 10185static void 10186unicode_append_inplace(PyObject **p_left, PyObject *right) 10187{ 10188 Py_ssize_t left_len, right_len, new_len; 10189 10190 assert(PyUnicode_IS_READY(*p_left)); 10191 assert(PyUnicode_IS_READY(right)); 10192 10193 left_len = PyUnicode_GET_LENGTH(*p_left); 10194 right_len = PyUnicode_GET_LENGTH(right); 10195 if (left_len > PY_SSIZE_T_MAX - right_len) { 10196 PyErr_SetString(PyExc_OverflowError, 10197 "strings are too large to concat"); 10198 goto error; 10199 } 10200 new_len = left_len + right_len; 10201 10202 /* Now we own the last reference to 'left', so we can resize it 10203 * in-place. 10204 */ 10205 if (unicode_resize(p_left, new_len) != 0) { 10206 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10207 * deallocated so it cannot be put back into 10208 * 'variable'. The MemoryError is raised when there 10209 * is no value in 'variable', which might (very 10210 * remotely) be a cause of incompatibilities. 10211 */ 10212 goto error; 10213 } 10214 /* copy 'right' into the newly allocated area of 'left' */ 10215 copy_characters(*p_left, left_len, right, 0, right_len); 10216 _PyUnicode_DIRTY(*p_left); 10217 return; 10218 10219error: 10220 Py_DECREF(*p_left); 10221 *p_left = NULL; 10222} 10223 10224void 10225PyUnicode_Append(PyObject **p_left, PyObject *right) 10226{ 10227 PyObject *left, *res; 10228 10229 if (p_left == NULL) { 10230 if (!PyErr_Occurred()) 10231 PyErr_BadInternalCall(); 10232 return; 10233 } 10234 left = *p_left; 10235 if (right == NULL || !PyUnicode_Check(left)) { 10236 if (!PyErr_Occurred()) 10237 PyErr_BadInternalCall(); 10238 goto error; 10239 } 10240 10241 if (PyUnicode_READY(left)) 10242 goto error; 10243 if (PyUnicode_READY(right)) 10244 goto error; 10245 10246 if (PyUnicode_CheckExact(left) && left != unicode_empty 10247 && PyUnicode_CheckExact(right) && right != unicode_empty 10248 && unicode_resizable(left) 10249 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10250 || _PyUnicode_WSTR(left) != NULL)) 10251 { 10252 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10253 to change the structure size, but characters are stored just after 10254 the structure, and so it requires to move all characters which is 10255 not so different than duplicating the string. */ 10256 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10257 { 10258 unicode_append_inplace(p_left, right); 10259 if (p_left != NULL) 10260 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10261 return; 10262 } 10263 } 10264 10265 res = PyUnicode_Concat(left, right); 10266 if (res == NULL) 10267 goto error; 10268 Py_DECREF(left); 10269 *p_left = res; 10270 return; 10271 10272error: 10273 Py_DECREF(*p_left); 10274 *p_left = NULL; 10275} 10276 10277void 10278PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10279{ 10280 PyUnicode_Append(pleft, right); 10281 Py_XDECREF(right); 10282} 10283 10284PyDoc_STRVAR(count__doc__, 10285 "S.count(sub[, start[, end]]) -> int\n\ 10286\n\ 10287Return the number of non-overlapping occurrences of substring sub in\n\ 10288string S[start:end]. Optional arguments start and end are\n\ 10289interpreted as in slice notation."); 10290 10291static PyObject * 10292unicode_count(PyUnicodeObject *self, PyObject *args) 10293{ 10294 PyUnicodeObject *substring; 10295 Py_ssize_t start = 0; 10296 Py_ssize_t end = PY_SSIZE_T_MAX; 10297 PyObject *result; 10298 int kind1, kind2, kind; 10299 void *buf1, *buf2; 10300 Py_ssize_t len1, len2, iresult; 10301 10302 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10303 &start, &end)) 10304 return NULL; 10305 10306 kind1 = PyUnicode_KIND(self); 10307 kind2 = PyUnicode_KIND(substring); 10308 kind = kind1 > kind2 ? kind1 : kind2; 10309 buf1 = PyUnicode_DATA(self); 10310 buf2 = PyUnicode_DATA(substring); 10311 if (kind1 != kind) 10312 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 10313 if (!buf1) { 10314 Py_DECREF(substring); 10315 return NULL; 10316 } 10317 if (kind2 != kind) 10318 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 10319 if (!buf2) { 10320 Py_DECREF(substring); 10321 if (kind1 != kind) PyMem_Free(buf1); 10322 return NULL; 10323 } 10324 len1 = PyUnicode_GET_LENGTH(self); 10325 len2 = PyUnicode_GET_LENGTH(substring); 10326 10327 ADJUST_INDICES(start, end, len1); 10328 switch(kind) { 10329 case PyUnicode_1BYTE_KIND: 10330 iresult = ucs1lib_count( 10331 ((Py_UCS1*)buf1) + start, end - start, 10332 buf2, len2, PY_SSIZE_T_MAX 10333 ); 10334 break; 10335 case PyUnicode_2BYTE_KIND: 10336 iresult = ucs2lib_count( 10337 ((Py_UCS2*)buf1) + start, end - start, 10338 buf2, len2, PY_SSIZE_T_MAX 10339 ); 10340 break; 10341 case PyUnicode_4BYTE_KIND: 10342 iresult = ucs4lib_count( 10343 ((Py_UCS4*)buf1) + start, end - start, 10344 buf2, len2, PY_SSIZE_T_MAX 10345 ); 10346 break; 10347 default: 10348 assert(0); iresult = 0; 10349 } 10350 10351 result = PyLong_FromSsize_t(iresult); 10352 10353 if (kind1 != kind) 10354 PyMem_Free(buf1); 10355 if (kind2 != kind) 10356 PyMem_Free(buf2); 10357 10358 Py_DECREF(substring); 10359 10360 return result; 10361} 10362 10363PyDoc_STRVAR(encode__doc__, 10364 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10365\n\ 10366Encode S using the codec registered for encoding. Default encoding\n\ 10367is 'utf-8'. errors may be given to set a different error\n\ 10368handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10369a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10370'xmlcharrefreplace' as well as any other name registered with\n\ 10371codecs.register_error that can handle UnicodeEncodeErrors."); 10372 10373static PyObject * 10374unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10375{ 10376 static char *kwlist[] = {"encoding", "errors", 0}; 10377 char *encoding = NULL; 10378 char *errors = NULL; 10379 10380 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10381 kwlist, &encoding, &errors)) 10382 return NULL; 10383 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10384} 10385 10386PyDoc_STRVAR(expandtabs__doc__, 10387 "S.expandtabs([tabsize]) -> str\n\ 10388\n\ 10389Return a copy of S where all tab characters are expanded using spaces.\n\ 10390If tabsize is not given, a tab size of 8 characters is assumed."); 10391 10392static PyObject* 10393unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10394{ 10395 Py_ssize_t i, j, line_pos, src_len, incr; 10396 Py_UCS4 ch; 10397 PyObject *u; 10398 void *src_data, *dest_data; 10399 int tabsize = 8; 10400 int kind; 10401 int found; 10402 10403 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10404 return NULL; 10405 10406 if (PyUnicode_READY(self) == -1) 10407 return NULL; 10408 10409 /* First pass: determine size of output string */ 10410 src_len = PyUnicode_GET_LENGTH(self); 10411 i = j = line_pos = 0; 10412 kind = PyUnicode_KIND(self); 10413 src_data = PyUnicode_DATA(self); 10414 found = 0; 10415 for (; i < src_len; i++) { 10416 ch = PyUnicode_READ(kind, src_data, i); 10417 if (ch == '\t') { 10418 found = 1; 10419 if (tabsize > 0) { 10420 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10421 if (j > PY_SSIZE_T_MAX - incr) 10422 goto overflow; 10423 line_pos += incr; 10424 j += incr; 10425 } 10426 } 10427 else { 10428 if (j > PY_SSIZE_T_MAX - 1) 10429 goto overflow; 10430 line_pos++; 10431 j++; 10432 if (ch == '\n' || ch == '\r') 10433 line_pos = 0; 10434 } 10435 } 10436 if (!found && PyUnicode_CheckExact(self)) { 10437 Py_INCREF((PyObject *) self); 10438 return (PyObject *) self; 10439 } 10440 10441 /* Second pass: create output string and fill it */ 10442 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10443 if (!u) 10444 return NULL; 10445 dest_data = PyUnicode_DATA(u); 10446 10447 i = j = line_pos = 0; 10448 10449 for (; i < src_len; i++) { 10450 ch = PyUnicode_READ(kind, src_data, i); 10451 if (ch == '\t') { 10452 if (tabsize > 0) { 10453 incr = tabsize - (line_pos % tabsize); 10454 line_pos += incr; 10455 while (incr--) { 10456 PyUnicode_WRITE(kind, dest_data, j, ' '); 10457 j++; 10458 } 10459 } 10460 } 10461 else { 10462 line_pos++; 10463 PyUnicode_WRITE(kind, dest_data, j, ch); 10464 j++; 10465 if (ch == '\n' || ch == '\r') 10466 line_pos = 0; 10467 } 10468 } 10469 assert (j == PyUnicode_GET_LENGTH(u)); 10470#ifndef DONT_MAKE_RESULT_READY 10471 if (_PyUnicode_READY_REPLACE(&u)) { 10472 Py_DECREF(u); 10473 return NULL; 10474 } 10475#endif 10476 assert(_PyUnicode_CheckConsistency(u, 1)); 10477 return (PyObject*) u; 10478 10479 overflow: 10480 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10481 return NULL; 10482} 10483 10484PyDoc_STRVAR(find__doc__, 10485 "S.find(sub[, start[, end]]) -> int\n\ 10486\n\ 10487Return the lowest index in S where substring sub is found,\n\ 10488such that sub is contained within S[start:end]. Optional\n\ 10489arguments start and end are interpreted as in slice notation.\n\ 10490\n\ 10491Return -1 on failure."); 10492 10493static PyObject * 10494unicode_find(PyObject *self, PyObject *args) 10495{ 10496 PyUnicodeObject *substring; 10497 Py_ssize_t start; 10498 Py_ssize_t end; 10499 Py_ssize_t result; 10500 10501 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10502 &start, &end)) 10503 return NULL; 10504 10505 if (PyUnicode_READY(self) == -1) 10506 return NULL; 10507 if (PyUnicode_READY(substring) == -1) 10508 return NULL; 10509 10510 result = any_find_slice( 10511 asciilib_find_slice, ucs1lib_find_slice, 10512 ucs2lib_find_slice, ucs4lib_find_slice, 10513 self, (PyObject*)substring, start, end 10514 ); 10515 10516 Py_DECREF(substring); 10517 10518 if (result == -2) 10519 return NULL; 10520 10521 return PyLong_FromSsize_t(result); 10522} 10523 10524static PyObject * 10525unicode_getitem(PyObject *self, Py_ssize_t index) 10526{ 10527 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10528 if (ch == (Py_UCS4)-1) 10529 return NULL; 10530 return PyUnicode_FromOrdinal(ch); 10531} 10532 10533/* Believe it or not, this produces the same value for ASCII strings 10534 as bytes_hash(). */ 10535static Py_hash_t 10536unicode_hash(PyUnicodeObject *self) 10537{ 10538 Py_ssize_t len; 10539 Py_uhash_t x; 10540 10541 if (_PyUnicode_HASH(self) != -1) 10542 return _PyUnicode_HASH(self); 10543 if (PyUnicode_READY(self) == -1) 10544 return -1; 10545 len = PyUnicode_GET_LENGTH(self); 10546 10547 /* The hash function as a macro, gets expanded three times below. */ 10548#define HASH(P) \ 10549 x = (Py_uhash_t)*P << 7; \ 10550 while (--len >= 0) \ 10551 x = (1000003*x) ^ (Py_uhash_t)*P++; 10552 10553 switch (PyUnicode_KIND(self)) { 10554 case PyUnicode_1BYTE_KIND: { 10555 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10556 HASH(c); 10557 break; 10558 } 10559 case PyUnicode_2BYTE_KIND: { 10560 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10561 HASH(s); 10562 break; 10563 } 10564 default: { 10565 Py_UCS4 *l; 10566 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10567 "Impossible switch case in unicode_hash"); 10568 l = PyUnicode_4BYTE_DATA(self); 10569 HASH(l); 10570 break; 10571 } 10572 } 10573 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10574 10575 if (x == -1) 10576 x = -2; 10577 _PyUnicode_HASH(self) = x; 10578 return x; 10579} 10580#undef HASH 10581 10582PyDoc_STRVAR(index__doc__, 10583 "S.index(sub[, start[, end]]) -> int\n\ 10584\n\ 10585Like S.find() but raise ValueError when the substring is not found."); 10586 10587static PyObject * 10588unicode_index(PyObject *self, PyObject *args) 10589{ 10590 Py_ssize_t result; 10591 PyUnicodeObject *substring; 10592 Py_ssize_t start; 10593 Py_ssize_t end; 10594 10595 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10596 &start, &end)) 10597 return NULL; 10598 10599 if (PyUnicode_READY(self) == -1) 10600 return NULL; 10601 if (PyUnicode_READY(substring) == -1) 10602 return NULL; 10603 10604 result = any_find_slice( 10605 asciilib_find_slice, ucs1lib_find_slice, 10606 ucs2lib_find_slice, ucs4lib_find_slice, 10607 self, (PyObject*)substring, start, end 10608 ); 10609 10610 Py_DECREF(substring); 10611 10612 if (result == -2) 10613 return NULL; 10614 10615 if (result < 0) { 10616 PyErr_SetString(PyExc_ValueError, "substring not found"); 10617 return NULL; 10618 } 10619 10620 return PyLong_FromSsize_t(result); 10621} 10622 10623PyDoc_STRVAR(islower__doc__, 10624 "S.islower() -> bool\n\ 10625\n\ 10626Return True if all cased characters in S are lowercase and there is\n\ 10627at least one cased character in S, False otherwise."); 10628 10629static PyObject* 10630unicode_islower(PyUnicodeObject *self) 10631{ 10632 Py_ssize_t i, length; 10633 int kind; 10634 void *data; 10635 int cased; 10636 10637 if (PyUnicode_READY(self) == -1) 10638 return NULL; 10639 length = PyUnicode_GET_LENGTH(self); 10640 kind = PyUnicode_KIND(self); 10641 data = PyUnicode_DATA(self); 10642 10643 /* Shortcut for single character strings */ 10644 if (length == 1) 10645 return PyBool_FromLong( 10646 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10647 10648 /* Special case for empty strings */ 10649 if (length == 0) 10650 return PyBool_FromLong(0); 10651 10652 cased = 0; 10653 for (i = 0; i < length; i++) { 10654 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10655 10656 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10657 return PyBool_FromLong(0); 10658 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10659 cased = 1; 10660 } 10661 return PyBool_FromLong(cased); 10662} 10663 10664PyDoc_STRVAR(isupper__doc__, 10665 "S.isupper() -> bool\n\ 10666\n\ 10667Return True if all cased characters in S are uppercase and there is\n\ 10668at least one cased character in S, False otherwise."); 10669 10670static PyObject* 10671unicode_isupper(PyUnicodeObject *self) 10672{ 10673 Py_ssize_t i, length; 10674 int kind; 10675 void *data; 10676 int cased; 10677 10678 if (PyUnicode_READY(self) == -1) 10679 return NULL; 10680 length = PyUnicode_GET_LENGTH(self); 10681 kind = PyUnicode_KIND(self); 10682 data = PyUnicode_DATA(self); 10683 10684 /* Shortcut for single character strings */ 10685 if (length == 1) 10686 return PyBool_FromLong( 10687 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10688 10689 /* Special case for empty strings */ 10690 if (length == 0) 10691 return PyBool_FromLong(0); 10692 10693 cased = 0; 10694 for (i = 0; i < length; i++) { 10695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10696 10697 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10698 return PyBool_FromLong(0); 10699 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10700 cased = 1; 10701 } 10702 return PyBool_FromLong(cased); 10703} 10704 10705PyDoc_STRVAR(istitle__doc__, 10706 "S.istitle() -> bool\n\ 10707\n\ 10708Return True if S is a titlecased string and there is at least one\n\ 10709character in S, i.e. upper- and titlecase characters may only\n\ 10710follow uncased characters and lowercase characters only cased ones.\n\ 10711Return False otherwise."); 10712 10713static PyObject* 10714unicode_istitle(PyUnicodeObject *self) 10715{ 10716 Py_ssize_t i, length; 10717 int kind; 10718 void *data; 10719 int cased, previous_is_cased; 10720 10721 if (PyUnicode_READY(self) == -1) 10722 return NULL; 10723 length = PyUnicode_GET_LENGTH(self); 10724 kind = PyUnicode_KIND(self); 10725 data = PyUnicode_DATA(self); 10726 10727 /* Shortcut for single character strings */ 10728 if (length == 1) { 10729 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10730 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10731 (Py_UNICODE_ISUPPER(ch) != 0)); 10732 } 10733 10734 /* Special case for empty strings */ 10735 if (length == 0) 10736 return PyBool_FromLong(0); 10737 10738 cased = 0; 10739 previous_is_cased = 0; 10740 for (i = 0; i < length; i++) { 10741 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10742 10743 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10744 if (previous_is_cased) 10745 return PyBool_FromLong(0); 10746 previous_is_cased = 1; 10747 cased = 1; 10748 } 10749 else if (Py_UNICODE_ISLOWER(ch)) { 10750 if (!previous_is_cased) 10751 return PyBool_FromLong(0); 10752 previous_is_cased = 1; 10753 cased = 1; 10754 } 10755 else 10756 previous_is_cased = 0; 10757 } 10758 return PyBool_FromLong(cased); 10759} 10760 10761PyDoc_STRVAR(isspace__doc__, 10762 "S.isspace() -> bool\n\ 10763\n\ 10764Return True if all characters in S are whitespace\n\ 10765and there is at least one character in S, False otherwise."); 10766 10767static PyObject* 10768unicode_isspace(PyUnicodeObject *self) 10769{ 10770 Py_ssize_t i, length; 10771 int kind; 10772 void *data; 10773 10774 if (PyUnicode_READY(self) == -1) 10775 return NULL; 10776 length = PyUnicode_GET_LENGTH(self); 10777 kind = PyUnicode_KIND(self); 10778 data = PyUnicode_DATA(self); 10779 10780 /* Shortcut for single character strings */ 10781 if (length == 1) 10782 return PyBool_FromLong( 10783 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10784 10785 /* Special case for empty strings */ 10786 if (length == 0) 10787 return PyBool_FromLong(0); 10788 10789 for (i = 0; i < length; i++) { 10790 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10791 if (!Py_UNICODE_ISSPACE(ch)) 10792 return PyBool_FromLong(0); 10793 } 10794 return PyBool_FromLong(1); 10795} 10796 10797PyDoc_STRVAR(isalpha__doc__, 10798 "S.isalpha() -> bool\n\ 10799\n\ 10800Return True if all characters in S are alphabetic\n\ 10801and there is at least one character in S, False otherwise."); 10802 10803static PyObject* 10804unicode_isalpha(PyUnicodeObject *self) 10805{ 10806 Py_ssize_t i, length; 10807 int kind; 10808 void *data; 10809 10810 if (PyUnicode_READY(self) == -1) 10811 return NULL; 10812 length = PyUnicode_GET_LENGTH(self); 10813 kind = PyUnicode_KIND(self); 10814 data = PyUnicode_DATA(self); 10815 10816 /* Shortcut for single character strings */ 10817 if (length == 1) 10818 return PyBool_FromLong( 10819 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10820 10821 /* Special case for empty strings */ 10822 if (length == 0) 10823 return PyBool_FromLong(0); 10824 10825 for (i = 0; i < length; i++) { 10826 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10827 return PyBool_FromLong(0); 10828 } 10829 return PyBool_FromLong(1); 10830} 10831 10832PyDoc_STRVAR(isalnum__doc__, 10833 "S.isalnum() -> bool\n\ 10834\n\ 10835Return True if all characters in S are alphanumeric\n\ 10836and there is at least one character in S, False otherwise."); 10837 10838static PyObject* 10839unicode_isalnum(PyUnicodeObject *self) 10840{ 10841 int kind; 10842 void *data; 10843 Py_ssize_t len, i; 10844 10845 if (PyUnicode_READY(self) == -1) 10846 return NULL; 10847 10848 kind = PyUnicode_KIND(self); 10849 data = PyUnicode_DATA(self); 10850 len = PyUnicode_GET_LENGTH(self); 10851 10852 /* Shortcut for single character strings */ 10853 if (len == 1) { 10854 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10855 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10856 } 10857 10858 /* Special case for empty strings */ 10859 if (len == 0) 10860 return PyBool_FromLong(0); 10861 10862 for (i = 0; i < len; i++) { 10863 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10864 if (!Py_UNICODE_ISALNUM(ch)) 10865 return PyBool_FromLong(0); 10866 } 10867 return PyBool_FromLong(1); 10868} 10869 10870PyDoc_STRVAR(isdecimal__doc__, 10871 "S.isdecimal() -> bool\n\ 10872\n\ 10873Return True if there are only decimal characters in S,\n\ 10874False otherwise."); 10875 10876static PyObject* 10877unicode_isdecimal(PyUnicodeObject *self) 10878{ 10879 Py_ssize_t i, length; 10880 int kind; 10881 void *data; 10882 10883 if (PyUnicode_READY(self) == -1) 10884 return NULL; 10885 length = PyUnicode_GET_LENGTH(self); 10886 kind = PyUnicode_KIND(self); 10887 data = PyUnicode_DATA(self); 10888 10889 /* Shortcut for single character strings */ 10890 if (length == 1) 10891 return PyBool_FromLong( 10892 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10893 10894 /* Special case for empty strings */ 10895 if (length == 0) 10896 return PyBool_FromLong(0); 10897 10898 for (i = 0; i < length; i++) { 10899 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10900 return PyBool_FromLong(0); 10901 } 10902 return PyBool_FromLong(1); 10903} 10904 10905PyDoc_STRVAR(isdigit__doc__, 10906 "S.isdigit() -> bool\n\ 10907\n\ 10908Return True if all characters in S are digits\n\ 10909and there is at least one character in S, False otherwise."); 10910 10911static PyObject* 10912unicode_isdigit(PyUnicodeObject *self) 10913{ 10914 Py_ssize_t i, length; 10915 int kind; 10916 void *data; 10917 10918 if (PyUnicode_READY(self) == -1) 10919 return NULL; 10920 length = PyUnicode_GET_LENGTH(self); 10921 kind = PyUnicode_KIND(self); 10922 data = PyUnicode_DATA(self); 10923 10924 /* Shortcut for single character strings */ 10925 if (length == 1) { 10926 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10927 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10928 } 10929 10930 /* Special case for empty strings */ 10931 if (length == 0) 10932 return PyBool_FromLong(0); 10933 10934 for (i = 0; i < length; i++) { 10935 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10936 return PyBool_FromLong(0); 10937 } 10938 return PyBool_FromLong(1); 10939} 10940 10941PyDoc_STRVAR(isnumeric__doc__, 10942 "S.isnumeric() -> bool\n\ 10943\n\ 10944Return True if there are only numeric characters in S,\n\ 10945False otherwise."); 10946 10947static PyObject* 10948unicode_isnumeric(PyUnicodeObject *self) 10949{ 10950 Py_ssize_t i, length; 10951 int kind; 10952 void *data; 10953 10954 if (PyUnicode_READY(self) == -1) 10955 return NULL; 10956 length = PyUnicode_GET_LENGTH(self); 10957 kind = PyUnicode_KIND(self); 10958 data = PyUnicode_DATA(self); 10959 10960 /* Shortcut for single character strings */ 10961 if (length == 1) 10962 return PyBool_FromLong( 10963 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10964 10965 /* Special case for empty strings */ 10966 if (length == 0) 10967 return PyBool_FromLong(0); 10968 10969 for (i = 0; i < length; i++) { 10970 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10971 return PyBool_FromLong(0); 10972 } 10973 return PyBool_FromLong(1); 10974} 10975 10976int 10977PyUnicode_IsIdentifier(PyObject *self) 10978{ 10979 int kind; 10980 void *data; 10981 Py_ssize_t i; 10982 Py_UCS4 first; 10983 10984 if (PyUnicode_READY(self) == -1) { 10985 Py_FatalError("identifier not ready"); 10986 return 0; 10987 } 10988 10989 /* Special case for empty strings */ 10990 if (PyUnicode_GET_LENGTH(self) == 0) 10991 return 0; 10992 kind = PyUnicode_KIND(self); 10993 data = PyUnicode_DATA(self); 10994 10995 /* PEP 3131 says that the first character must be in 10996 XID_Start and subsequent characters in XID_Continue, 10997 and for the ASCII range, the 2.x rules apply (i.e 10998 start with letters and underscore, continue with 10999 letters, digits, underscore). However, given the current 11000 definition of XID_Start and XID_Continue, it is sufficient 11001 to check just for these, except that _ must be allowed 11002 as starting an identifier. */ 11003 first = PyUnicode_READ(kind, data, 0); 11004 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11005 return 0; 11006 11007 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11008 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11009 return 0; 11010 return 1; 11011} 11012 11013PyDoc_STRVAR(isidentifier__doc__, 11014 "S.isidentifier() -> bool\n\ 11015\n\ 11016Return True if S is a valid identifier according\n\ 11017to the language definition."); 11018 11019static PyObject* 11020unicode_isidentifier(PyObject *self) 11021{ 11022 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11023} 11024 11025PyDoc_STRVAR(isprintable__doc__, 11026 "S.isprintable() -> bool\n\ 11027\n\ 11028Return True if all characters in S are considered\n\ 11029printable in repr() or S is empty, False otherwise."); 11030 11031static PyObject* 11032unicode_isprintable(PyObject *self) 11033{ 11034 Py_ssize_t i, length; 11035 int kind; 11036 void *data; 11037 11038 if (PyUnicode_READY(self) == -1) 11039 return NULL; 11040 length = PyUnicode_GET_LENGTH(self); 11041 kind = PyUnicode_KIND(self); 11042 data = PyUnicode_DATA(self); 11043 11044 /* Shortcut for single character strings */ 11045 if (length == 1) 11046 return PyBool_FromLong( 11047 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11048 11049 for (i = 0; i < length; i++) { 11050 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11051 Py_RETURN_FALSE; 11052 } 11053 } 11054 Py_RETURN_TRUE; 11055} 11056 11057PyDoc_STRVAR(join__doc__, 11058 "S.join(iterable) -> str\n\ 11059\n\ 11060Return a string which is the concatenation of the strings in the\n\ 11061iterable. The separator between elements is S."); 11062 11063static PyObject* 11064unicode_join(PyObject *self, PyObject *data) 11065{ 11066 return PyUnicode_Join(self, data); 11067} 11068 11069static Py_ssize_t 11070unicode_length(PyUnicodeObject *self) 11071{ 11072 if (PyUnicode_READY(self) == -1) 11073 return -1; 11074 return PyUnicode_GET_LENGTH(self); 11075} 11076 11077PyDoc_STRVAR(ljust__doc__, 11078 "S.ljust(width[, fillchar]) -> str\n\ 11079\n\ 11080Return S left-justified in a Unicode string of length width. Padding is\n\ 11081done using the specified fill character (default is a space)."); 11082 11083static PyObject * 11084unicode_ljust(PyObject *self, PyObject *args) 11085{ 11086 Py_ssize_t width; 11087 Py_UCS4 fillchar = ' '; 11088 11089 if (PyUnicode_READY(self) == -1) 11090 return NULL; 11091 11092 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11093 return NULL; 11094 11095 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11096 Py_INCREF(self); 11097 return (PyObject*) self; 11098 } 11099 11100 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 11101} 11102 11103PyDoc_STRVAR(lower__doc__, 11104 "S.lower() -> str\n\ 11105\n\ 11106Return a copy of the string S converted to lowercase."); 11107 11108static PyObject* 11109unicode_lower(PyObject *self) 11110{ 11111 return fixup(self, fixlower); 11112} 11113 11114#define LEFTSTRIP 0 11115#define RIGHTSTRIP 1 11116#define BOTHSTRIP 2 11117 11118/* Arrays indexed by above */ 11119static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11120 11121#define STRIPNAME(i) (stripformat[i]+3) 11122 11123/* externally visible for str.strip(unicode) */ 11124PyObject * 11125_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 11126{ 11127 void *data; 11128 int kind; 11129 Py_ssize_t i, j, len; 11130 BLOOM_MASK sepmask; 11131 11132 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11133 return NULL; 11134 11135 kind = PyUnicode_KIND(self); 11136 data = PyUnicode_DATA(self); 11137 len = PyUnicode_GET_LENGTH(self); 11138 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11139 PyUnicode_DATA(sepobj), 11140 PyUnicode_GET_LENGTH(sepobj)); 11141 11142 i = 0; 11143 if (striptype != RIGHTSTRIP) { 11144 while (i < len && 11145 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11146 i++; 11147 } 11148 } 11149 11150 j = len; 11151 if (striptype != LEFTSTRIP) { 11152 do { 11153 j--; 11154 } while (j >= i && 11155 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11156 j++; 11157 } 11158 11159 return PyUnicode_Substring((PyObject*)self, i, j); 11160} 11161 11162PyObject* 11163PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11164{ 11165 unsigned char *data; 11166 int kind; 11167 Py_ssize_t length; 11168 11169 if (PyUnicode_READY(self) == -1) 11170 return NULL; 11171 11172 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11173 11174 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11175 { 11176 if (PyUnicode_CheckExact(self)) { 11177 Py_INCREF(self); 11178 return self; 11179 } 11180 else 11181 return PyUnicode_Copy(self); 11182 } 11183 11184 length = end - start; 11185 if (length == 1) 11186 return unicode_getitem(self, start); 11187 11188 if (start < 0 || end < 0) { 11189 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11190 return NULL; 11191 } 11192 11193 if (PyUnicode_IS_ASCII(self)) { 11194 kind = PyUnicode_KIND(self); 11195 data = PyUnicode_1BYTE_DATA(self); 11196 return unicode_fromascii(data + start, length); 11197 } 11198 else { 11199 kind = PyUnicode_KIND(self); 11200 data = PyUnicode_1BYTE_DATA(self); 11201 return PyUnicode_FromKindAndData(kind, 11202 data + PyUnicode_KIND_SIZE(kind, start), 11203 length); 11204 } 11205} 11206 11207static PyObject * 11208do_strip(PyUnicodeObject *self, int striptype) 11209{ 11210 int kind; 11211 void *data; 11212 Py_ssize_t len, i, j; 11213 11214 if (PyUnicode_READY(self) == -1) 11215 return NULL; 11216 11217 kind = PyUnicode_KIND(self); 11218 data = PyUnicode_DATA(self); 11219 len = PyUnicode_GET_LENGTH(self); 11220 11221 i = 0; 11222 if (striptype != RIGHTSTRIP) { 11223 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11224 i++; 11225 } 11226 } 11227 11228 j = len; 11229 if (striptype != LEFTSTRIP) { 11230 do { 11231 j--; 11232 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11233 j++; 11234 } 11235 11236 return PyUnicode_Substring((PyObject*)self, i, j); 11237} 11238 11239 11240static PyObject * 11241do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 11242{ 11243 PyObject *sep = NULL; 11244 11245 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11246 return NULL; 11247 11248 if (sep != NULL && sep != Py_None) { 11249 if (PyUnicode_Check(sep)) 11250 return _PyUnicode_XStrip(self, striptype, sep); 11251 else { 11252 PyErr_Format(PyExc_TypeError, 11253 "%s arg must be None or str", 11254 STRIPNAME(striptype)); 11255 return NULL; 11256 } 11257 } 11258 11259 return do_strip(self, striptype); 11260} 11261 11262 11263PyDoc_STRVAR(strip__doc__, 11264 "S.strip([chars]) -> str\n\ 11265\n\ 11266Return a copy of the string S with leading and trailing\n\ 11267whitespace removed.\n\ 11268If chars is given and not None, remove characters in chars instead."); 11269 11270static PyObject * 11271unicode_strip(PyUnicodeObject *self, PyObject *args) 11272{ 11273 if (PyTuple_GET_SIZE(args) == 0) 11274 return do_strip(self, BOTHSTRIP); /* Common case */ 11275 else 11276 return do_argstrip(self, BOTHSTRIP, args); 11277} 11278 11279 11280PyDoc_STRVAR(lstrip__doc__, 11281 "S.lstrip([chars]) -> str\n\ 11282\n\ 11283Return a copy of the string S with leading whitespace removed.\n\ 11284If chars is given and not None, remove characters in chars instead."); 11285 11286static PyObject * 11287unicode_lstrip(PyUnicodeObject *self, PyObject *args) 11288{ 11289 if (PyTuple_GET_SIZE(args) == 0) 11290 return do_strip(self, LEFTSTRIP); /* Common case */ 11291 else 11292 return do_argstrip(self, LEFTSTRIP, args); 11293} 11294 11295 11296PyDoc_STRVAR(rstrip__doc__, 11297 "S.rstrip([chars]) -> str\n\ 11298\n\ 11299Return a copy of the string S with trailing whitespace removed.\n\ 11300If chars is given and not None, remove characters in chars instead."); 11301 11302static PyObject * 11303unicode_rstrip(PyUnicodeObject *self, PyObject *args) 11304{ 11305 if (PyTuple_GET_SIZE(args) == 0) 11306 return do_strip(self, RIGHTSTRIP); /* Common case */ 11307 else 11308 return do_argstrip(self, RIGHTSTRIP, args); 11309} 11310 11311 11312static PyObject* 11313unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 11314{ 11315 PyUnicodeObject *u; 11316 Py_ssize_t nchars, n; 11317 11318 if (len < 1) { 11319 Py_INCREF(unicode_empty); 11320 return unicode_empty; 11321 } 11322 11323 if (len == 1 && PyUnicode_CheckExact(str)) { 11324 /* no repeat, return original string */ 11325 Py_INCREF(str); 11326 return (PyObject*) str; 11327 } 11328 11329 if (PyUnicode_READY(str) == -1) 11330 return NULL; 11331 11332 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11333 PyErr_SetString(PyExc_OverflowError, 11334 "repeated string is too long"); 11335 return NULL; 11336 } 11337 nchars = len * PyUnicode_GET_LENGTH(str); 11338 11339 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11340 if (!u) 11341 return NULL; 11342 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11343 11344 if (PyUnicode_GET_LENGTH(str) == 1) { 11345 const int kind = PyUnicode_KIND(str); 11346 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11347 void *to = PyUnicode_DATA(u); 11348 if (kind == PyUnicode_1BYTE_KIND) 11349 memset(to, (unsigned char)fill_char, len); 11350 else { 11351 for (n = 0; n < len; ++n) 11352 PyUnicode_WRITE(kind, to, n, fill_char); 11353 } 11354 } 11355 else { 11356 /* number of characters copied this far */ 11357 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11358 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11359 char *to = (char *) PyUnicode_DATA(u); 11360 Py_MEMCPY(to, PyUnicode_DATA(str), 11361 PyUnicode_GET_LENGTH(str) * char_size); 11362 while (done < nchars) { 11363 n = (done <= nchars-done) ? done : nchars-done; 11364 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11365 done += n; 11366 } 11367 } 11368 11369 assert(_PyUnicode_CheckConsistency(u, 1)); 11370 return (PyObject*) u; 11371} 11372 11373PyObject * 11374PyUnicode_Replace(PyObject *obj, 11375 PyObject *subobj, 11376 PyObject *replobj, 11377 Py_ssize_t maxcount) 11378{ 11379 PyObject *self; 11380 PyObject *str1; 11381 PyObject *str2; 11382 PyObject *result; 11383 11384 self = PyUnicode_FromObject(obj); 11385 if (self == NULL || PyUnicode_READY(self) == -1) 11386 return NULL; 11387 str1 = PyUnicode_FromObject(subobj); 11388 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11389 Py_DECREF(self); 11390 return NULL; 11391 } 11392 str2 = PyUnicode_FromObject(replobj); 11393 if (str2 == NULL || PyUnicode_READY(str2)) { 11394 Py_DECREF(self); 11395 Py_DECREF(str1); 11396 return NULL; 11397 } 11398 result = replace(self, str1, str2, maxcount); 11399 Py_DECREF(self); 11400 Py_DECREF(str1); 11401 Py_DECREF(str2); 11402 return result; 11403} 11404 11405PyDoc_STRVAR(replace__doc__, 11406 "S.replace(old, new[, count]) -> str\n\ 11407\n\ 11408Return a copy of S with all occurrences of substring\n\ 11409old replaced by new. If the optional argument count is\n\ 11410given, only the first count occurrences are replaced."); 11411 11412static PyObject* 11413unicode_replace(PyObject *self, PyObject *args) 11414{ 11415 PyObject *str1; 11416 PyObject *str2; 11417 Py_ssize_t maxcount = -1; 11418 PyObject *result; 11419 11420 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11421 return NULL; 11422 if (!PyUnicode_READY(self) == -1) 11423 return NULL; 11424 str1 = PyUnicode_FromObject(str1); 11425 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11426 return NULL; 11427 str2 = PyUnicode_FromObject(str2); 11428 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11429 Py_DECREF(str1); 11430 return NULL; 11431 } 11432 11433 result = replace(self, str1, str2, maxcount); 11434 11435 Py_DECREF(str1); 11436 Py_DECREF(str2); 11437 return result; 11438} 11439 11440static PyObject * 11441unicode_repr(PyObject *unicode) 11442{ 11443 PyObject *repr; 11444 Py_ssize_t isize; 11445 Py_ssize_t osize, squote, dquote, i, o; 11446 Py_UCS4 max, quote; 11447 int ikind, okind; 11448 void *idata, *odata; 11449 11450 if (PyUnicode_READY(unicode) == -1) 11451 return NULL; 11452 11453 isize = PyUnicode_GET_LENGTH(unicode); 11454 idata = PyUnicode_DATA(unicode); 11455 11456 /* Compute length of output, quote characters, and 11457 maximum character */ 11458 osize = 2; /* quotes */ 11459 max = 127; 11460 squote = dquote = 0; 11461 ikind = PyUnicode_KIND(unicode); 11462 for (i = 0; i < isize; i++) { 11463 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11464 switch (ch) { 11465 case '\'': squote++; osize++; break; 11466 case '"': dquote++; osize++; break; 11467 case '\\': case '\t': case '\r': case '\n': 11468 osize += 2; break; 11469 default: 11470 /* Fast-path ASCII */ 11471 if (ch < ' ' || ch == 0x7f) 11472 osize += 4; /* \xHH */ 11473 else if (ch < 0x7f) 11474 osize++; 11475 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11476 osize++; 11477 max = ch > max ? ch : max; 11478 } 11479 else if (ch < 0x100) 11480 osize += 4; /* \xHH */ 11481 else if (ch < 0x10000) 11482 osize += 6; /* \uHHHH */ 11483 else 11484 osize += 10; /* \uHHHHHHHH */ 11485 } 11486 } 11487 11488 quote = '\''; 11489 if (squote) { 11490 if (dquote) 11491 /* Both squote and dquote present. Use squote, 11492 and escape them */ 11493 osize += squote; 11494 else 11495 quote = '"'; 11496 } 11497 11498 repr = PyUnicode_New(osize, max); 11499 if (repr == NULL) 11500 return NULL; 11501 okind = PyUnicode_KIND(repr); 11502 odata = PyUnicode_DATA(repr); 11503 11504 PyUnicode_WRITE(okind, odata, 0, quote); 11505 PyUnicode_WRITE(okind, odata, osize-1, quote); 11506 11507 for (i = 0, o = 1; i < isize; i++) { 11508 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11509 11510 /* Escape quotes and backslashes */ 11511 if ((ch == quote) || (ch == '\\')) { 11512 PyUnicode_WRITE(okind, odata, o++, '\\'); 11513 PyUnicode_WRITE(okind, odata, o++, ch); 11514 continue; 11515 } 11516 11517 /* Map special whitespace to '\t', \n', '\r' */ 11518 if (ch == '\t') { 11519 PyUnicode_WRITE(okind, odata, o++, '\\'); 11520 PyUnicode_WRITE(okind, odata, o++, 't'); 11521 } 11522 else if (ch == '\n') { 11523 PyUnicode_WRITE(okind, odata, o++, '\\'); 11524 PyUnicode_WRITE(okind, odata, o++, 'n'); 11525 } 11526 else if (ch == '\r') { 11527 PyUnicode_WRITE(okind, odata, o++, '\\'); 11528 PyUnicode_WRITE(okind, odata, o++, 'r'); 11529 } 11530 11531 /* Map non-printable US ASCII to '\xhh' */ 11532 else if (ch < ' ' || ch == 0x7F) { 11533 PyUnicode_WRITE(okind, odata, o++, '\\'); 11534 PyUnicode_WRITE(okind, odata, o++, 'x'); 11535 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11536 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11537 } 11538 11539 /* Copy ASCII characters as-is */ 11540 else if (ch < 0x7F) { 11541 PyUnicode_WRITE(okind, odata, o++, ch); 11542 } 11543 11544 /* Non-ASCII characters */ 11545 else { 11546 /* Map Unicode whitespace and control characters 11547 (categories Z* and C* except ASCII space) 11548 */ 11549 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11550 /* Map 8-bit characters to '\xhh' */ 11551 if (ch <= 0xff) { 11552 PyUnicode_WRITE(okind, odata, o++, '\\'); 11553 PyUnicode_WRITE(okind, odata, o++, 'x'); 11554 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11555 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11556 } 11557 /* Map 21-bit characters to '\U00xxxxxx' */ 11558 else if (ch >= 0x10000) { 11559 PyUnicode_WRITE(okind, odata, o++, '\\'); 11560 PyUnicode_WRITE(okind, odata, o++, 'U'); 11561 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11562 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11563 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11564 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11565 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11566 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11567 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11569 } 11570 /* Map 16-bit characters to '\uxxxx' */ 11571 else { 11572 PyUnicode_WRITE(okind, odata, o++, '\\'); 11573 PyUnicode_WRITE(okind, odata, o++, 'u'); 11574 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11575 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11576 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11577 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11578 } 11579 } 11580 /* Copy characters as-is */ 11581 else { 11582 PyUnicode_WRITE(okind, odata, o++, ch); 11583 } 11584 } 11585 } 11586 /* Closing quote already added at the beginning */ 11587 assert(_PyUnicode_CheckConsistency(repr, 1)); 11588 return repr; 11589} 11590 11591PyDoc_STRVAR(rfind__doc__, 11592 "S.rfind(sub[, start[, end]]) -> int\n\ 11593\n\ 11594Return the highest index in S where substring sub is found,\n\ 11595such that sub is contained within S[start:end]. Optional\n\ 11596arguments start and end are interpreted as in slice notation.\n\ 11597\n\ 11598Return -1 on failure."); 11599 11600static PyObject * 11601unicode_rfind(PyObject *self, PyObject *args) 11602{ 11603 PyUnicodeObject *substring; 11604 Py_ssize_t start; 11605 Py_ssize_t end; 11606 Py_ssize_t result; 11607 11608 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11609 &start, &end)) 11610 return NULL; 11611 11612 if (PyUnicode_READY(self) == -1) 11613 return NULL; 11614 if (PyUnicode_READY(substring) == -1) 11615 return NULL; 11616 11617 result = any_find_slice( 11618 asciilib_rfind_slice, ucs1lib_rfind_slice, 11619 ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11620 self, (PyObject*)substring, start, end 11621 ); 11622 11623 Py_DECREF(substring); 11624 11625 if (result == -2) 11626 return NULL; 11627 11628 return PyLong_FromSsize_t(result); 11629} 11630 11631PyDoc_STRVAR(rindex__doc__, 11632 "S.rindex(sub[, start[, end]]) -> int\n\ 11633\n\ 11634Like S.rfind() but raise ValueError when the substring is not found."); 11635 11636static PyObject * 11637unicode_rindex(PyObject *self, PyObject *args) 11638{ 11639 PyUnicodeObject *substring; 11640 Py_ssize_t start; 11641 Py_ssize_t end; 11642 Py_ssize_t result; 11643 11644 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11645 &start, &end)) 11646 return NULL; 11647 11648 if (PyUnicode_READY(self) == -1) 11649 return NULL; 11650 if (PyUnicode_READY(substring) == -1) 11651 return NULL; 11652 11653 result = any_find_slice( 11654 asciilib_rfind_slice, ucs1lib_rfind_slice, 11655 ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11656 self, (PyObject*)substring, start, end 11657 ); 11658 11659 Py_DECREF(substring); 11660 11661 if (result == -2) 11662 return NULL; 11663 11664 if (result < 0) { 11665 PyErr_SetString(PyExc_ValueError, "substring not found"); 11666 return NULL; 11667 } 11668 11669 return PyLong_FromSsize_t(result); 11670} 11671 11672PyDoc_STRVAR(rjust__doc__, 11673 "S.rjust(width[, fillchar]) -> str\n\ 11674\n\ 11675Return S right-justified in a string of length width. Padding is\n\ 11676done using the specified fill character (default is a space)."); 11677 11678static PyObject * 11679unicode_rjust(PyObject *self, PyObject *args) 11680{ 11681 Py_ssize_t width; 11682 Py_UCS4 fillchar = ' '; 11683 11684 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11685 return NULL; 11686 11687 if (PyUnicode_READY(self) == -1) 11688 return NULL; 11689 11690 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11691 Py_INCREF(self); 11692 return (PyObject*) self; 11693 } 11694 11695 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11696} 11697 11698PyObject * 11699PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11700{ 11701 PyObject *result; 11702 11703 s = PyUnicode_FromObject(s); 11704 if (s == NULL) 11705 return NULL; 11706 if (sep != NULL) { 11707 sep = PyUnicode_FromObject(sep); 11708 if (sep == NULL) { 11709 Py_DECREF(s); 11710 return NULL; 11711 } 11712 } 11713 11714 result = split(s, sep, maxsplit); 11715 11716 Py_DECREF(s); 11717 Py_XDECREF(sep); 11718 return result; 11719} 11720 11721PyDoc_STRVAR(split__doc__, 11722 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11723\n\ 11724Return a list of the words in S, using sep as the\n\ 11725delimiter string. If maxsplit is given, at most maxsplit\n\ 11726splits are done. If sep is not specified or is None, any\n\ 11727whitespace string is a separator and empty strings are\n\ 11728removed from the result."); 11729 11730static PyObject* 11731unicode_split(PyObject *self, PyObject *args) 11732{ 11733 PyObject *substring = Py_None; 11734 Py_ssize_t maxcount = -1; 11735 11736 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11737 return NULL; 11738 11739 if (substring == Py_None) 11740 return split(self, NULL, maxcount); 11741 else if (PyUnicode_Check(substring)) 11742 return split(self, substring, maxcount); 11743 else 11744 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11745} 11746 11747PyObject * 11748PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11749{ 11750 PyObject* str_obj; 11751 PyObject* sep_obj; 11752 PyObject* out; 11753 int kind1, kind2, kind; 11754 void *buf1 = NULL, *buf2 = NULL; 11755 Py_ssize_t len1, len2; 11756 11757 str_obj = PyUnicode_FromObject(str_in); 11758 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11759 return NULL; 11760 sep_obj = PyUnicode_FromObject(sep_in); 11761 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11762 Py_DECREF(str_obj); 11763 return NULL; 11764 } 11765 11766 kind1 = PyUnicode_KIND(str_obj); 11767 kind2 = PyUnicode_KIND(sep_obj); 11768 kind = Py_MAX(kind1, kind2); 11769 buf1 = PyUnicode_DATA(str_obj); 11770 if (kind1 != kind) 11771 buf1 = _PyUnicode_AsKind(str_obj, kind); 11772 if (!buf1) 11773 goto onError; 11774 buf2 = PyUnicode_DATA(sep_obj); 11775 if (kind2 != kind) 11776 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11777 if (!buf2) 11778 goto onError; 11779 len1 = PyUnicode_GET_LENGTH(str_obj); 11780 len2 = PyUnicode_GET_LENGTH(sep_obj); 11781 11782 switch(PyUnicode_KIND(str_obj)) { 11783 case PyUnicode_1BYTE_KIND: 11784 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 11785 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11786 else 11787 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11788 break; 11789 case PyUnicode_2BYTE_KIND: 11790 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11791 break; 11792 case PyUnicode_4BYTE_KIND: 11793 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11794 break; 11795 default: 11796 assert(0); 11797 out = 0; 11798 } 11799 11800 Py_DECREF(sep_obj); 11801 Py_DECREF(str_obj); 11802 if (kind1 != kind) 11803 PyMem_Free(buf1); 11804 if (kind2 != kind) 11805 PyMem_Free(buf2); 11806 11807 return out; 11808 onError: 11809 Py_DECREF(sep_obj); 11810 Py_DECREF(str_obj); 11811 if (kind1 != kind && buf1) 11812 PyMem_Free(buf1); 11813 if (kind2 != kind && buf2) 11814 PyMem_Free(buf2); 11815 return NULL; 11816} 11817 11818 11819PyObject * 11820PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11821{ 11822 PyObject* str_obj; 11823 PyObject* sep_obj; 11824 PyObject* out; 11825 int kind1, kind2, kind; 11826 void *buf1 = NULL, *buf2 = NULL; 11827 Py_ssize_t len1, len2; 11828 11829 str_obj = PyUnicode_FromObject(str_in); 11830 if (!str_obj) 11831 return NULL; 11832 sep_obj = PyUnicode_FromObject(sep_in); 11833 if (!sep_obj) { 11834 Py_DECREF(str_obj); 11835 return NULL; 11836 } 11837 11838 kind1 = PyUnicode_KIND(str_in); 11839 kind2 = PyUnicode_KIND(sep_obj); 11840 kind = Py_MAX(kind1, kind2); 11841 buf1 = PyUnicode_DATA(str_in); 11842 if (kind1 != kind) 11843 buf1 = _PyUnicode_AsKind(str_in, kind); 11844 if (!buf1) 11845 goto onError; 11846 buf2 = PyUnicode_DATA(sep_obj); 11847 if (kind2 != kind) 11848 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11849 if (!buf2) 11850 goto onError; 11851 len1 = PyUnicode_GET_LENGTH(str_obj); 11852 len2 = PyUnicode_GET_LENGTH(sep_obj); 11853 11854 switch(PyUnicode_KIND(str_in)) { 11855 case PyUnicode_1BYTE_KIND: 11856 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 11857 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11858 else 11859 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11860 break; 11861 case PyUnicode_2BYTE_KIND: 11862 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11863 break; 11864 case PyUnicode_4BYTE_KIND: 11865 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11866 break; 11867 default: 11868 assert(0); 11869 out = 0; 11870 } 11871 11872 Py_DECREF(sep_obj); 11873 Py_DECREF(str_obj); 11874 if (kind1 != kind) 11875 PyMem_Free(buf1); 11876 if (kind2 != kind) 11877 PyMem_Free(buf2); 11878 11879 return out; 11880 onError: 11881 Py_DECREF(sep_obj); 11882 Py_DECREF(str_obj); 11883 if (kind1 != kind && buf1) 11884 PyMem_Free(buf1); 11885 if (kind2 != kind && buf2) 11886 PyMem_Free(buf2); 11887 return NULL; 11888} 11889 11890PyDoc_STRVAR(partition__doc__, 11891 "S.partition(sep) -> (head, sep, tail)\n\ 11892\n\ 11893Search for the separator sep in S, and return the part before it,\n\ 11894the separator itself, and the part after it. If the separator is not\n\ 11895found, return S and two empty strings."); 11896 11897static PyObject* 11898unicode_partition(PyObject *self, PyObject *separator) 11899{ 11900 return PyUnicode_Partition(self, separator); 11901} 11902 11903PyDoc_STRVAR(rpartition__doc__, 11904 "S.rpartition(sep) -> (head, sep, tail)\n\ 11905\n\ 11906Search for the separator sep in S, starting at the end of S, and return\n\ 11907the part before it, the separator itself, and the part after it. If the\n\ 11908separator is not found, return two empty strings and S."); 11909 11910static PyObject* 11911unicode_rpartition(PyObject *self, PyObject *separator) 11912{ 11913 return PyUnicode_RPartition(self, separator); 11914} 11915 11916PyObject * 11917PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11918{ 11919 PyObject *result; 11920 11921 s = PyUnicode_FromObject(s); 11922 if (s == NULL) 11923 return NULL; 11924 if (sep != NULL) { 11925 sep = PyUnicode_FromObject(sep); 11926 if (sep == NULL) { 11927 Py_DECREF(s); 11928 return NULL; 11929 } 11930 } 11931 11932 result = rsplit(s, sep, maxsplit); 11933 11934 Py_DECREF(s); 11935 Py_XDECREF(sep); 11936 return result; 11937} 11938 11939PyDoc_STRVAR(rsplit__doc__, 11940 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11941\n\ 11942Return a list of the words in S, using sep as the\n\ 11943delimiter string, starting at the end of the string and\n\ 11944working to the front. If maxsplit is given, at most maxsplit\n\ 11945splits are done. If sep is not specified, any whitespace string\n\ 11946is a separator."); 11947 11948static PyObject* 11949unicode_rsplit(PyObject *self, PyObject *args) 11950{ 11951 PyObject *substring = Py_None; 11952 Py_ssize_t maxcount = -1; 11953 11954 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11955 return NULL; 11956 11957 if (substring == Py_None) 11958 return rsplit(self, NULL, maxcount); 11959 else if (PyUnicode_Check(substring)) 11960 return rsplit(self, substring, maxcount); 11961 else 11962 return PyUnicode_RSplit(self, substring, maxcount); 11963} 11964 11965PyDoc_STRVAR(splitlines__doc__, 11966 "S.splitlines([keepends]) -> list of strings\n\ 11967\n\ 11968Return a list of the lines in S, breaking at line boundaries.\n\ 11969Line breaks are not included in the resulting list unless keepends\n\ 11970is given and true."); 11971 11972static PyObject* 11973unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11974{ 11975 static char *kwlist[] = {"keepends", 0}; 11976 int keepends = 0; 11977 11978 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11979 kwlist, &keepends)) 11980 return NULL; 11981 11982 return PyUnicode_Splitlines((PyObject *)self, keepends); 11983} 11984 11985static 11986PyObject *unicode_str(PyObject *self) 11987{ 11988 if (PyUnicode_CheckExact(self)) { 11989 Py_INCREF(self); 11990 return self; 11991 } else 11992 /* Subtype -- return genuine unicode string with the same value. */ 11993 return PyUnicode_Copy(self); 11994} 11995 11996PyDoc_STRVAR(swapcase__doc__, 11997 "S.swapcase() -> str\n\ 11998\n\ 11999Return a copy of S with uppercase characters converted to lowercase\n\ 12000and vice versa."); 12001 12002static PyObject* 12003unicode_swapcase(PyObject *self) 12004{ 12005 return fixup(self, fixswapcase); 12006} 12007 12008PyDoc_STRVAR(maketrans__doc__, 12009 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12010\n\ 12011Return a translation table usable for str.translate().\n\ 12012If there is only one argument, it must be a dictionary mapping Unicode\n\ 12013ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12014Character keys will be then converted to ordinals.\n\ 12015If there are two arguments, they must be strings of equal length, and\n\ 12016in the resulting dictionary, each character in x will be mapped to the\n\ 12017character at the same position in y. If there is a third argument, it\n\ 12018must be a string, whose characters will be mapped to None in the result."); 12019 12020static PyObject* 12021unicode_maketrans(PyUnicodeObject *null, PyObject *args) 12022{ 12023 PyObject *x, *y = NULL, *z = NULL; 12024 PyObject *new = NULL, *key, *value; 12025 Py_ssize_t i = 0; 12026 int res; 12027 12028 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12029 return NULL; 12030 new = PyDict_New(); 12031 if (!new) 12032 return NULL; 12033 if (y != NULL) { 12034 int x_kind, y_kind, z_kind; 12035 void *x_data, *y_data, *z_data; 12036 12037 /* x must be a string too, of equal length */ 12038 if (!PyUnicode_Check(x)) { 12039 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12040 "be a string if there is a second argument"); 12041 goto err; 12042 } 12043 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12044 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12045 "arguments must have equal length"); 12046 goto err; 12047 } 12048 /* create entries for translating chars in x to those in y */ 12049 x_kind = PyUnicode_KIND(x); 12050 y_kind = PyUnicode_KIND(y); 12051 x_data = PyUnicode_DATA(x); 12052 y_data = PyUnicode_DATA(y); 12053 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12054 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12055 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12056 if (!key || !value) 12057 goto err; 12058 res = PyDict_SetItem(new, key, value); 12059 Py_DECREF(key); 12060 Py_DECREF(value); 12061 if (res < 0) 12062 goto err; 12063 } 12064 /* create entries for deleting chars in z */ 12065 if (z != NULL) { 12066 z_kind = PyUnicode_KIND(z); 12067 z_data = PyUnicode_DATA(z); 12068 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 12069 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12070 if (!key) 12071 goto err; 12072 res = PyDict_SetItem(new, key, Py_None); 12073 Py_DECREF(key); 12074 if (res < 0) 12075 goto err; 12076 } 12077 } 12078 } else { 12079 int kind; 12080 void *data; 12081 12082 /* x must be a dict */ 12083 if (!PyDict_CheckExact(x)) { 12084 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12085 "to maketrans it must be a dict"); 12086 goto err; 12087 } 12088 /* copy entries into the new dict, converting string keys to int keys */ 12089 while (PyDict_Next(x, &i, &key, &value)) { 12090 if (PyUnicode_Check(key)) { 12091 /* convert string keys to integer keys */ 12092 PyObject *newkey; 12093 if (PyUnicode_GET_SIZE(key) != 1) { 12094 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12095 "table must be of length 1"); 12096 goto err; 12097 } 12098 kind = PyUnicode_KIND(key); 12099 data = PyUnicode_DATA(key); 12100 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12101 if (!newkey) 12102 goto err; 12103 res = PyDict_SetItem(new, newkey, value); 12104 Py_DECREF(newkey); 12105 if (res < 0) 12106 goto err; 12107 } else if (PyLong_Check(key)) { 12108 /* just keep integer keys */ 12109 if (PyDict_SetItem(new, key, value) < 0) 12110 goto err; 12111 } else { 12112 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12113 "be strings or integers"); 12114 goto err; 12115 } 12116 } 12117 } 12118 return new; 12119 err: 12120 Py_DECREF(new); 12121 return NULL; 12122} 12123 12124PyDoc_STRVAR(translate__doc__, 12125 "S.translate(table) -> str\n\ 12126\n\ 12127Return a copy of the string S, where all characters have been mapped\n\ 12128through the given translation table, which must be a mapping of\n\ 12129Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12130Unmapped characters are left untouched. Characters mapped to None\n\ 12131are deleted."); 12132 12133static PyObject* 12134unicode_translate(PyObject *self, PyObject *table) 12135{ 12136 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12137} 12138 12139PyDoc_STRVAR(upper__doc__, 12140 "S.upper() -> str\n\ 12141\n\ 12142Return a copy of S converted to uppercase."); 12143 12144static PyObject* 12145unicode_upper(PyObject *self) 12146{ 12147 return fixup(self, fixupper); 12148} 12149 12150PyDoc_STRVAR(zfill__doc__, 12151 "S.zfill(width) -> str\n\ 12152\n\ 12153Pad a numeric string S with zeros on the left, to fill a field\n\ 12154of the specified width. The string S is never truncated."); 12155 12156static PyObject * 12157unicode_zfill(PyObject *self, PyObject *args) 12158{ 12159 Py_ssize_t fill; 12160 PyObject *u; 12161 Py_ssize_t width; 12162 int kind; 12163 void *data; 12164 Py_UCS4 chr; 12165 12166 if (PyUnicode_READY(self) == -1) 12167 return NULL; 12168 12169 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12170 return NULL; 12171 12172 if (PyUnicode_GET_LENGTH(self) >= width) { 12173 if (PyUnicode_CheckExact(self)) { 12174 Py_INCREF(self); 12175 return (PyObject*) self; 12176 } 12177 else 12178 return PyUnicode_Copy((PyObject*)self); 12179 } 12180 12181 fill = width - _PyUnicode_LENGTH(self); 12182 12183 u = pad(self, fill, 0, '0'); 12184 12185 if (u == NULL) 12186 return NULL; 12187 12188 kind = PyUnicode_KIND(u); 12189 data = PyUnicode_DATA(u); 12190 chr = PyUnicode_READ(kind, data, fill); 12191 12192 if (chr == '+' || chr == '-') { 12193 /* move sign to beginning of string */ 12194 PyUnicode_WRITE(kind, data, 0, chr); 12195 PyUnicode_WRITE(kind, data, fill, '0'); 12196 } 12197 12198 assert(_PyUnicode_CheckConsistency(u, 1)); 12199 return (PyObject*) u; 12200} 12201 12202#if 0 12203static PyObject * 12204unicode__decimal2ascii(PyObject *self) 12205{ 12206 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12207} 12208#endif 12209 12210PyDoc_STRVAR(startswith__doc__, 12211 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12212\n\ 12213Return True if S starts with the specified prefix, False otherwise.\n\ 12214With optional start, test S beginning at that position.\n\ 12215With optional end, stop comparing S at that position.\n\ 12216prefix can also be a tuple of strings to try."); 12217 12218static PyObject * 12219unicode_startswith(PyUnicodeObject *self, 12220 PyObject *args) 12221{ 12222 PyObject *subobj; 12223 PyUnicodeObject *substring; 12224 Py_ssize_t start = 0; 12225 Py_ssize_t end = PY_SSIZE_T_MAX; 12226 int result; 12227 12228 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12229 return NULL; 12230 if (PyTuple_Check(subobj)) { 12231 Py_ssize_t i; 12232 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12233 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12234 PyTuple_GET_ITEM(subobj, i)); 12235 if (substring == NULL) 12236 return NULL; 12237 result = tailmatch(self, substring, start, end, -1); 12238 Py_DECREF(substring); 12239 if (result) { 12240 Py_RETURN_TRUE; 12241 } 12242 } 12243 /* nothing matched */ 12244 Py_RETURN_FALSE; 12245 } 12246 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12247 if (substring == NULL) { 12248 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12249 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12250 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12251 return NULL; 12252 } 12253 result = tailmatch(self, substring, start, end, -1); 12254 Py_DECREF(substring); 12255 return PyBool_FromLong(result); 12256} 12257 12258 12259PyDoc_STRVAR(endswith__doc__, 12260 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12261\n\ 12262Return True if S ends with the specified suffix, False otherwise.\n\ 12263With optional start, test S beginning at that position.\n\ 12264With optional end, stop comparing S at that position.\n\ 12265suffix can also be a tuple of strings to try."); 12266 12267static PyObject * 12268unicode_endswith(PyUnicodeObject *self, 12269 PyObject *args) 12270{ 12271 PyObject *subobj; 12272 PyUnicodeObject *substring; 12273 Py_ssize_t start = 0; 12274 Py_ssize_t end = PY_SSIZE_T_MAX; 12275 int result; 12276 12277 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12278 return NULL; 12279 if (PyTuple_Check(subobj)) { 12280 Py_ssize_t i; 12281 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12282 substring = (PyUnicodeObject *)PyUnicode_FromObject( 12283 PyTuple_GET_ITEM(subobj, i)); 12284 if (substring == NULL) 12285 return NULL; 12286 result = tailmatch(self, substring, start, end, +1); 12287 Py_DECREF(substring); 12288 if (result) { 12289 Py_RETURN_TRUE; 12290 } 12291 } 12292 Py_RETURN_FALSE; 12293 } 12294 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 12295 if (substring == NULL) { 12296 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12297 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12298 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12299 return NULL; 12300 } 12301 result = tailmatch(self, substring, start, end, +1); 12302 Py_DECREF(substring); 12303 return PyBool_FromLong(result); 12304} 12305 12306#include "stringlib/unicode_format.h" 12307 12308PyDoc_STRVAR(format__doc__, 12309 "S.format(*args, **kwargs) -> str\n\ 12310\n\ 12311Return a formatted version of S, using substitutions from args and kwargs.\n\ 12312The substitutions are identified by braces ('{' and '}')."); 12313 12314PyDoc_STRVAR(format_map__doc__, 12315 "S.format_map(mapping) -> str\n\ 12316\n\ 12317Return a formatted version of S, using substitutions from mapping.\n\ 12318The substitutions are identified by braces ('{' and '}')."); 12319 12320static PyObject * 12321unicode__format__(PyObject* self, PyObject* args) 12322{ 12323 PyObject *format_spec, *out; 12324 12325 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12326 return NULL; 12327 12328 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, 12329 PyUnicode_GET_LENGTH(format_spec)); 12330 return out; 12331} 12332 12333PyDoc_STRVAR(p_format__doc__, 12334 "S.__format__(format_spec) -> str\n\ 12335\n\ 12336Return a formatted version of S as described by format_spec."); 12337 12338static PyObject * 12339unicode__sizeof__(PyUnicodeObject *v) 12340{ 12341 Py_ssize_t size; 12342 12343 /* If it's a compact object, account for base structure + 12344 character data. */ 12345 if (PyUnicode_IS_COMPACT_ASCII(v)) 12346 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12347 else if (PyUnicode_IS_COMPACT(v)) 12348 size = sizeof(PyCompactUnicodeObject) + 12349 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 12350 else { 12351 /* If it is a two-block object, account for base object, and 12352 for character block if present. */ 12353 size = sizeof(PyUnicodeObject); 12354 if (_PyUnicode_DATA_ANY(v)) 12355 size += (PyUnicode_GET_LENGTH(v) + 1) * 12356 PyUnicode_CHARACTER_SIZE(v); 12357 } 12358 /* If the wstr pointer is present, account for it unless it is shared 12359 with the data pointer. Check if the data is not shared. */ 12360 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12361 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12362 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12363 size += PyUnicode_UTF8_LENGTH(v) + 1; 12364 12365 return PyLong_FromSsize_t(size); 12366} 12367 12368PyDoc_STRVAR(sizeof__doc__, 12369 "S.__sizeof__() -> size of S in memory, in bytes"); 12370 12371static PyObject * 12372unicode_getnewargs(PyObject *v) 12373{ 12374 PyObject *copy = PyUnicode_Copy(v); 12375 if (!copy) 12376 return NULL; 12377 return Py_BuildValue("(N)", copy); 12378} 12379 12380static PyMethodDef unicode_methods[] = { 12381 12382 /* Order is according to common usage: often used methods should 12383 appear first, since lookup is done sequentially. */ 12384 12385 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12386 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12387 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12388 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12389 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12390 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12391 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12392 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12393 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12394 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12395 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12396 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12397 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12398 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12399 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12400 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12401 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12402 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12403 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12404 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12405 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12406 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12407 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12408 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12409 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12410 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12411 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12412 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12413 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12414 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12415 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12416 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12417 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12418 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12419 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12420 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12421 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12422 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12423 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12424 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12425 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12426 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12427 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12428 {"maketrans", (PyCFunction) unicode_maketrans, 12429 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12430 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12431#if 0 12432 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12433#endif 12434 12435#if 0 12436 /* These methods are just used for debugging the implementation. */ 12437 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12438#endif 12439 12440 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12441 {NULL, NULL} 12442}; 12443 12444static PyObject * 12445unicode_mod(PyObject *v, PyObject *w) 12446{ 12447 if (!PyUnicode_Check(v)) 12448 Py_RETURN_NOTIMPLEMENTED; 12449 return PyUnicode_Format(v, w); 12450} 12451 12452static PyNumberMethods unicode_as_number = { 12453 0, /*nb_add*/ 12454 0, /*nb_subtract*/ 12455 0, /*nb_multiply*/ 12456 unicode_mod, /*nb_remainder*/ 12457}; 12458 12459static PySequenceMethods unicode_as_sequence = { 12460 (lenfunc) unicode_length, /* sq_length */ 12461 PyUnicode_Concat, /* sq_concat */ 12462 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12463 (ssizeargfunc) unicode_getitem, /* sq_item */ 12464 0, /* sq_slice */ 12465 0, /* sq_ass_item */ 12466 0, /* sq_ass_slice */ 12467 PyUnicode_Contains, /* sq_contains */ 12468}; 12469 12470static PyObject* 12471unicode_subscript(PyUnicodeObject* self, PyObject* item) 12472{ 12473 if (PyUnicode_READY(self) == -1) 12474 return NULL; 12475 12476 if (PyIndex_Check(item)) { 12477 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12478 if (i == -1 && PyErr_Occurred()) 12479 return NULL; 12480 if (i < 0) 12481 i += PyUnicode_GET_LENGTH(self); 12482 return unicode_getitem((PyObject*)self, i); 12483 } else if (PySlice_Check(item)) { 12484 Py_ssize_t start, stop, step, slicelength, cur, i; 12485 PyObject *result; 12486 void *src_data, *dest_data; 12487 int src_kind, dest_kind; 12488 Py_UCS4 ch, max_char, kind_limit; 12489 12490 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12491 &start, &stop, &step, &slicelength) < 0) { 12492 return NULL; 12493 } 12494 12495 if (slicelength <= 0) { 12496 return PyUnicode_New(0, 0); 12497 } else if (start == 0 && step == 1 && 12498 slicelength == PyUnicode_GET_LENGTH(self) && 12499 PyUnicode_CheckExact(self)) { 12500 Py_INCREF(self); 12501 return (PyObject *)self; 12502 } else if (step == 1) { 12503 return PyUnicode_Substring((PyObject*)self, 12504 start, start + slicelength); 12505 } 12506 /* General case */ 12507 max_char = 0; 12508 src_kind = PyUnicode_KIND(self); 12509 kind_limit = kind_maxchar_limit(src_kind); 12510 src_data = PyUnicode_DATA(self); 12511 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12512 ch = PyUnicode_READ(src_kind, src_data, cur); 12513 if (ch > max_char) { 12514 max_char = ch; 12515 if (max_char >= kind_limit) 12516 break; 12517 } 12518 } 12519 result = PyUnicode_New(slicelength, max_char); 12520 if (result == NULL) 12521 return NULL; 12522 dest_kind = PyUnicode_KIND(result); 12523 dest_data = PyUnicode_DATA(result); 12524 12525 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12526 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 12527 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 12528 } 12529 assert(_PyUnicode_CheckConsistency(result, 1)); 12530 return result; 12531 } else { 12532 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12533 return NULL; 12534 } 12535} 12536 12537static PyMappingMethods unicode_as_mapping = { 12538 (lenfunc)unicode_length, /* mp_length */ 12539 (binaryfunc)unicode_subscript, /* mp_subscript */ 12540 (objobjargproc)0, /* mp_ass_subscript */ 12541}; 12542 12543 12544/* Helpers for PyUnicode_Format() */ 12545 12546static PyObject * 12547getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12548{ 12549 Py_ssize_t argidx = *p_argidx; 12550 if (argidx < arglen) { 12551 (*p_argidx)++; 12552 if (arglen < 0) 12553 return args; 12554 else 12555 return PyTuple_GetItem(args, argidx); 12556 } 12557 PyErr_SetString(PyExc_TypeError, 12558 "not enough arguments for format string"); 12559 return NULL; 12560} 12561 12562/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12563 12564static PyObject * 12565formatfloat(PyObject *v, int flags, int prec, int type) 12566{ 12567 char *p; 12568 PyObject *result; 12569 double x; 12570 12571 x = PyFloat_AsDouble(v); 12572 if (x == -1.0 && PyErr_Occurred()) 12573 return NULL; 12574 12575 if (prec < 0) 12576 prec = 6; 12577 12578 p = PyOS_double_to_string(x, type, prec, 12579 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12580 if (p == NULL) 12581 return NULL; 12582 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12583 PyMem_Free(p); 12584 return result; 12585} 12586 12587static PyObject* 12588formatlong(PyObject *val, int flags, int prec, int type) 12589{ 12590 char *buf; 12591 int len; 12592 PyObject *str; /* temporary string object. */ 12593 PyObject *result; 12594 12595 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12596 if (!str) 12597 return NULL; 12598 result = PyUnicode_DecodeASCII(buf, len, NULL); 12599 Py_DECREF(str); 12600 return result; 12601} 12602 12603static int 12604formatchar(Py_UCS4 *buf, 12605 size_t buflen, 12606 PyObject *v) 12607{ 12608 /* presume that the buffer is at least 3 characters long */ 12609 if (PyUnicode_Check(v)) { 12610 if (PyUnicode_GET_LENGTH(v) == 1) { 12611 buf[0] = PyUnicode_READ_CHAR(v, 0); 12612 buf[1] = '\0'; 12613 return 1; 12614 } 12615 goto onError; 12616 } 12617 else { 12618 /* Integer input truncated to a character */ 12619 long x; 12620 x = PyLong_AsLong(v); 12621 if (x == -1 && PyErr_Occurred()) 12622 goto onError; 12623 12624 if (x < 0 || x > 0x10ffff) { 12625 PyErr_SetString(PyExc_OverflowError, 12626 "%c arg not in range(0x110000)"); 12627 return -1; 12628 } 12629 12630 buf[0] = (Py_UCS4) x; 12631 buf[1] = '\0'; 12632 return 1; 12633 } 12634 12635 onError: 12636 PyErr_SetString(PyExc_TypeError, 12637 "%c requires int or char"); 12638 return -1; 12639} 12640 12641/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12642 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12643*/ 12644#define FORMATBUFLEN (size_t)10 12645 12646PyObject * 12647PyUnicode_Format(PyObject *format, PyObject *args) 12648{ 12649 void *fmt; 12650 int fmtkind; 12651 PyObject *result; 12652 Py_UCS4 *res, *res0; 12653 Py_UCS4 max; 12654 int kind; 12655 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12656 int args_owned = 0; 12657 PyObject *dict = NULL; 12658 PyUnicodeObject *uformat; 12659 12660 if (format == NULL || args == NULL) { 12661 PyErr_BadInternalCall(); 12662 return NULL; 12663 } 12664 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12665 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12666 return NULL; 12667 fmt = PyUnicode_DATA(uformat); 12668 fmtkind = PyUnicode_KIND(uformat); 12669 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12670 fmtpos = 0; 12671 12672 reslen = rescnt = fmtcnt + 100; 12673 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12674 if (res0 == NULL) { 12675 PyErr_NoMemory(); 12676 goto onError; 12677 } 12678 12679 if (PyTuple_Check(args)) { 12680 arglen = PyTuple_Size(args); 12681 argidx = 0; 12682 } 12683 else { 12684 arglen = -1; 12685 argidx = -2; 12686 } 12687 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12688 !PyUnicode_Check(args)) 12689 dict = args; 12690 12691 while (--fmtcnt >= 0) { 12692 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12693 if (--rescnt < 0) { 12694 rescnt = fmtcnt + 100; 12695 reslen += rescnt; 12696 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12697 if (res0 == NULL){ 12698 PyErr_NoMemory(); 12699 goto onError; 12700 } 12701 res = res0 + reslen - rescnt; 12702 --rescnt; 12703 } 12704 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12705 } 12706 else { 12707 /* Got a format specifier */ 12708 int flags = 0; 12709 Py_ssize_t width = -1; 12710 int prec = -1; 12711 Py_UCS4 c = '\0'; 12712 Py_UCS4 fill; 12713 int isnumok; 12714 PyObject *v = NULL; 12715 PyObject *temp = NULL; 12716 void *pbuf; 12717 Py_ssize_t pindex; 12718 Py_UNICODE sign; 12719 Py_ssize_t len, len1; 12720 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12721 12722 fmtpos++; 12723 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12724 Py_ssize_t keystart; 12725 Py_ssize_t keylen; 12726 PyObject *key; 12727 int pcount = 1; 12728 12729 if (dict == NULL) { 12730 PyErr_SetString(PyExc_TypeError, 12731 "format requires a mapping"); 12732 goto onError; 12733 } 12734 ++fmtpos; 12735 --fmtcnt; 12736 keystart = fmtpos; 12737 /* Skip over balanced parentheses */ 12738 while (pcount > 0 && --fmtcnt >= 0) { 12739 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12740 --pcount; 12741 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12742 ++pcount; 12743 fmtpos++; 12744 } 12745 keylen = fmtpos - keystart - 1; 12746 if (fmtcnt < 0 || pcount > 0) { 12747 PyErr_SetString(PyExc_ValueError, 12748 "incomplete format key"); 12749 goto onError; 12750 } 12751 key = PyUnicode_Substring((PyObject*)uformat, 12752 keystart, keystart + keylen); 12753 if (key == NULL) 12754 goto onError; 12755 if (args_owned) { 12756 Py_DECREF(args); 12757 args_owned = 0; 12758 } 12759 args = PyObject_GetItem(dict, key); 12760 Py_DECREF(key); 12761 if (args == NULL) { 12762 goto onError; 12763 } 12764 args_owned = 1; 12765 arglen = -1; 12766 argidx = -2; 12767 } 12768 while (--fmtcnt >= 0) { 12769 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12770 case '-': flags |= F_LJUST; continue; 12771 case '+': flags |= F_SIGN; continue; 12772 case ' ': flags |= F_BLANK; continue; 12773 case '#': flags |= F_ALT; continue; 12774 case '0': flags |= F_ZERO; continue; 12775 } 12776 break; 12777 } 12778 if (c == '*') { 12779 v = getnextarg(args, arglen, &argidx); 12780 if (v == NULL) 12781 goto onError; 12782 if (!PyLong_Check(v)) { 12783 PyErr_SetString(PyExc_TypeError, 12784 "* wants int"); 12785 goto onError; 12786 } 12787 width = PyLong_AsLong(v); 12788 if (width == -1 && PyErr_Occurred()) 12789 goto onError; 12790 if (width < 0) { 12791 flags |= F_LJUST; 12792 width = -width; 12793 } 12794 if (--fmtcnt >= 0) 12795 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12796 } 12797 else if (c >= '0' && c <= '9') { 12798 width = c - '0'; 12799 while (--fmtcnt >= 0) { 12800 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12801 if (c < '0' || c > '9') 12802 break; 12803 if ((width*10) / 10 != width) { 12804 PyErr_SetString(PyExc_ValueError, 12805 "width too big"); 12806 goto onError; 12807 } 12808 width = width*10 + (c - '0'); 12809 } 12810 } 12811 if (c == '.') { 12812 prec = 0; 12813 if (--fmtcnt >= 0) 12814 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12815 if (c == '*') { 12816 v = getnextarg(args, arglen, &argidx); 12817 if (v == NULL) 12818 goto onError; 12819 if (!PyLong_Check(v)) { 12820 PyErr_SetString(PyExc_TypeError, 12821 "* wants int"); 12822 goto onError; 12823 } 12824 prec = PyLong_AsLong(v); 12825 if (prec == -1 && PyErr_Occurred()) 12826 goto onError; 12827 if (prec < 0) 12828 prec = 0; 12829 if (--fmtcnt >= 0) 12830 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12831 } 12832 else if (c >= '0' && c <= '9') { 12833 prec = c - '0'; 12834 while (--fmtcnt >= 0) { 12835 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12836 if (c < '0' || c > '9') 12837 break; 12838 if ((prec*10) / 10 != prec) { 12839 PyErr_SetString(PyExc_ValueError, 12840 "prec too big"); 12841 goto onError; 12842 } 12843 prec = prec*10 + (c - '0'); 12844 } 12845 } 12846 } /* prec */ 12847 if (fmtcnt >= 0) { 12848 if (c == 'h' || c == 'l' || c == 'L') { 12849 if (--fmtcnt >= 0) 12850 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12851 } 12852 } 12853 if (fmtcnt < 0) { 12854 PyErr_SetString(PyExc_ValueError, 12855 "incomplete format"); 12856 goto onError; 12857 } 12858 if (c != '%') { 12859 v = getnextarg(args, arglen, &argidx); 12860 if (v == NULL) 12861 goto onError; 12862 } 12863 sign = 0; 12864 fill = ' '; 12865 switch (c) { 12866 12867 case '%': 12868 pbuf = formatbuf; 12869 kind = PyUnicode_4BYTE_KIND; 12870 /* presume that buffer length is at least 1 */ 12871 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12872 len = 1; 12873 break; 12874 12875 case 's': 12876 case 'r': 12877 case 'a': 12878 if (PyUnicode_CheckExact(v) && c == 's') { 12879 temp = v; 12880 Py_INCREF(temp); 12881 } 12882 else { 12883 if (c == 's') 12884 temp = PyObject_Str(v); 12885 else if (c == 'r') 12886 temp = PyObject_Repr(v); 12887 else 12888 temp = PyObject_ASCII(v); 12889 if (temp == NULL) 12890 goto onError; 12891 if (PyUnicode_Check(temp)) 12892 /* nothing to do */; 12893 else { 12894 Py_DECREF(temp); 12895 PyErr_SetString(PyExc_TypeError, 12896 "%s argument has non-string str()"); 12897 goto onError; 12898 } 12899 } 12900 if (PyUnicode_READY(temp) == -1) { 12901 Py_CLEAR(temp); 12902 goto onError; 12903 } 12904 pbuf = PyUnicode_DATA(temp); 12905 kind = PyUnicode_KIND(temp); 12906 len = PyUnicode_GET_LENGTH(temp); 12907 if (prec >= 0 && len > prec) 12908 len = prec; 12909 break; 12910 12911 case 'i': 12912 case 'd': 12913 case 'u': 12914 case 'o': 12915 case 'x': 12916 case 'X': 12917 isnumok = 0; 12918 if (PyNumber_Check(v)) { 12919 PyObject *iobj=NULL; 12920 12921 if (PyLong_Check(v)) { 12922 iobj = v; 12923 Py_INCREF(iobj); 12924 } 12925 else { 12926 iobj = PyNumber_Long(v); 12927 } 12928 if (iobj!=NULL) { 12929 if (PyLong_Check(iobj)) { 12930 isnumok = 1; 12931 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12932 Py_DECREF(iobj); 12933 if (!temp) 12934 goto onError; 12935 if (PyUnicode_READY(temp) == -1) { 12936 Py_CLEAR(temp); 12937 goto onError; 12938 } 12939 pbuf = PyUnicode_DATA(temp); 12940 kind = PyUnicode_KIND(temp); 12941 len = PyUnicode_GET_LENGTH(temp); 12942 sign = 1; 12943 } 12944 else { 12945 Py_DECREF(iobj); 12946 } 12947 } 12948 } 12949 if (!isnumok) { 12950 PyErr_Format(PyExc_TypeError, 12951 "%%%c format: a number is required, " 12952 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12953 goto onError; 12954 } 12955 if (flags & F_ZERO) 12956 fill = '0'; 12957 break; 12958 12959 case 'e': 12960 case 'E': 12961 case 'f': 12962 case 'F': 12963 case 'g': 12964 case 'G': 12965 temp = formatfloat(v, flags, prec, c); 12966 if (!temp) 12967 goto onError; 12968 if (PyUnicode_READY(temp) == -1) { 12969 Py_CLEAR(temp); 12970 goto onError; 12971 } 12972 pbuf = PyUnicode_DATA(temp); 12973 kind = PyUnicode_KIND(temp); 12974 len = PyUnicode_GET_LENGTH(temp); 12975 sign = 1; 12976 if (flags & F_ZERO) 12977 fill = '0'; 12978 break; 12979 12980 case 'c': 12981 pbuf = formatbuf; 12982 kind = PyUnicode_4BYTE_KIND; 12983 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12984 if (len < 0) 12985 goto onError; 12986 break; 12987 12988 default: 12989 PyErr_Format(PyExc_ValueError, 12990 "unsupported format character '%c' (0x%x) " 12991 "at index %zd", 12992 (31<=c && c<=126) ? (char)c : '?', 12993 (int)c, 12994 fmtpos - 1); 12995 goto onError; 12996 } 12997 /* pbuf is initialized here. */ 12998 pindex = 0; 12999 if (sign) { 13000 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 13001 PyUnicode_READ(kind, pbuf, pindex) == '+') { 13002 sign = PyUnicode_READ(kind, pbuf, pindex++); 13003 len--; 13004 } 13005 else if (flags & F_SIGN) 13006 sign = '+'; 13007 else if (flags & F_BLANK) 13008 sign = ' '; 13009 else 13010 sign = 0; 13011 } 13012 if (width < len) 13013 width = len; 13014 if (rescnt - (sign != 0) < width) { 13015 reslen -= rescnt; 13016 rescnt = width + fmtcnt + 100; 13017 reslen += rescnt; 13018 if (reslen < 0) { 13019 Py_XDECREF(temp); 13020 PyErr_NoMemory(); 13021 goto onError; 13022 } 13023 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 13024 if (res0 == 0) { 13025 PyErr_NoMemory(); 13026 Py_XDECREF(temp); 13027 goto onError; 13028 } 13029 res = res0 + reslen - rescnt; 13030 } 13031 if (sign) { 13032 if (fill != ' ') 13033 *res++ = sign; 13034 rescnt--; 13035 if (width > len) 13036 width--; 13037 } 13038 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13039 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13040 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13041 if (fill != ' ') { 13042 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 13043 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 13044 } 13045 rescnt -= 2; 13046 width -= 2; 13047 if (width < 0) 13048 width = 0; 13049 len -= 2; 13050 } 13051 if (width > len && !(flags & F_LJUST)) { 13052 do { 13053 --rescnt; 13054 *res++ = fill; 13055 } while (--width > len); 13056 } 13057 if (fill == ' ') { 13058 if (sign) 13059 *res++ = sign; 13060 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13061 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13062 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13063 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 13064 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 13065 } 13066 } 13067 /* Copy all characters, preserving len */ 13068 len1 = len; 13069 while (len1--) { 13070 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 13071 rescnt--; 13072 } 13073 while (--width >= len) { 13074 --rescnt; 13075 *res++ = ' '; 13076 } 13077 if (dict && (argidx < arglen) && c != '%') { 13078 PyErr_SetString(PyExc_TypeError, 13079 "not all arguments converted during string formatting"); 13080 Py_XDECREF(temp); 13081 goto onError; 13082 } 13083 Py_XDECREF(temp); 13084 } /* '%' */ 13085 } /* until end */ 13086 if (argidx < arglen && !dict) { 13087 PyErr_SetString(PyExc_TypeError, 13088 "not all arguments converted during string formatting"); 13089 goto onError; 13090 } 13091 13092 13093 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 13094 if (*res > max) 13095 max = *res; 13096 result = PyUnicode_New(reslen - rescnt, max); 13097 if (!result) 13098 goto onError; 13099 kind = PyUnicode_KIND(result); 13100 for (res = res0; res < res0+reslen-rescnt; res++) 13101 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 13102 PyMem_Free(res0); 13103 if (args_owned) { 13104 Py_DECREF(args); 13105 } 13106 Py_DECREF(uformat); 13107 assert(_PyUnicode_CheckConsistency(result, 1)); 13108 return (PyObject *)result; 13109 13110 onError: 13111 PyMem_Free(res0); 13112 Py_DECREF(uformat); 13113 if (args_owned) { 13114 Py_DECREF(args); 13115 } 13116 return NULL; 13117} 13118 13119static PyObject * 13120unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13121 13122static PyObject * 13123unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13124{ 13125 PyObject *x = NULL; 13126 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13127 char *encoding = NULL; 13128 char *errors = NULL; 13129 13130 if (type != &PyUnicode_Type) 13131 return unicode_subtype_new(type, args, kwds); 13132 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13133 kwlist, &x, &encoding, &errors)) 13134 return NULL; 13135 if (x == NULL) 13136 return (PyObject *)PyUnicode_New(0, 0); 13137 if (encoding == NULL && errors == NULL) 13138 return PyObject_Str(x); 13139 else 13140 return PyUnicode_FromEncodedObject(x, encoding, errors); 13141} 13142 13143static PyObject * 13144unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13145{ 13146 PyUnicodeObject *unicode, *self; 13147 Py_ssize_t length, char_size; 13148 int share_wstr, share_utf8; 13149 unsigned int kind; 13150 void *data; 13151 13152 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13153 13154 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 13155 if (unicode == NULL) 13156 return NULL; 13157 assert(_PyUnicode_CHECK(unicode)); 13158 if (PyUnicode_READY(unicode)) 13159 return NULL; 13160 13161 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 13162 if (self == NULL) { 13163 Py_DECREF(unicode); 13164 return NULL; 13165 } 13166 kind = PyUnicode_KIND(unicode); 13167 length = PyUnicode_GET_LENGTH(unicode); 13168 13169 _PyUnicode_LENGTH(self) = length; 13170#ifdef Py_DEBUG 13171 _PyUnicode_HASH(self) = -1; 13172#else 13173 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13174#endif 13175 _PyUnicode_STATE(self).interned = 0; 13176 _PyUnicode_STATE(self).kind = kind; 13177 _PyUnicode_STATE(self).compact = 0; 13178 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13179 _PyUnicode_STATE(self).ready = 1; 13180 _PyUnicode_WSTR(self) = NULL; 13181 _PyUnicode_UTF8_LENGTH(self) = 0; 13182 _PyUnicode_UTF8(self) = NULL; 13183 _PyUnicode_WSTR_LENGTH(self) = 0; 13184 _PyUnicode_DATA_ANY(self) = NULL; 13185 13186 share_utf8 = 0; 13187 share_wstr = 0; 13188 if (kind == PyUnicode_1BYTE_KIND) { 13189 char_size = 1; 13190 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13191 share_utf8 = 1; 13192 } 13193 else if (kind == PyUnicode_2BYTE_KIND) { 13194 char_size = 2; 13195 if (sizeof(wchar_t) == 2) 13196 share_wstr = 1; 13197 } 13198 else { 13199 assert(kind == PyUnicode_4BYTE_KIND); 13200 char_size = 4; 13201 if (sizeof(wchar_t) == 4) 13202 share_wstr = 1; 13203 } 13204 13205 /* Ensure we won't overflow the length. */ 13206 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13207 PyErr_NoMemory(); 13208 goto onError; 13209 } 13210 data = PyObject_MALLOC((length + 1) * char_size); 13211 if (data == NULL) { 13212 PyErr_NoMemory(); 13213 goto onError; 13214 } 13215 13216 _PyUnicode_DATA_ANY(self) = data; 13217 if (share_utf8) { 13218 _PyUnicode_UTF8_LENGTH(self) = length; 13219 _PyUnicode_UTF8(self) = data; 13220 } 13221 if (share_wstr) { 13222 _PyUnicode_WSTR_LENGTH(self) = length; 13223 _PyUnicode_WSTR(self) = (wchar_t *)data; 13224 } 13225 13226 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13227 PyUnicode_KIND_SIZE(kind, length + 1)); 13228 Py_DECREF(unicode); 13229 assert(_PyUnicode_CheckConsistency(self, 1)); 13230#ifdef Py_DEBUG 13231 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13232#endif 13233 return (PyObject *)self; 13234 13235onError: 13236 Py_DECREF(unicode); 13237 Py_DECREF(self); 13238 return NULL; 13239} 13240 13241PyDoc_STRVAR(unicode_doc, 13242 "str(string[, encoding[, errors]]) -> str\n\ 13243\n\ 13244Create a new string object from the given encoded string.\n\ 13245encoding defaults to the current default string encoding.\n\ 13246errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13247 13248static PyObject *unicode_iter(PyObject *seq); 13249 13250PyTypeObject PyUnicode_Type = { 13251 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13252 "str", /* tp_name */ 13253 sizeof(PyUnicodeObject), /* tp_size */ 13254 0, /* tp_itemsize */ 13255 /* Slots */ 13256 (destructor)unicode_dealloc, /* tp_dealloc */ 13257 0, /* tp_print */ 13258 0, /* tp_getattr */ 13259 0, /* tp_setattr */ 13260 0, /* tp_reserved */ 13261 unicode_repr, /* tp_repr */ 13262 &unicode_as_number, /* tp_as_number */ 13263 &unicode_as_sequence, /* tp_as_sequence */ 13264 &unicode_as_mapping, /* tp_as_mapping */ 13265 (hashfunc) unicode_hash, /* tp_hash*/ 13266 0, /* tp_call*/ 13267 (reprfunc) unicode_str, /* tp_str */ 13268 PyObject_GenericGetAttr, /* tp_getattro */ 13269 0, /* tp_setattro */ 13270 0, /* tp_as_buffer */ 13271 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13272 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13273 unicode_doc, /* tp_doc */ 13274 0, /* tp_traverse */ 13275 0, /* tp_clear */ 13276 PyUnicode_RichCompare, /* tp_richcompare */ 13277 0, /* tp_weaklistoffset */ 13278 unicode_iter, /* tp_iter */ 13279 0, /* tp_iternext */ 13280 unicode_methods, /* tp_methods */ 13281 0, /* tp_members */ 13282 0, /* tp_getset */ 13283 &PyBaseObject_Type, /* tp_base */ 13284 0, /* tp_dict */ 13285 0, /* tp_descr_get */ 13286 0, /* tp_descr_set */ 13287 0, /* tp_dictoffset */ 13288 0, /* tp_init */ 13289 0, /* tp_alloc */ 13290 unicode_new, /* tp_new */ 13291 PyObject_Del, /* tp_free */ 13292}; 13293 13294/* Initialize the Unicode implementation */ 13295 13296void _PyUnicode_Init(void) 13297{ 13298 int i; 13299 13300 /* XXX - move this array to unicodectype.c ? */ 13301 Py_UCS2 linebreak[] = { 13302 0x000A, /* LINE FEED */ 13303 0x000D, /* CARRIAGE RETURN */ 13304 0x001C, /* FILE SEPARATOR */ 13305 0x001D, /* GROUP SEPARATOR */ 13306 0x001E, /* RECORD SEPARATOR */ 13307 0x0085, /* NEXT LINE */ 13308 0x2028, /* LINE SEPARATOR */ 13309 0x2029, /* PARAGRAPH SEPARATOR */ 13310 }; 13311 13312 /* Init the implementation */ 13313 unicode_empty = PyUnicode_New(0, 0); 13314 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 13315 if (!unicode_empty) 13316 Py_FatalError("Can't create empty string"); 13317 13318 for (i = 0; i < 256; i++) 13319 unicode_latin1[i] = NULL; 13320 if (PyType_Ready(&PyUnicode_Type) < 0) 13321 Py_FatalError("Can't initialize 'unicode'"); 13322 13323 /* initialize the linebreak bloom filter */ 13324 bloom_linebreak = make_bloom_mask( 13325 PyUnicode_2BYTE_KIND, linebreak, 13326 Py_ARRAY_LENGTH(linebreak)); 13327 13328 PyType_Ready(&EncodingMapType); 13329} 13330 13331/* Finalize the Unicode implementation */ 13332 13333int 13334PyUnicode_ClearFreeList(void) 13335{ 13336 return 0; 13337} 13338 13339void 13340_PyUnicode_Fini(void) 13341{ 13342 int i; 13343 13344 Py_XDECREF(unicode_empty); 13345 unicode_empty = NULL; 13346 13347 for (i = 0; i < 256; i++) { 13348 if (unicode_latin1[i]) { 13349 Py_DECREF(unicode_latin1[i]); 13350 unicode_latin1[i] = NULL; 13351 } 13352 } 13353 (void)PyUnicode_ClearFreeList(); 13354} 13355 13356void 13357PyUnicode_InternInPlace(PyObject **p) 13358{ 13359 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 13360 PyObject *t; 13361#ifdef Py_DEBUG 13362 assert(s != NULL); 13363 assert(_PyUnicode_CHECK(s)); 13364#else 13365 if (s == NULL || !PyUnicode_Check(s)) 13366 return; 13367#endif 13368 /* If it's a subclass, we don't really know what putting 13369 it in the interned dict might do. */ 13370 if (!PyUnicode_CheckExact(s)) 13371 return; 13372 if (PyUnicode_CHECK_INTERNED(s)) 13373 return; 13374 if (_PyUnicode_READY_REPLACE(p)) { 13375 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 13376 return; 13377 } 13378 s = (PyUnicodeObject *)(*p); 13379 if (interned == NULL) { 13380 interned = PyDict_New(); 13381 if (interned == NULL) { 13382 PyErr_Clear(); /* Don't leave an exception */ 13383 return; 13384 } 13385 } 13386 /* It might be that the GetItem call fails even 13387 though the key is present in the dictionary, 13388 namely when this happens during a stack overflow. */ 13389 Py_ALLOW_RECURSION 13390 t = PyDict_GetItem(interned, (PyObject *)s); 13391 Py_END_ALLOW_RECURSION 13392 13393 if (t) { 13394 Py_INCREF(t); 13395 Py_DECREF(*p); 13396 *p = t; 13397 return; 13398 } 13399 13400 PyThreadState_GET()->recursion_critical = 1; 13401 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13402 PyErr_Clear(); 13403 PyThreadState_GET()->recursion_critical = 0; 13404 return; 13405 } 13406 PyThreadState_GET()->recursion_critical = 0; 13407 /* The two references in interned are not counted by refcnt. 13408 The deallocator will take care of this */ 13409 Py_REFCNT(s) -= 2; 13410 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13411} 13412 13413void 13414PyUnicode_InternImmortal(PyObject **p) 13415{ 13416 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13417 13418 PyUnicode_InternInPlace(p); 13419 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13420 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13421 Py_INCREF(*p); 13422 } 13423} 13424 13425PyObject * 13426PyUnicode_InternFromString(const char *cp) 13427{ 13428 PyObject *s = PyUnicode_FromString(cp); 13429 if (s == NULL) 13430 return NULL; 13431 PyUnicode_InternInPlace(&s); 13432 return s; 13433} 13434 13435void 13436_Py_ReleaseInternedUnicodeStrings(void) 13437{ 13438 PyObject *keys; 13439 PyUnicodeObject *s; 13440 Py_ssize_t i, n; 13441 Py_ssize_t immortal_size = 0, mortal_size = 0; 13442 13443 if (interned == NULL || !PyDict_Check(interned)) 13444 return; 13445 keys = PyDict_Keys(interned); 13446 if (keys == NULL || !PyList_Check(keys)) { 13447 PyErr_Clear(); 13448 return; 13449 } 13450 13451 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13452 detector, interned unicode strings are not forcibly deallocated; 13453 rather, we give them their stolen references back, and then clear 13454 and DECREF the interned dict. */ 13455 13456 n = PyList_GET_SIZE(keys); 13457 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13458 n); 13459 for (i = 0; i < n; i++) { 13460 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13461 if (PyUnicode_READY(s) == -1) { 13462 assert(0 && "could not ready string"); 13463 fprintf(stderr, "could not ready string\n"); 13464 } 13465 switch (PyUnicode_CHECK_INTERNED(s)) { 13466 case SSTATE_NOT_INTERNED: 13467 /* XXX Shouldn't happen */ 13468 break; 13469 case SSTATE_INTERNED_IMMORTAL: 13470 Py_REFCNT(s) += 1; 13471 immortal_size += PyUnicode_GET_LENGTH(s); 13472 break; 13473 case SSTATE_INTERNED_MORTAL: 13474 Py_REFCNT(s) += 2; 13475 mortal_size += PyUnicode_GET_LENGTH(s); 13476 break; 13477 default: 13478 Py_FatalError("Inconsistent interned string state."); 13479 } 13480 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13481 } 13482 fprintf(stderr, "total size of all interned strings: " 13483 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13484 "mortal/immortal\n", mortal_size, immortal_size); 13485 Py_DECREF(keys); 13486 PyDict_Clear(interned); 13487 Py_DECREF(interned); 13488 interned = NULL; 13489} 13490 13491 13492/********************* Unicode Iterator **************************/ 13493 13494typedef struct { 13495 PyObject_HEAD 13496 Py_ssize_t it_index; 13497 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13498} unicodeiterobject; 13499 13500static void 13501unicodeiter_dealloc(unicodeiterobject *it) 13502{ 13503 _PyObject_GC_UNTRACK(it); 13504 Py_XDECREF(it->it_seq); 13505 PyObject_GC_Del(it); 13506} 13507 13508static int 13509unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13510{ 13511 Py_VISIT(it->it_seq); 13512 return 0; 13513} 13514 13515static PyObject * 13516unicodeiter_next(unicodeiterobject *it) 13517{ 13518 PyUnicodeObject *seq; 13519 PyObject *item; 13520 13521 assert(it != NULL); 13522 seq = it->it_seq; 13523 if (seq == NULL) 13524 return NULL; 13525 assert(_PyUnicode_CHECK(seq)); 13526 13527 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13528 int kind = PyUnicode_KIND(seq); 13529 void *data = PyUnicode_DATA(seq); 13530 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13531 item = PyUnicode_FromOrdinal(chr); 13532 if (item != NULL) 13533 ++it->it_index; 13534 return item; 13535 } 13536 13537 Py_DECREF(seq); 13538 it->it_seq = NULL; 13539 return NULL; 13540} 13541 13542static PyObject * 13543unicodeiter_len(unicodeiterobject *it) 13544{ 13545 Py_ssize_t len = 0; 13546 if (it->it_seq) 13547 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13548 return PyLong_FromSsize_t(len); 13549} 13550 13551PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13552 13553static PyMethodDef unicodeiter_methods[] = { 13554 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13555 length_hint_doc}, 13556 {NULL, NULL} /* sentinel */ 13557}; 13558 13559PyTypeObject PyUnicodeIter_Type = { 13560 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13561 "str_iterator", /* tp_name */ 13562 sizeof(unicodeiterobject), /* tp_basicsize */ 13563 0, /* tp_itemsize */ 13564 /* methods */ 13565 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13566 0, /* tp_print */ 13567 0, /* tp_getattr */ 13568 0, /* tp_setattr */ 13569 0, /* tp_reserved */ 13570 0, /* tp_repr */ 13571 0, /* tp_as_number */ 13572 0, /* tp_as_sequence */ 13573 0, /* tp_as_mapping */ 13574 0, /* tp_hash */ 13575 0, /* tp_call */ 13576 0, /* tp_str */ 13577 PyObject_GenericGetAttr, /* tp_getattro */ 13578 0, /* tp_setattro */ 13579 0, /* tp_as_buffer */ 13580 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13581 0, /* tp_doc */ 13582 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13583 0, /* tp_clear */ 13584 0, /* tp_richcompare */ 13585 0, /* tp_weaklistoffset */ 13586 PyObject_SelfIter, /* tp_iter */ 13587 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13588 unicodeiter_methods, /* tp_methods */ 13589 0, 13590}; 13591 13592static PyObject * 13593unicode_iter(PyObject *seq) 13594{ 13595 unicodeiterobject *it; 13596 13597 if (!PyUnicode_Check(seq)) { 13598 PyErr_BadInternalCall(); 13599 return NULL; 13600 } 13601 if (PyUnicode_READY(seq) == -1) 13602 return NULL; 13603 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13604 if (it == NULL) 13605 return NULL; 13606 it->it_index = 0; 13607 Py_INCREF(seq); 13608 it->it_seq = (PyUnicodeObject *)seq; 13609 _PyObject_GC_TRACK(it); 13610 return (PyObject *)it; 13611} 13612 13613#define UNIOP(x) Py_UNICODE_##x 13614#define UNIOP_t Py_UNICODE 13615#include "uniops.h" 13616#undef UNIOP 13617#undef UNIOP_t 13618#define UNIOP(x) Py_UCS4_##x 13619#define UNIOP_t Py_UCS4 13620#include "uniops.h" 13621#undef UNIOP 13622#undef UNIOP_t 13623 13624Py_UNICODE* 13625PyUnicode_AsUnicodeCopy(PyObject *object) 13626{ 13627 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13628 Py_UNICODE *copy; 13629 Py_ssize_t size; 13630 13631 if (!PyUnicode_Check(unicode)) { 13632 PyErr_BadArgument(); 13633 return NULL; 13634 } 13635 /* Ensure we won't overflow the size. */ 13636 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13637 PyErr_NoMemory(); 13638 return NULL; 13639 } 13640 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13641 size *= sizeof(Py_UNICODE); 13642 copy = PyMem_Malloc(size); 13643 if (copy == NULL) { 13644 PyErr_NoMemory(); 13645 return NULL; 13646 } 13647 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13648 return copy; 13649} 13650 13651/* A _string module, to export formatter_parser and formatter_field_name_split 13652 to the string.Formatter class implemented in Python. */ 13653 13654static PyMethodDef _string_methods[] = { 13655 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13656 METH_O, PyDoc_STR("split the argument as a field name")}, 13657 {"formatter_parser", (PyCFunction) formatter_parser, 13658 METH_O, PyDoc_STR("parse the argument as a format string")}, 13659 {NULL, NULL} 13660}; 13661 13662static struct PyModuleDef _string_module = { 13663 PyModuleDef_HEAD_INIT, 13664 "_string", 13665 PyDoc_STR("string helper module"), 13666 0, 13667 _string_methods, 13668 NULL, 13669 NULL, 13670 NULL, 13671 NULL 13672}; 13673 13674PyMODINIT_FUNC 13675PyInit__string(void) 13676{ 13677 return PyModule_Create(&_string_module); 13678} 13679 13680 13681#ifdef __cplusplus 13682} 13683#endif 13684