unicodeobject.c revision c379ead9afe114e1023ad64a9dea9a3a9a869ecf
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Limit for the Unicode object free list */ 50 51#define PyUnicode_MAXFREELIST 1024 52 53/* Limit for the Unicode object free list stay alive optimization. 54 55 The implementation will keep allocated Unicode memory intact for 56 all objects on the free list having a size less than this 57 limit. This reduces malloc() overhead for small Unicode objects. 58 59 At worst this will result in PyUnicode_MAXFREELIST * 60 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + 61 malloc()-overhead) bytes of unused garbage. 62 63 Setting the limit to 0 effectively turns the feature off. 64 65 Note: This is an experimental feature ! If you get core dumps when 66 using Unicode objects, turn this feature off. 67 68*/ 69 70#define KEEPALIVE_SIZE_LIMIT 9 71 72/* Endianness switches; defaults to little endian */ 73 74#ifdef WORDS_BIGENDIAN 75# define BYTEORDER_IS_BIG_ENDIAN 76#else 77# define BYTEORDER_IS_LITTLE_ENDIAN 78#endif 79 80/* --- Globals ------------------------------------------------------------ 81 82 The globals are initialized by the _PyUnicode_Init() API and should 83 not be used before calling that API. 84 85*/ 86 87 88#ifdef __cplusplus 89extern "C" { 90#endif 91 92#ifdef Py_DEBUG 93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) 94#else 95# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 96#endif 97 98#define _PyUnicode_UTF8(op) \ 99 (((PyCompactUnicodeObject*)(op))->utf8) 100#define PyUnicode_UTF8(op) \ 101 (assert(_PyUnicode_CHECK(op)), \ 102 assert(PyUnicode_IS_READY(op)), \ 103 PyUnicode_IS_COMPACT_ASCII(op) ? \ 104 ((char*)((PyASCIIObject*)(op) + 1)) : \ 105 _PyUnicode_UTF8(op)) 106#define _PyUnicode_UTF8_LENGTH(op) \ 107 (((PyCompactUnicodeObject*)(op))->utf8_length) 108#define PyUnicode_UTF8_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 assert(PyUnicode_IS_READY(op)), \ 111 PyUnicode_IS_COMPACT_ASCII(op) ? \ 112 ((PyASCIIObject*)(op))->length : \ 113 _PyUnicode_UTF8_LENGTH(op)) 114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr) 115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length) 116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length) 117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state) 118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash) 119#define _PyUnicode_KIND(op) \ 120 (assert(_PyUnicode_CHECK(op)), \ 121 ((PyASCIIObject *)(op))->state.kind) 122#define _PyUnicode_GET_LENGTH(op) \ 123 (assert(_PyUnicode_CHECK(op)), \ 124 ((PyASCIIObject *)(op))->length) 125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any) 126 127#undef PyUnicode_READY 128#define PyUnicode_READY(op) \ 129 (assert(_PyUnicode_CHECK(op)), \ 130 (PyUnicode_IS_READY(op) ? \ 131 0 : _PyUnicode_Ready((PyObject *)(op)))) 132 133#define _PyUnicode_SHARE_UTF8(op) \ 134 (assert(_PyUnicode_CHECK(op)), \ 135 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 136 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 137#define _PyUnicode_SHARE_WSTR(op) \ 138 (assert(_PyUnicode_CHECK(op)), \ 139 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 140 141/* true if the Unicode object has an allocated UTF-8 memory block 142 (not shared with other data) */ 143#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 144 (assert(_PyUnicode_CHECK(op)), \ 145 (!PyUnicode_IS_COMPACT_ASCII(op) \ 146 && _PyUnicode_UTF8(op) \ 147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 148 149/* Generic helper macro to convert characters of different types. 150 from_type and to_type have to be valid type names, begin and end 151 are pointers to the source characters which should be of type 152 "from_type *". to is a pointer of type "to_type *" and points to the 153 buffer where the result characters are written to. */ 154#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 155 do { \ 156 const from_type *iter_; to_type *to_; \ 157 for (iter_ = (begin), to_ = (to_type *)(to); \ 158 iter_ < (end); \ 159 ++iter_, ++to_) { \ 160 *to_ = (to_type)*iter_; \ 161 } \ 162 } while (0) 163 164/* The Unicode string has been modified: reset the hash */ 165#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 166 167/* This dictionary holds all interned unicode strings. Note that references 168 to strings in this dictionary are *not* counted in the string's ob_refcnt. 169 When the interned string reaches a refcnt of 0 the string deallocation 170 function will delete the reference from this dictionary. 171 172 Another way to look at this is that to say that the actual reference 173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 174*/ 175static PyObject *interned; 176 177/* The empty Unicode object is shared to improve performance. */ 178static PyObject *unicode_empty; 179 180/* Single character Unicode strings in the Latin-1 range are being 181 shared as well. */ 182static PyObject *unicode_latin1[256]; 183 184/* Fast detection of the most frequent whitespace characters */ 185const unsigned char _Py_ascii_whitespace[] = { 186 0, 0, 0, 0, 0, 0, 0, 0, 187/* case 0x0009: * CHARACTER TABULATION */ 188/* case 0x000A: * LINE FEED */ 189/* case 0x000B: * LINE TABULATION */ 190/* case 0x000C: * FORM FEED */ 191/* case 0x000D: * CARRIAGE RETURN */ 192 0, 1, 1, 1, 1, 1, 0, 0, 193 0, 0, 0, 0, 0, 0, 0, 0, 194/* case 0x001C: * FILE SEPARATOR */ 195/* case 0x001D: * GROUP SEPARATOR */ 196/* case 0x001E: * RECORD SEPARATOR */ 197/* case 0x001F: * UNIT SEPARATOR */ 198 0, 0, 0, 0, 1, 1, 1, 1, 199/* case 0x0020: * SPACE */ 200 1, 0, 0, 0, 0, 0, 0, 0, 201 0, 0, 0, 0, 0, 0, 0, 0, 202 0, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204 205 0, 0, 0, 0, 0, 0, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0 213}; 214 215static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 216 217static PyObject * 218unicode_encode_call_errorhandler(const char *errors, 219 PyObject **errorHandler,const char *encoding, const char *reason, 220 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 221 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 222 223static void 224raise_encode_exception(PyObject **exceptionObject, 225 const char *encoding, 226 const Py_UNICODE *unicode, Py_ssize_t size, 227 Py_ssize_t startpos, Py_ssize_t endpos, 228 const char *reason); 229 230/* Same for linebreaks */ 231static unsigned char ascii_linebreak[] = { 232 0, 0, 0, 0, 0, 0, 0, 0, 233/* 0x000A, * LINE FEED */ 234/* 0x000B, * LINE TABULATION */ 235/* 0x000C, * FORM FEED */ 236/* 0x000D, * CARRIAGE RETURN */ 237 0, 0, 1, 1, 1, 1, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0, 239/* 0x001C, * FILE SEPARATOR */ 240/* 0x001D, * GROUP SEPARATOR */ 241/* 0x001E, * RECORD SEPARATOR */ 242 0, 0, 0, 0, 1, 1, 1, 0, 243 0, 0, 0, 0, 0, 0, 0, 0, 244 0, 0, 0, 0, 0, 0, 0, 0, 245 0, 0, 0, 0, 0, 0, 0, 0, 246 0, 0, 0, 0, 0, 0, 0, 0, 247 248 0, 0, 0, 0, 0, 0, 0, 0, 249 0, 0, 0, 0, 0, 0, 0, 0, 250 0, 0, 0, 0, 0, 0, 0, 0, 251 0, 0, 0, 0, 0, 0, 0, 0, 252 0, 0, 0, 0, 0, 0, 0, 0, 253 0, 0, 0, 0, 0, 0, 0, 0, 254 0, 0, 0, 0, 0, 0, 0, 0, 255 0, 0, 0, 0, 0, 0, 0, 0 256}; 257 258/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 259 This function is kept for backward compatibility with the old API. */ 260Py_UNICODE 261PyUnicode_GetMax(void) 262{ 263#ifdef Py_UNICODE_WIDE 264 return 0x10FFFF; 265#else 266 /* This is actually an illegal character, so it should 267 not be passed to unichr. */ 268 return 0xFFFF; 269#endif 270} 271 272#ifdef Py_DEBUG 273static int 274_PyUnicode_CheckConsistency(void *op) 275{ 276 PyASCIIObject *ascii; 277 unsigned int kind; 278 279 assert(PyUnicode_Check(op)); 280 281 ascii = (PyASCIIObject *)op; 282 kind = ascii->state.kind; 283 284 if (ascii->state.ascii == 1) { 285 assert(kind == PyUnicode_1BYTE_KIND); 286 assert(ascii->state.compact == 1); 287 assert(ascii->state.ready == 1); 288 } 289 else if (ascii->state.compact == 1) { 290 assert(kind == PyUnicode_1BYTE_KIND 291 || kind == PyUnicode_2BYTE_KIND 292 || kind == PyUnicode_4BYTE_KIND); 293 assert(ascii->state.compact == 1); 294 assert(ascii->state.ascii == 0); 295 assert(ascii->state.ready == 1); 296 } else { 297 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 298 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 299 300 if (kind == PyUnicode_WCHAR_KIND) { 301 assert(!ascii->state.compact == 1); 302 assert(ascii->state.ascii == 0); 303 assert(!ascii->state.ready == 1); 304 assert(ascii->wstr != NULL); 305 assert(unicode->data.any == NULL); 306 assert(compact->utf8 == NULL); 307 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 308 } 309 else { 310 assert(kind == PyUnicode_1BYTE_KIND 311 || kind == PyUnicode_2BYTE_KIND 312 || kind == PyUnicode_4BYTE_KIND); 313 assert(!ascii->state.compact == 1); 314 assert(ascii->state.ready == 1); 315 assert(unicode->data.any != NULL); 316 assert(ascii->state.ascii == 0); 317 } 318 } 319 return 1; 320} 321#endif 322 323/* --- Bloom Filters ----------------------------------------------------- */ 324 325/* stuff to implement simple "bloom filters" for Unicode characters. 326 to keep things simple, we use a single bitmask, using the least 5 327 bits from each unicode characters as the bit index. */ 328 329/* the linebreak mask is set up by Unicode_Init below */ 330 331#if LONG_BIT >= 128 332#define BLOOM_WIDTH 128 333#elif LONG_BIT >= 64 334#define BLOOM_WIDTH 64 335#elif LONG_BIT >= 32 336#define BLOOM_WIDTH 32 337#else 338#error "LONG_BIT is smaller than 32" 339#endif 340 341#define BLOOM_MASK unsigned long 342 343static BLOOM_MASK bloom_linebreak; 344 345#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 346#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 347 348#define BLOOM_LINEBREAK(ch) \ 349 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 350 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 351 352Py_LOCAL_INLINE(BLOOM_MASK) 353make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 354{ 355 /* calculate simple bloom-style bitmask for a given unicode string */ 356 357 BLOOM_MASK mask; 358 Py_ssize_t i; 359 360 mask = 0; 361 for (i = 0; i < len; i++) 362 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 363 364 return mask; 365} 366 367#define BLOOM_MEMBER(mask, chr, str) \ 368 (BLOOM(mask, chr) \ 369 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 370 371/* --- Unicode Object ----------------------------------------------------- */ 372 373static PyObject * 374fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); 375 376Py_LOCAL_INLINE(char *) findchar(void *s, int kind, 377 Py_ssize_t size, Py_UCS4 ch, 378 int direction) 379{ 380 /* like wcschr, but doesn't stop at NULL characters */ 381 Py_ssize_t i; 382 if (direction == 1) { 383 for(i = 0; i < size; i++) 384 if (PyUnicode_READ(kind, s, i) == ch) 385 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 386 } 387 else { 388 for(i = size-1; i >= 0; i--) 389 if (PyUnicode_READ(kind, s, i) == ch) 390 return (char*)s + PyUnicode_KIND_SIZE(kind, i); 391 } 392 return NULL; 393} 394 395static PyObject* 396resize_compact(PyObject *unicode, Py_ssize_t length) 397{ 398 Py_ssize_t char_size; 399 Py_ssize_t struct_size; 400 Py_ssize_t new_size; 401 int share_wstr; 402 403 assert(PyUnicode_IS_READY(unicode)); 404 char_size = PyUnicode_CHARACTER_SIZE(unicode); 405 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 406 struct_size = sizeof(PyASCIIObject); 407 else 408 struct_size = sizeof(PyCompactUnicodeObject); 409 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 410 411 _Py_DEC_REFTOTAL; 412 _Py_ForgetReference(unicode); 413 414 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 415 PyErr_NoMemory(); 416 return NULL; 417 } 418 new_size = (struct_size + (length + 1) * char_size); 419 420 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 421 if (unicode == NULL) { 422 PyObject_Del(unicode); 423 PyErr_NoMemory(); 424 return NULL; 425 } 426 _Py_NewReference(unicode); 427 _PyUnicode_LENGTH(unicode) = length; 428 if (share_wstr) { 429 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 430 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 431 _PyUnicode_WSTR_LENGTH(unicode) = length; 432 } 433 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 434 length, 0); 435 return unicode; 436} 437 438static int 439resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length) 440{ 441 void *oldstr; 442 443 assert(!PyUnicode_IS_COMPACT(unicode)); 444 445 assert(Py_REFCNT(unicode) == 1); 446 _PyUnicode_DIRTY(unicode); 447 448 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 449 { 450 PyObject_DEL(_PyUnicode_UTF8(unicode)); 451 _PyUnicode_UTF8(unicode) = NULL; 452 } 453 454 if (PyUnicode_IS_READY(unicode)) { 455 Py_ssize_t char_size; 456 Py_ssize_t new_size; 457 int share_wstr, share_utf8; 458 void *data; 459 460 data = _PyUnicode_DATA_ANY(unicode); 461 assert(data != NULL); 462 char_size = PyUnicode_CHARACTER_SIZE(unicode); 463 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 464 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 465 466 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 467 PyErr_NoMemory(); 468 return -1; 469 } 470 new_size = (length + 1) * char_size; 471 472 data = (PyObject *)PyObject_REALLOC(data, new_size); 473 if (data == NULL) { 474 PyErr_NoMemory(); 475 return -1; 476 } 477 _PyUnicode_DATA_ANY(unicode) = data; 478 if (share_wstr) { 479 _PyUnicode_WSTR(unicode) = data; 480 _PyUnicode_WSTR_LENGTH(unicode) = length; 481 } 482 if (share_utf8) { 483 _PyUnicode_UTF8(unicode) = data; 484 _PyUnicode_UTF8_LENGTH(unicode) = length; 485 } 486 _PyUnicode_LENGTH(unicode) = length; 487 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 488 if (share_wstr) 489 return 0; 490 } 491 if (_PyUnicode_WSTR(unicode) != NULL) { 492 assert(_PyUnicode_WSTR(unicode) != NULL); 493 494 oldstr = _PyUnicode_WSTR(unicode); 495 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), 496 sizeof(Py_UNICODE) * (length + 1)); 497 if (!_PyUnicode_WSTR(unicode)) { 498 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr; 499 PyErr_NoMemory(); 500 return -1; 501 } 502 _PyUnicode_WSTR(unicode)[length] = 0; 503 _PyUnicode_WSTR_LENGTH(unicode) = length; 504 } 505 return 0; 506} 507 508static PyObject* 509resize_copy(PyObject *unicode, Py_ssize_t length) 510{ 511 Py_ssize_t copy_length; 512 if (PyUnicode_IS_COMPACT(unicode)) { 513 PyObject *copy; 514 assert(PyUnicode_IS_READY(unicode)); 515 516 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 517 if (copy == NULL) 518 return NULL; 519 520 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 521 if (PyUnicode_CopyCharacters(copy, 0, 522 unicode, 0, 523 copy_length) < 0) 524 { 525 Py_DECREF(copy); 526 return NULL; 527 } 528 return copy; 529 } else { 530 PyUnicodeObject *w; 531 assert(_PyUnicode_WSTR(unicode) != NULL); 532 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 533 w = _PyUnicode_New(length); 534 if (w == NULL) 535 return NULL; 536 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 537 copy_length = Py_MIN(copy_length, length); 538 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 539 copy_length); 540 return (PyObject*)w; 541 } 542} 543 544/* We allocate one more byte to make sure the string is 545 Ux0000 terminated; some code (e.g. new_identifier) 546 relies on that. 547 548 XXX This allocator could further be enhanced by assuring that the 549 free list never reduces its size below 1. 550 551*/ 552 553#ifdef Py_DEBUG 554int unicode_old_new_calls = 0; 555#endif 556 557static PyUnicodeObject * 558_PyUnicode_New(Py_ssize_t length) 559{ 560 register PyUnicodeObject *unicode; 561 size_t new_size; 562 563 /* Optimization for empty strings */ 564 if (length == 0 && unicode_empty != NULL) { 565 Py_INCREF(unicode_empty); 566 return (PyUnicodeObject*)unicode_empty; 567 } 568 569 /* Ensure we won't overflow the size. */ 570 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 571 return (PyUnicodeObject *)PyErr_NoMemory(); 572 } 573 if (length < 0) { 574 PyErr_SetString(PyExc_SystemError, 575 "Negative size passed to _PyUnicode_New"); 576 return NULL; 577 } 578 579#ifdef Py_DEBUG 580 ++unicode_old_new_calls; 581#endif 582 583 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 584 if (unicode == NULL) 585 return NULL; 586 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 587 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 588 if (!_PyUnicode_WSTR(unicode)) { 589 PyErr_NoMemory(); 590 goto onError; 591 } 592 593 /* Initialize the first element to guard against cases where 594 * the caller fails before initializing str -- unicode_resize() 595 * reads str[0], and the Keep-Alive optimization can keep memory 596 * allocated for str alive across a call to unicode_dealloc(unicode). 597 * We don't want unicode_resize to read uninitialized memory in 598 * that case. 599 */ 600 _PyUnicode_WSTR(unicode)[0] = 0; 601 _PyUnicode_WSTR(unicode)[length] = 0; 602 _PyUnicode_WSTR_LENGTH(unicode) = length; 603 _PyUnicode_HASH(unicode) = -1; 604 _PyUnicode_STATE(unicode).interned = 0; 605 _PyUnicode_STATE(unicode).kind = 0; 606 _PyUnicode_STATE(unicode).compact = 0; 607 _PyUnicode_STATE(unicode).ready = 0; 608 _PyUnicode_STATE(unicode).ascii = 0; 609 _PyUnicode_DATA_ANY(unicode) = NULL; 610 _PyUnicode_LENGTH(unicode) = 0; 611 _PyUnicode_UTF8(unicode) = NULL; 612 _PyUnicode_UTF8_LENGTH(unicode) = 0; 613 return unicode; 614 615 onError: 616 /* XXX UNREF/NEWREF interface should be more symmetrical */ 617 _Py_DEC_REFTOTAL; 618 _Py_ForgetReference((PyObject *)unicode); 619 PyObject_Del(unicode); 620 return NULL; 621} 622 623static const char* 624unicode_kind_name(PyObject *unicode) 625{ 626 assert(_PyUnicode_CHECK(unicode)); 627 if (!PyUnicode_IS_COMPACT(unicode)) 628 { 629 if (!PyUnicode_IS_READY(unicode)) 630 return "wstr"; 631 switch(PyUnicode_KIND(unicode)) 632 { 633 case PyUnicode_1BYTE_KIND: 634 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 635 return "legacy ascii"; 636 else 637 return "legacy latin1"; 638 case PyUnicode_2BYTE_KIND: 639 return "legacy UCS2"; 640 case PyUnicode_4BYTE_KIND: 641 return "legacy UCS4"; 642 default: 643 return "<legacy invalid kind>"; 644 } 645 } 646 assert(PyUnicode_IS_READY(unicode)); 647 switch(PyUnicode_KIND(unicode)) 648 { 649 case PyUnicode_1BYTE_KIND: 650 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 651 return "ascii"; 652 else 653 return "compact latin1"; 654 case PyUnicode_2BYTE_KIND: 655 return "compact UCS2"; 656 case PyUnicode_4BYTE_KIND: 657 return "compact UCS4"; 658 default: 659 return "<invalid compact kind>"; 660 } 661} 662 663#ifdef Py_DEBUG 664int unicode_new_new_calls = 0; 665 666/* Functions wrapping macros for use in debugger */ 667char *_PyUnicode_utf8(void *unicode){ 668 return PyUnicode_UTF8(unicode); 669} 670 671void *_PyUnicode_compact_data(void *unicode) { 672 return _PyUnicode_COMPACT_DATA(unicode); 673} 674void *_PyUnicode_data(void *unicode){ 675 printf("obj %p\n", unicode); 676 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 677 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 678 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 679 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 680 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 681 return PyUnicode_DATA(unicode); 682} 683 684void 685_PyUnicode_Dump(PyObject *op) 686{ 687 PyASCIIObject *ascii = (PyASCIIObject *)op; 688 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 689 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 690 void *data; 691 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 692 if (ascii->state.compact) 693 data = (compact + 1); 694 else 695 data = unicode->data.any; 696 if (ascii->wstr == data) 697 printf("shared "); 698 printf("wstr=%p", ascii->wstr); 699 if (!ascii->state.ascii) { 700 printf(" (%zu), ", compact->wstr_length); 701 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 702 printf("shared "); 703 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 704 } 705 printf(", data=%p\n", data); 706} 707#endif 708 709PyObject * 710PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 711{ 712 PyObject *obj; 713 PyCompactUnicodeObject *unicode; 714 void *data; 715 int kind_state; 716 int is_sharing = 0, is_ascii = 0; 717 Py_ssize_t char_size; 718 Py_ssize_t struct_size; 719 720 /* Optimization for empty strings */ 721 if (size == 0 && unicode_empty != NULL) { 722 Py_INCREF(unicode_empty); 723 return unicode_empty; 724 } 725 726#ifdef Py_DEBUG 727 ++unicode_new_new_calls; 728#endif 729 730 struct_size = sizeof(PyCompactUnicodeObject); 731 if (maxchar < 128) { 732 kind_state = PyUnicode_1BYTE_KIND; 733 char_size = 1; 734 is_ascii = 1; 735 struct_size = sizeof(PyASCIIObject); 736 } 737 else if (maxchar < 256) { 738 kind_state = PyUnicode_1BYTE_KIND; 739 char_size = 1; 740 } 741 else if (maxchar < 65536) { 742 kind_state = PyUnicode_2BYTE_KIND; 743 char_size = 2; 744 if (sizeof(wchar_t) == 2) 745 is_sharing = 1; 746 } 747 else { 748 kind_state = PyUnicode_4BYTE_KIND; 749 char_size = 4; 750 if (sizeof(wchar_t) == 4) 751 is_sharing = 1; 752 } 753 754 /* Ensure we won't overflow the size. */ 755 if (size < 0) { 756 PyErr_SetString(PyExc_SystemError, 757 "Negative size passed to PyUnicode_New"); 758 return NULL; 759 } 760 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 761 return PyErr_NoMemory(); 762 763 /* Duplicated allocation code from _PyObject_New() instead of a call to 764 * PyObject_New() so we are able to allocate space for the object and 765 * it's data buffer. 766 */ 767 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 768 if (obj == NULL) 769 return PyErr_NoMemory(); 770 obj = PyObject_INIT(obj, &PyUnicode_Type); 771 if (obj == NULL) 772 return NULL; 773 774 unicode = (PyCompactUnicodeObject *)obj; 775 if (is_ascii) 776 data = ((PyASCIIObject*)obj) + 1; 777 else 778 data = unicode + 1; 779 _PyUnicode_LENGTH(unicode) = size; 780 _PyUnicode_HASH(unicode) = -1; 781 _PyUnicode_STATE(unicode).interned = 0; 782 _PyUnicode_STATE(unicode).kind = kind_state; 783 _PyUnicode_STATE(unicode).compact = 1; 784 _PyUnicode_STATE(unicode).ready = 1; 785 _PyUnicode_STATE(unicode).ascii = is_ascii; 786 if (is_ascii) { 787 ((char*)data)[size] = 0; 788 _PyUnicode_WSTR(unicode) = NULL; 789 } 790 else if (kind_state == PyUnicode_1BYTE_KIND) { 791 ((char*)data)[size] = 0; 792 _PyUnicode_WSTR(unicode) = NULL; 793 _PyUnicode_WSTR_LENGTH(unicode) = 0; 794 unicode->utf8_length = 0; 795 unicode->utf8 = NULL; 796 } 797 else { 798 unicode->utf8 = NULL; 799 if (kind_state == PyUnicode_2BYTE_KIND) 800 ((Py_UCS2*)data)[size] = 0; 801 else /* kind_state == PyUnicode_4BYTE_KIND */ 802 ((Py_UCS4*)data)[size] = 0; 803 if (is_sharing) { 804 _PyUnicode_WSTR_LENGTH(unicode) = size; 805 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 806 } 807 else { 808 _PyUnicode_WSTR_LENGTH(unicode) = 0; 809 _PyUnicode_WSTR(unicode) = NULL; 810 } 811 } 812 return obj; 813} 814 815#if SIZEOF_WCHAR_T == 2 816/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 817 will decode surrogate pairs, the other conversions are implemented as macros 818 for efficency. 819 820 This function assumes that unicode can hold one more code point than wstr 821 characters for a terminating null character. */ 822static void 823unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 824 PyUnicodeObject *unicode) 825{ 826 const wchar_t *iter; 827 Py_UCS4 *ucs4_out; 828 829 assert(unicode != NULL); 830 assert(_PyUnicode_CHECK(unicode)); 831 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 832 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 833 834 for (iter = begin; iter < end; ) { 835 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 836 _PyUnicode_GET_LENGTH(unicode))); 837 if (*iter >= 0xD800 && *iter <= 0xDBFF 838 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 839 { 840 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 841 iter += 2; 842 } 843 else { 844 *ucs4_out++ = *iter; 845 iter++; 846 } 847 } 848 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 849 _PyUnicode_GET_LENGTH(unicode))); 850 851} 852#endif 853 854static int 855_PyUnicode_Dirty(PyObject *unicode) 856{ 857 assert(_PyUnicode_CHECK(unicode)); 858 if (Py_REFCNT(unicode) != 1) { 859 PyErr_SetString(PyExc_ValueError, 860 "Cannot modify a string having more than 1 reference"); 861 return -1; 862 } 863 _PyUnicode_DIRTY(unicode); 864 return 0; 865} 866 867Py_ssize_t 868PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 869 PyObject *from, Py_ssize_t from_start, 870 Py_ssize_t how_many) 871{ 872 unsigned int from_kind, to_kind; 873 void *from_data, *to_data; 874 875 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 876 PyErr_BadInternalCall(); 877 return -1; 878 } 879 880 if (PyUnicode_READY(from)) 881 return -1; 882 if (PyUnicode_READY(to)) 883 return -1; 884 885 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 886 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 887 PyErr_Format(PyExc_ValueError, 888 "Cannot write %zi characters at %zi " 889 "in a string of %zi characters", 890 how_many, to_start, PyUnicode_GET_LENGTH(to)); 891 return -1; 892 } 893 if (how_many == 0) 894 return 0; 895 896 if (_PyUnicode_Dirty(to)) 897 return -1; 898 899 from_kind = PyUnicode_KIND(from); 900 from_data = PyUnicode_DATA(from); 901 to_kind = PyUnicode_KIND(to); 902 to_data = PyUnicode_DATA(to); 903 904 if (from_kind == to_kind 905 /* deny latin1 => ascii */ 906 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from)) 907 { 908 Py_MEMCPY((char*)to_data 909 + PyUnicode_KIND_SIZE(to_kind, to_start), 910 (char*)from_data 911 + PyUnicode_KIND_SIZE(from_kind, from_start), 912 PyUnicode_KIND_SIZE(to_kind, how_many)); 913 } 914 else if (from_kind == PyUnicode_1BYTE_KIND 915 && to_kind == PyUnicode_2BYTE_KIND) 916 { 917 _PyUnicode_CONVERT_BYTES( 918 Py_UCS1, Py_UCS2, 919 PyUnicode_1BYTE_DATA(from) + from_start, 920 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 921 PyUnicode_2BYTE_DATA(to) + to_start 922 ); 923 } 924 else if (from_kind == PyUnicode_1BYTE_KIND 925 && to_kind == PyUnicode_4BYTE_KIND) 926 { 927 _PyUnicode_CONVERT_BYTES( 928 Py_UCS1, Py_UCS4, 929 PyUnicode_1BYTE_DATA(from) + from_start, 930 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 931 PyUnicode_4BYTE_DATA(to) + to_start 932 ); 933 } 934 else if (from_kind == PyUnicode_2BYTE_KIND 935 && to_kind == PyUnicode_4BYTE_KIND) 936 { 937 _PyUnicode_CONVERT_BYTES( 938 Py_UCS2, Py_UCS4, 939 PyUnicode_2BYTE_DATA(from) + from_start, 940 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 941 PyUnicode_4BYTE_DATA(to) + to_start 942 ); 943 } 944 else { 945 int invalid_kinds; 946 947 /* check if max_char(from substring) <= max_char(to) */ 948 if (from_kind > to_kind 949 /* latin1 => ascii */ 950 || (PyUnicode_IS_COMPACT_ASCII(to) 951 && to_kind == PyUnicode_1BYTE_KIND 952 && !PyUnicode_IS_COMPACT_ASCII(from))) 953 { 954 /* slow path to check for character overflow */ 955 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 956 Py_UCS4 ch, maxchar; 957 Py_ssize_t i; 958 959 maxchar = 0; 960 invalid_kinds = 0; 961 for (i=0; i < how_many; i++) { 962 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 963 if (ch > maxchar) { 964 maxchar = ch; 965 if (maxchar > to_maxchar) { 966 invalid_kinds = 1; 967 break; 968 } 969 } 970 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 971 } 972 } 973 else 974 invalid_kinds = 1; 975 if (invalid_kinds) { 976 PyErr_Format(PyExc_ValueError, 977 "Cannot copy %s characters " 978 "into a string of %s characters", 979 unicode_kind_name(from), 980 unicode_kind_name(to)); 981 return -1; 982 } 983 } 984 return how_many; 985} 986 987/* Find the maximum code point and count the number of surrogate pairs so a 988 correct string length can be computed before converting a string to UCS4. 989 This function counts single surrogates as a character and not as a pair. 990 991 Return 0 on success, or -1 on error. */ 992static int 993find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 994 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 995{ 996 const wchar_t *iter; 997 998 assert(num_surrogates != NULL && maxchar != NULL); 999 if (num_surrogates == NULL || maxchar == NULL) { 1000 PyErr_SetString(PyExc_SystemError, 1001 "unexpected NULL arguments to " 1002 "PyUnicode_FindMaxCharAndNumSurrogatePairs"); 1003 return -1; 1004 } 1005 1006 *num_surrogates = 0; 1007 *maxchar = 0; 1008 1009 for (iter = begin; iter < end; ) { 1010 if (*iter > *maxchar) 1011 *maxchar = *iter; 1012#if SIZEOF_WCHAR_T == 2 1013 if (*iter >= 0xD800 && *iter <= 0xDBFF 1014 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1015 { 1016 Py_UCS4 surrogate_val; 1017 surrogate_val = (((iter[0] & 0x3FF)<<10) 1018 | (iter[1] & 0x3FF)) + 0x10000; 1019 ++(*num_surrogates); 1020 if (surrogate_val > *maxchar) 1021 *maxchar = surrogate_val; 1022 iter += 2; 1023 } 1024 else 1025 iter++; 1026#else 1027 iter++; 1028#endif 1029 } 1030 return 0; 1031} 1032 1033#ifdef Py_DEBUG 1034int unicode_ready_calls = 0; 1035#endif 1036 1037int 1038_PyUnicode_Ready(PyObject *obj) 1039{ 1040 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 1041 wchar_t *end; 1042 Py_UCS4 maxchar = 0; 1043 Py_ssize_t num_surrogates; 1044#if SIZEOF_WCHAR_T == 2 1045 Py_ssize_t length_wo_surrogates; 1046#endif 1047 1048 /* _PyUnicode_Ready() is only intented for old-style API usage where 1049 strings were created using _PyObject_New() and where no canonical 1050 representation (the str field) has been set yet aka strings 1051 which are not yet ready. */ 1052 assert(_PyUnicode_CHECK(unicode)); 1053 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1054 assert(_PyUnicode_WSTR(unicode) != NULL); 1055 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1056 assert(_PyUnicode_UTF8(unicode) == NULL); 1057 /* Actually, it should neither be interned nor be anything else: */ 1058 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1059 1060#ifdef Py_DEBUG 1061 ++unicode_ready_calls; 1062#endif 1063 1064 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1065 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1066 &maxchar, &num_surrogates) == -1) 1067 return -1; 1068 1069 if (maxchar < 256) { 1070 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1071 if (!_PyUnicode_DATA_ANY(unicode)) { 1072 PyErr_NoMemory(); 1073 return -1; 1074 } 1075 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1076 _PyUnicode_WSTR(unicode), end, 1077 PyUnicode_1BYTE_DATA(unicode)); 1078 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1079 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1080 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1081 if (maxchar < 128) { 1082 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1083 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1084 } 1085 else { 1086 _PyUnicode_UTF8(unicode) = NULL; 1087 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1088 } 1089 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1090 _PyUnicode_WSTR(unicode) = NULL; 1091 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1092 } 1093 /* In this case we might have to convert down from 4-byte native 1094 wchar_t to 2-byte unicode. */ 1095 else if (maxchar < 65536) { 1096 assert(num_surrogates == 0 && 1097 "FindMaxCharAndNumSurrogatePairs() messed up"); 1098 1099#if SIZEOF_WCHAR_T == 2 1100 /* We can share representations and are done. */ 1101 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1102 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1103 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1104 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1105 _PyUnicode_UTF8(unicode) = NULL; 1106 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1107#else 1108 /* sizeof(wchar_t) == 4 */ 1109 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1110 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1111 if (!_PyUnicode_DATA_ANY(unicode)) { 1112 PyErr_NoMemory(); 1113 return -1; 1114 } 1115 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1116 _PyUnicode_WSTR(unicode), end, 1117 PyUnicode_2BYTE_DATA(unicode)); 1118 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1119 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1120 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1121 _PyUnicode_UTF8(unicode) = NULL; 1122 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1123 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1124 _PyUnicode_WSTR(unicode) = NULL; 1125 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1126#endif 1127 } 1128 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1129 else { 1130#if SIZEOF_WCHAR_T == 2 1131 /* in case the native representation is 2-bytes, we need to allocate a 1132 new normalized 4-byte version. */ 1133 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1134 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1135 if (!_PyUnicode_DATA_ANY(unicode)) { 1136 PyErr_NoMemory(); 1137 return -1; 1138 } 1139 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1140 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1141 _PyUnicode_UTF8(unicode) = NULL; 1142 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1143 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1144 _PyUnicode_STATE(unicode).ready = 1; 1145 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1146 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1147 _PyUnicode_WSTR(unicode) = NULL; 1148 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1149#else 1150 assert(num_surrogates == 0); 1151 1152 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1153 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1154 _PyUnicode_UTF8(unicode) = NULL; 1155 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1156 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1157#endif 1158 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1159 } 1160 _PyUnicode_STATE(unicode).ready = 1; 1161 return 0; 1162} 1163 1164static void 1165unicode_dealloc(register PyUnicodeObject *unicode) 1166{ 1167 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1168 case SSTATE_NOT_INTERNED: 1169 break; 1170 1171 case SSTATE_INTERNED_MORTAL: 1172 /* revive dead object temporarily for DelItem */ 1173 Py_REFCNT(unicode) = 3; 1174 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0) 1175 Py_FatalError( 1176 "deletion of interned string failed"); 1177 break; 1178 1179 case SSTATE_INTERNED_IMMORTAL: 1180 Py_FatalError("Immortal interned string died."); 1181 1182 default: 1183 Py_FatalError("Inconsistent interned string state."); 1184 } 1185 1186 if (_PyUnicode_WSTR(unicode) && 1187 (!PyUnicode_IS_READY(unicode) || 1188 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode))) 1189 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1190 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1191 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1192 1193 if (PyUnicode_IS_COMPACT(unicode)) { 1194 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1195 } 1196 else { 1197 if (_PyUnicode_DATA_ANY(unicode)) 1198 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1199 Py_TYPE(unicode)->tp_free((PyObject *)unicode); 1200 } 1201} 1202 1203static int 1204unicode_resizable(PyObject *unicode) 1205{ 1206 Py_ssize_t len; 1207 if (Py_REFCNT(unicode) != 1) 1208 return 0; 1209 if (PyUnicode_CHECK_INTERNED(unicode)) 1210 return 0; 1211 if (unicode == unicode_empty) 1212 return 0; 1213 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1214 len = PyUnicode_WSTR_LENGTH(unicode); 1215 else 1216 len = PyUnicode_GET_LENGTH(unicode); 1217 if (len == 1) { 1218 Py_UCS4 ch; 1219 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1220 ch = _PyUnicode_WSTR(unicode)[0]; 1221 else 1222 ch = PyUnicode_READ_CHAR(unicode, 0); 1223 if (ch < 256 && unicode_latin1[ch] == unicode) 1224 return 0; 1225 } 1226 return 1; 1227} 1228 1229static int 1230unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1231{ 1232 PyObject *unicode; 1233 Py_ssize_t old_length; 1234 1235 assert(p_unicode != NULL); 1236 unicode = *p_unicode; 1237 1238 assert(unicode != NULL); 1239 assert(PyUnicode_Check(unicode)); 1240 assert(0 <= length); 1241 1242 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1243 old_length = PyUnicode_WSTR_LENGTH(unicode); 1244 else 1245 old_length = PyUnicode_GET_LENGTH(unicode); 1246 if (old_length == length) 1247 return 0; 1248 1249 /* FIXME: really create a new object? */ 1250 if (!unicode_resizable(unicode)) { 1251 PyObject *copy = resize_copy(unicode, length); 1252 if (copy == NULL) 1253 return -1; 1254 Py_DECREF(*p_unicode); 1255 *p_unicode = copy; 1256 return 0; 1257 } 1258 1259 if (PyUnicode_IS_COMPACT(unicode)) { 1260 *p_unicode = resize_compact(unicode, length); 1261 if (*p_unicode == NULL) 1262 return -1; 1263 return 0; 1264 } else 1265 return resize_inplace((PyUnicodeObject*)unicode, length); 1266} 1267 1268int 1269PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1270{ 1271 PyObject *unicode; 1272 if (p_unicode == NULL) { 1273 PyErr_BadInternalCall(); 1274 return -1; 1275 } 1276 unicode = *p_unicode; 1277 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1278 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1279 { 1280 PyErr_BadInternalCall(); 1281 return -1; 1282 } 1283 return unicode_resize(p_unicode, length); 1284} 1285 1286static PyObject* 1287get_latin1_char(unsigned char ch) 1288{ 1289 PyObject *unicode = unicode_latin1[ch]; 1290 if (!unicode) { 1291 unicode = PyUnicode_New(1, ch); 1292 if (!unicode) 1293 return NULL; 1294 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1295 unicode_latin1[ch] = unicode; 1296 } 1297 Py_INCREF(unicode); 1298 return unicode; 1299} 1300 1301PyObject * 1302PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1303{ 1304 PyUnicodeObject *unicode; 1305 Py_UCS4 maxchar = 0; 1306 Py_ssize_t num_surrogates; 1307 1308 if (u == NULL) 1309 return (PyObject*)_PyUnicode_New(size); 1310 1311 /* If the Unicode data is known at construction time, we can apply 1312 some optimizations which share commonly used objects. */ 1313 1314 /* Optimization for empty strings */ 1315 if (size == 0 && unicode_empty != NULL) { 1316 Py_INCREF(unicode_empty); 1317 return unicode_empty; 1318 } 1319 1320 /* Single character Unicode objects in the Latin-1 range are 1321 shared when using this constructor */ 1322 if (size == 1 && *u < 256) 1323 return get_latin1_char((unsigned char)*u); 1324 1325 /* If not empty and not single character, copy the Unicode data 1326 into the new object */ 1327 if (find_maxchar_surrogates(u, u + size, 1328 &maxchar, &num_surrogates) == -1) 1329 return NULL; 1330 1331 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates, 1332 maxchar); 1333 if (!unicode) 1334 return NULL; 1335 1336 switch (PyUnicode_KIND(unicode)) { 1337 case PyUnicode_1BYTE_KIND: 1338 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1339 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1340 break; 1341 case PyUnicode_2BYTE_KIND: 1342#if Py_UNICODE_SIZE == 2 1343 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1344#else 1345 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1346 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1347#endif 1348 break; 1349 case PyUnicode_4BYTE_KIND: 1350#if SIZEOF_WCHAR_T == 2 1351 /* This is the only case which has to process surrogates, thus 1352 a simple copy loop is not enough and we need a function. */ 1353 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1354#else 1355 assert(num_surrogates == 0); 1356 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1357#endif 1358 break; 1359 default: 1360 assert(0 && "Impossible state"); 1361 } 1362 1363 return (PyObject *)unicode; 1364} 1365 1366PyObject * 1367PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1368{ 1369 PyUnicodeObject *unicode; 1370 1371 if (size < 0) { 1372 PyErr_SetString(PyExc_SystemError, 1373 "Negative size passed to PyUnicode_FromStringAndSize"); 1374 return NULL; 1375 } 1376 1377 /* If the Unicode data is known at construction time, we can apply 1378 some optimizations which share commonly used objects. 1379 Also, this means the input must be UTF-8, so fall back to the 1380 UTF-8 decoder at the end. */ 1381 if (u != NULL) { 1382 1383 /* Optimization for empty strings */ 1384 if (size == 0 && unicode_empty != NULL) { 1385 Py_INCREF(unicode_empty); 1386 return unicode_empty; 1387 } 1388 1389 /* Single characters are shared when using this constructor. 1390 Restrict to ASCII, since the input must be UTF-8. */ 1391 if (size == 1 && Py_CHARMASK(*u) < 128) 1392 return get_latin1_char(Py_CHARMASK(*u)); 1393 1394 return PyUnicode_DecodeUTF8(u, size, NULL); 1395 } 1396 1397 unicode = _PyUnicode_New(size); 1398 if (!unicode) 1399 return NULL; 1400 1401 return (PyObject *)unicode; 1402} 1403 1404PyObject * 1405PyUnicode_FromString(const char *u) 1406{ 1407 size_t size = strlen(u); 1408 if (size > PY_SSIZE_T_MAX) { 1409 PyErr_SetString(PyExc_OverflowError, "input too long"); 1410 return NULL; 1411 } 1412 1413 return PyUnicode_FromStringAndSize(u, size); 1414} 1415 1416static PyObject* 1417_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1418{ 1419 PyObject *res; 1420 unsigned char max = 127; 1421 Py_ssize_t i; 1422 for (i = 0; i < size; i++) { 1423 if (u[i] & 0x80) { 1424 max = 255; 1425 break; 1426 } 1427 } 1428 res = PyUnicode_New(size, max); 1429 if (!res) 1430 return NULL; 1431 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1432 return res; 1433} 1434 1435static PyObject* 1436_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1437{ 1438 PyObject *res; 1439 Py_UCS2 max = 0; 1440 Py_ssize_t i; 1441 for (i = 0; i < size; i++) 1442 if (u[i] > max) 1443 max = u[i]; 1444 res = PyUnicode_New(size, max); 1445 if (!res) 1446 return NULL; 1447 if (max >= 256) 1448 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1449 else 1450 for (i = 0; i < size; i++) 1451 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; 1452 return res; 1453} 1454 1455static PyObject* 1456_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1457{ 1458 PyObject *res; 1459 Py_UCS4 max = 0; 1460 Py_ssize_t i; 1461 for (i = 0; i < size; i++) 1462 if (u[i] > max) 1463 max = u[i]; 1464 res = PyUnicode_New(size, max); 1465 if (!res) 1466 return NULL; 1467 if (max >= 0x10000) 1468 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1469 else { 1470 int kind = PyUnicode_KIND(res); 1471 void *data = PyUnicode_DATA(res); 1472 for (i = 0; i < size; i++) 1473 PyUnicode_WRITE(kind, data, i, u[i]); 1474 } 1475 return res; 1476} 1477 1478PyObject* 1479PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1480{ 1481 switch(kind) { 1482 case PyUnicode_1BYTE_KIND: 1483 return _PyUnicode_FromUCS1(buffer, size); 1484 case PyUnicode_2BYTE_KIND: 1485 return _PyUnicode_FromUCS2(buffer, size); 1486 case PyUnicode_4BYTE_KIND: 1487 return _PyUnicode_FromUCS4(buffer, size); 1488 } 1489 PyErr_SetString(PyExc_ValueError, "invalid kind"); 1490 return NULL; 1491} 1492 1493PyObject* 1494PyUnicode_Copy(PyObject *unicode) 1495{ 1496 Py_ssize_t size; 1497 PyObject *copy; 1498 void *data; 1499 1500 if (!PyUnicode_Check(unicode)) { 1501 PyErr_BadInternalCall(); 1502 return NULL; 1503 } 1504 if (PyUnicode_READY(unicode)) 1505 return NULL; 1506 1507 size = PyUnicode_GET_LENGTH(unicode); 1508 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1509 if (!copy) 1510 return NULL; 1511 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1512 1513 data = PyUnicode_DATA(unicode); 1514 switch (PyUnicode_KIND(unicode)) 1515 { 1516 case PyUnicode_1BYTE_KIND: 1517 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1518 break; 1519 case PyUnicode_2BYTE_KIND: 1520 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1521 break; 1522 case PyUnicode_4BYTE_KIND: 1523 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1524 break; 1525 default: 1526 assert(0); 1527 break; 1528 } 1529 return copy; 1530} 1531 1532 1533/* Widen Unicode objects to larger buffers. Don't write terminating null 1534 character. Return NULL on error. */ 1535 1536void* 1537_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1538{ 1539 Py_ssize_t len; 1540 void *result; 1541 unsigned int skind; 1542 1543 if (PyUnicode_READY(s)) 1544 return NULL; 1545 1546 len = PyUnicode_GET_LENGTH(s); 1547 skind = PyUnicode_KIND(s); 1548 if (skind >= kind) { 1549 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt"); 1550 return NULL; 1551 } 1552 switch(kind) { 1553 case PyUnicode_2BYTE_KIND: 1554 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1555 if (!result) 1556 return PyErr_NoMemory(); 1557 assert(skind == PyUnicode_1BYTE_KIND); 1558 _PyUnicode_CONVERT_BYTES( 1559 Py_UCS1, Py_UCS2, 1560 PyUnicode_1BYTE_DATA(s), 1561 PyUnicode_1BYTE_DATA(s) + len, 1562 result); 1563 return result; 1564 case PyUnicode_4BYTE_KIND: 1565 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1566 if (!result) 1567 return PyErr_NoMemory(); 1568 if (skind == PyUnicode_2BYTE_KIND) { 1569 _PyUnicode_CONVERT_BYTES( 1570 Py_UCS2, Py_UCS4, 1571 PyUnicode_2BYTE_DATA(s), 1572 PyUnicode_2BYTE_DATA(s) + len, 1573 result); 1574 } 1575 else { 1576 assert(skind == PyUnicode_1BYTE_KIND); 1577 _PyUnicode_CONVERT_BYTES( 1578 Py_UCS1, Py_UCS4, 1579 PyUnicode_1BYTE_DATA(s), 1580 PyUnicode_1BYTE_DATA(s) + len, 1581 result); 1582 } 1583 return result; 1584 default: 1585 break; 1586 } 1587 PyErr_SetString(PyExc_ValueError, "invalid kind"); 1588 return NULL; 1589} 1590 1591static Py_UCS4* 1592as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1593 int copy_null) 1594{ 1595 int kind; 1596 void *data; 1597 Py_ssize_t len, targetlen; 1598 if (PyUnicode_READY(string) == -1) 1599 return NULL; 1600 kind = PyUnicode_KIND(string); 1601 data = PyUnicode_DATA(string); 1602 len = PyUnicode_GET_LENGTH(string); 1603 targetlen = len; 1604 if (copy_null) 1605 targetlen++; 1606 if (!target) { 1607 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1608 PyErr_NoMemory(); 1609 return NULL; 1610 } 1611 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1612 if (!target) { 1613 PyErr_NoMemory(); 1614 return NULL; 1615 } 1616 } 1617 else { 1618 if (targetsize < targetlen) { 1619 PyErr_Format(PyExc_SystemError, 1620 "string is longer than the buffer"); 1621 if (copy_null && 0 < targetsize) 1622 target[0] = 0; 1623 return NULL; 1624 } 1625 } 1626 if (kind != PyUnicode_4BYTE_KIND) { 1627 Py_ssize_t i; 1628 for (i = 0; i < len; i++) 1629 target[i] = PyUnicode_READ(kind, data, i); 1630 } 1631 else 1632 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 1633 if (copy_null) 1634 target[len] = 0; 1635 return target; 1636} 1637 1638Py_UCS4* 1639PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1640 int copy_null) 1641{ 1642 if (target == NULL || targetsize < 1) { 1643 PyErr_BadInternalCall(); 1644 return NULL; 1645 } 1646 return as_ucs4(string, target, targetsize, copy_null); 1647} 1648 1649Py_UCS4* 1650PyUnicode_AsUCS4Copy(PyObject *string) 1651{ 1652 return as_ucs4(string, NULL, 0, 1); 1653} 1654 1655#ifdef HAVE_WCHAR_H 1656 1657PyObject * 1658PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 1659{ 1660 if (w == NULL) { 1661 if (size == 0) 1662 return PyUnicode_New(0, 0); 1663 PyErr_BadInternalCall(); 1664 return NULL; 1665 } 1666 1667 if (size == -1) { 1668 size = wcslen(w); 1669 } 1670 1671 return PyUnicode_FromUnicode(w, size); 1672} 1673 1674#endif /* HAVE_WCHAR_H */ 1675 1676static void 1677makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 1678 int zeropad, int width, int precision, char c) 1679{ 1680 *fmt++ = '%'; 1681 if (width) { 1682 if (zeropad) 1683 *fmt++ = '0'; 1684 fmt += sprintf(fmt, "%d", width); 1685 } 1686 if (precision) 1687 fmt += sprintf(fmt, ".%d", precision); 1688 if (longflag) 1689 *fmt++ = 'l'; 1690 else if (longlongflag) { 1691 /* longlongflag should only ever be nonzero on machines with 1692 HAVE_LONG_LONG defined */ 1693#ifdef HAVE_LONG_LONG 1694 char *f = PY_FORMAT_LONG_LONG; 1695 while (*f) 1696 *fmt++ = *f++; 1697#else 1698 /* we shouldn't ever get here */ 1699 assert(0); 1700 *fmt++ = 'l'; 1701#endif 1702 } 1703 else if (size_tflag) { 1704 char *f = PY_FORMAT_SIZE_T; 1705 while (*f) 1706 *fmt++ = *f++; 1707 } 1708 *fmt++ = c; 1709 *fmt = '\0'; 1710} 1711 1712/* helper for PyUnicode_FromFormatV() */ 1713 1714static const char* 1715parse_format_flags(const char *f, 1716 int *p_width, int *p_precision, 1717 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 1718{ 1719 int width, precision, longflag, longlongflag, size_tflag; 1720 1721 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 1722 f++; 1723 width = 0; 1724 while (Py_ISDIGIT((unsigned)*f)) 1725 width = (width*10) + *f++ - '0'; 1726 precision = 0; 1727 if (*f == '.') { 1728 f++; 1729 while (Py_ISDIGIT((unsigned)*f)) 1730 precision = (precision*10) + *f++ - '0'; 1731 if (*f == '%') { 1732 /* "%.3%s" => f points to "3" */ 1733 f--; 1734 } 1735 } 1736 if (*f == '\0') { 1737 /* bogus format "%.1" => go backward, f points to "1" */ 1738 f--; 1739 } 1740 if (p_width != NULL) 1741 *p_width = width; 1742 if (p_precision != NULL) 1743 *p_precision = precision; 1744 1745 /* Handle %ld, %lu, %lld and %llu. */ 1746 longflag = 0; 1747 longlongflag = 0; 1748 size_tflag = 0; 1749 1750 if (*f == 'l') { 1751 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 1752 longflag = 1; 1753 ++f; 1754 } 1755#ifdef HAVE_LONG_LONG 1756 else if (f[1] == 'l' && 1757 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 1758 longlongflag = 1; 1759 f += 2; 1760 } 1761#endif 1762 } 1763 /* handle the size_t flag. */ 1764 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 1765 size_tflag = 1; 1766 ++f; 1767 } 1768 if (p_longflag != NULL) 1769 *p_longflag = longflag; 1770 if (p_longlongflag != NULL) 1771 *p_longlongflag = longlongflag; 1772 if (p_size_tflag != NULL) 1773 *p_size_tflag = size_tflag; 1774 return f; 1775} 1776 1777/* maximum number of characters required for output of %ld. 21 characters 1778 allows for 64-bit integers (in decimal) and an optional sign. */ 1779#define MAX_LONG_CHARS 21 1780/* maximum number of characters required for output of %lld. 1781 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 1782 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 1783#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 1784 1785PyObject * 1786PyUnicode_FromFormatV(const char *format, va_list vargs) 1787{ 1788 va_list count; 1789 Py_ssize_t callcount = 0; 1790 PyObject **callresults = NULL; 1791 PyObject **callresult = NULL; 1792 Py_ssize_t n = 0; 1793 int width = 0; 1794 int precision = 0; 1795 int zeropad; 1796 const char* f; 1797 PyUnicodeObject *string; 1798 /* used by sprintf */ 1799 char fmt[61]; /* should be enough for %0width.precisionlld */ 1800 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 1801 Py_UCS4 argmaxchar; 1802 Py_ssize_t numbersize = 0; 1803 char *numberresults = NULL; 1804 char *numberresult = NULL; 1805 Py_ssize_t i; 1806 int kind; 1807 void *data; 1808 1809 Py_VA_COPY(count, vargs); 1810 /* step 1: count the number of %S/%R/%A/%s format specifications 1811 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 1812 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 1813 * result in an array) 1814 * also esimate a upper bound for all the number formats in the string, 1815 * numbers will be formated in step 3 and be keept in a '\0'-separated 1816 * buffer before putting everything together. */ 1817 for (f = format; *f; f++) { 1818 if (*f == '%') { 1819 int longlongflag; 1820 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 1821 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 1822 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 1823 ++callcount; 1824 1825 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 1826#ifdef HAVE_LONG_LONG 1827 if (longlongflag) { 1828 if (width < MAX_LONG_LONG_CHARS) 1829 width = MAX_LONG_LONG_CHARS; 1830 } 1831 else 1832#endif 1833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 1834 including sign. Decimal takes the most space. This 1835 isn't enough for octal. If a width is specified we 1836 need more (which we allocate later). */ 1837 if (width < MAX_LONG_CHARS) 1838 width = MAX_LONG_CHARS; 1839 1840 /* account for the size + '\0' to separate numbers 1841 inside of the numberresults buffer */ 1842 numbersize += (width + 1); 1843 } 1844 } 1845 else if ((unsigned char)*f > 127) { 1846 PyErr_Format(PyExc_ValueError, 1847 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 1848 "string, got a non-ASCII byte: 0x%02x", 1849 (unsigned char)*f); 1850 return NULL; 1851 } 1852 } 1853 /* step 2: allocate memory for the results of 1854 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 1855 if (callcount) { 1856 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 1857 if (!callresults) { 1858 PyErr_NoMemory(); 1859 return NULL; 1860 } 1861 callresult = callresults; 1862 } 1863 /* step 2.5: allocate memory for the results of formating numbers */ 1864 if (numbersize) { 1865 numberresults = PyObject_Malloc(numbersize); 1866 if (!numberresults) { 1867 PyErr_NoMemory(); 1868 goto fail; 1869 } 1870 numberresult = numberresults; 1871 } 1872 1873 /* step 3: format numbers and figure out how large a buffer we need */ 1874 for (f = format; *f; f++) { 1875 if (*f == '%') { 1876 const char* p; 1877 int longflag; 1878 int longlongflag; 1879 int size_tflag; 1880 int numprinted; 1881 1882 p = f; 1883 zeropad = (f[1] == '0'); 1884 f = parse_format_flags(f, &width, &precision, 1885 &longflag, &longlongflag, &size_tflag); 1886 switch (*f) { 1887 case 'c': 1888 { 1889 Py_UCS4 ordinal = va_arg(count, int); 1890 maxchar = Py_MAX(maxchar, ordinal); 1891 n++; 1892 break; 1893 } 1894 case '%': 1895 n++; 1896 break; 1897 case 'i': 1898 case 'd': 1899 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1900 width, precision, *f); 1901 if (longflag) 1902 numprinted = sprintf(numberresult, fmt, 1903 va_arg(count, long)); 1904#ifdef HAVE_LONG_LONG 1905 else if (longlongflag) 1906 numprinted = sprintf(numberresult, fmt, 1907 va_arg(count, PY_LONG_LONG)); 1908#endif 1909 else if (size_tflag) 1910 numprinted = sprintf(numberresult, fmt, 1911 va_arg(count, Py_ssize_t)); 1912 else 1913 numprinted = sprintf(numberresult, fmt, 1914 va_arg(count, int)); 1915 n += numprinted; 1916 /* advance by +1 to skip over the '\0' */ 1917 numberresult += (numprinted + 1); 1918 assert(*(numberresult - 1) == '\0'); 1919 assert(*(numberresult - 2) != '\0'); 1920 assert(numprinted >= 0); 1921 assert(numberresult <= numberresults + numbersize); 1922 break; 1923 case 'u': 1924 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 1925 width, precision, 'u'); 1926 if (longflag) 1927 numprinted = sprintf(numberresult, fmt, 1928 va_arg(count, unsigned long)); 1929#ifdef HAVE_LONG_LONG 1930 else if (longlongflag) 1931 numprinted = sprintf(numberresult, fmt, 1932 va_arg(count, unsigned PY_LONG_LONG)); 1933#endif 1934 else if (size_tflag) 1935 numprinted = sprintf(numberresult, fmt, 1936 va_arg(count, size_t)); 1937 else 1938 numprinted = sprintf(numberresult, fmt, 1939 va_arg(count, unsigned int)); 1940 n += numprinted; 1941 numberresult += (numprinted + 1); 1942 assert(*(numberresult - 1) == '\0'); 1943 assert(*(numberresult - 2) != '\0'); 1944 assert(numprinted >= 0); 1945 assert(numberresult <= numberresults + numbersize); 1946 break; 1947 case 'x': 1948 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 1949 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 1950 n += numprinted; 1951 numberresult += (numprinted + 1); 1952 assert(*(numberresult - 1) == '\0'); 1953 assert(*(numberresult - 2) != '\0'); 1954 assert(numprinted >= 0); 1955 assert(numberresult <= numberresults + numbersize); 1956 break; 1957 case 'p': 1958 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 1959 /* %p is ill-defined: ensure leading 0x. */ 1960 if (numberresult[1] == 'X') 1961 numberresult[1] = 'x'; 1962 else if (numberresult[1] != 'x') { 1963 memmove(numberresult + 2, numberresult, 1964 strlen(numberresult) + 1); 1965 numberresult[0] = '0'; 1966 numberresult[1] = 'x'; 1967 numprinted += 2; 1968 } 1969 n += numprinted; 1970 numberresult += (numprinted + 1); 1971 assert(*(numberresult - 1) == '\0'); 1972 assert(*(numberresult - 2) != '\0'); 1973 assert(numprinted >= 0); 1974 assert(numberresult <= numberresults + numbersize); 1975 break; 1976 case 's': 1977 { 1978 /* UTF-8 */ 1979 const char *s = va_arg(count, const char*); 1980 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 1981 if (!str) 1982 goto fail; 1983 /* since PyUnicode_DecodeUTF8 returns already flexible 1984 unicode objects, there is no need to call ready on them */ 1985 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 1986 maxchar = Py_MAX(maxchar, argmaxchar); 1987 n += PyUnicode_GET_LENGTH(str); 1988 /* Remember the str and switch to the next slot */ 1989 *callresult++ = str; 1990 break; 1991 } 1992 case 'U': 1993 { 1994 PyObject *obj = va_arg(count, PyObject *); 1995 assert(obj && _PyUnicode_CHECK(obj)); 1996 if (PyUnicode_READY(obj) == -1) 1997 goto fail; 1998 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 1999 maxchar = Py_MAX(maxchar, argmaxchar); 2000 n += PyUnicode_GET_LENGTH(obj); 2001 break; 2002 } 2003 case 'V': 2004 { 2005 PyObject *obj = va_arg(count, PyObject *); 2006 const char *str = va_arg(count, const char *); 2007 PyObject *str_obj; 2008 assert(obj || str); 2009 assert(!obj || _PyUnicode_CHECK(obj)); 2010 if (obj) { 2011 if (PyUnicode_READY(obj) == -1) 2012 goto fail; 2013 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2014 maxchar = Py_MAX(maxchar, argmaxchar); 2015 n += PyUnicode_GET_LENGTH(obj); 2016 *callresult++ = NULL; 2017 } 2018 else { 2019 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2020 if (!str_obj) 2021 goto fail; 2022 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2023 maxchar = Py_MAX(maxchar, argmaxchar); 2024 n += PyUnicode_GET_LENGTH(str_obj); 2025 *callresult++ = str_obj; 2026 } 2027 break; 2028 } 2029 case 'S': 2030 { 2031 PyObject *obj = va_arg(count, PyObject *); 2032 PyObject *str; 2033 assert(obj); 2034 str = PyObject_Str(obj); 2035 if (!str || PyUnicode_READY(str) == -1) 2036 goto fail; 2037 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2038 maxchar = Py_MAX(maxchar, argmaxchar); 2039 n += PyUnicode_GET_LENGTH(str); 2040 /* Remember the str and switch to the next slot */ 2041 *callresult++ = str; 2042 break; 2043 } 2044 case 'R': 2045 { 2046 PyObject *obj = va_arg(count, PyObject *); 2047 PyObject *repr; 2048 assert(obj); 2049 repr = PyObject_Repr(obj); 2050 if (!repr || PyUnicode_READY(repr) == -1) 2051 goto fail; 2052 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2053 maxchar = Py_MAX(maxchar, argmaxchar); 2054 n += PyUnicode_GET_LENGTH(repr); 2055 /* Remember the repr and switch to the next slot */ 2056 *callresult++ = repr; 2057 break; 2058 } 2059 case 'A': 2060 { 2061 PyObject *obj = va_arg(count, PyObject *); 2062 PyObject *ascii; 2063 assert(obj); 2064 ascii = PyObject_ASCII(obj); 2065 if (!ascii || PyUnicode_READY(ascii) == -1) 2066 goto fail; 2067 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2068 maxchar = Py_MAX(maxchar, argmaxchar); 2069 n += PyUnicode_GET_LENGTH(ascii); 2070 /* Remember the repr and switch to the next slot */ 2071 *callresult++ = ascii; 2072 break; 2073 } 2074 default: 2075 /* if we stumble upon an unknown 2076 formatting code, copy the rest of 2077 the format string to the output 2078 string. (we cannot just skip the 2079 code, since there's no way to know 2080 what's in the argument list) */ 2081 n += strlen(p); 2082 goto expand; 2083 } 2084 } else 2085 n++; 2086 } 2087 expand: 2088 /* step 4: fill the buffer */ 2089 /* Since we've analyzed how much space we need, 2090 we don't have to resize the string. 2091 There can be no errors beyond this point. */ 2092 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); 2093 if (!string) 2094 goto fail; 2095 kind = PyUnicode_KIND(string); 2096 data = PyUnicode_DATA(string); 2097 callresult = callresults; 2098 numberresult = numberresults; 2099 2100 for (i = 0, f = format; *f; f++) { 2101 if (*f == '%') { 2102 const char* p; 2103 2104 p = f; 2105 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2106 /* checking for == because the last argument could be a empty 2107 string, which causes i to point to end, the assert at the end of 2108 the loop */ 2109 assert(i <= PyUnicode_GET_LENGTH(string)); 2110 2111 switch (*f) { 2112 case 'c': 2113 { 2114 const int ordinal = va_arg(vargs, int); 2115 PyUnicode_WRITE(kind, data, i++, ordinal); 2116 break; 2117 } 2118 case 'i': 2119 case 'd': 2120 case 'u': 2121 case 'x': 2122 case 'p': 2123 /* unused, since we already have the result */ 2124 if (*f == 'p') 2125 (void) va_arg(vargs, void *); 2126 else 2127 (void) va_arg(vargs, int); 2128 /* extract the result from numberresults and append. */ 2129 for (; *numberresult; ++i, ++numberresult) 2130 PyUnicode_WRITE(kind, data, i, *numberresult); 2131 /* skip over the separating '\0' */ 2132 assert(*numberresult == '\0'); 2133 numberresult++; 2134 assert(numberresult <= numberresults + numbersize); 2135 break; 2136 case 's': 2137 { 2138 /* unused, since we already have the result */ 2139 Py_ssize_t size; 2140 (void) va_arg(vargs, char *); 2141 size = PyUnicode_GET_LENGTH(*callresult); 2142 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2143 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2144 *callresult, 0, 2145 size) < 0) 2146 goto fail; 2147 i += size; 2148 /* We're done with the unicode()/repr() => forget it */ 2149 Py_DECREF(*callresult); 2150 /* switch to next unicode()/repr() result */ 2151 ++callresult; 2152 break; 2153 } 2154 case 'U': 2155 { 2156 PyObject *obj = va_arg(vargs, PyObject *); 2157 Py_ssize_t size; 2158 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2159 size = PyUnicode_GET_LENGTH(obj); 2160 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2161 obj, 0, 2162 size) < 0) 2163 goto fail; 2164 i += size; 2165 break; 2166 } 2167 case 'V': 2168 { 2169 Py_ssize_t size; 2170 PyObject *obj = va_arg(vargs, PyObject *); 2171 va_arg(vargs, const char *); 2172 if (obj) { 2173 size = PyUnicode_GET_LENGTH(obj); 2174 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2175 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2176 obj, 0, 2177 size) < 0) 2178 goto fail; 2179 i += size; 2180 } else { 2181 size = PyUnicode_GET_LENGTH(*callresult); 2182 assert(PyUnicode_KIND(*callresult) <= 2183 PyUnicode_KIND(string)); 2184 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2185 *callresult, 2186 0, size) < 0) 2187 goto fail; 2188 i += size; 2189 Py_DECREF(*callresult); 2190 } 2191 ++callresult; 2192 break; 2193 } 2194 case 'S': 2195 case 'R': 2196 case 'A': 2197 { 2198 /* unused, since we already have the result */ 2199 (void) va_arg(vargs, PyObject *); 2200 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2201 if (PyUnicode_CopyCharacters((PyObject*)string, i, 2202 *callresult, 0, 2203 PyUnicode_GET_LENGTH(*callresult)) < 0) 2204 goto fail; 2205 i += PyUnicode_GET_LENGTH(*callresult); 2206 /* We're done with the unicode()/repr() => forget it */ 2207 Py_DECREF(*callresult); 2208 /* switch to next unicode()/repr() result */ 2209 ++callresult; 2210 break; 2211 } 2212 case '%': 2213 PyUnicode_WRITE(kind, data, i++, '%'); 2214 break; 2215 default: 2216 for (; *p; ++p, ++i) 2217 PyUnicode_WRITE(kind, data, i, *p); 2218 assert(i == PyUnicode_GET_LENGTH(string)); 2219 goto end; 2220 } 2221 } 2222 else { 2223 assert(i < PyUnicode_GET_LENGTH(string)); 2224 PyUnicode_WRITE(kind, data, i++, *f); 2225 } 2226 } 2227 assert(i == PyUnicode_GET_LENGTH(string)); 2228 2229 end: 2230 if (callresults) 2231 PyObject_Free(callresults); 2232 if (numberresults) 2233 PyObject_Free(numberresults); 2234 return (PyObject *)string; 2235 fail: 2236 if (callresults) { 2237 PyObject **callresult2 = callresults; 2238 while (callresult2 < callresult) { 2239 Py_XDECREF(*callresult2); 2240 ++callresult2; 2241 } 2242 PyObject_Free(callresults); 2243 } 2244 if (numberresults) 2245 PyObject_Free(numberresults); 2246 return NULL; 2247} 2248 2249PyObject * 2250PyUnicode_FromFormat(const char *format, ...) 2251{ 2252 PyObject* ret; 2253 va_list vargs; 2254 2255#ifdef HAVE_STDARG_PROTOTYPES 2256 va_start(vargs, format); 2257#else 2258 va_start(vargs); 2259#endif 2260 ret = PyUnicode_FromFormatV(format, vargs); 2261 va_end(vargs); 2262 return ret; 2263} 2264 2265#ifdef HAVE_WCHAR_H 2266 2267/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2268 convert a Unicode object to a wide character string. 2269 2270 - If w is NULL: return the number of wide characters (including the null 2271 character) required to convert the unicode object. Ignore size argument. 2272 2273 - Otherwise: return the number of wide characters (excluding the null 2274 character) written into w. Write at most size wide characters (including 2275 the null character). */ 2276static Py_ssize_t 2277unicode_aswidechar(PyUnicodeObject *unicode, 2278 wchar_t *w, 2279 Py_ssize_t size) 2280{ 2281 Py_ssize_t res; 2282 const wchar_t *wstr; 2283 2284 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res); 2285 if (wstr == NULL) 2286 return -1; 2287 2288 if (w != NULL) { 2289 if (size > res) 2290 size = res + 1; 2291 else 2292 res = size; 2293 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2294 return res; 2295 } 2296 else 2297 return res + 1; 2298} 2299 2300Py_ssize_t 2301PyUnicode_AsWideChar(PyObject *unicode, 2302 wchar_t *w, 2303 Py_ssize_t size) 2304{ 2305 if (unicode == NULL) { 2306 PyErr_BadInternalCall(); 2307 return -1; 2308 } 2309 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); 2310} 2311 2312wchar_t* 2313PyUnicode_AsWideCharString(PyObject *unicode, 2314 Py_ssize_t *size) 2315{ 2316 wchar_t* buffer; 2317 Py_ssize_t buflen; 2318 2319 if (unicode == NULL) { 2320 PyErr_BadInternalCall(); 2321 return NULL; 2322 } 2323 2324 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); 2325 if (buflen == -1) 2326 return NULL; 2327 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2328 PyErr_NoMemory(); 2329 return NULL; 2330 } 2331 2332 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2333 if (buffer == NULL) { 2334 PyErr_NoMemory(); 2335 return NULL; 2336 } 2337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); 2338 if (buflen == -1) 2339 return NULL; 2340 if (size != NULL) 2341 *size = buflen; 2342 return buffer; 2343} 2344 2345#endif /* HAVE_WCHAR_H */ 2346 2347PyObject * 2348PyUnicode_FromOrdinal(int ordinal) 2349{ 2350 PyObject *v; 2351 if (ordinal < 0 || ordinal > 0x10ffff) { 2352 PyErr_SetString(PyExc_ValueError, 2353 "chr() arg not in range(0x110000)"); 2354 return NULL; 2355 } 2356 2357 if (ordinal < 256) 2358 return get_latin1_char(ordinal); 2359 2360 v = PyUnicode_New(1, ordinal); 2361 if (v == NULL) 2362 return NULL; 2363 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2364 return v; 2365} 2366 2367PyObject * 2368PyUnicode_FromObject(register PyObject *obj) 2369{ 2370 /* XXX Perhaps we should make this API an alias of 2371 PyObject_Str() instead ?! */ 2372 if (PyUnicode_CheckExact(obj)) { 2373 if (PyUnicode_READY(obj)) 2374 return NULL; 2375 Py_INCREF(obj); 2376 return obj; 2377 } 2378 if (PyUnicode_Check(obj)) { 2379 /* For a Unicode subtype that's not a Unicode object, 2380 return a true Unicode object with the same data. */ 2381 return PyUnicode_Copy(obj); 2382 } 2383 PyErr_Format(PyExc_TypeError, 2384 "Can't convert '%.100s' object to str implicitly", 2385 Py_TYPE(obj)->tp_name); 2386 return NULL; 2387} 2388 2389PyObject * 2390PyUnicode_FromEncodedObject(register PyObject *obj, 2391 const char *encoding, 2392 const char *errors) 2393{ 2394 Py_buffer buffer; 2395 PyObject *v; 2396 2397 if (obj == NULL) { 2398 PyErr_BadInternalCall(); 2399 return NULL; 2400 } 2401 2402 /* Decoding bytes objects is the most common case and should be fast */ 2403 if (PyBytes_Check(obj)) { 2404 if (PyBytes_GET_SIZE(obj) == 0) { 2405 Py_INCREF(unicode_empty); 2406 v = unicode_empty; 2407 } 2408 else { 2409 v = PyUnicode_Decode( 2410 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2411 encoding, errors); 2412 } 2413 return v; 2414 } 2415 2416 if (PyUnicode_Check(obj)) { 2417 PyErr_SetString(PyExc_TypeError, 2418 "decoding str is not supported"); 2419 return NULL; 2420 } 2421 2422 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2423 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2424 PyErr_Format(PyExc_TypeError, 2425 "coercing to str: need bytes, bytearray " 2426 "or buffer-like object, %.80s found", 2427 Py_TYPE(obj)->tp_name); 2428 return NULL; 2429 } 2430 2431 if (buffer.len == 0) { 2432 Py_INCREF(unicode_empty); 2433 v = unicode_empty; 2434 } 2435 else 2436 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2437 2438 PyBuffer_Release(&buffer); 2439 return v; 2440} 2441 2442/* Convert encoding to lower case and replace '_' with '-' in order to 2443 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2444 1 on success. */ 2445static int 2446normalize_encoding(const char *encoding, 2447 char *lower, 2448 size_t lower_len) 2449{ 2450 const char *e; 2451 char *l; 2452 char *l_end; 2453 2454 e = encoding; 2455 l = lower; 2456 l_end = &lower[lower_len - 1]; 2457 while (*e) { 2458 if (l == l_end) 2459 return 0; 2460 if (Py_ISUPPER(*e)) { 2461 *l++ = Py_TOLOWER(*e++); 2462 } 2463 else if (*e == '_') { 2464 *l++ = '-'; 2465 e++; 2466 } 2467 else { 2468 *l++ = *e++; 2469 } 2470 } 2471 *l = '\0'; 2472 return 1; 2473} 2474 2475PyObject * 2476PyUnicode_Decode(const char *s, 2477 Py_ssize_t size, 2478 const char *encoding, 2479 const char *errors) 2480{ 2481 PyObject *buffer = NULL, *unicode; 2482 Py_buffer info; 2483 char lower[11]; /* Enough for any encoding shortcut */ 2484 2485 if (encoding == NULL) 2486 return PyUnicode_DecodeUTF8(s, size, errors); 2487 2488 /* Shortcuts for common default encodings */ 2489 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2490 if ((strcmp(lower, "utf-8") == 0) || 2491 (strcmp(lower, "utf8") == 0)) 2492 return PyUnicode_DecodeUTF8(s, size, errors); 2493 else if ((strcmp(lower, "latin-1") == 0) || 2494 (strcmp(lower, "latin1") == 0) || 2495 (strcmp(lower, "iso-8859-1") == 0)) 2496 return PyUnicode_DecodeLatin1(s, size, errors); 2497#ifdef HAVE_MBCS 2498 else if (strcmp(lower, "mbcs") == 0) 2499 return PyUnicode_DecodeMBCS(s, size, errors); 2500#endif 2501 else if (strcmp(lower, "ascii") == 0) 2502 return PyUnicode_DecodeASCII(s, size, errors); 2503 else if (strcmp(lower, "utf-16") == 0) 2504 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2505 else if (strcmp(lower, "utf-32") == 0) 2506 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2507 } 2508 2509 /* Decode via the codec registry */ 2510 buffer = NULL; 2511 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2512 goto onError; 2513 buffer = PyMemoryView_FromBuffer(&info); 2514 if (buffer == NULL) 2515 goto onError; 2516 unicode = PyCodec_Decode(buffer, encoding, errors); 2517 if (unicode == NULL) 2518 goto onError; 2519 if (!PyUnicode_Check(unicode)) { 2520 PyErr_Format(PyExc_TypeError, 2521 "decoder did not return a str object (type=%.400s)", 2522 Py_TYPE(unicode)->tp_name); 2523 Py_DECREF(unicode); 2524 goto onError; 2525 } 2526 Py_DECREF(buffer); 2527 if (PyUnicode_READY(unicode)) { 2528 Py_DECREF(unicode); 2529 return NULL; 2530 } 2531 return unicode; 2532 2533 onError: 2534 Py_XDECREF(buffer); 2535 return NULL; 2536} 2537 2538PyObject * 2539PyUnicode_AsDecodedObject(PyObject *unicode, 2540 const char *encoding, 2541 const char *errors) 2542{ 2543 PyObject *v; 2544 2545 if (!PyUnicode_Check(unicode)) { 2546 PyErr_BadArgument(); 2547 goto onError; 2548 } 2549 2550 if (encoding == NULL) 2551 encoding = PyUnicode_GetDefaultEncoding(); 2552 2553 /* Decode via the codec registry */ 2554 v = PyCodec_Decode(unicode, encoding, errors); 2555 if (v == NULL) 2556 goto onError; 2557 return v; 2558 2559 onError: 2560 return NULL; 2561} 2562 2563PyObject * 2564PyUnicode_AsDecodedUnicode(PyObject *unicode, 2565 const char *encoding, 2566 const char *errors) 2567{ 2568 PyObject *v; 2569 2570 if (!PyUnicode_Check(unicode)) { 2571 PyErr_BadArgument(); 2572 goto onError; 2573 } 2574 2575 if (encoding == NULL) 2576 encoding = PyUnicode_GetDefaultEncoding(); 2577 2578 /* Decode via the codec registry */ 2579 v = PyCodec_Decode(unicode, encoding, errors); 2580 if (v == NULL) 2581 goto onError; 2582 if (!PyUnicode_Check(v)) { 2583 PyErr_Format(PyExc_TypeError, 2584 "decoder did not return a str object (type=%.400s)", 2585 Py_TYPE(v)->tp_name); 2586 Py_DECREF(v); 2587 goto onError; 2588 } 2589 return v; 2590 2591 onError: 2592 return NULL; 2593} 2594 2595PyObject * 2596PyUnicode_Encode(const Py_UNICODE *s, 2597 Py_ssize_t size, 2598 const char *encoding, 2599 const char *errors) 2600{ 2601 PyObject *v, *unicode; 2602 2603 unicode = PyUnicode_FromUnicode(s, size); 2604 if (unicode == NULL) 2605 return NULL; 2606 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2607 Py_DECREF(unicode); 2608 return v; 2609} 2610 2611PyObject * 2612PyUnicode_AsEncodedObject(PyObject *unicode, 2613 const char *encoding, 2614 const char *errors) 2615{ 2616 PyObject *v; 2617 2618 if (!PyUnicode_Check(unicode)) { 2619 PyErr_BadArgument(); 2620 goto onError; 2621 } 2622 2623 if (encoding == NULL) 2624 encoding = PyUnicode_GetDefaultEncoding(); 2625 2626 /* Encode via the codec registry */ 2627 v = PyCodec_Encode(unicode, encoding, errors); 2628 if (v == NULL) 2629 goto onError; 2630 return v; 2631 2632 onError: 2633 return NULL; 2634} 2635 2636PyObject * 2637PyUnicode_EncodeFSDefault(PyObject *unicode) 2638{ 2639#ifdef HAVE_MBCS 2640 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2641 PyUnicode_GET_SIZE(unicode), 2642 NULL); 2643#elif defined(__APPLE__) 2644 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 2645#else 2646 PyInterpreterState *interp = PyThreadState_GET()->interp; 2647 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2648 cannot use it to encode and decode filenames before it is loaded. Load 2649 the Python codec requires to encode at least its own filename. Use the C 2650 version of the locale codec until the codec registry is initialized and 2651 the Python codec is loaded. 2652 2653 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2654 cannot only rely on it: check also interp->fscodec_initialized for 2655 subinterpreters. */ 2656 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2657 return PyUnicode_AsEncodedString(unicode, 2658 Py_FileSystemDefaultEncoding, 2659 "surrogateescape"); 2660 } 2661 else { 2662 /* locale encoding with surrogateescape */ 2663 wchar_t *wchar; 2664 char *bytes; 2665 PyObject *bytes_obj; 2666 size_t error_pos; 2667 2668 wchar = PyUnicode_AsWideCharString(unicode, NULL); 2669 if (wchar == NULL) 2670 return NULL; 2671 bytes = _Py_wchar2char(wchar, &error_pos); 2672 if (bytes == NULL) { 2673 if (error_pos != (size_t)-1) { 2674 char *errmsg = strerror(errno); 2675 PyObject *exc = NULL; 2676 if (errmsg == NULL) 2677 errmsg = "Py_wchar2char() failed"; 2678 raise_encode_exception(&exc, 2679 "filesystemencoding", 2680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 2681 error_pos, error_pos+1, 2682 errmsg); 2683 Py_XDECREF(exc); 2684 } 2685 else 2686 PyErr_NoMemory(); 2687 PyMem_Free(wchar); 2688 return NULL; 2689 } 2690 PyMem_Free(wchar); 2691 2692 bytes_obj = PyBytes_FromString(bytes); 2693 PyMem_Free(bytes); 2694 return bytes_obj; 2695 } 2696#endif 2697} 2698 2699PyObject * 2700PyUnicode_AsEncodedString(PyObject *unicode, 2701 const char *encoding, 2702 const char *errors) 2703{ 2704 PyObject *v; 2705 char lower[11]; /* Enough for any encoding shortcut */ 2706 2707 if (!PyUnicode_Check(unicode)) { 2708 PyErr_BadArgument(); 2709 return NULL; 2710 } 2711 2712 if (encoding == NULL) { 2713 if (errors == NULL || strcmp(errors, "strict") == 0) 2714 return _PyUnicode_AsUTF8String(unicode, NULL); 2715 else 2716 return _PyUnicode_AsUTF8String(unicode, errors); 2717 } 2718 2719 /* Shortcuts for common default encodings */ 2720 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2721 if ((strcmp(lower, "utf-8") == 0) || 2722 (strcmp(lower, "utf8") == 0)) 2723 { 2724 if (errors == NULL || strcmp(errors, "strict") == 0) 2725 return _PyUnicode_AsUTF8String(unicode, NULL); 2726 else 2727 return _PyUnicode_AsUTF8String(unicode, errors); 2728 } 2729 else if ((strcmp(lower, "latin-1") == 0) || 2730 (strcmp(lower, "latin1") == 0) || 2731 (strcmp(lower, "iso-8859-1") == 0)) 2732 return _PyUnicode_AsLatin1String(unicode, errors); 2733#ifdef HAVE_MBCS 2734 else if (strcmp(lower, "mbcs") == 0) 2735 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 2736 PyUnicode_GET_SIZE(unicode), 2737 errors); 2738#endif 2739 else if (strcmp(lower, "ascii") == 0) 2740 return _PyUnicode_AsASCIIString(unicode, errors); 2741 } 2742 2743 /* Encode via the codec registry */ 2744 v = PyCodec_Encode(unicode, encoding, errors); 2745 if (v == NULL) 2746 return NULL; 2747 2748 /* The normal path */ 2749 if (PyBytes_Check(v)) 2750 return v; 2751 2752 /* If the codec returns a buffer, raise a warning and convert to bytes */ 2753 if (PyByteArray_Check(v)) { 2754 int error; 2755 PyObject *b; 2756 2757 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 2758 "encoder %s returned bytearray instead of bytes", 2759 encoding); 2760 if (error) { 2761 Py_DECREF(v); 2762 return NULL; 2763 } 2764 2765 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 2766 Py_DECREF(v); 2767 return b; 2768 } 2769 2770 PyErr_Format(PyExc_TypeError, 2771 "encoder did not return a bytes object (type=%.400s)", 2772 Py_TYPE(v)->tp_name); 2773 Py_DECREF(v); 2774 return NULL; 2775} 2776 2777PyObject * 2778PyUnicode_AsEncodedUnicode(PyObject *unicode, 2779 const char *encoding, 2780 const char *errors) 2781{ 2782 PyObject *v; 2783 2784 if (!PyUnicode_Check(unicode)) { 2785 PyErr_BadArgument(); 2786 goto onError; 2787 } 2788 2789 if (encoding == NULL) 2790 encoding = PyUnicode_GetDefaultEncoding(); 2791 2792 /* Encode via the codec registry */ 2793 v = PyCodec_Encode(unicode, encoding, errors); 2794 if (v == NULL) 2795 goto onError; 2796 if (!PyUnicode_Check(v)) { 2797 PyErr_Format(PyExc_TypeError, 2798 "encoder did not return an str object (type=%.400s)", 2799 Py_TYPE(v)->tp_name); 2800 Py_DECREF(v); 2801 goto onError; 2802 } 2803 return v; 2804 2805 onError: 2806 return NULL; 2807} 2808 2809PyObject* 2810PyUnicode_DecodeFSDefault(const char *s) { 2811 Py_ssize_t size = (Py_ssize_t)strlen(s); 2812 return PyUnicode_DecodeFSDefaultAndSize(s, size); 2813} 2814 2815PyObject* 2816PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 2817{ 2818#ifdef HAVE_MBCS 2819 return PyUnicode_DecodeMBCS(s, size, NULL); 2820#elif defined(__APPLE__) 2821 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 2822#else 2823 PyInterpreterState *interp = PyThreadState_GET()->interp; 2824 /* Bootstrap check: if the filesystem codec is implemented in Python, we 2825 cannot use it to encode and decode filenames before it is loaded. Load 2826 the Python codec requires to encode at least its own filename. Use the C 2827 version of the locale codec until the codec registry is initialized and 2828 the Python codec is loaded. 2829 2830 Py_FileSystemDefaultEncoding is shared between all interpreters, we 2831 cannot only rely on it: check also interp->fscodec_initialized for 2832 subinterpreters. */ 2833 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 2834 return PyUnicode_Decode(s, size, 2835 Py_FileSystemDefaultEncoding, 2836 "surrogateescape"); 2837 } 2838 else { 2839 /* locale encoding with surrogateescape */ 2840 wchar_t *wchar; 2841 PyObject *unicode; 2842 size_t len; 2843 2844 if (s[size] != '\0' || size != strlen(s)) { 2845 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2846 return NULL; 2847 } 2848 2849 wchar = _Py_char2wchar(s, &len); 2850 if (wchar == NULL) 2851 return PyErr_NoMemory(); 2852 2853 unicode = PyUnicode_FromWideChar(wchar, len); 2854 PyMem_Free(wchar); 2855 return unicode; 2856 } 2857#endif 2858} 2859 2860 2861int 2862PyUnicode_FSConverter(PyObject* arg, void* addr) 2863{ 2864 PyObject *output = NULL; 2865 Py_ssize_t size; 2866 void *data; 2867 if (arg == NULL) { 2868 Py_DECREF(*(PyObject**)addr); 2869 return 1; 2870 } 2871 if (PyBytes_Check(arg)) { 2872 output = arg; 2873 Py_INCREF(output); 2874 } 2875 else { 2876 arg = PyUnicode_FromObject(arg); 2877 if (!arg) 2878 return 0; 2879 output = PyUnicode_EncodeFSDefault(arg); 2880 Py_DECREF(arg); 2881 if (!output) 2882 return 0; 2883 if (!PyBytes_Check(output)) { 2884 Py_DECREF(output); 2885 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 2886 return 0; 2887 } 2888 } 2889 size = PyBytes_GET_SIZE(output); 2890 data = PyBytes_AS_STRING(output); 2891 if (size != strlen(data)) { 2892 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2893 Py_DECREF(output); 2894 return 0; 2895 } 2896 *(PyObject**)addr = output; 2897 return Py_CLEANUP_SUPPORTED; 2898} 2899 2900 2901int 2902PyUnicode_FSDecoder(PyObject* arg, void* addr) 2903{ 2904 PyObject *output = NULL; 2905 if (arg == NULL) { 2906 Py_DECREF(*(PyObject**)addr); 2907 return 1; 2908 } 2909 if (PyUnicode_Check(arg)) { 2910 if (PyUnicode_READY(arg)) 2911 return 0; 2912 output = arg; 2913 Py_INCREF(output); 2914 } 2915 else { 2916 arg = PyBytes_FromObject(arg); 2917 if (!arg) 2918 return 0; 2919 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 2920 PyBytes_GET_SIZE(arg)); 2921 Py_DECREF(arg); 2922 if (!output) 2923 return 0; 2924 if (!PyUnicode_Check(output)) { 2925 Py_DECREF(output); 2926 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 2927 return 0; 2928 } 2929 } 2930 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 2931 PyUnicode_GET_LENGTH(output), 0, 1)) { 2932 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 2933 Py_DECREF(output); 2934 return 0; 2935 } 2936 *(PyObject**)addr = output; 2937 return Py_CLEANUP_SUPPORTED; 2938} 2939 2940 2941char* 2942PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 2943{ 2944 PyObject *bytes; 2945 PyUnicodeObject *u = (PyUnicodeObject *)unicode; 2946 2947 if (!PyUnicode_Check(unicode)) { 2948 PyErr_BadArgument(); 2949 return NULL; 2950 } 2951 if (PyUnicode_READY(u) == -1) 2952 return NULL; 2953 2954 if (PyUnicode_UTF8(unicode) == NULL) { 2955 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 2956 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 2957 if (bytes == NULL) 2958 return NULL; 2959 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 2960 if (_PyUnicode_UTF8(u) == NULL) { 2961 Py_DECREF(bytes); 2962 return NULL; 2963 } 2964 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes); 2965 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1); 2966 Py_DECREF(bytes); 2967 } 2968 2969 if (psize) 2970 *psize = PyUnicode_UTF8_LENGTH(unicode); 2971 return PyUnicode_UTF8(unicode); 2972} 2973 2974char* 2975PyUnicode_AsUTF8(PyObject *unicode) 2976{ 2977 return PyUnicode_AsUTF8AndSize(unicode, NULL); 2978} 2979 2980#ifdef Py_DEBUG 2981int unicode_as_unicode_calls = 0; 2982#endif 2983 2984 2985Py_UNICODE * 2986PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 2987{ 2988 PyUnicodeObject *u; 2989 const unsigned char *one_byte; 2990#if SIZEOF_WCHAR_T == 4 2991 const Py_UCS2 *two_bytes; 2992#else 2993 const Py_UCS4 *four_bytes; 2994 const Py_UCS4 *ucs4_end; 2995 Py_ssize_t num_surrogates; 2996#endif 2997 wchar_t *w; 2998 wchar_t *wchar_end; 2999 3000 if (!PyUnicode_Check(unicode)) { 3001 PyErr_BadArgument(); 3002 return NULL; 3003 } 3004 u = (PyUnicodeObject*)unicode; 3005 if (_PyUnicode_WSTR(u) == NULL) { 3006 /* Non-ASCII compact unicode object */ 3007 assert(_PyUnicode_KIND(u) != 0); 3008 assert(PyUnicode_IS_READY(u)); 3009 3010#ifdef Py_DEBUG 3011 ++unicode_as_unicode_calls; 3012#endif 3013 3014 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) { 3015#if SIZEOF_WCHAR_T == 2 3016 four_bytes = PyUnicode_4BYTE_DATA(u); 3017 ucs4_end = four_bytes + _PyUnicode_LENGTH(u); 3018 num_surrogates = 0; 3019 3020 for (; four_bytes < ucs4_end; ++four_bytes) { 3021 if (*four_bytes > 0xFFFF) 3022 ++num_surrogates; 3023 } 3024 3025 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC( 3026 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates)); 3027 if (!_PyUnicode_WSTR(u)) { 3028 PyErr_NoMemory(); 3029 return NULL; 3030 } 3031 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates; 3032 3033 w = _PyUnicode_WSTR(u); 3034 wchar_end = w + _PyUnicode_WSTR_LENGTH(u); 3035 four_bytes = PyUnicode_4BYTE_DATA(u); 3036 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3037 if (*four_bytes > 0xFFFF) { 3038 /* encode surrogate pair in this case */ 3039 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3040 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3041 } 3042 else 3043 *w = *four_bytes; 3044 3045 if (w > wchar_end) { 3046 assert(0 && "Miscalculated string end"); 3047 } 3048 } 3049 *w = 0; 3050#else 3051 /* sizeof(wchar_t) == 4 */ 3052 Py_FatalError("Impossible unicode object state, wstr and str " 3053 "should share memory already."); 3054 return NULL; 3055#endif 3056 } 3057 else { 3058 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3059 (_PyUnicode_LENGTH(u) + 1)); 3060 if (!_PyUnicode_WSTR(u)) { 3061 PyErr_NoMemory(); 3062 return NULL; 3063 } 3064 if (!PyUnicode_IS_COMPACT_ASCII(u)) 3065 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u); 3066 w = _PyUnicode_WSTR(u); 3067 wchar_end = w + _PyUnicode_LENGTH(u); 3068 3069 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) { 3070 one_byte = PyUnicode_1BYTE_DATA(u); 3071 for (; w < wchar_end; ++one_byte, ++w) 3072 *w = *one_byte; 3073 /* null-terminate the wstr */ 3074 *w = 0; 3075 } 3076 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) { 3077#if SIZEOF_WCHAR_T == 4 3078 two_bytes = PyUnicode_2BYTE_DATA(u); 3079 for (; w < wchar_end; ++two_bytes, ++w) 3080 *w = *two_bytes; 3081 /* null-terminate the wstr */ 3082 *w = 0; 3083#else 3084 /* sizeof(wchar_t) == 2 */ 3085 PyObject_FREE(_PyUnicode_WSTR(u)); 3086 _PyUnicode_WSTR(u) = NULL; 3087 Py_FatalError("Impossible unicode object state, wstr " 3088 "and str should share memory already."); 3089 return NULL; 3090#endif 3091 } 3092 else { 3093 assert(0 && "This should never happen."); 3094 } 3095 } 3096 } 3097 if (size != NULL) 3098 *size = PyUnicode_WSTR_LENGTH(u); 3099 return _PyUnicode_WSTR(u); 3100} 3101 3102Py_UNICODE * 3103PyUnicode_AsUnicode(PyObject *unicode) 3104{ 3105 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3106} 3107 3108 3109Py_ssize_t 3110PyUnicode_GetSize(PyObject *unicode) 3111{ 3112 if (!PyUnicode_Check(unicode)) { 3113 PyErr_BadArgument(); 3114 goto onError; 3115 } 3116 return PyUnicode_GET_SIZE(unicode); 3117 3118 onError: 3119 return -1; 3120} 3121 3122Py_ssize_t 3123PyUnicode_GetLength(PyObject *unicode) 3124{ 3125 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3126 PyErr_BadArgument(); 3127 return -1; 3128 } 3129 3130 return PyUnicode_GET_LENGTH(unicode); 3131} 3132 3133Py_UCS4 3134PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3135{ 3136 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3137 PyErr_BadArgument(); 3138 return (Py_UCS4)-1; 3139 } 3140 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3141 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3142 return (Py_UCS4)-1; 3143 } 3144 return PyUnicode_READ_CHAR(unicode, index); 3145} 3146 3147int 3148PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3149{ 3150 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3151 PyErr_BadArgument(); 3152 return -1; 3153 } 3154 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3155 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3156 return -1; 3157 } 3158 if (_PyUnicode_Dirty(unicode)) 3159 return -1; 3160 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3161 index, ch); 3162 return 0; 3163} 3164 3165const char * 3166PyUnicode_GetDefaultEncoding(void) 3167{ 3168 return "utf-8"; 3169} 3170 3171/* create or adjust a UnicodeDecodeError */ 3172static void 3173make_decode_exception(PyObject **exceptionObject, 3174 const char *encoding, 3175 const char *input, Py_ssize_t length, 3176 Py_ssize_t startpos, Py_ssize_t endpos, 3177 const char *reason) 3178{ 3179 if (*exceptionObject == NULL) { 3180 *exceptionObject = PyUnicodeDecodeError_Create( 3181 encoding, input, length, startpos, endpos, reason); 3182 } 3183 else { 3184 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3185 goto onError; 3186 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3187 goto onError; 3188 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3189 goto onError; 3190 } 3191 return; 3192 3193onError: 3194 Py_DECREF(*exceptionObject); 3195 *exceptionObject = NULL; 3196} 3197 3198/* error handling callback helper: 3199 build arguments, call the callback and check the arguments, 3200 if no exception occurred, copy the replacement to the output 3201 and adjust various state variables. 3202 return 0 on success, -1 on error 3203*/ 3204 3205static int 3206unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3207 const char *encoding, const char *reason, 3208 const char **input, const char **inend, Py_ssize_t *startinpos, 3209 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3210 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3211{ 3212 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3213 3214 PyObject *restuple = NULL; 3215 PyObject *repunicode = NULL; 3216 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3217 Py_ssize_t insize; 3218 Py_ssize_t requiredsize; 3219 Py_ssize_t newpos; 3220 const Py_UNICODE *repptr; 3221 PyObject *inputobj = NULL; 3222 Py_ssize_t repsize; 3223 int res = -1; 3224 3225 if (*errorHandler == NULL) { 3226 *errorHandler = PyCodec_LookupError(errors); 3227 if (*errorHandler == NULL) 3228 goto onError; 3229 } 3230 3231 make_decode_exception(exceptionObject, 3232 encoding, 3233 *input, *inend - *input, 3234 *startinpos, *endinpos, 3235 reason); 3236 if (*exceptionObject == NULL) 3237 goto onError; 3238 3239 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3240 if (restuple == NULL) 3241 goto onError; 3242 if (!PyTuple_Check(restuple)) { 3243 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3244 goto onError; 3245 } 3246 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3247 goto onError; 3248 3249 /* Copy back the bytes variables, which might have been modified by the 3250 callback */ 3251 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3252 if (!inputobj) 3253 goto onError; 3254 if (!PyBytes_Check(inputobj)) { 3255 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3256 } 3257 *input = PyBytes_AS_STRING(inputobj); 3258 insize = PyBytes_GET_SIZE(inputobj); 3259 *inend = *input + insize; 3260 /* we can DECREF safely, as the exception has another reference, 3261 so the object won't go away. */ 3262 Py_DECREF(inputobj); 3263 3264 if (newpos<0) 3265 newpos = insize+newpos; 3266 if (newpos<0 || newpos>insize) { 3267 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3268 goto onError; 3269 } 3270 3271 /* need more space? (at least enough for what we 3272 have+the replacement+the rest of the string (starting 3273 at the new input position), so we won't have to check space 3274 when there are no errors in the rest of the string) */ 3275 repptr = PyUnicode_AS_UNICODE(repunicode); 3276 repsize = PyUnicode_GET_SIZE(repunicode); 3277 requiredsize = *outpos + repsize + insize-newpos; 3278 if (requiredsize > outsize) { 3279 if (requiredsize<2*outsize) 3280 requiredsize = 2*outsize; 3281 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) 3282 goto onError; 3283 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3284 } 3285 *endinpos = newpos; 3286 *inptr = *input + newpos; 3287 Py_UNICODE_COPY(*outptr, repptr, repsize); 3288 *outptr += repsize; 3289 *outpos += repsize; 3290 3291 /* we made it! */ 3292 res = 0; 3293 3294 onError: 3295 Py_XDECREF(restuple); 3296 return res; 3297} 3298 3299/* --- UTF-7 Codec -------------------------------------------------------- */ 3300 3301/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3302 3303/* Three simple macros defining base-64. */ 3304 3305/* Is c a base-64 character? */ 3306 3307#define IS_BASE64(c) \ 3308 (((c) >= 'A' && (c) <= 'Z') || \ 3309 ((c) >= 'a' && (c) <= 'z') || \ 3310 ((c) >= '0' && (c) <= '9') || \ 3311 (c) == '+' || (c) == '/') 3312 3313/* given that c is a base-64 character, what is its base-64 value? */ 3314 3315#define FROM_BASE64(c) \ 3316 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3317 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3318 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3319 (c) == '+' ? 62 : 63) 3320 3321/* What is the base-64 character of the bottom 6 bits of n? */ 3322 3323#define TO_BASE64(n) \ 3324 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3325 3326/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3327 * decoded as itself. We are permissive on decoding; the only ASCII 3328 * byte not decoding to itself is the + which begins a base64 3329 * string. */ 3330 3331#define DECODE_DIRECT(c) \ 3332 ((c) <= 127 && (c) != '+') 3333 3334/* The UTF-7 encoder treats ASCII characters differently according to 3335 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3336 * the above). See RFC2152. This array identifies these different 3337 * sets: 3338 * 0 : "Set D" 3339 * alphanumeric and '(),-./:? 3340 * 1 : "Set O" 3341 * !"#$%&*;<=>@[]^_`{|} 3342 * 2 : "whitespace" 3343 * ht nl cr sp 3344 * 3 : special (must be base64 encoded) 3345 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3346 */ 3347 3348static 3349char utf7_category[128] = { 3350/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3351 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3352/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3353 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3354/* sp ! " # $ % & ' ( ) * + , - . / */ 3355 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3356/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3358/* @ A B C D E F G H I J K L M N O */ 3359 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3360/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3362/* ` a b c d e f g h i j k l m n o */ 3363 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3364/* p q r s t u v w x y z { | } ~ del */ 3365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3366}; 3367 3368/* ENCODE_DIRECT: this character should be encoded as itself. The 3369 * answer depends on whether we are encoding set O as itself, and also 3370 * on whether we are encoding whitespace as itself. RFC2152 makes it 3371 * clear that the answers to these questions vary between 3372 * applications, so this code needs to be flexible. */ 3373 3374#define ENCODE_DIRECT(c, directO, directWS) \ 3375 ((c) < 128 && (c) > 0 && \ 3376 ((utf7_category[(c)] == 0) || \ 3377 (directWS && (utf7_category[(c)] == 2)) || \ 3378 (directO && (utf7_category[(c)] == 1)))) 3379 3380PyObject * 3381PyUnicode_DecodeUTF7(const char *s, 3382 Py_ssize_t size, 3383 const char *errors) 3384{ 3385 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3386} 3387 3388/* The decoder. The only state we preserve is our read position, 3389 * i.e. how many characters we have consumed. So if we end in the 3390 * middle of a shift sequence we have to back off the read position 3391 * and the output to the beginning of the sequence, otherwise we lose 3392 * all the shift state (seen bits, number of bits seen, high 3393 * surrogate). */ 3394 3395PyObject * 3396PyUnicode_DecodeUTF7Stateful(const char *s, 3397 Py_ssize_t size, 3398 const char *errors, 3399 Py_ssize_t *consumed) 3400{ 3401 const char *starts = s; 3402 Py_ssize_t startinpos; 3403 Py_ssize_t endinpos; 3404 Py_ssize_t outpos; 3405 const char *e; 3406 PyUnicodeObject *unicode; 3407 Py_UNICODE *p; 3408 const char *errmsg = ""; 3409 int inShift = 0; 3410 Py_UNICODE *shiftOutStart; 3411 unsigned int base64bits = 0; 3412 unsigned long base64buffer = 0; 3413 Py_UNICODE surrogate = 0; 3414 PyObject *errorHandler = NULL; 3415 PyObject *exc = NULL; 3416 3417 unicode = _PyUnicode_New(size); 3418 if (!unicode) 3419 return NULL; 3420 if (size == 0) { 3421 if (consumed) 3422 *consumed = 0; 3423 return (PyObject *)unicode; 3424 } 3425 3426 p = PyUnicode_AS_UNICODE(unicode); 3427 shiftOutStart = p; 3428 e = s + size; 3429 3430 while (s < e) { 3431 Py_UNICODE ch; 3432 restart: 3433 ch = (unsigned char) *s; 3434 3435 if (inShift) { /* in a base-64 section */ 3436 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3437 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3438 base64bits += 6; 3439 s++; 3440 if (base64bits >= 16) { 3441 /* we have enough bits for a UTF-16 value */ 3442 Py_UNICODE outCh = (Py_UNICODE) 3443 (base64buffer >> (base64bits-16)); 3444 base64bits -= 16; 3445 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3446 if (surrogate) { 3447 /* expecting a second surrogate */ 3448 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3449#ifdef Py_UNICODE_WIDE 3450 *p++ = (((surrogate & 0x3FF)<<10) 3451 | (outCh & 0x3FF)) + 0x10000; 3452#else 3453 *p++ = surrogate; 3454 *p++ = outCh; 3455#endif 3456 surrogate = 0; 3457 } 3458 else { 3459 surrogate = 0; 3460 errmsg = "second surrogate missing"; 3461 goto utf7Error; 3462 } 3463 } 3464 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3465 /* first surrogate */ 3466 surrogate = outCh; 3467 } 3468 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3469 errmsg = "unexpected second surrogate"; 3470 goto utf7Error; 3471 } 3472 else { 3473 *p++ = outCh; 3474 } 3475 } 3476 } 3477 else { /* now leaving a base-64 section */ 3478 inShift = 0; 3479 s++; 3480 if (surrogate) { 3481 errmsg = "second surrogate missing at end of shift sequence"; 3482 goto utf7Error; 3483 } 3484 if (base64bits > 0) { /* left-over bits */ 3485 if (base64bits >= 6) { 3486 /* We've seen at least one base-64 character */ 3487 errmsg = "partial character in shift sequence"; 3488 goto utf7Error; 3489 } 3490 else { 3491 /* Some bits remain; they should be zero */ 3492 if (base64buffer != 0) { 3493 errmsg = "non-zero padding bits in shift sequence"; 3494 goto utf7Error; 3495 } 3496 } 3497 } 3498 if (ch != '-') { 3499 /* '-' is absorbed; other terminating 3500 characters are preserved */ 3501 *p++ = ch; 3502 } 3503 } 3504 } 3505 else if ( ch == '+' ) { 3506 startinpos = s-starts; 3507 s++; /* consume '+' */ 3508 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3509 s++; 3510 *p++ = '+'; 3511 } 3512 else { /* begin base64-encoded section */ 3513 inShift = 1; 3514 shiftOutStart = p; 3515 base64bits = 0; 3516 } 3517 } 3518 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3519 *p++ = ch; 3520 s++; 3521 } 3522 else { 3523 startinpos = s-starts; 3524 s++; 3525 errmsg = "unexpected special character"; 3526 goto utf7Error; 3527 } 3528 continue; 3529utf7Error: 3530 outpos = p-PyUnicode_AS_UNICODE(unicode); 3531 endinpos = s-starts; 3532 if (unicode_decode_call_errorhandler( 3533 errors, &errorHandler, 3534 "utf7", errmsg, 3535 &starts, &e, &startinpos, &endinpos, &exc, &s, 3536 &unicode, &outpos, &p)) 3537 goto onError; 3538 } 3539 3540 /* end of string */ 3541 3542 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3543 /* if we're in an inconsistent state, that's an error */ 3544 if (surrogate || 3545 (base64bits >= 6) || 3546 (base64bits > 0 && base64buffer != 0)) { 3547 outpos = p-PyUnicode_AS_UNICODE(unicode); 3548 endinpos = size; 3549 if (unicode_decode_call_errorhandler( 3550 errors, &errorHandler, 3551 "utf7", "unterminated shift sequence", 3552 &starts, &e, &startinpos, &endinpos, &exc, &s, 3553 &unicode, &outpos, &p)) 3554 goto onError; 3555 if (s < e) 3556 goto restart; 3557 } 3558 } 3559 3560 /* return state */ 3561 if (consumed) { 3562 if (inShift) { 3563 p = shiftOutStart; /* back off output */ 3564 *consumed = startinpos; 3565 } 3566 else { 3567 *consumed = s-starts; 3568 } 3569 } 3570 3571 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3572 goto onError; 3573 3574 Py_XDECREF(errorHandler); 3575 Py_XDECREF(exc); 3576 if (PyUnicode_READY(unicode) == -1) { 3577 Py_DECREF(unicode); 3578 return NULL; 3579 } 3580 return (PyObject *)unicode; 3581 3582 onError: 3583 Py_XDECREF(errorHandler); 3584 Py_XDECREF(exc); 3585 Py_DECREF(unicode); 3586 return NULL; 3587} 3588 3589 3590PyObject * 3591PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3592 Py_ssize_t size, 3593 int base64SetO, 3594 int base64WhiteSpace, 3595 const char *errors) 3596{ 3597 PyObject *v; 3598 /* It might be possible to tighten this worst case */ 3599 Py_ssize_t allocated = 8 * size; 3600 int inShift = 0; 3601 Py_ssize_t i = 0; 3602 unsigned int base64bits = 0; 3603 unsigned long base64buffer = 0; 3604 char * out; 3605 char * start; 3606 3607 if (size == 0) 3608 return PyBytes_FromStringAndSize(NULL, 0); 3609 3610 if (allocated / 8 != size) 3611 return PyErr_NoMemory(); 3612 3613 v = PyBytes_FromStringAndSize(NULL, allocated); 3614 if (v == NULL) 3615 return NULL; 3616 3617 start = out = PyBytes_AS_STRING(v); 3618 for (;i < size; ++i) { 3619 Py_UNICODE ch = s[i]; 3620 3621 if (inShift) { 3622 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3623 /* shifting out */ 3624 if (base64bits) { /* output remaining bits */ 3625 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 3626 base64buffer = 0; 3627 base64bits = 0; 3628 } 3629 inShift = 0; 3630 /* Characters not in the BASE64 set implicitly unshift the sequence 3631 so no '-' is required, except if the character is itself a '-' */ 3632 if (IS_BASE64(ch) || ch == '-') { 3633 *out++ = '-'; 3634 } 3635 *out++ = (char) ch; 3636 } 3637 else { 3638 goto encode_char; 3639 } 3640 } 3641 else { /* not in a shift sequence */ 3642 if (ch == '+') { 3643 *out++ = '+'; 3644 *out++ = '-'; 3645 } 3646 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3647 *out++ = (char) ch; 3648 } 3649 else { 3650 *out++ = '+'; 3651 inShift = 1; 3652 goto encode_char; 3653 } 3654 } 3655 continue; 3656encode_char: 3657#ifdef Py_UNICODE_WIDE 3658 if (ch >= 0x10000) { 3659 /* code first surrogate */ 3660 base64bits += 16; 3661 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 3662 while (base64bits >= 6) { 3663 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3664 base64bits -= 6; 3665 } 3666 /* prepare second surrogate */ 3667 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 3668 } 3669#endif 3670 base64bits += 16; 3671 base64buffer = (base64buffer << 16) | ch; 3672 while (base64bits >= 6) { 3673 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 3674 base64bits -= 6; 3675 } 3676 } 3677 if (base64bits) 3678 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 3679 if (inShift) 3680 *out++ = '-'; 3681 if (_PyBytes_Resize(&v, out - start) < 0) 3682 return NULL; 3683 return v; 3684} 3685 3686#undef IS_BASE64 3687#undef FROM_BASE64 3688#undef TO_BASE64 3689#undef DECODE_DIRECT 3690#undef ENCODE_DIRECT 3691 3692/* --- UTF-8 Codec -------------------------------------------------------- */ 3693 3694static 3695char utf8_code_length[256] = { 3696 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 3697 illegal prefix. See RFC 3629 for details */ 3698 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 3699 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3700 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3701 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3702 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3703 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3704 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3705 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 3706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 3707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3709 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 3710 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 3711 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 3712 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 3713 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 3714}; 3715 3716PyObject * 3717PyUnicode_DecodeUTF8(const char *s, 3718 Py_ssize_t size, 3719 const char *errors) 3720{ 3721 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3722} 3723 3724/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 3725#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 3726 3727/* Mask to quickly check whether a C 'long' contains a 3728 non-ASCII, UTF8-encoded char. */ 3729#if (SIZEOF_LONG == 8) 3730# define ASCII_CHAR_MASK 0x8080808080808080L 3731#elif (SIZEOF_LONG == 4) 3732# define ASCII_CHAR_MASK 0x80808080L 3733#else 3734# error C 'long' size should be either 4 or 8! 3735#endif 3736 3737/* Scans a UTF-8 string and returns the maximum character to be expected, 3738 the size of the decoded unicode string and if any major errors were 3739 encountered. 3740 3741 This function does check basic UTF-8 sanity, it does however NOT CHECK 3742 if the string contains surrogates, and if all continuation bytes are 3743 within the correct ranges, these checks are performed in 3744 PyUnicode_DecodeUTF8Stateful. 3745 3746 If it sets has_errors to 1, it means the value of unicode_size and max_char 3747 will be bogus and you should not rely on useful information in them. 3748 */ 3749static Py_UCS4 3750utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 3751 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 3752 int *has_errors) 3753{ 3754 Py_ssize_t n; 3755 Py_ssize_t char_count = 0; 3756 Py_UCS4 max_char = 127, new_max; 3757 Py_UCS4 upper_bound; 3758 const unsigned char *p = (const unsigned char *)s; 3759 const unsigned char *end = p + string_size; 3760 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 3761 int err = 0; 3762 3763 for (; p < end && !err; ++p, ++char_count) { 3764 /* Only check value if it's not a ASCII char... */ 3765 if (*p < 0x80) { 3766 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 3767 an explanation. */ 3768 if (!((size_t) p & LONG_PTR_MASK)) { 3769 /* Help register allocation */ 3770 register const unsigned char *_p = p; 3771 while (_p < aligned_end) { 3772 unsigned long value = *(unsigned long *) _p; 3773 if (value & ASCII_CHAR_MASK) 3774 break; 3775 _p += SIZEOF_LONG; 3776 char_count += SIZEOF_LONG; 3777 } 3778 p = _p; 3779 if (p == end) 3780 break; 3781 } 3782 } 3783 if (*p >= 0x80) { 3784 n = utf8_code_length[*p]; 3785 new_max = max_char; 3786 switch (n) { 3787 /* invalid start byte */ 3788 case 0: 3789 err = 1; 3790 break; 3791 case 2: 3792 /* Code points between 0x00FF and 0x07FF inclusive. 3793 Approximate the upper bound of the code point, 3794 if this flips over 255 we can be sure it will be more 3795 than 255 and the string will need 2 bytes per code coint, 3796 if it stays under or equal to 255, we can be sure 1 byte 3797 is enough. 3798 ((*p & 0b00011111) << 6) | 0b00111111 */ 3799 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 3800 if (max_char < upper_bound) 3801 new_max = upper_bound; 3802 /* Ensure we track at least that we left ASCII space. */ 3803 if (new_max < 128) 3804 new_max = 128; 3805 break; 3806 case 3: 3807 /* Between 0x0FFF and 0xFFFF inclusive, so values are 3808 always > 255 and <= 65535 and will always need 2 bytes. */ 3809 if (max_char < 65535) 3810 new_max = 65535; 3811 break; 3812 case 4: 3813 /* Code point will be above 0xFFFF for sure in this case. */ 3814 new_max = 65537; 3815 break; 3816 /* Internal error, this should be caught by the first if */ 3817 case 1: 3818 default: 3819 assert(0 && "Impossible case in utf8_max_char_and_size"); 3820 err = 1; 3821 } 3822 /* Instead of number of overall bytes for this code point, 3823 n containts the number of following bytes: */ 3824 --n; 3825 /* Check if the follow up chars are all valid continuation bytes */ 3826 if (n >= 1) { 3827 const unsigned char *cont; 3828 if ((p + n) >= end) { 3829 if (consumed == 0) 3830 /* incomplete data, non-incremental decoding */ 3831 err = 1; 3832 break; 3833 } 3834 for (cont = p + 1; cont < (p + n); ++cont) { 3835 if ((*cont & 0xc0) != 0x80) { 3836 err = 1; 3837 break; 3838 } 3839 } 3840 p += n; 3841 } 3842 else 3843 err = 1; 3844 max_char = new_max; 3845 } 3846 } 3847 3848 if (unicode_size) 3849 *unicode_size = char_count; 3850 if (has_errors) 3851 *has_errors = err; 3852 return max_char; 3853} 3854 3855/* Similar to PyUnicode_WRITE but can also write into wstr field 3856 of the legacy unicode representation */ 3857#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 3858 do { \ 3859 const int k_ = (kind); \ 3860 if (k_ == PyUnicode_WCHAR_KIND) \ 3861 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 3862 else if (k_ == PyUnicode_1BYTE_KIND) \ 3863 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 3864 else if (k_ == PyUnicode_2BYTE_KIND) \ 3865 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 3866 else \ 3867 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 3868 } while (0) 3869 3870PyObject * 3871PyUnicode_DecodeUTF8Stateful(const char *s, 3872 Py_ssize_t size, 3873 const char *errors, 3874 Py_ssize_t *consumed) 3875{ 3876 const char *starts = s; 3877 int n; 3878 int k; 3879 Py_ssize_t startinpos; 3880 Py_ssize_t endinpos; 3881 const char *e, *aligned_end; 3882 PyUnicodeObject *unicode; 3883 const char *errmsg = ""; 3884 PyObject *errorHandler = NULL; 3885 PyObject *exc = NULL; 3886 Py_UCS4 maxchar = 0; 3887 Py_ssize_t unicode_size; 3888 Py_ssize_t i; 3889 int kind; 3890 void *data; 3891 int has_errors; 3892 Py_UNICODE *error_outptr; 3893#if SIZEOF_WCHAR_T == 2 3894 Py_ssize_t wchar_offset = 0; 3895#endif 3896 3897 if (size == 0) { 3898 if (consumed) 3899 *consumed = 0; 3900 return (PyObject *)PyUnicode_New(0, 0); 3901 } 3902 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 3903 consumed, &has_errors); 3904 if (has_errors) { 3905 unicode = _PyUnicode_New(size); 3906 if (!unicode) 3907 return NULL; 3908 kind = PyUnicode_WCHAR_KIND; 3909 data = PyUnicode_AS_UNICODE(unicode); 3910 assert(data != NULL); 3911 } 3912 else { 3913 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar); 3914 if (!unicode) 3915 return NULL; 3916 /* When the string is ASCII only, just use memcpy and return. 3917 unicode_size may be != size if there is an incomplete UTF-8 3918 sequence at the end of the ASCII block. */ 3919 if (maxchar < 128 && size == unicode_size) { 3920 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 3921 return (PyObject *)unicode; 3922 } 3923 kind = PyUnicode_KIND(unicode); 3924 data = PyUnicode_DATA(unicode); 3925 } 3926 /* Unpack UTF-8 encoded data */ 3927 i = 0; 3928 e = s + size; 3929 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 3930 3931 while (s < e) { 3932 Py_UCS4 ch = (unsigned char)*s; 3933 3934 if (ch < 0x80) { 3935 /* Fast path for runs of ASCII characters. Given that common UTF-8 3936 input will consist of an overwhelming majority of ASCII 3937 characters, we try to optimize for this case by checking 3938 as many characters as a C 'long' can contain. 3939 First, check if we can do an aligned read, as most CPUs have 3940 a penalty for unaligned reads. 3941 */ 3942 if (!((size_t) s & LONG_PTR_MASK)) { 3943 /* Help register allocation */ 3944 register const char *_s = s; 3945 register Py_ssize_t _i = i; 3946 while (_s < aligned_end) { 3947 /* Read a whole long at a time (either 4 or 8 bytes), 3948 and do a fast unrolled copy if it only contains ASCII 3949 characters. */ 3950 unsigned long value = *(unsigned long *) _s; 3951 if (value & ASCII_CHAR_MASK) 3952 break; 3953 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 3954 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 3955 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 3956 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 3957#if (SIZEOF_LONG == 8) 3958 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 3959 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 3960 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 3961 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 3962#endif 3963 _s += SIZEOF_LONG; 3964 _i += SIZEOF_LONG; 3965 } 3966 s = _s; 3967 i = _i; 3968 if (s == e) 3969 break; 3970 ch = (unsigned char)*s; 3971 } 3972 } 3973 3974 if (ch < 0x80) { 3975 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 3976 s++; 3977 continue; 3978 } 3979 3980 n = utf8_code_length[ch]; 3981 3982 if (s + n > e) { 3983 if (consumed) 3984 break; 3985 else { 3986 errmsg = "unexpected end of data"; 3987 startinpos = s-starts; 3988 endinpos = startinpos+1; 3989 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 3990 endinpos++; 3991 goto utf8Error; 3992 } 3993 } 3994 3995 switch (n) { 3996 3997 case 0: 3998 errmsg = "invalid start byte"; 3999 startinpos = s-starts; 4000 endinpos = startinpos+1; 4001 goto utf8Error; 4002 4003 case 1: 4004 errmsg = "internal error"; 4005 startinpos = s-starts; 4006 endinpos = startinpos+1; 4007 goto utf8Error; 4008 4009 case 2: 4010 if ((s[1] & 0xc0) != 0x80) { 4011 errmsg = "invalid continuation byte"; 4012 startinpos = s-starts; 4013 endinpos = startinpos + 1; 4014 goto utf8Error; 4015 } 4016 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4017 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4018 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4019 break; 4020 4021 case 3: 4022 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4023 will result in surrogates in range d800-dfff. Surrogates are 4024 not valid UTF-8 so they are rejected. 4025 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4026 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4027 if ((s[1] & 0xc0) != 0x80 || 4028 (s[2] & 0xc0) != 0x80 || 4029 ((unsigned char)s[0] == 0xE0 && 4030 (unsigned char)s[1] < 0xA0) || 4031 ((unsigned char)s[0] == 0xED && 4032 (unsigned char)s[1] > 0x9F)) { 4033 errmsg = "invalid continuation byte"; 4034 startinpos = s-starts; 4035 endinpos = startinpos + 1; 4036 4037 /* if s[1] first two bits are 1 and 0, then the invalid 4038 continuation byte is s[2], so increment endinpos by 1, 4039 if not, s[1] is invalid and endinpos doesn't need to 4040 be incremented. */ 4041 if ((s[1] & 0xC0) == 0x80) 4042 endinpos++; 4043 goto utf8Error; 4044 } 4045 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4046 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4047 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4048 break; 4049 4050 case 4: 4051 if ((s[1] & 0xc0) != 0x80 || 4052 (s[2] & 0xc0) != 0x80 || 4053 (s[3] & 0xc0) != 0x80 || 4054 ((unsigned char)s[0] == 0xF0 && 4055 (unsigned char)s[1] < 0x90) || 4056 ((unsigned char)s[0] == 0xF4 && 4057 (unsigned char)s[1] > 0x8F)) { 4058 errmsg = "invalid continuation byte"; 4059 startinpos = s-starts; 4060 endinpos = startinpos + 1; 4061 if ((s[1] & 0xC0) == 0x80) { 4062 endinpos++; 4063 if ((s[2] & 0xC0) == 0x80) 4064 endinpos++; 4065 } 4066 goto utf8Error; 4067 } 4068 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4069 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4070 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4071 4072 /* If the string is flexible or we have native UCS-4, write 4073 directly.. */ 4074 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4075 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4076 4077 else { 4078 /* compute and append the two surrogates: */ 4079 4080 /* translate from 10000..10FFFF to 0..FFFF */ 4081 ch -= 0x10000; 4082 4083 /* high surrogate = top 10 bits added to D800 */ 4084 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4085 (Py_UNICODE)(0xD800 + (ch >> 10))); 4086 4087 /* low surrogate = bottom 10 bits added to DC00 */ 4088 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4089 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4090 } 4091#if SIZEOF_WCHAR_T == 2 4092 wchar_offset++; 4093#endif 4094 break; 4095 } 4096 s += n; 4097 continue; 4098 4099 utf8Error: 4100 /* If this is not yet a resizable string, make it one.. */ 4101 if (kind != PyUnicode_WCHAR_KIND) { 4102 const Py_UNICODE *u; 4103 PyUnicodeObject *new_unicode = _PyUnicode_New(size); 4104 if (!new_unicode) 4105 goto onError; 4106 u = PyUnicode_AsUnicode((PyObject *)unicode); 4107 if (!u) 4108 goto onError; 4109#if SIZEOF_WCHAR_T == 2 4110 i += wchar_offset; 4111#endif 4112 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4113 Py_DECREF(unicode); 4114 unicode = new_unicode; 4115 kind = 0; 4116 data = PyUnicode_AS_UNICODE(new_unicode); 4117 assert(data != NULL); 4118 } 4119 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4120 if (unicode_decode_call_errorhandler( 4121 errors, &errorHandler, 4122 "utf8", errmsg, 4123 &starts, &e, &startinpos, &endinpos, &exc, &s, 4124 &unicode, &i, &error_outptr)) 4125 goto onError; 4126 /* Update data because unicode_decode_call_errorhandler might have 4127 re-created or resized the unicode object. */ 4128 data = PyUnicode_AS_UNICODE(unicode); 4129 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4130 } 4131 /* Ensure the unicode_size calculation above was correct: */ 4132 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4133 4134 if (consumed) 4135 *consumed = s-starts; 4136 4137 /* Adjust length and ready string when it contained errors and 4138 is of the old resizable kind. */ 4139 if (kind == PyUnicode_WCHAR_KIND) { 4140 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 || 4141 PyUnicode_READY(unicode) == -1) 4142 goto onError; 4143 } 4144 4145 Py_XDECREF(errorHandler); 4146 Py_XDECREF(exc); 4147 if (PyUnicode_READY(unicode) == -1) { 4148 Py_DECREF(unicode); 4149 return NULL; 4150 } 4151 return (PyObject *)unicode; 4152 4153 onError: 4154 Py_XDECREF(errorHandler); 4155 Py_XDECREF(exc); 4156 Py_DECREF(unicode); 4157 return NULL; 4158} 4159 4160#undef WRITE_FLEXIBLE_OR_WSTR 4161 4162#ifdef __APPLE__ 4163 4164/* Simplified UTF-8 decoder using surrogateescape error handler, 4165 used to decode the command line arguments on Mac OS X. */ 4166 4167wchar_t* 4168_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4169{ 4170 int n; 4171 const char *e; 4172 wchar_t *unicode, *p; 4173 4174 /* Note: size will always be longer than the resulting Unicode 4175 character count */ 4176 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4177 PyErr_NoMemory(); 4178 return NULL; 4179 } 4180 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4181 if (!unicode) 4182 return NULL; 4183 4184 /* Unpack UTF-8 encoded data */ 4185 p = unicode; 4186 e = s + size; 4187 while (s < e) { 4188 Py_UCS4 ch = (unsigned char)*s; 4189 4190 if (ch < 0x80) { 4191 *p++ = (wchar_t)ch; 4192 s++; 4193 continue; 4194 } 4195 4196 n = utf8_code_length[ch]; 4197 if (s + n > e) { 4198 goto surrogateescape; 4199 } 4200 4201 switch (n) { 4202 case 0: 4203 case 1: 4204 goto surrogateescape; 4205 4206 case 2: 4207 if ((s[1] & 0xc0) != 0x80) 4208 goto surrogateescape; 4209 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4210 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4211 *p++ = (wchar_t)ch; 4212 break; 4213 4214 case 3: 4215 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4216 will result in surrogates in range d800-dfff. Surrogates are 4217 not valid UTF-8 so they are rejected. 4218 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4219 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4220 if ((s[1] & 0xc0) != 0x80 || 4221 (s[2] & 0xc0) != 0x80 || 4222 ((unsigned char)s[0] == 0xE0 && 4223 (unsigned char)s[1] < 0xA0) || 4224 ((unsigned char)s[0] == 0xED && 4225 (unsigned char)s[1] > 0x9F)) { 4226 4227 goto surrogateescape; 4228 } 4229 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4230 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4231 *p++ = (wchar_t)ch; 4232 break; 4233 4234 case 4: 4235 if ((s[1] & 0xc0) != 0x80 || 4236 (s[2] & 0xc0) != 0x80 || 4237 (s[3] & 0xc0) != 0x80 || 4238 ((unsigned char)s[0] == 0xF0 && 4239 (unsigned char)s[1] < 0x90) || 4240 ((unsigned char)s[0] == 0xF4 && 4241 (unsigned char)s[1] > 0x8F)) { 4242 goto surrogateescape; 4243 } 4244 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4245 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4246 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4247 4248#if SIZEOF_WCHAR_T == 4 4249 *p++ = (wchar_t)ch; 4250#else 4251 /* compute and append the two surrogates: */ 4252 4253 /* translate from 10000..10FFFF to 0..FFFF */ 4254 ch -= 0x10000; 4255 4256 /* high surrogate = top 10 bits added to D800 */ 4257 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4258 4259 /* low surrogate = bottom 10 bits added to DC00 */ 4260 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4261#endif 4262 break; 4263 } 4264 s += n; 4265 continue; 4266 4267 surrogateescape: 4268 *p++ = 0xDC00 + ch; 4269 s++; 4270 } 4271 *p = L'\0'; 4272 return unicode; 4273} 4274 4275#endif /* __APPLE__ */ 4276 4277/* Primary internal function which creates utf8 encoded bytes objects. 4278 4279 Allocation strategy: if the string is short, convert into a stack buffer 4280 and allocate exactly as much space needed at the end. Else allocate the 4281 maximum possible needed (4 result bytes per Unicode character), and return 4282 the excess memory at the end. 4283*/ 4284PyObject * 4285_PyUnicode_AsUTF8String(PyObject *obj, const char *errors) 4286{ 4287#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4288 4289 Py_ssize_t i; /* index into s of next input byte */ 4290 PyObject *result; /* result string object */ 4291 char *p; /* next free byte in output buffer */ 4292 Py_ssize_t nallocated; /* number of result bytes allocated */ 4293 Py_ssize_t nneeded; /* number of result bytes needed */ 4294 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4295 PyObject *errorHandler = NULL; 4296 PyObject *exc = NULL; 4297 int kind; 4298 void *data; 4299 Py_ssize_t size; 4300 PyUnicodeObject *unicode = (PyUnicodeObject *)obj; 4301#if SIZEOF_WCHAR_T == 2 4302 Py_ssize_t wchar_offset = 0; 4303#endif 4304 4305 if (!PyUnicode_Check(unicode)) { 4306 PyErr_BadArgument(); 4307 return NULL; 4308 } 4309 4310 if (PyUnicode_READY(unicode) == -1) 4311 return NULL; 4312 4313 if (PyUnicode_UTF8(unicode)) 4314 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4315 PyUnicode_UTF8_LENGTH(unicode)); 4316 4317 kind = PyUnicode_KIND(unicode); 4318 data = PyUnicode_DATA(unicode); 4319 size = PyUnicode_GET_LENGTH(unicode); 4320 4321 assert(size >= 0); 4322 4323 if (size <= MAX_SHORT_UNICHARS) { 4324 /* Write into the stack buffer; nallocated can't overflow. 4325 * At the end, we'll allocate exactly as much heap space as it 4326 * turns out we need. 4327 */ 4328 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4329 result = NULL; /* will allocate after we're done */ 4330 p = stackbuf; 4331 } 4332 else { 4333 /* Overallocate on the heap, and give the excess back at the end. */ 4334 nallocated = size * 4; 4335 if (nallocated / 4 != size) /* overflow! */ 4336 return PyErr_NoMemory(); 4337 result = PyBytes_FromStringAndSize(NULL, nallocated); 4338 if (result == NULL) 4339 return NULL; 4340 p = PyBytes_AS_STRING(result); 4341 } 4342 4343 for (i = 0; i < size;) { 4344 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4345 4346 if (ch < 0x80) 4347 /* Encode ASCII */ 4348 *p++ = (char) ch; 4349 4350 else if (ch < 0x0800) { 4351 /* Encode Latin-1 */ 4352 *p++ = (char)(0xc0 | (ch >> 6)); 4353 *p++ = (char)(0x80 | (ch & 0x3f)); 4354 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4355 Py_ssize_t newpos; 4356 PyObject *rep; 4357 Py_ssize_t repsize, k, startpos; 4358 startpos = i-1; 4359#if SIZEOF_WCHAR_T == 2 4360 startpos += wchar_offset; 4361#endif 4362 rep = unicode_encode_call_errorhandler( 4363 errors, &errorHandler, "utf-8", "surrogates not allowed", 4364 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), 4365 &exc, startpos, startpos+1, &newpos); 4366 if (!rep) 4367 goto error; 4368 4369 if (PyBytes_Check(rep)) 4370 repsize = PyBytes_GET_SIZE(rep); 4371 else 4372 repsize = PyUnicode_GET_SIZE(rep); 4373 4374 if (repsize > 4) { 4375 Py_ssize_t offset; 4376 4377 if (result == NULL) 4378 offset = p - stackbuf; 4379 else 4380 offset = p - PyBytes_AS_STRING(result); 4381 4382 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4383 /* integer overflow */ 4384 PyErr_NoMemory(); 4385 goto error; 4386 } 4387 nallocated += repsize - 4; 4388 if (result != NULL) { 4389 if (_PyBytes_Resize(&result, nallocated) < 0) 4390 goto error; 4391 } else { 4392 result = PyBytes_FromStringAndSize(NULL, nallocated); 4393 if (result == NULL) 4394 goto error; 4395 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4396 } 4397 p = PyBytes_AS_STRING(result) + offset; 4398 } 4399 4400 if (PyBytes_Check(rep)) { 4401 char *prep = PyBytes_AS_STRING(rep); 4402 for(k = repsize; k > 0; k--) 4403 *p++ = *prep++; 4404 } else /* rep is unicode */ { 4405 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4406 Py_UNICODE c; 4407 4408 for(k=0; k<repsize; k++) { 4409 c = prep[k]; 4410 if (0x80 <= c) { 4411 raise_encode_exception(&exc, "utf-8", 4412 PyUnicode_AS_UNICODE(unicode), 4413 size, i-1, i, 4414 "surrogates not allowed"); 4415 goto error; 4416 } 4417 *p++ = (char)prep[k]; 4418 } 4419 } 4420 Py_DECREF(rep); 4421 } else if (ch < 0x10000) { 4422 *p++ = (char)(0xe0 | (ch >> 12)); 4423 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4424 *p++ = (char)(0x80 | (ch & 0x3f)); 4425 } else /* ch >= 0x10000 */ { 4426 /* Encode UCS4 Unicode ordinals */ 4427 *p++ = (char)(0xf0 | (ch >> 18)); 4428 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4429 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4430 *p++ = (char)(0x80 | (ch & 0x3f)); 4431#if SIZEOF_WCHAR_T == 2 4432 wchar_offset++; 4433#endif 4434 } 4435 } 4436 4437 if (result == NULL) { 4438 /* This was stack allocated. */ 4439 nneeded = p - stackbuf; 4440 assert(nneeded <= nallocated); 4441 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4442 } 4443 else { 4444 /* Cut back to size actually needed. */ 4445 nneeded = p - PyBytes_AS_STRING(result); 4446 assert(nneeded <= nallocated); 4447 _PyBytes_Resize(&result, nneeded); 4448 } 4449 4450 Py_XDECREF(errorHandler); 4451 Py_XDECREF(exc); 4452 return result; 4453 error: 4454 Py_XDECREF(errorHandler); 4455 Py_XDECREF(exc); 4456 Py_XDECREF(result); 4457 return NULL; 4458 4459#undef MAX_SHORT_UNICHARS 4460} 4461 4462PyObject * 4463PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4464 Py_ssize_t size, 4465 const char *errors) 4466{ 4467 PyObject *v, *unicode; 4468 4469 unicode = PyUnicode_FromUnicode(s, size); 4470 if (unicode == NULL) 4471 return NULL; 4472 v = _PyUnicode_AsUTF8String(unicode, errors); 4473 Py_DECREF(unicode); 4474 return v; 4475} 4476 4477PyObject * 4478PyUnicode_AsUTF8String(PyObject *unicode) 4479{ 4480 return _PyUnicode_AsUTF8String(unicode, NULL); 4481} 4482 4483/* --- UTF-32 Codec ------------------------------------------------------- */ 4484 4485PyObject * 4486PyUnicode_DecodeUTF32(const char *s, 4487 Py_ssize_t size, 4488 const char *errors, 4489 int *byteorder) 4490{ 4491 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4492} 4493 4494PyObject * 4495PyUnicode_DecodeUTF32Stateful(const char *s, 4496 Py_ssize_t size, 4497 const char *errors, 4498 int *byteorder, 4499 Py_ssize_t *consumed) 4500{ 4501 const char *starts = s; 4502 Py_ssize_t startinpos; 4503 Py_ssize_t endinpos; 4504 Py_ssize_t outpos; 4505 PyUnicodeObject *unicode; 4506 Py_UNICODE *p; 4507#ifndef Py_UNICODE_WIDE 4508 int pairs = 0; 4509 const unsigned char *qq; 4510#else 4511 const int pairs = 0; 4512#endif 4513 const unsigned char *q, *e; 4514 int bo = 0; /* assume native ordering by default */ 4515 const char *errmsg = ""; 4516 /* Offsets from q for retrieving bytes in the right order. */ 4517#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4518 int iorder[] = {0, 1, 2, 3}; 4519#else 4520 int iorder[] = {3, 2, 1, 0}; 4521#endif 4522 PyObject *errorHandler = NULL; 4523 PyObject *exc = NULL; 4524 4525 q = (unsigned char *)s; 4526 e = q + size; 4527 4528 if (byteorder) 4529 bo = *byteorder; 4530 4531 /* Check for BOM marks (U+FEFF) in the input and adjust current 4532 byte order setting accordingly. In native mode, the leading BOM 4533 mark is skipped, in all other modes, it is copied to the output 4534 stream as-is (giving a ZWNBSP character). */ 4535 if (bo == 0) { 4536 if (size >= 4) { 4537 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4538 (q[iorder[1]] << 8) | q[iorder[0]]; 4539#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4540 if (bom == 0x0000FEFF) { 4541 q += 4; 4542 bo = -1; 4543 } 4544 else if (bom == 0xFFFE0000) { 4545 q += 4; 4546 bo = 1; 4547 } 4548#else 4549 if (bom == 0x0000FEFF) { 4550 q += 4; 4551 bo = 1; 4552 } 4553 else if (bom == 0xFFFE0000) { 4554 q += 4; 4555 bo = -1; 4556 } 4557#endif 4558 } 4559 } 4560 4561 if (bo == -1) { 4562 /* force LE */ 4563 iorder[0] = 0; 4564 iorder[1] = 1; 4565 iorder[2] = 2; 4566 iorder[3] = 3; 4567 } 4568 else if (bo == 1) { 4569 /* force BE */ 4570 iorder[0] = 3; 4571 iorder[1] = 2; 4572 iorder[2] = 1; 4573 iorder[3] = 0; 4574 } 4575 4576 /* On narrow builds we split characters outside the BMP into two 4577 codepoints => count how much extra space we need. */ 4578#ifndef Py_UNICODE_WIDE 4579 for (qq = q; qq < e; qq += 4) 4580 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4581 pairs++; 4582#endif 4583 4584 /* This might be one to much, because of a BOM */ 4585 unicode = _PyUnicode_New((size+3)/4+pairs); 4586 if (!unicode) 4587 return NULL; 4588 if (size == 0) 4589 return (PyObject *)unicode; 4590 4591 /* Unpack UTF-32 encoded data */ 4592 p = PyUnicode_AS_UNICODE(unicode); 4593 4594 while (q < e) { 4595 Py_UCS4 ch; 4596 /* remaining bytes at the end? (size should be divisible by 4) */ 4597 if (e-q<4) { 4598 if (consumed) 4599 break; 4600 errmsg = "truncated data"; 4601 startinpos = ((const char *)q)-starts; 4602 endinpos = ((const char *)e)-starts; 4603 goto utf32Error; 4604 /* The remaining input chars are ignored if the callback 4605 chooses to skip the input */ 4606 } 4607 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4608 (q[iorder[1]] << 8) | q[iorder[0]]; 4609 4610 if (ch >= 0x110000) 4611 { 4612 errmsg = "codepoint not in range(0x110000)"; 4613 startinpos = ((const char *)q)-starts; 4614 endinpos = startinpos+4; 4615 goto utf32Error; 4616 } 4617#ifndef Py_UNICODE_WIDE 4618 if (ch >= 0x10000) 4619 { 4620 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4621 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4622 } 4623 else 4624#endif 4625 *p++ = ch; 4626 q += 4; 4627 continue; 4628 utf32Error: 4629 outpos = p-PyUnicode_AS_UNICODE(unicode); 4630 if (unicode_decode_call_errorhandler( 4631 errors, &errorHandler, 4632 "utf32", errmsg, 4633 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4634 &unicode, &outpos, &p)) 4635 goto onError; 4636 } 4637 4638 if (byteorder) 4639 *byteorder = bo; 4640 4641 if (consumed) 4642 *consumed = (const char *)q-starts; 4643 4644 /* Adjust length */ 4645 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 4646 goto onError; 4647 4648 Py_XDECREF(errorHandler); 4649 Py_XDECREF(exc); 4650 if (PyUnicode_READY(unicode) == -1) { 4651 Py_DECREF(unicode); 4652 return NULL; 4653 } 4654 return (PyObject *)unicode; 4655 4656 onError: 4657 Py_DECREF(unicode); 4658 Py_XDECREF(errorHandler); 4659 Py_XDECREF(exc); 4660 return NULL; 4661} 4662 4663PyObject * 4664PyUnicode_EncodeUTF32(const Py_UNICODE *s, 4665 Py_ssize_t size, 4666 const char *errors, 4667 int byteorder) 4668{ 4669 PyObject *v; 4670 unsigned char *p; 4671 Py_ssize_t nsize, bytesize; 4672#ifndef Py_UNICODE_WIDE 4673 Py_ssize_t i, pairs; 4674#else 4675 const int pairs = 0; 4676#endif 4677 /* Offsets from p for storing byte pairs in the right order. */ 4678#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4679 int iorder[] = {0, 1, 2, 3}; 4680#else 4681 int iorder[] = {3, 2, 1, 0}; 4682#endif 4683 4684#define STORECHAR(CH) \ 4685 do { \ 4686 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 4687 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 4688 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 4689 p[iorder[0]] = (CH) & 0xff; \ 4690 p += 4; \ 4691 } while(0) 4692 4693 /* In narrow builds we can output surrogate pairs as one codepoint, 4694 so we need less space. */ 4695#ifndef Py_UNICODE_WIDE 4696 for (i = pairs = 0; i < size-1; i++) 4697 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 4698 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 4699 pairs++; 4700#endif 4701 nsize = (size - pairs + (byteorder == 0)); 4702 bytesize = nsize * 4; 4703 if (bytesize / 4 != nsize) 4704 return PyErr_NoMemory(); 4705 v = PyBytes_FromStringAndSize(NULL, bytesize); 4706 if (v == NULL) 4707 return NULL; 4708 4709 p = (unsigned char *)PyBytes_AS_STRING(v); 4710 if (byteorder == 0) 4711 STORECHAR(0xFEFF); 4712 if (size == 0) 4713 goto done; 4714 4715 if (byteorder == -1) { 4716 /* force LE */ 4717 iorder[0] = 0; 4718 iorder[1] = 1; 4719 iorder[2] = 2; 4720 iorder[3] = 3; 4721 } 4722 else if (byteorder == 1) { 4723 /* force BE */ 4724 iorder[0] = 3; 4725 iorder[1] = 2; 4726 iorder[2] = 1; 4727 iorder[3] = 0; 4728 } 4729 4730 while (size-- > 0) { 4731 Py_UCS4 ch = *s++; 4732#ifndef Py_UNICODE_WIDE 4733 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 4734 Py_UCS4 ch2 = *s; 4735 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4736 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4737 s++; 4738 size--; 4739 } 4740 } 4741#endif 4742 STORECHAR(ch); 4743 } 4744 4745 done: 4746 return v; 4747#undef STORECHAR 4748} 4749 4750PyObject * 4751PyUnicode_AsUTF32String(PyObject *unicode) 4752{ 4753 if (!PyUnicode_Check(unicode)) { 4754 PyErr_BadArgument(); 4755 return NULL; 4756 } 4757 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 4758 PyUnicode_GET_SIZE(unicode), 4759 NULL, 4760 0); 4761} 4762 4763/* --- UTF-16 Codec ------------------------------------------------------- */ 4764 4765PyObject * 4766PyUnicode_DecodeUTF16(const char *s, 4767 Py_ssize_t size, 4768 const char *errors, 4769 int *byteorder) 4770{ 4771 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 4772} 4773 4774/* Two masks for fast checking of whether a C 'long' may contain 4775 UTF16-encoded surrogate characters. This is an efficient heuristic, 4776 assuming that non-surrogate characters with a code point >= 0x8000 are 4777 rare in most input. 4778 FAST_CHAR_MASK is used when the input is in native byte ordering, 4779 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 4780*/ 4781#if (SIZEOF_LONG == 8) 4782# define FAST_CHAR_MASK 0x8000800080008000L 4783# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 4784#elif (SIZEOF_LONG == 4) 4785# define FAST_CHAR_MASK 0x80008000L 4786# define SWAPPED_FAST_CHAR_MASK 0x00800080L 4787#else 4788# error C 'long' size should be either 4 or 8! 4789#endif 4790 4791PyObject * 4792PyUnicode_DecodeUTF16Stateful(const char *s, 4793 Py_ssize_t size, 4794 const char *errors, 4795 int *byteorder, 4796 Py_ssize_t *consumed) 4797{ 4798 const char *starts = s; 4799 Py_ssize_t startinpos; 4800 Py_ssize_t endinpos; 4801 Py_ssize_t outpos; 4802 PyUnicodeObject *unicode; 4803 Py_UNICODE *p; 4804 const unsigned char *q, *e, *aligned_end; 4805 int bo = 0; /* assume native ordering by default */ 4806 int native_ordering = 0; 4807 const char *errmsg = ""; 4808 /* Offsets from q for retrieving byte pairs in the right order. */ 4809#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4810 int ihi = 1, ilo = 0; 4811#else 4812 int ihi = 0, ilo = 1; 4813#endif 4814 PyObject *errorHandler = NULL; 4815 PyObject *exc = NULL; 4816 4817 /* Note: size will always be longer than the resulting Unicode 4818 character count */ 4819 unicode = _PyUnicode_New(size); 4820 if (!unicode) 4821 return NULL; 4822 if (size == 0) 4823 return (PyObject *)unicode; 4824 4825 /* Unpack UTF-16 encoded data */ 4826 p = PyUnicode_AS_UNICODE(unicode); 4827 q = (unsigned char *)s; 4828 e = q + size - 1; 4829 4830 if (byteorder) 4831 bo = *byteorder; 4832 4833 /* Check for BOM marks (U+FEFF) in the input and adjust current 4834 byte order setting accordingly. In native mode, the leading BOM 4835 mark is skipped, in all other modes, it is copied to the output 4836 stream as-is (giving a ZWNBSP character). */ 4837 if (bo == 0) { 4838 if (size >= 2) { 4839 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 4840#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4841 if (bom == 0xFEFF) { 4842 q += 2; 4843 bo = -1; 4844 } 4845 else if (bom == 0xFFFE) { 4846 q += 2; 4847 bo = 1; 4848 } 4849#else 4850 if (bom == 0xFEFF) { 4851 q += 2; 4852 bo = 1; 4853 } 4854 else if (bom == 0xFFFE) { 4855 q += 2; 4856 bo = -1; 4857 } 4858#endif 4859 } 4860 } 4861 4862 if (bo == -1) { 4863 /* force LE */ 4864 ihi = 1; 4865 ilo = 0; 4866 } 4867 else if (bo == 1) { 4868 /* force BE */ 4869 ihi = 0; 4870 ilo = 1; 4871 } 4872#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4873 native_ordering = ilo < ihi; 4874#else 4875 native_ordering = ilo > ihi; 4876#endif 4877 4878 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 4879 while (q < e) { 4880 Py_UNICODE ch; 4881 /* First check for possible aligned read of a C 'long'. Unaligned 4882 reads are more expensive, better to defer to another iteration. */ 4883 if (!((size_t) q & LONG_PTR_MASK)) { 4884 /* Fast path for runs of non-surrogate chars. */ 4885 register const unsigned char *_q = q; 4886 Py_UNICODE *_p = p; 4887 if (native_ordering) { 4888 /* Native ordering is simple: as long as the input cannot 4889 possibly contain a surrogate char, do an unrolled copy 4890 of several 16-bit code points to the target object. 4891 The non-surrogate check is done on several input bytes 4892 at a time (as many as a C 'long' can contain). */ 4893 while (_q < aligned_end) { 4894 unsigned long data = * (unsigned long *) _q; 4895 if (data & FAST_CHAR_MASK) 4896 break; 4897 _p[0] = ((unsigned short *) _q)[0]; 4898 _p[1] = ((unsigned short *) _q)[1]; 4899#if (SIZEOF_LONG == 8) 4900 _p[2] = ((unsigned short *) _q)[2]; 4901 _p[3] = ((unsigned short *) _q)[3]; 4902#endif 4903 _q += SIZEOF_LONG; 4904 _p += SIZEOF_LONG / 2; 4905 } 4906 } 4907 else { 4908 /* Byteswapped ordering is similar, but we must decompose 4909 the copy bytewise, and take care of zero'ing out the 4910 upper bytes if the target object is in 32-bit units 4911 (that is, in UCS-4 builds). */ 4912 while (_q < aligned_end) { 4913 unsigned long data = * (unsigned long *) _q; 4914 if (data & SWAPPED_FAST_CHAR_MASK) 4915 break; 4916 /* Zero upper bytes in UCS-4 builds */ 4917#if (Py_UNICODE_SIZE > 2) 4918 _p[0] = 0; 4919 _p[1] = 0; 4920#if (SIZEOF_LONG == 8) 4921 _p[2] = 0; 4922 _p[3] = 0; 4923#endif 4924#endif 4925 /* Issue #4916; UCS-4 builds on big endian machines must 4926 fill the two last bytes of each 4-byte unit. */ 4927#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 4928# define OFF 2 4929#else 4930# define OFF 0 4931#endif 4932 ((unsigned char *) _p)[OFF + 1] = _q[0]; 4933 ((unsigned char *) _p)[OFF + 0] = _q[1]; 4934 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 4935 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 4936#if (SIZEOF_LONG == 8) 4937 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 4938 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 4939 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 4940 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 4941#endif 4942#undef OFF 4943 _q += SIZEOF_LONG; 4944 _p += SIZEOF_LONG / 2; 4945 } 4946 } 4947 p = _p; 4948 q = _q; 4949 if (q >= e) 4950 break; 4951 } 4952 ch = (q[ihi] << 8) | q[ilo]; 4953 4954 q += 2; 4955 4956 if (ch < 0xD800 || ch > 0xDFFF) { 4957 *p++ = ch; 4958 continue; 4959 } 4960 4961 /* UTF-16 code pair: */ 4962 if (q > e) { 4963 errmsg = "unexpected end of data"; 4964 startinpos = (((const char *)q) - 2) - starts; 4965 endinpos = ((const char *)e) + 1 - starts; 4966 goto utf16Error; 4967 } 4968 if (0xD800 <= ch && ch <= 0xDBFF) { 4969 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 4970 q += 2; 4971 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 4972#ifndef Py_UNICODE_WIDE 4973 *p++ = ch; 4974 *p++ = ch2; 4975#else 4976 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 4977#endif 4978 continue; 4979 } 4980 else { 4981 errmsg = "illegal UTF-16 surrogate"; 4982 startinpos = (((const char *)q)-4)-starts; 4983 endinpos = startinpos+2; 4984 goto utf16Error; 4985 } 4986 4987 } 4988 errmsg = "illegal encoding"; 4989 startinpos = (((const char *)q)-2)-starts; 4990 endinpos = startinpos+2; 4991 /* Fall through to report the error */ 4992 4993 utf16Error: 4994 outpos = p - PyUnicode_AS_UNICODE(unicode); 4995 if (unicode_decode_call_errorhandler( 4996 errors, 4997 &errorHandler, 4998 "utf16", errmsg, 4999 &starts, 5000 (const char **)&e, 5001 &startinpos, 5002 &endinpos, 5003 &exc, 5004 (const char **)&q, 5005 &unicode, 5006 &outpos, 5007 &p)) 5008 goto onError; 5009 } 5010 /* remaining byte at the end? (size should be even) */ 5011 if (e == q) { 5012 if (!consumed) { 5013 errmsg = "truncated data"; 5014 startinpos = ((const char *)q) - starts; 5015 endinpos = ((const char *)e) + 1 - starts; 5016 outpos = p - PyUnicode_AS_UNICODE(unicode); 5017 if (unicode_decode_call_errorhandler( 5018 errors, 5019 &errorHandler, 5020 "utf16", errmsg, 5021 &starts, 5022 (const char **)&e, 5023 &startinpos, 5024 &endinpos, 5025 &exc, 5026 (const char **)&q, 5027 &unicode, 5028 &outpos, 5029 &p)) 5030 goto onError; 5031 /* The remaining input chars are ignored if the callback 5032 chooses to skip the input */ 5033 } 5034 } 5035 5036 if (byteorder) 5037 *byteorder = bo; 5038 5039 if (consumed) 5040 *consumed = (const char *)q-starts; 5041 5042 /* Adjust length */ 5043 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5044 goto onError; 5045 5046 Py_XDECREF(errorHandler); 5047 Py_XDECREF(exc); 5048 if (PyUnicode_READY(unicode) == -1) { 5049 Py_DECREF(unicode); 5050 return NULL; 5051 } 5052 return (PyObject *)unicode; 5053 5054 onError: 5055 Py_DECREF(unicode); 5056 Py_XDECREF(errorHandler); 5057 Py_XDECREF(exc); 5058 return NULL; 5059} 5060 5061#undef FAST_CHAR_MASK 5062#undef SWAPPED_FAST_CHAR_MASK 5063 5064PyObject * 5065PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5066 Py_ssize_t size, 5067 const char *errors, 5068 int byteorder) 5069{ 5070 PyObject *v; 5071 unsigned char *p; 5072 Py_ssize_t nsize, bytesize; 5073#ifdef Py_UNICODE_WIDE 5074 Py_ssize_t i, pairs; 5075#else 5076 const int pairs = 0; 5077#endif 5078 /* Offsets from p for storing byte pairs in the right order. */ 5079#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5080 int ihi = 1, ilo = 0; 5081#else 5082 int ihi = 0, ilo = 1; 5083#endif 5084 5085#define STORECHAR(CH) \ 5086 do { \ 5087 p[ihi] = ((CH) >> 8) & 0xff; \ 5088 p[ilo] = (CH) & 0xff; \ 5089 p += 2; \ 5090 } while(0) 5091 5092#ifdef Py_UNICODE_WIDE 5093 for (i = pairs = 0; i < size; i++) 5094 if (s[i] >= 0x10000) 5095 pairs++; 5096#endif 5097 /* 2 * (size + pairs + (byteorder == 0)) */ 5098 if (size > PY_SSIZE_T_MAX || 5099 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5100 return PyErr_NoMemory(); 5101 nsize = size + pairs + (byteorder == 0); 5102 bytesize = nsize * 2; 5103 if (bytesize / 2 != nsize) 5104 return PyErr_NoMemory(); 5105 v = PyBytes_FromStringAndSize(NULL, bytesize); 5106 if (v == NULL) 5107 return NULL; 5108 5109 p = (unsigned char *)PyBytes_AS_STRING(v); 5110 if (byteorder == 0) 5111 STORECHAR(0xFEFF); 5112 if (size == 0) 5113 goto done; 5114 5115 if (byteorder == -1) { 5116 /* force LE */ 5117 ihi = 1; 5118 ilo = 0; 5119 } 5120 else if (byteorder == 1) { 5121 /* force BE */ 5122 ihi = 0; 5123 ilo = 1; 5124 } 5125 5126 while (size-- > 0) { 5127 Py_UNICODE ch = *s++; 5128 Py_UNICODE ch2 = 0; 5129#ifdef Py_UNICODE_WIDE 5130 if (ch >= 0x10000) { 5131 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5132 ch = 0xD800 | ((ch-0x10000) >> 10); 5133 } 5134#endif 5135 STORECHAR(ch); 5136 if (ch2) 5137 STORECHAR(ch2); 5138 } 5139 5140 done: 5141 return v; 5142#undef STORECHAR 5143} 5144 5145PyObject * 5146PyUnicode_AsUTF16String(PyObject *unicode) 5147{ 5148 if (!PyUnicode_Check(unicode)) { 5149 PyErr_BadArgument(); 5150 return NULL; 5151 } 5152 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5153 PyUnicode_GET_SIZE(unicode), 5154 NULL, 5155 0); 5156} 5157 5158/* --- Unicode Escape Codec ----------------------------------------------- */ 5159 5160/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5161 if all the escapes in the string make it still a valid ASCII string. 5162 Returns -1 if any escapes were found which cause the string to 5163 pop out of ASCII range. Otherwise returns the length of the 5164 required buffer to hold the string. 5165 */ 5166Py_ssize_t 5167length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5168{ 5169 const unsigned char *p = (const unsigned char *)s; 5170 const unsigned char *end = p + size; 5171 Py_ssize_t length = 0; 5172 5173 if (size < 0) 5174 return -1; 5175 5176 for (; p < end; ++p) { 5177 if (*p > 127) { 5178 /* Non-ASCII */ 5179 return -1; 5180 } 5181 else if (*p != '\\') { 5182 /* Normal character */ 5183 ++length; 5184 } 5185 else { 5186 /* Backslash-escape, check next char */ 5187 ++p; 5188 /* Escape sequence reaches till end of string or 5189 non-ASCII follow-up. */ 5190 if (p >= end || *p > 127) 5191 return -1; 5192 switch (*p) { 5193 case '\n': 5194 /* backslash + \n result in zero characters */ 5195 break; 5196 case '\\': case '\'': case '\"': 5197 case 'b': case 'f': case 't': 5198 case 'n': case 'r': case 'v': case 'a': 5199 ++length; 5200 break; 5201 case '0': case '1': case '2': case '3': 5202 case '4': case '5': case '6': case '7': 5203 case 'x': case 'u': case 'U': case 'N': 5204 /* these do not guarantee ASCII characters */ 5205 return -1; 5206 default: 5207 /* count the backslash + the other character */ 5208 length += 2; 5209 } 5210 } 5211 } 5212 return length; 5213} 5214 5215/* Similar to PyUnicode_WRITE but either write into wstr field 5216 or treat string as ASCII. */ 5217#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5218 do { \ 5219 if ((kind) != PyUnicode_WCHAR_KIND) \ 5220 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5221 else \ 5222 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5223 } while (0) 5224 5225#define WRITE_WSTR(buf, index, value) \ 5226 assert(kind == PyUnicode_WCHAR_KIND), \ 5227 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5228 5229 5230static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5231 5232PyObject * 5233PyUnicode_DecodeUnicodeEscape(const char *s, 5234 Py_ssize_t size, 5235 const char *errors) 5236{ 5237 const char *starts = s; 5238 Py_ssize_t startinpos; 5239 Py_ssize_t endinpos; 5240 int j; 5241 PyUnicodeObject *v; 5242 Py_UNICODE *p; 5243 const char *end; 5244 char* message; 5245 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5246 PyObject *errorHandler = NULL; 5247 PyObject *exc = NULL; 5248 Py_ssize_t ascii_length; 5249 Py_ssize_t i; 5250 int kind; 5251 void *data; 5252 5253 ascii_length = length_of_escaped_ascii_string(s, size); 5254 5255 /* After length_of_escaped_ascii_string() there are two alternatives, 5256 either the string is pure ASCII with named escapes like \n, etc. 5257 and we determined it's exact size (common case) 5258 or it contains \x, \u, ... escape sequences. then we create a 5259 legacy wchar string and resize it at the end of this function. */ 5260 if (ascii_length >= 0) { 5261 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127); 5262 if (!v) 5263 goto onError; 5264 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5265 kind = PyUnicode_1BYTE_KIND; 5266 data = PyUnicode_DATA(v); 5267 } 5268 else { 5269 /* Escaped strings will always be longer than the resulting 5270 Unicode string, so we start with size here and then reduce the 5271 length after conversion to the true value. 5272 (but if the error callback returns a long replacement string 5273 we'll have to allocate more space) */ 5274 v = _PyUnicode_New(size); 5275 if (!v) 5276 goto onError; 5277 kind = PyUnicode_WCHAR_KIND; 5278 data = PyUnicode_AS_UNICODE(v); 5279 } 5280 5281 if (size == 0) 5282 return (PyObject *)v; 5283 i = 0; 5284 end = s + size; 5285 5286 while (s < end) { 5287 unsigned char c; 5288 Py_UNICODE x; 5289 int digits; 5290 5291 if (kind == PyUnicode_WCHAR_KIND) { 5292 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5293 } 5294 else { 5295 /* The only case in which i == ascii_length is a backslash 5296 followed by a newline. */ 5297 assert(i <= ascii_length); 5298 } 5299 5300 /* Non-escape characters are interpreted as Unicode ordinals */ 5301 if (*s != '\\') { 5302 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5303 continue; 5304 } 5305 5306 startinpos = s-starts; 5307 /* \ - Escapes */ 5308 s++; 5309 c = *s++; 5310 if (s > end) 5311 c = '\0'; /* Invalid after \ */ 5312 5313 if (kind == PyUnicode_WCHAR_KIND) { 5314 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5315 } 5316 else { 5317 /* The only case in which i == ascii_length is a backslash 5318 followed by a newline. */ 5319 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5320 } 5321 5322 switch (c) { 5323 5324 /* \x escapes */ 5325 case '\n': break; 5326 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5327 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5328 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5329 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5330 /* FF */ 5331 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5332 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5333 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5334 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5335 /* VT */ 5336 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5337 /* BEL, not classic C */ 5338 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5339 5340 /* \OOO (octal) escapes */ 5341 case '0': case '1': case '2': case '3': 5342 case '4': case '5': case '6': case '7': 5343 x = s[-1] - '0'; 5344 if (s < end && '0' <= *s && *s <= '7') { 5345 x = (x<<3) + *s++ - '0'; 5346 if (s < end && '0' <= *s && *s <= '7') 5347 x = (x<<3) + *s++ - '0'; 5348 } 5349 WRITE_WSTR(data, i++, x); 5350 break; 5351 5352 /* hex escapes */ 5353 /* \xXX */ 5354 case 'x': 5355 digits = 2; 5356 message = "truncated \\xXX escape"; 5357 goto hexescape; 5358 5359 /* \uXXXX */ 5360 case 'u': 5361 digits = 4; 5362 message = "truncated \\uXXXX escape"; 5363 goto hexescape; 5364 5365 /* \UXXXXXXXX */ 5366 case 'U': 5367 digits = 8; 5368 message = "truncated \\UXXXXXXXX escape"; 5369 hexescape: 5370 chr = 0; 5371 p = PyUnicode_AS_UNICODE(v) + i; 5372 if (s+digits>end) { 5373 endinpos = size; 5374 if (unicode_decode_call_errorhandler( 5375 errors, &errorHandler, 5376 "unicodeescape", "end of string in escape sequence", 5377 &starts, &end, &startinpos, &endinpos, &exc, &s, 5378 &v, &i, &p)) 5379 goto onError; 5380 data = PyUnicode_AS_UNICODE(v); 5381 goto nextByte; 5382 } 5383 for (j = 0; j < digits; ++j) { 5384 c = (unsigned char) s[j]; 5385 if (!Py_ISXDIGIT(c)) { 5386 endinpos = (s+j+1)-starts; 5387 p = PyUnicode_AS_UNICODE(v) + i; 5388 if (unicode_decode_call_errorhandler( 5389 errors, &errorHandler, 5390 "unicodeescape", message, 5391 &starts, &end, &startinpos, &endinpos, &exc, &s, 5392 &v, &i, &p)) 5393 goto onError; 5394 data = PyUnicode_AS_UNICODE(v); 5395 goto nextByte; 5396 } 5397 chr = (chr<<4) & ~0xF; 5398 if (c >= '0' && c <= '9') 5399 chr += c - '0'; 5400 else if (c >= 'a' && c <= 'f') 5401 chr += 10 + c - 'a'; 5402 else 5403 chr += 10 + c - 'A'; 5404 } 5405 s += j; 5406 if (chr == 0xffffffff && PyErr_Occurred()) 5407 /* _decoding_error will have already written into the 5408 target buffer. */ 5409 break; 5410 store: 5411 /* when we get here, chr is a 32-bit unicode character */ 5412 if (chr <= 0xffff) 5413 /* UCS-2 character */ 5414 WRITE_WSTR(data, i++, chr); 5415 else if (chr <= 0x10ffff) { 5416 /* UCS-4 character. Either store directly, or as 5417 surrogate pair. */ 5418#ifdef Py_UNICODE_WIDE 5419 WRITE_WSTR(data, i++, chr); 5420#else 5421 chr -= 0x10000L; 5422 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5423 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5424#endif 5425 } else { 5426 endinpos = s-starts; 5427 p = PyUnicode_AS_UNICODE(v) + i; 5428 if (unicode_decode_call_errorhandler( 5429 errors, &errorHandler, 5430 "unicodeescape", "illegal Unicode character", 5431 &starts, &end, &startinpos, &endinpos, &exc, &s, 5432 &v, &i, &p)) 5433 goto onError; 5434 data = PyUnicode_AS_UNICODE(v); 5435 } 5436 break; 5437 5438 /* \N{name} */ 5439 case 'N': 5440 message = "malformed \\N character escape"; 5441 if (ucnhash_CAPI == NULL) { 5442 /* load the unicode data module */ 5443 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5444 PyUnicodeData_CAPSULE_NAME, 1); 5445 if (ucnhash_CAPI == NULL) 5446 goto ucnhashError; 5447 } 5448 if (*s == '{') { 5449 const char *start = s+1; 5450 /* look for the closing brace */ 5451 while (*s != '}' && s < end) 5452 s++; 5453 if (s > start && s < end && *s == '}') { 5454 /* found a name. look it up in the unicode database */ 5455 message = "unknown Unicode character name"; 5456 s++; 5457 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5458 &chr)) 5459 goto store; 5460 } 5461 } 5462 endinpos = s-starts; 5463 p = PyUnicode_AS_UNICODE(v) + i; 5464 if (unicode_decode_call_errorhandler( 5465 errors, &errorHandler, 5466 "unicodeescape", message, 5467 &starts, &end, &startinpos, &endinpos, &exc, &s, 5468 &v, &i, &p)) 5469 goto onError; 5470 data = PyUnicode_AS_UNICODE(v); 5471 break; 5472 5473 default: 5474 if (s > end) { 5475 assert(kind == PyUnicode_WCHAR_KIND); 5476 message = "\\ at end of string"; 5477 s--; 5478 endinpos = s-starts; 5479 p = PyUnicode_AS_UNICODE(v) + i; 5480 if (unicode_decode_call_errorhandler( 5481 errors, &errorHandler, 5482 "unicodeescape", message, 5483 &starts, &end, &startinpos, &endinpos, &exc, &s, 5484 &v, &i, &p)) 5485 goto onError; 5486 data = PyUnicode_AS_UNICODE(v); 5487 } 5488 else { 5489 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5490 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5491 } 5492 break; 5493 } 5494 nextByte: 5495 ; 5496 } 5497 /* Ensure the length prediction worked in case of ASCII strings */ 5498 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5499 5500 if (kind == PyUnicode_WCHAR_KIND) 5501 { 5502 if (PyUnicode_Resize((PyObject**)&v, i) < 0) 5503 goto onError; 5504 if (PyUnicode_READY(v) == -1) 5505 goto onError; 5506 } 5507 Py_XDECREF(errorHandler); 5508 Py_XDECREF(exc); 5509 return (PyObject *)v; 5510 5511 ucnhashError: 5512 PyErr_SetString( 5513 PyExc_UnicodeError, 5514 "\\N escapes not supported (can't load unicodedata module)" 5515 ); 5516 Py_XDECREF(v); 5517 Py_XDECREF(errorHandler); 5518 Py_XDECREF(exc); 5519 return NULL; 5520 5521 onError: 5522 Py_XDECREF(v); 5523 Py_XDECREF(errorHandler); 5524 Py_XDECREF(exc); 5525 return NULL; 5526} 5527 5528#undef WRITE_ASCII_OR_WSTR 5529#undef WRITE_WSTR 5530 5531/* Return a Unicode-Escape string version of the Unicode object. 5532 5533 If quotes is true, the string is enclosed in u"" or u'' quotes as 5534 appropriate. 5535 5536*/ 5537 5538static const char *hexdigits = "0123456789abcdef"; 5539 5540PyObject * 5541PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5542 Py_ssize_t size) 5543{ 5544 PyObject *repr; 5545 char *p; 5546 5547#ifdef Py_UNICODE_WIDE 5548 const Py_ssize_t expandsize = 10; 5549#else 5550 const Py_ssize_t expandsize = 6; 5551#endif 5552 5553 /* XXX(nnorwitz): rather than over-allocating, it would be 5554 better to choose a different scheme. Perhaps scan the 5555 first N-chars of the string and allocate based on that size. 5556 */ 5557 /* Initial allocation is based on the longest-possible unichr 5558 escape. 5559 5560 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5561 unichr, so in this case it's the longest unichr escape. In 5562 narrow (UTF-16) builds this is five chars per source unichr 5563 since there are two unichrs in the surrogate pair, so in narrow 5564 (UTF-16) builds it's not the longest unichr escape. 5565 5566 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5567 so in the narrow (UTF-16) build case it's the longest unichr 5568 escape. 5569 */ 5570 5571 if (size == 0) 5572 return PyBytes_FromStringAndSize(NULL, 0); 5573 5574 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5575 return PyErr_NoMemory(); 5576 5577 repr = PyBytes_FromStringAndSize(NULL, 5578 2 5579 + expandsize*size 5580 + 1); 5581 if (repr == NULL) 5582 return NULL; 5583 5584 p = PyBytes_AS_STRING(repr); 5585 5586 while (size-- > 0) { 5587 Py_UNICODE ch = *s++; 5588 5589 /* Escape backslashes */ 5590 if (ch == '\\') { 5591 *p++ = '\\'; 5592 *p++ = (char) ch; 5593 continue; 5594 } 5595 5596#ifdef Py_UNICODE_WIDE 5597 /* Map 21-bit characters to '\U00xxxxxx' */ 5598 else if (ch >= 0x10000) { 5599 *p++ = '\\'; 5600 *p++ = 'U'; 5601 *p++ = hexdigits[(ch >> 28) & 0x0000000F]; 5602 *p++ = hexdigits[(ch >> 24) & 0x0000000F]; 5603 *p++ = hexdigits[(ch >> 20) & 0x0000000F]; 5604 *p++ = hexdigits[(ch >> 16) & 0x0000000F]; 5605 *p++ = hexdigits[(ch >> 12) & 0x0000000F]; 5606 *p++ = hexdigits[(ch >> 8) & 0x0000000F]; 5607 *p++ = hexdigits[(ch >> 4) & 0x0000000F]; 5608 *p++ = hexdigits[ch & 0x0000000F]; 5609 continue; 5610 } 5611#else 5612 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5613 else if (ch >= 0xD800 && ch < 0xDC00) { 5614 Py_UNICODE ch2; 5615 Py_UCS4 ucs; 5616 5617 ch2 = *s++; 5618 size--; 5619 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5620 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5621 *p++ = '\\'; 5622 *p++ = 'U'; 5623 *p++ = hexdigits[(ucs >> 28) & 0x0000000F]; 5624 *p++ = hexdigits[(ucs >> 24) & 0x0000000F]; 5625 *p++ = hexdigits[(ucs >> 20) & 0x0000000F]; 5626 *p++ = hexdigits[(ucs >> 16) & 0x0000000F]; 5627 *p++ = hexdigits[(ucs >> 12) & 0x0000000F]; 5628 *p++ = hexdigits[(ucs >> 8) & 0x0000000F]; 5629 *p++ = hexdigits[(ucs >> 4) & 0x0000000F]; 5630 *p++ = hexdigits[ucs & 0x0000000F]; 5631 continue; 5632 } 5633 /* Fall through: isolated surrogates are copied as-is */ 5634 s--; 5635 size++; 5636 } 5637#endif 5638 5639 /* Map 16-bit characters to '\uxxxx' */ 5640 if (ch >= 256) { 5641 *p++ = '\\'; 5642 *p++ = 'u'; 5643 *p++ = hexdigits[(ch >> 12) & 0x000F]; 5644 *p++ = hexdigits[(ch >> 8) & 0x000F]; 5645 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5646 *p++ = hexdigits[ch & 0x000F]; 5647 } 5648 5649 /* Map special whitespace to '\t', \n', '\r' */ 5650 else if (ch == '\t') { 5651 *p++ = '\\'; 5652 *p++ = 't'; 5653 } 5654 else if (ch == '\n') { 5655 *p++ = '\\'; 5656 *p++ = 'n'; 5657 } 5658 else if (ch == '\r') { 5659 *p++ = '\\'; 5660 *p++ = 'r'; 5661 } 5662 5663 /* Map non-printable US ASCII to '\xhh' */ 5664 else if (ch < ' ' || ch >= 0x7F) { 5665 *p++ = '\\'; 5666 *p++ = 'x'; 5667 *p++ = hexdigits[(ch >> 4) & 0x000F]; 5668 *p++ = hexdigits[ch & 0x000F]; 5669 } 5670 5671 /* Copy everything else as-is */ 5672 else 5673 *p++ = (char) ch; 5674 } 5675 5676 assert(p - PyBytes_AS_STRING(repr) > 0); 5677 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5678 return NULL; 5679 return repr; 5680} 5681 5682PyObject * 5683PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5684{ 5685 PyObject *s; 5686 if (!PyUnicode_Check(unicode)) { 5687 PyErr_BadArgument(); 5688 return NULL; 5689 } 5690 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5691 PyUnicode_GET_SIZE(unicode)); 5692 return s; 5693} 5694 5695/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5696 5697PyObject * 5698PyUnicode_DecodeRawUnicodeEscape(const char *s, 5699 Py_ssize_t size, 5700 const char *errors) 5701{ 5702 const char *starts = s; 5703 Py_ssize_t startinpos; 5704 Py_ssize_t endinpos; 5705 Py_ssize_t outpos; 5706 PyUnicodeObject *v; 5707 Py_UNICODE *p; 5708 const char *end; 5709 const char *bs; 5710 PyObject *errorHandler = NULL; 5711 PyObject *exc = NULL; 5712 5713 /* Escaped strings will always be longer than the resulting 5714 Unicode string, so we start with size here and then reduce the 5715 length after conversion to the true value. (But decoding error 5716 handler might have to resize the string) */ 5717 v = _PyUnicode_New(size); 5718 if (v == NULL) 5719 goto onError; 5720 if (size == 0) 5721 return (PyObject *)v; 5722 p = PyUnicode_AS_UNICODE(v); 5723 end = s + size; 5724 while (s < end) { 5725 unsigned char c; 5726 Py_UCS4 x; 5727 int i; 5728 int count; 5729 5730 /* Non-escape characters are interpreted as Unicode ordinals */ 5731 if (*s != '\\') { 5732 *p++ = (unsigned char)*s++; 5733 continue; 5734 } 5735 startinpos = s-starts; 5736 5737 /* \u-escapes are only interpreted iff the number of leading 5738 backslashes if odd */ 5739 bs = s; 5740 for (;s < end;) { 5741 if (*s != '\\') 5742 break; 5743 *p++ = (unsigned char)*s++; 5744 } 5745 if (((s - bs) & 1) == 0 || 5746 s >= end || 5747 (*s != 'u' && *s != 'U')) { 5748 continue; 5749 } 5750 p--; 5751 count = *s=='u' ? 4 : 8; 5752 s++; 5753 5754 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5755 outpos = p-PyUnicode_AS_UNICODE(v); 5756 for (x = 0, i = 0; i < count; ++i, ++s) { 5757 c = (unsigned char)*s; 5758 if (!Py_ISXDIGIT(c)) { 5759 endinpos = s-starts; 5760 if (unicode_decode_call_errorhandler( 5761 errors, &errorHandler, 5762 "rawunicodeescape", "truncated \\uXXXX", 5763 &starts, &end, &startinpos, &endinpos, &exc, &s, 5764 &v, &outpos, &p)) 5765 goto onError; 5766 goto nextByte; 5767 } 5768 x = (x<<4) & ~0xF; 5769 if (c >= '0' && c <= '9') 5770 x += c - '0'; 5771 else if (c >= 'a' && c <= 'f') 5772 x += 10 + c - 'a'; 5773 else 5774 x += 10 + c - 'A'; 5775 } 5776 if (x <= 0xffff) 5777 /* UCS-2 character */ 5778 *p++ = (Py_UNICODE) x; 5779 else if (x <= 0x10ffff) { 5780 /* UCS-4 character. Either store directly, or as 5781 surrogate pair. */ 5782#ifdef Py_UNICODE_WIDE 5783 *p++ = (Py_UNICODE) x; 5784#else 5785 x -= 0x10000L; 5786 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 5787 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 5788#endif 5789 } else { 5790 endinpos = s-starts; 5791 outpos = p-PyUnicode_AS_UNICODE(v); 5792 if (unicode_decode_call_errorhandler( 5793 errors, &errorHandler, 5794 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5795 &starts, &end, &startinpos, &endinpos, &exc, &s, 5796 &v, &outpos, &p)) 5797 goto onError; 5798 } 5799 nextByte: 5800 ; 5801 } 5802 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5803 goto onError; 5804 Py_XDECREF(errorHandler); 5805 Py_XDECREF(exc); 5806 if (PyUnicode_READY(v) == -1) { 5807 Py_DECREF(v); 5808 return NULL; 5809 } 5810 return (PyObject *)v; 5811 5812 onError: 5813 Py_XDECREF(v); 5814 Py_XDECREF(errorHandler); 5815 Py_XDECREF(exc); 5816 return NULL; 5817} 5818 5819PyObject * 5820PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 5821 Py_ssize_t size) 5822{ 5823 PyObject *repr; 5824 char *p; 5825 char *q; 5826 5827#ifdef Py_UNICODE_WIDE 5828 const Py_ssize_t expandsize = 10; 5829#else 5830 const Py_ssize_t expandsize = 6; 5831#endif 5832 5833 if (size > PY_SSIZE_T_MAX / expandsize) 5834 return PyErr_NoMemory(); 5835 5836 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 5837 if (repr == NULL) 5838 return NULL; 5839 if (size == 0) 5840 return repr; 5841 5842 p = q = PyBytes_AS_STRING(repr); 5843 while (size-- > 0) { 5844 Py_UNICODE ch = *s++; 5845#ifdef Py_UNICODE_WIDE 5846 /* Map 32-bit characters to '\Uxxxxxxxx' */ 5847 if (ch >= 0x10000) { 5848 *p++ = '\\'; 5849 *p++ = 'U'; 5850 *p++ = hexdigits[(ch >> 28) & 0xf]; 5851 *p++ = hexdigits[(ch >> 24) & 0xf]; 5852 *p++ = hexdigits[(ch >> 20) & 0xf]; 5853 *p++ = hexdigits[(ch >> 16) & 0xf]; 5854 *p++ = hexdigits[(ch >> 12) & 0xf]; 5855 *p++ = hexdigits[(ch >> 8) & 0xf]; 5856 *p++ = hexdigits[(ch >> 4) & 0xf]; 5857 *p++ = hexdigits[ch & 15]; 5858 } 5859 else 5860#else 5861 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5862 if (ch >= 0xD800 && ch < 0xDC00) { 5863 Py_UNICODE ch2; 5864 Py_UCS4 ucs; 5865 5866 ch2 = *s++; 5867 size--; 5868 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5869 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5870 *p++ = '\\'; 5871 *p++ = 'U'; 5872 *p++ = hexdigits[(ucs >> 28) & 0xf]; 5873 *p++ = hexdigits[(ucs >> 24) & 0xf]; 5874 *p++ = hexdigits[(ucs >> 20) & 0xf]; 5875 *p++ = hexdigits[(ucs >> 16) & 0xf]; 5876 *p++ = hexdigits[(ucs >> 12) & 0xf]; 5877 *p++ = hexdigits[(ucs >> 8) & 0xf]; 5878 *p++ = hexdigits[(ucs >> 4) & 0xf]; 5879 *p++ = hexdigits[ucs & 0xf]; 5880 continue; 5881 } 5882 /* Fall through: isolated surrogates are copied as-is */ 5883 s--; 5884 size++; 5885 } 5886#endif 5887 /* Map 16-bit characters to '\uxxxx' */ 5888 if (ch >= 256) { 5889 *p++ = '\\'; 5890 *p++ = 'u'; 5891 *p++ = hexdigits[(ch >> 12) & 0xf]; 5892 *p++ = hexdigits[(ch >> 8) & 0xf]; 5893 *p++ = hexdigits[(ch >> 4) & 0xf]; 5894 *p++ = hexdigits[ch & 15]; 5895 } 5896 /* Copy everything else as-is */ 5897 else 5898 *p++ = (char) ch; 5899 } 5900 size = p - q; 5901 5902 assert(size > 0); 5903 if (_PyBytes_Resize(&repr, size) < 0) 5904 return NULL; 5905 return repr; 5906} 5907 5908PyObject * 5909PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 5910{ 5911 PyObject *s; 5912 if (!PyUnicode_Check(unicode)) { 5913 PyErr_BadArgument(); 5914 return NULL; 5915 } 5916 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 5917 PyUnicode_GET_SIZE(unicode)); 5918 5919 return s; 5920} 5921 5922/* --- Unicode Internal Codec ------------------------------------------- */ 5923 5924PyObject * 5925_PyUnicode_DecodeUnicodeInternal(const char *s, 5926 Py_ssize_t size, 5927 const char *errors) 5928{ 5929 const char *starts = s; 5930 Py_ssize_t startinpos; 5931 Py_ssize_t endinpos; 5932 Py_ssize_t outpos; 5933 PyUnicodeObject *v; 5934 Py_UNICODE *p; 5935 const char *end; 5936 const char *reason; 5937 PyObject *errorHandler = NULL; 5938 PyObject *exc = NULL; 5939 5940#ifdef Py_UNICODE_WIDE 5941 Py_UNICODE unimax = PyUnicode_GetMax(); 5942#endif 5943 5944 /* XXX overflow detection missing */ 5945 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 5946 if (v == NULL) 5947 goto onError; 5948 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 5949 as string was created with the old API. */ 5950 if (PyUnicode_GET_SIZE(v) == 0) 5951 return (PyObject *)v; 5952 p = PyUnicode_AS_UNICODE(v); 5953 end = s + size; 5954 5955 while (s < end) { 5956 memcpy(p, s, sizeof(Py_UNICODE)); 5957 /* We have to sanity check the raw data, otherwise doom looms for 5958 some malformed UCS-4 data. */ 5959 if ( 5960#ifdef Py_UNICODE_WIDE 5961 *p > unimax || *p < 0 || 5962#endif 5963 end-s < Py_UNICODE_SIZE 5964 ) 5965 { 5966 startinpos = s - starts; 5967 if (end-s < Py_UNICODE_SIZE) { 5968 endinpos = end-starts; 5969 reason = "truncated input"; 5970 } 5971 else { 5972 endinpos = s - starts + Py_UNICODE_SIZE; 5973 reason = "illegal code point (> 0x10FFFF)"; 5974 } 5975 outpos = p - PyUnicode_AS_UNICODE(v); 5976 if (unicode_decode_call_errorhandler( 5977 errors, &errorHandler, 5978 "unicode_internal", reason, 5979 &starts, &end, &startinpos, &endinpos, &exc, &s, 5980 &v, &outpos, &p)) { 5981 goto onError; 5982 } 5983 } 5984 else { 5985 p++; 5986 s += Py_UNICODE_SIZE; 5987 } 5988 } 5989 5990 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 5991 goto onError; 5992 Py_XDECREF(errorHandler); 5993 Py_XDECREF(exc); 5994 if (PyUnicode_READY(v) == -1) { 5995 Py_DECREF(v); 5996 return NULL; 5997 } 5998 return (PyObject *)v; 5999 6000 onError: 6001 Py_XDECREF(v); 6002 Py_XDECREF(errorHandler); 6003 Py_XDECREF(exc); 6004 return NULL; 6005} 6006 6007/* --- Latin-1 Codec ------------------------------------------------------ */ 6008 6009PyObject * 6010PyUnicode_DecodeLatin1(const char *s, 6011 Py_ssize_t size, 6012 const char *errors) 6013{ 6014 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6015 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6016} 6017 6018/* create or adjust a UnicodeEncodeError */ 6019static void 6020make_encode_exception(PyObject **exceptionObject, 6021 const char *encoding, 6022 const Py_UNICODE *unicode, Py_ssize_t size, 6023 Py_ssize_t startpos, Py_ssize_t endpos, 6024 const char *reason) 6025{ 6026 if (*exceptionObject == NULL) { 6027 *exceptionObject = PyUnicodeEncodeError_Create( 6028 encoding, unicode, size, startpos, endpos, reason); 6029 } 6030 else { 6031 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6032 goto onError; 6033 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6034 goto onError; 6035 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6036 goto onError; 6037 return; 6038 onError: 6039 Py_DECREF(*exceptionObject); 6040 *exceptionObject = NULL; 6041 } 6042} 6043 6044/* raises a UnicodeEncodeError */ 6045static void 6046raise_encode_exception(PyObject **exceptionObject, 6047 const char *encoding, 6048 const Py_UNICODE *unicode, Py_ssize_t size, 6049 Py_ssize_t startpos, Py_ssize_t endpos, 6050 const char *reason) 6051{ 6052 make_encode_exception(exceptionObject, 6053 encoding, unicode, size, startpos, endpos, reason); 6054 if (*exceptionObject != NULL) 6055 PyCodec_StrictErrors(*exceptionObject); 6056} 6057 6058/* error handling callback helper: 6059 build arguments, call the callback and check the arguments, 6060 put the result into newpos and return the replacement string, which 6061 has to be freed by the caller */ 6062static PyObject * 6063unicode_encode_call_errorhandler(const char *errors, 6064 PyObject **errorHandler, 6065 const char *encoding, const char *reason, 6066 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, 6067 Py_ssize_t startpos, Py_ssize_t endpos, 6068 Py_ssize_t *newpos) 6069{ 6070 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6071 6072 PyObject *restuple; 6073 PyObject *resunicode; 6074 6075 if (*errorHandler == NULL) { 6076 *errorHandler = PyCodec_LookupError(errors); 6077 if (*errorHandler == NULL) 6078 return NULL; 6079 } 6080 6081 make_encode_exception(exceptionObject, 6082 encoding, unicode, size, startpos, endpos, reason); 6083 if (*exceptionObject == NULL) 6084 return NULL; 6085 6086 restuple = PyObject_CallFunctionObjArgs( 6087 *errorHandler, *exceptionObject, NULL); 6088 if (restuple == NULL) 6089 return NULL; 6090 if (!PyTuple_Check(restuple)) { 6091 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6092 Py_DECREF(restuple); 6093 return NULL; 6094 } 6095 if (!PyArg_ParseTuple(restuple, argparse, 6096 &resunicode, newpos)) { 6097 Py_DECREF(restuple); 6098 return NULL; 6099 } 6100 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6101 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6102 Py_DECREF(restuple); 6103 return NULL; 6104 } 6105 if (*newpos<0) 6106 *newpos = size+*newpos; 6107 if (*newpos<0 || *newpos>size) { 6108 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6109 Py_DECREF(restuple); 6110 return NULL; 6111 } 6112 Py_INCREF(resunicode); 6113 Py_DECREF(restuple); 6114 return resunicode; 6115} 6116 6117static PyObject * 6118unicode_encode_ucs1(const Py_UNICODE *p, 6119 Py_ssize_t size, 6120 const char *errors, 6121 int limit) 6122{ 6123 /* output object */ 6124 PyObject *res; 6125 /* pointers to the beginning and end+1 of input */ 6126 const Py_UNICODE *startp = p; 6127 const Py_UNICODE *endp = p + size; 6128 /* pointer to the beginning of the unencodable characters */ 6129 /* const Py_UNICODE *badp = NULL; */ 6130 /* pointer into the output */ 6131 char *str; 6132 /* current output position */ 6133 Py_ssize_t ressize; 6134 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6135 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6136 PyObject *errorHandler = NULL; 6137 PyObject *exc = NULL; 6138 /* the following variable is used for caching string comparisons 6139 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6140 int known_errorHandler = -1; 6141 6142 /* allocate enough for a simple encoding without 6143 replacements, if we need more, we'll resize */ 6144 if (size == 0) 6145 return PyBytes_FromStringAndSize(NULL, 0); 6146 res = PyBytes_FromStringAndSize(NULL, size); 6147 if (res == NULL) 6148 return NULL; 6149 str = PyBytes_AS_STRING(res); 6150 ressize = size; 6151 6152 while (p<endp) { 6153 Py_UNICODE c = *p; 6154 6155 /* can we encode this? */ 6156 if (c<limit) { 6157 /* no overflow check, because we know that the space is enough */ 6158 *str++ = (char)c; 6159 ++p; 6160 } 6161 else { 6162 Py_ssize_t unicodepos = p-startp; 6163 Py_ssize_t requiredsize; 6164 PyObject *repunicode; 6165 Py_ssize_t repsize; 6166 Py_ssize_t newpos; 6167 Py_ssize_t respos; 6168 Py_UNICODE *uni2; 6169 /* startpos for collecting unencodable chars */ 6170 const Py_UNICODE *collstart = p; 6171 const Py_UNICODE *collend = p; 6172 /* find all unecodable characters */ 6173 while ((collend < endp) && ((*collend)>=limit)) 6174 ++collend; 6175 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6176 if (known_errorHandler==-1) { 6177 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6178 known_errorHandler = 1; 6179 else if (!strcmp(errors, "replace")) 6180 known_errorHandler = 2; 6181 else if (!strcmp(errors, "ignore")) 6182 known_errorHandler = 3; 6183 else if (!strcmp(errors, "xmlcharrefreplace")) 6184 known_errorHandler = 4; 6185 else 6186 known_errorHandler = 0; 6187 } 6188 switch (known_errorHandler) { 6189 case 1: /* strict */ 6190 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); 6191 goto onError; 6192 case 2: /* replace */ 6193 while (collstart++<collend) 6194 *str++ = '?'; /* fall through */ 6195 case 3: /* ignore */ 6196 p = collend; 6197 break; 6198 case 4: /* xmlcharrefreplace */ 6199 respos = str - PyBytes_AS_STRING(res); 6200 /* determine replacement size (temporarily (mis)uses p) */ 6201 for (p = collstart, repsize = 0; p < collend; ++p) { 6202 if (*p<10) 6203 repsize += 2+1+1; 6204 else if (*p<100) 6205 repsize += 2+2+1; 6206 else if (*p<1000) 6207 repsize += 2+3+1; 6208 else if (*p<10000) 6209 repsize += 2+4+1; 6210#ifndef Py_UNICODE_WIDE 6211 else 6212 repsize += 2+5+1; 6213#else 6214 else if (*p<100000) 6215 repsize += 2+5+1; 6216 else if (*p<1000000) 6217 repsize += 2+6+1; 6218 else 6219 repsize += 2+7+1; 6220#endif 6221 } 6222 requiredsize = respos+repsize+(endp-collend); 6223 if (requiredsize > ressize) { 6224 if (requiredsize<2*ressize) 6225 requiredsize = 2*ressize; 6226 if (_PyBytes_Resize(&res, requiredsize)) 6227 goto onError; 6228 str = PyBytes_AS_STRING(res) + respos; 6229 ressize = requiredsize; 6230 } 6231 /* generate replacement (temporarily (mis)uses p) */ 6232 for (p = collstart; p < collend; ++p) { 6233 str += sprintf(str, "&#%d;", (int)*p); 6234 } 6235 p = collend; 6236 break; 6237 default: 6238 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6239 encoding, reason, startp, size, &exc, 6240 collstart-startp, collend-startp, &newpos); 6241 if (repunicode == NULL) 6242 goto onError; 6243 if (PyBytes_Check(repunicode)) { 6244 /* Directly copy bytes result to output. */ 6245 repsize = PyBytes_Size(repunicode); 6246 if (repsize > 1) { 6247 /* Make room for all additional bytes. */ 6248 respos = str - PyBytes_AS_STRING(res); 6249 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6250 Py_DECREF(repunicode); 6251 goto onError; 6252 } 6253 str = PyBytes_AS_STRING(res) + respos; 6254 ressize += repsize-1; 6255 } 6256 memcpy(str, PyBytes_AsString(repunicode), repsize); 6257 str += repsize; 6258 p = startp + newpos; 6259 Py_DECREF(repunicode); 6260 break; 6261 } 6262 /* need more space? (at least enough for what we 6263 have+the replacement+the rest of the string, so 6264 we won't have to check space for encodable characters) */ 6265 respos = str - PyBytes_AS_STRING(res); 6266 repsize = PyUnicode_GET_SIZE(repunicode); 6267 requiredsize = respos+repsize+(endp-collend); 6268 if (requiredsize > ressize) { 6269 if (requiredsize<2*ressize) 6270 requiredsize = 2*ressize; 6271 if (_PyBytes_Resize(&res, requiredsize)) { 6272 Py_DECREF(repunicode); 6273 goto onError; 6274 } 6275 str = PyBytes_AS_STRING(res) + respos; 6276 ressize = requiredsize; 6277 } 6278 /* check if there is anything unencodable in the replacement 6279 and copy it to the output */ 6280 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { 6281 c = *uni2; 6282 if (c >= limit) { 6283 raise_encode_exception(&exc, encoding, startp, size, 6284 unicodepos, unicodepos+1, reason); 6285 Py_DECREF(repunicode); 6286 goto onError; 6287 } 6288 *str = (char)c; 6289 } 6290 p = startp + newpos; 6291 Py_DECREF(repunicode); 6292 } 6293 } 6294 } 6295 /* Resize if we allocated to much */ 6296 size = str - PyBytes_AS_STRING(res); 6297 if (size < ressize) { /* If this falls res will be NULL */ 6298 assert(size >= 0); 6299 if (_PyBytes_Resize(&res, size) < 0) 6300 goto onError; 6301 } 6302 6303 Py_XDECREF(errorHandler); 6304 Py_XDECREF(exc); 6305 return res; 6306 6307 onError: 6308 Py_XDECREF(res); 6309 Py_XDECREF(errorHandler); 6310 Py_XDECREF(exc); 6311 return NULL; 6312} 6313 6314PyObject * 6315PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6316 Py_ssize_t size, 6317 const char *errors) 6318{ 6319 return unicode_encode_ucs1(p, size, errors, 256); 6320} 6321 6322PyObject * 6323_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6324{ 6325 if (!PyUnicode_Check(unicode)) { 6326 PyErr_BadArgument(); 6327 return NULL; 6328 } 6329 if (PyUnicode_READY(unicode) == -1) 6330 return NULL; 6331 /* Fast path: if it is a one-byte string, construct 6332 bytes object directly. */ 6333 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6334 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6335 PyUnicode_GET_LENGTH(unicode)); 6336 /* Non-Latin-1 characters present. Defer to above function to 6337 raise the exception. */ 6338 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), 6339 PyUnicode_GET_SIZE(unicode), 6340 errors); 6341} 6342 6343PyObject* 6344PyUnicode_AsLatin1String(PyObject *unicode) 6345{ 6346 return _PyUnicode_AsLatin1String(unicode, NULL); 6347} 6348 6349/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6350 6351PyObject * 6352PyUnicode_DecodeASCII(const char *s, 6353 Py_ssize_t size, 6354 const char *errors) 6355{ 6356 const char *starts = s; 6357 PyUnicodeObject *v; 6358 Py_UNICODE *p; 6359 Py_ssize_t startinpos; 6360 Py_ssize_t endinpos; 6361 Py_ssize_t outpos; 6362 const char *e; 6363 unsigned char* d; 6364 PyObject *errorHandler = NULL; 6365 PyObject *exc = NULL; 6366 Py_ssize_t i; 6367 6368 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6369 if (size == 1 && *(unsigned char*)s < 128) 6370 return PyUnicode_FromOrdinal(*(unsigned char*)s); 6371 6372 /* Fast path. Assume the input actually *is* ASCII, and allocate 6373 a single-block Unicode object with that assumption. If there is 6374 an error, drop the object and start over. */ 6375 v = (PyUnicodeObject*)PyUnicode_New(size, 127); 6376 if (v == NULL) 6377 goto onError; 6378 d = PyUnicode_1BYTE_DATA(v); 6379 for (i = 0; i < size; i++) { 6380 unsigned char ch = ((unsigned char*)s)[i]; 6381 if (ch < 128) 6382 d[i] = ch; 6383 else 6384 break; 6385 } 6386 if (i == size) 6387 return (PyObject*)v; 6388 Py_DECREF(v); /* start over */ 6389 6390 v = _PyUnicode_New(size); 6391 if (v == NULL) 6392 goto onError; 6393 if (size == 0) 6394 return (PyObject *)v; 6395 p = PyUnicode_AS_UNICODE(v); 6396 e = s + size; 6397 while (s < e) { 6398 register unsigned char c = (unsigned char)*s; 6399 if (c < 128) { 6400 *p++ = c; 6401 ++s; 6402 } 6403 else { 6404 startinpos = s-starts; 6405 endinpos = startinpos + 1; 6406 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6407 if (unicode_decode_call_errorhandler( 6408 errors, &errorHandler, 6409 "ascii", "ordinal not in range(128)", 6410 &starts, &e, &startinpos, &endinpos, &exc, &s, 6411 &v, &outpos, &p)) 6412 goto onError; 6413 } 6414 } 6415 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6416 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6417 goto onError; 6418 Py_XDECREF(errorHandler); 6419 Py_XDECREF(exc); 6420 if (PyUnicode_READY(v) == -1) { 6421 Py_DECREF(v); 6422 return NULL; 6423 } 6424 return (PyObject *)v; 6425 6426 onError: 6427 Py_XDECREF(v); 6428 Py_XDECREF(errorHandler); 6429 Py_XDECREF(exc); 6430 return NULL; 6431} 6432 6433PyObject * 6434PyUnicode_EncodeASCII(const Py_UNICODE *p, 6435 Py_ssize_t size, 6436 const char *errors) 6437{ 6438 return unicode_encode_ucs1(p, size, errors, 128); 6439} 6440 6441PyObject * 6442_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6443{ 6444 if (!PyUnicode_Check(unicode)) { 6445 PyErr_BadArgument(); 6446 return NULL; 6447 } 6448 if (PyUnicode_READY(unicode) == -1) 6449 return NULL; 6450 /* Fast path: if it is an ASCII-only string, construct bytes object 6451 directly. Else defer to above function to raise the exception. */ 6452 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6453 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6454 PyUnicode_GET_LENGTH(unicode)); 6455 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), 6456 PyUnicode_GET_SIZE(unicode), 6457 errors); 6458} 6459 6460PyObject * 6461PyUnicode_AsASCIIString(PyObject *unicode) 6462{ 6463 return _PyUnicode_AsASCIIString(unicode, NULL); 6464} 6465 6466#ifdef HAVE_MBCS 6467 6468/* --- MBCS codecs for Windows -------------------------------------------- */ 6469 6470#if SIZEOF_INT < SIZEOF_SIZE_T 6471#define NEED_RETRY 6472#endif 6473 6474/* XXX This code is limited to "true" double-byte encodings, as 6475 a) it assumes an incomplete character consists of a single byte, and 6476 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte 6477 encodings, see IsDBCSLeadByteEx documentation. */ 6478 6479static int 6480is_dbcs_lead_byte(const char *s, int offset) 6481{ 6482 const char *curr = s + offset; 6483 6484 if (IsDBCSLeadByte(*curr)) { 6485 const char *prev = CharPrev(s, curr); 6486 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); 6487 } 6488 return 0; 6489} 6490 6491/* 6492 * Decode MBCS string into unicode object. If 'final' is set, converts 6493 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. 6494 */ 6495static int 6496decode_mbcs(PyUnicodeObject **v, 6497 const char *s, /* MBCS string */ 6498 int size, /* sizeof MBCS string */ 6499 int final, 6500 const char *errors) 6501{ 6502 Py_UNICODE *p; 6503 Py_ssize_t n; 6504 DWORD usize; 6505 DWORD flags; 6506 6507 assert(size >= 0); 6508 6509 /* check and handle 'errors' arg */ 6510 if (errors==NULL || strcmp(errors, "strict")==0) 6511 flags = MB_ERR_INVALID_CHARS; 6512 else if (strcmp(errors, "ignore")==0) 6513 flags = 0; 6514 else { 6515 PyErr_Format(PyExc_ValueError, 6516 "mbcs encoding does not support errors='%s'", 6517 errors); 6518 return -1; 6519 } 6520 6521 /* Skip trailing lead-byte unless 'final' is set */ 6522 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) 6523 --size; 6524 6525 /* First get the size of the result */ 6526 if (size > 0) { 6527 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); 6528 if (usize==0) 6529 goto mbcs_decode_error; 6530 } else 6531 usize = 0; 6532 6533 if (*v == NULL) { 6534 /* Create unicode object */ 6535 *v = _PyUnicode_New(usize); 6536 if (*v == NULL) 6537 return -1; 6538 n = 0; 6539 } 6540 else { 6541 /* Extend unicode object */ 6542 n = PyUnicode_GET_SIZE(*v); 6543 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0) 6544 return -1; 6545 } 6546 6547 /* Do the conversion */ 6548 if (usize > 0) { 6549 p = PyUnicode_AS_UNICODE(*v) + n; 6550 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { 6551 goto mbcs_decode_error; 6552 } 6553 } 6554 return size; 6555 6556mbcs_decode_error: 6557 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then 6558 we raise a UnicodeDecodeError - else it is a 'generic' 6559 windows error 6560 */ 6561 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { 6562 /* Ideally, we should get reason from FormatMessage - this 6563 is the Windows 2000 English version of the message 6564 */ 6565 PyObject *exc = NULL; 6566 const char *reason = "No mapping for the Unicode character exists " 6567 "in the target multi-byte code page."; 6568 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); 6569 if (exc != NULL) { 6570 PyCodec_StrictErrors(exc); 6571 Py_DECREF(exc); 6572 } 6573 } else { 6574 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6575 } 6576 return -1; 6577} 6578 6579PyObject * 6580PyUnicode_DecodeMBCSStateful(const char *s, 6581 Py_ssize_t size, 6582 const char *errors, 6583 Py_ssize_t *consumed) 6584{ 6585 PyUnicodeObject *v = NULL; 6586 int done; 6587 6588 if (consumed) 6589 *consumed = 0; 6590 6591#ifdef NEED_RETRY 6592 retry: 6593 if (size > INT_MAX) 6594 done = decode_mbcs(&v, s, INT_MAX, 0, errors); 6595 else 6596#endif 6597 done = decode_mbcs(&v, s, (int)size, !consumed, errors); 6598 6599 if (done < 0) { 6600 Py_XDECREF(v); 6601 return NULL; 6602 } 6603 6604 if (consumed) 6605 *consumed += done; 6606 6607#ifdef NEED_RETRY 6608 if (size > INT_MAX) { 6609 s += done; 6610 size -= done; 6611 goto retry; 6612 } 6613#endif 6614 if (PyUnicode_READY(v) == -1) { 6615 Py_DECREF(v); 6616 return NULL; 6617 } 6618 return (PyObject *)v; 6619} 6620 6621PyObject * 6622PyUnicode_DecodeMBCS(const char *s, 6623 Py_ssize_t size, 6624 const char *errors) 6625{ 6626 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 6627} 6628 6629/* 6630 * Convert unicode into string object (MBCS). 6631 * Returns 0 if succeed, -1 otherwise. 6632 */ 6633static int 6634encode_mbcs(PyObject **repr, 6635 const Py_UNICODE *p, /* unicode */ 6636 int size, /* size of unicode */ 6637 const char* errors) 6638{ 6639 BOOL usedDefaultChar = FALSE; 6640 BOOL *pusedDefaultChar; 6641 int mbcssize; 6642 Py_ssize_t n; 6643 PyObject *exc = NULL; 6644 DWORD flags; 6645 6646 assert(size >= 0); 6647 6648 /* check and handle 'errors' arg */ 6649 if (errors==NULL || strcmp(errors, "strict")==0) { 6650 flags = WC_NO_BEST_FIT_CHARS; 6651 pusedDefaultChar = &usedDefaultChar; 6652 } else if (strcmp(errors, "replace")==0) { 6653 flags = 0; 6654 pusedDefaultChar = NULL; 6655 } else { 6656 PyErr_Format(PyExc_ValueError, 6657 "mbcs encoding does not support errors='%s'", 6658 errors); 6659 return -1; 6660 } 6661 6662 /* First get the size of the result */ 6663 if (size > 0) { 6664 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, 6665 NULL, pusedDefaultChar); 6666 if (mbcssize == 0) { 6667 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6668 return -1; 6669 } 6670 /* If we used a default char, then we failed! */ 6671 if (pusedDefaultChar && *pusedDefaultChar) 6672 goto mbcs_encode_error; 6673 } else { 6674 mbcssize = 0; 6675 } 6676 6677 if (*repr == NULL) { 6678 /* Create string object */ 6679 *repr = PyBytes_FromStringAndSize(NULL, mbcssize); 6680 if (*repr == NULL) 6681 return -1; 6682 n = 0; 6683 } 6684 else { 6685 /* Extend string object */ 6686 n = PyBytes_Size(*repr); 6687 if (_PyBytes_Resize(repr, n + mbcssize) < 0) 6688 return -1; 6689 } 6690 6691 /* Do the conversion */ 6692 if (size > 0) { 6693 char *s = PyBytes_AS_STRING(*repr) + n; 6694 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, 6695 NULL, pusedDefaultChar)) { 6696 PyErr_SetFromWindowsErrWithFilename(0, NULL); 6697 return -1; 6698 } 6699 if (pusedDefaultChar && *pusedDefaultChar) 6700 goto mbcs_encode_error; 6701 } 6702 return 0; 6703 6704mbcs_encode_error: 6705 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); 6706 Py_XDECREF(exc); 6707 return -1; 6708} 6709 6710PyObject * 6711PyUnicode_EncodeMBCS(const Py_UNICODE *p, 6712 Py_ssize_t size, 6713 const char *errors) 6714{ 6715 PyObject *repr = NULL; 6716 int ret; 6717 6718#ifdef NEED_RETRY 6719 retry: 6720 if (size > INT_MAX) 6721 ret = encode_mbcs(&repr, p, INT_MAX, errors); 6722 else 6723#endif 6724 ret = encode_mbcs(&repr, p, (int)size, errors); 6725 6726 if (ret < 0) { 6727 Py_XDECREF(repr); 6728 return NULL; 6729 } 6730 6731#ifdef NEED_RETRY 6732 if (size > INT_MAX) { 6733 p += INT_MAX; 6734 size -= INT_MAX; 6735 goto retry; 6736 } 6737#endif 6738 6739 return repr; 6740} 6741 6742PyObject * 6743PyUnicode_AsMBCSString(PyObject *unicode) 6744{ 6745 if (!PyUnicode_Check(unicode)) { 6746 PyErr_BadArgument(); 6747 return NULL; 6748 } 6749 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 6750 PyUnicode_GET_SIZE(unicode), 6751 NULL); 6752} 6753 6754#undef NEED_RETRY 6755 6756#endif /* HAVE_MBCS */ 6757 6758/* --- Character Mapping Codec -------------------------------------------- */ 6759 6760PyObject * 6761PyUnicode_DecodeCharmap(const char *s, 6762 Py_ssize_t size, 6763 PyObject *mapping, 6764 const char *errors) 6765{ 6766 const char *starts = s; 6767 Py_ssize_t startinpos; 6768 Py_ssize_t endinpos; 6769 Py_ssize_t outpos; 6770 const char *e; 6771 PyUnicodeObject *v; 6772 Py_UNICODE *p; 6773 Py_ssize_t extrachars = 0; 6774 PyObject *errorHandler = NULL; 6775 PyObject *exc = NULL; 6776 Py_UNICODE *mapstring = NULL; 6777 Py_ssize_t maplen = 0; 6778 6779 /* Default to Latin-1 */ 6780 if (mapping == NULL) 6781 return PyUnicode_DecodeLatin1(s, size, errors); 6782 6783 v = _PyUnicode_New(size); 6784 if (v == NULL) 6785 goto onError; 6786 if (size == 0) 6787 return (PyObject *)v; 6788 p = PyUnicode_AS_UNICODE(v); 6789 e = s + size; 6790 if (PyUnicode_CheckExact(mapping)) { 6791 mapstring = PyUnicode_AS_UNICODE(mapping); 6792 maplen = PyUnicode_GET_SIZE(mapping); 6793 while (s < e) { 6794 unsigned char ch = *s; 6795 Py_UNICODE x = 0xfffe; /* illegal value */ 6796 6797 if (ch < maplen) 6798 x = mapstring[ch]; 6799 6800 if (x == 0xfffe) { 6801 /* undefined mapping */ 6802 outpos = p-PyUnicode_AS_UNICODE(v); 6803 startinpos = s-starts; 6804 endinpos = startinpos+1; 6805 if (unicode_decode_call_errorhandler( 6806 errors, &errorHandler, 6807 "charmap", "character maps to <undefined>", 6808 &starts, &e, &startinpos, &endinpos, &exc, &s, 6809 &v, &outpos, &p)) { 6810 goto onError; 6811 } 6812 continue; 6813 } 6814 *p++ = x; 6815 ++s; 6816 } 6817 } 6818 else { 6819 while (s < e) { 6820 unsigned char ch = *s; 6821 PyObject *w, *x; 6822 6823 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 6824 w = PyLong_FromLong((long)ch); 6825 if (w == NULL) 6826 goto onError; 6827 x = PyObject_GetItem(mapping, w); 6828 Py_DECREF(w); 6829 if (x == NULL) { 6830 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 6831 /* No mapping found means: mapping is undefined. */ 6832 PyErr_Clear(); 6833 x = Py_None; 6834 Py_INCREF(x); 6835 } else 6836 goto onError; 6837 } 6838 6839 /* Apply mapping */ 6840 if (PyLong_Check(x)) { 6841 long value = PyLong_AS_LONG(x); 6842 if (value < 0 || value > 65535) { 6843 PyErr_SetString(PyExc_TypeError, 6844 "character mapping must be in range(65536)"); 6845 Py_DECREF(x); 6846 goto onError; 6847 } 6848 *p++ = (Py_UNICODE)value; 6849 } 6850 else if (x == Py_None) { 6851 /* undefined mapping */ 6852 outpos = p-PyUnicode_AS_UNICODE(v); 6853 startinpos = s-starts; 6854 endinpos = startinpos+1; 6855 if (unicode_decode_call_errorhandler( 6856 errors, &errorHandler, 6857 "charmap", "character maps to <undefined>", 6858 &starts, &e, &startinpos, &endinpos, &exc, &s, 6859 &v, &outpos, &p)) { 6860 Py_DECREF(x); 6861 goto onError; 6862 } 6863 Py_DECREF(x); 6864 continue; 6865 } 6866 else if (PyUnicode_Check(x)) { 6867 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 6868 6869 if (targetsize == 1) 6870 /* 1-1 mapping */ 6871 *p++ = *PyUnicode_AS_UNICODE(x); 6872 6873 else if (targetsize > 1) { 6874 /* 1-n mapping */ 6875 if (targetsize > extrachars) { 6876 /* resize first */ 6877 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 6878 Py_ssize_t needed = (targetsize - extrachars) + \ 6879 (targetsize << 2); 6880 extrachars += needed; 6881 /* XXX overflow detection missing */ 6882 if (PyUnicode_Resize((PyObject**)&v, 6883 PyUnicode_GET_SIZE(v) + needed) < 0) { 6884 Py_DECREF(x); 6885 goto onError; 6886 } 6887 p = PyUnicode_AS_UNICODE(v) + oldpos; 6888 } 6889 Py_UNICODE_COPY(p, 6890 PyUnicode_AS_UNICODE(x), 6891 targetsize); 6892 p += targetsize; 6893 extrachars -= targetsize; 6894 } 6895 /* 1-0 mapping: skip the character */ 6896 } 6897 else { 6898 /* wrong return value */ 6899 PyErr_SetString(PyExc_TypeError, 6900 "character mapping must return integer, None or str"); 6901 Py_DECREF(x); 6902 goto onError; 6903 } 6904 Py_DECREF(x); 6905 ++s; 6906 } 6907 } 6908 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6909 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6910 goto onError; 6911 Py_XDECREF(errorHandler); 6912 Py_XDECREF(exc); 6913 if (PyUnicode_READY(v) == -1) { 6914 Py_DECREF(v); 6915 return NULL; 6916 } 6917 return (PyObject *)v; 6918 6919 onError: 6920 Py_XDECREF(errorHandler); 6921 Py_XDECREF(exc); 6922 Py_XDECREF(v); 6923 return NULL; 6924} 6925 6926/* Charmap encoding: the lookup table */ 6927 6928struct encoding_map { 6929 PyObject_HEAD 6930 unsigned char level1[32]; 6931 int count2, count3; 6932 unsigned char level23[1]; 6933}; 6934 6935static PyObject* 6936encoding_map_size(PyObject *obj, PyObject* args) 6937{ 6938 struct encoding_map *map = (struct encoding_map*)obj; 6939 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 6940 128*map->count3); 6941} 6942 6943static PyMethodDef encoding_map_methods[] = { 6944 {"size", encoding_map_size, METH_NOARGS, 6945 PyDoc_STR("Return the size (in bytes) of this object") }, 6946 { 0 } 6947}; 6948 6949static void 6950encoding_map_dealloc(PyObject* o) 6951{ 6952 PyObject_FREE(o); 6953} 6954 6955static PyTypeObject EncodingMapType = { 6956 PyVarObject_HEAD_INIT(NULL, 0) 6957 "EncodingMap", /*tp_name*/ 6958 sizeof(struct encoding_map), /*tp_basicsize*/ 6959 0, /*tp_itemsize*/ 6960 /* methods */ 6961 encoding_map_dealloc, /*tp_dealloc*/ 6962 0, /*tp_print*/ 6963 0, /*tp_getattr*/ 6964 0, /*tp_setattr*/ 6965 0, /*tp_reserved*/ 6966 0, /*tp_repr*/ 6967 0, /*tp_as_number*/ 6968 0, /*tp_as_sequence*/ 6969 0, /*tp_as_mapping*/ 6970 0, /*tp_hash*/ 6971 0, /*tp_call*/ 6972 0, /*tp_str*/ 6973 0, /*tp_getattro*/ 6974 0, /*tp_setattro*/ 6975 0, /*tp_as_buffer*/ 6976 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 6977 0, /*tp_doc*/ 6978 0, /*tp_traverse*/ 6979 0, /*tp_clear*/ 6980 0, /*tp_richcompare*/ 6981 0, /*tp_weaklistoffset*/ 6982 0, /*tp_iter*/ 6983 0, /*tp_iternext*/ 6984 encoding_map_methods, /*tp_methods*/ 6985 0, /*tp_members*/ 6986 0, /*tp_getset*/ 6987 0, /*tp_base*/ 6988 0, /*tp_dict*/ 6989 0, /*tp_descr_get*/ 6990 0, /*tp_descr_set*/ 6991 0, /*tp_dictoffset*/ 6992 0, /*tp_init*/ 6993 0, /*tp_alloc*/ 6994 0, /*tp_new*/ 6995 0, /*tp_free*/ 6996 0, /*tp_is_gc*/ 6997}; 6998 6999PyObject* 7000PyUnicode_BuildEncodingMap(PyObject* string) 7001{ 7002 PyObject *result; 7003 struct encoding_map *mresult; 7004 int i; 7005 int need_dict = 0; 7006 unsigned char level1[32]; 7007 unsigned char level2[512]; 7008 unsigned char *mlevel1, *mlevel2, *mlevel3; 7009 int count2 = 0, count3 = 0; 7010 int kind; 7011 void *data; 7012 Py_UCS4 ch; 7013 7014 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7015 PyErr_BadArgument(); 7016 return NULL; 7017 } 7018 kind = PyUnicode_KIND(string); 7019 data = PyUnicode_DATA(string); 7020 memset(level1, 0xFF, sizeof level1); 7021 memset(level2, 0xFF, sizeof level2); 7022 7023 /* If there isn't a one-to-one mapping of NULL to \0, 7024 or if there are non-BMP characters, we need to use 7025 a mapping dictionary. */ 7026 if (PyUnicode_READ(kind, data, 0) != 0) 7027 need_dict = 1; 7028 for (i = 1; i < 256; i++) { 7029 int l1, l2; 7030 ch = PyUnicode_READ(kind, data, i); 7031 if (ch == 0 || ch > 0xFFFF) { 7032 need_dict = 1; 7033 break; 7034 } 7035 if (ch == 0xFFFE) 7036 /* unmapped character */ 7037 continue; 7038 l1 = ch >> 11; 7039 l2 = ch >> 7; 7040 if (level1[l1] == 0xFF) 7041 level1[l1] = count2++; 7042 if (level2[l2] == 0xFF) 7043 level2[l2] = count3++; 7044 } 7045 7046 if (count2 >= 0xFF || count3 >= 0xFF) 7047 need_dict = 1; 7048 7049 if (need_dict) { 7050 PyObject *result = PyDict_New(); 7051 PyObject *key, *value; 7052 if (!result) 7053 return NULL; 7054 for (i = 0; i < 256; i++) { 7055 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7056 value = PyLong_FromLong(i); 7057 if (!key || !value) 7058 goto failed1; 7059 if (PyDict_SetItem(result, key, value) == -1) 7060 goto failed1; 7061 Py_DECREF(key); 7062 Py_DECREF(value); 7063 } 7064 return result; 7065 failed1: 7066 Py_XDECREF(key); 7067 Py_XDECREF(value); 7068 Py_DECREF(result); 7069 return NULL; 7070 } 7071 7072 /* Create a three-level trie */ 7073 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7074 16*count2 + 128*count3 - 1); 7075 if (!result) 7076 return PyErr_NoMemory(); 7077 PyObject_Init(result, &EncodingMapType); 7078 mresult = (struct encoding_map*)result; 7079 mresult->count2 = count2; 7080 mresult->count3 = count3; 7081 mlevel1 = mresult->level1; 7082 mlevel2 = mresult->level23; 7083 mlevel3 = mresult->level23 + 16*count2; 7084 memcpy(mlevel1, level1, 32); 7085 memset(mlevel2, 0xFF, 16*count2); 7086 memset(mlevel3, 0, 128*count3); 7087 count3 = 0; 7088 for (i = 1; i < 256; i++) { 7089 int o1, o2, o3, i2, i3; 7090 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7091 /* unmapped character */ 7092 continue; 7093 o1 = PyUnicode_READ(kind, data, i)>>11; 7094 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7095 i2 = 16*mlevel1[o1] + o2; 7096 if (mlevel2[i2] == 0xFF) 7097 mlevel2[i2] = count3++; 7098 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7099 i3 = 128*mlevel2[i2] + o3; 7100 mlevel3[i3] = i; 7101 } 7102 return result; 7103} 7104 7105static int 7106encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7107{ 7108 struct encoding_map *map = (struct encoding_map*)mapping; 7109 int l1 = c>>11; 7110 int l2 = (c>>7) & 0xF; 7111 int l3 = c & 0x7F; 7112 int i; 7113 7114#ifdef Py_UNICODE_WIDE 7115 if (c > 0xFFFF) { 7116 return -1; 7117 } 7118#endif 7119 if (c == 0) 7120 return 0; 7121 /* level 1*/ 7122 i = map->level1[l1]; 7123 if (i == 0xFF) { 7124 return -1; 7125 } 7126 /* level 2*/ 7127 i = map->level23[16*i+l2]; 7128 if (i == 0xFF) { 7129 return -1; 7130 } 7131 /* level 3 */ 7132 i = map->level23[16*map->count2 + 128*i + l3]; 7133 if (i == 0) { 7134 return -1; 7135 } 7136 return i; 7137} 7138 7139/* Lookup the character ch in the mapping. If the character 7140 can't be found, Py_None is returned (or NULL, if another 7141 error occurred). */ 7142static PyObject * 7143charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7144{ 7145 PyObject *w = PyLong_FromLong((long)c); 7146 PyObject *x; 7147 7148 if (w == NULL) 7149 return NULL; 7150 x = PyObject_GetItem(mapping, w); 7151 Py_DECREF(w); 7152 if (x == NULL) { 7153 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7154 /* No mapping found means: mapping is undefined. */ 7155 PyErr_Clear(); 7156 x = Py_None; 7157 Py_INCREF(x); 7158 return x; 7159 } else 7160 return NULL; 7161 } 7162 else if (x == Py_None) 7163 return x; 7164 else if (PyLong_Check(x)) { 7165 long value = PyLong_AS_LONG(x); 7166 if (value < 0 || value > 255) { 7167 PyErr_SetString(PyExc_TypeError, 7168 "character mapping must be in range(256)"); 7169 Py_DECREF(x); 7170 return NULL; 7171 } 7172 return x; 7173 } 7174 else if (PyBytes_Check(x)) 7175 return x; 7176 else { 7177 /* wrong return value */ 7178 PyErr_Format(PyExc_TypeError, 7179 "character mapping must return integer, bytes or None, not %.400s", 7180 x->ob_type->tp_name); 7181 Py_DECREF(x); 7182 return NULL; 7183 } 7184} 7185 7186static int 7187charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7188{ 7189 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7190 /* exponentially overallocate to minimize reallocations */ 7191 if (requiredsize < 2*outsize) 7192 requiredsize = 2*outsize; 7193 if (_PyBytes_Resize(outobj, requiredsize)) 7194 return -1; 7195 return 0; 7196} 7197 7198typedef enum charmapencode_result { 7199 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7200} charmapencode_result; 7201/* lookup the character, put the result in the output string and adjust 7202 various state variables. Resize the output bytes object if not enough 7203 space is available. Return a new reference to the object that 7204 was put in the output buffer, or Py_None, if the mapping was undefined 7205 (in which case no character was written) or NULL, if a 7206 reallocation error occurred. The caller must decref the result */ 7207static charmapencode_result 7208charmapencode_output(Py_UNICODE c, PyObject *mapping, 7209 PyObject **outobj, Py_ssize_t *outpos) 7210{ 7211 PyObject *rep; 7212 char *outstart; 7213 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7214 7215 if (Py_TYPE(mapping) == &EncodingMapType) { 7216 int res = encoding_map_lookup(c, mapping); 7217 Py_ssize_t requiredsize = *outpos+1; 7218 if (res == -1) 7219 return enc_FAILED; 7220 if (outsize<requiredsize) 7221 if (charmapencode_resize(outobj, outpos, requiredsize)) 7222 return enc_EXCEPTION; 7223 outstart = PyBytes_AS_STRING(*outobj); 7224 outstart[(*outpos)++] = (char)res; 7225 return enc_SUCCESS; 7226 } 7227 7228 rep = charmapencode_lookup(c, mapping); 7229 if (rep==NULL) 7230 return enc_EXCEPTION; 7231 else if (rep==Py_None) { 7232 Py_DECREF(rep); 7233 return enc_FAILED; 7234 } else { 7235 if (PyLong_Check(rep)) { 7236 Py_ssize_t requiredsize = *outpos+1; 7237 if (outsize<requiredsize) 7238 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7239 Py_DECREF(rep); 7240 return enc_EXCEPTION; 7241 } 7242 outstart = PyBytes_AS_STRING(*outobj); 7243 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7244 } 7245 else { 7246 const char *repchars = PyBytes_AS_STRING(rep); 7247 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7248 Py_ssize_t requiredsize = *outpos+repsize; 7249 if (outsize<requiredsize) 7250 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7251 Py_DECREF(rep); 7252 return enc_EXCEPTION; 7253 } 7254 outstart = PyBytes_AS_STRING(*outobj); 7255 memcpy(outstart + *outpos, repchars, repsize); 7256 *outpos += repsize; 7257 } 7258 } 7259 Py_DECREF(rep); 7260 return enc_SUCCESS; 7261} 7262 7263/* handle an error in PyUnicode_EncodeCharmap 7264 Return 0 on success, -1 on error */ 7265static int 7266charmap_encoding_error( 7267 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, 7268 PyObject **exceptionObject, 7269 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7270 PyObject **res, Py_ssize_t *respos) 7271{ 7272 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7273 Py_ssize_t repsize; 7274 Py_ssize_t newpos; 7275 Py_UNICODE *uni2; 7276 /* startpos for collecting unencodable chars */ 7277 Py_ssize_t collstartpos = *inpos; 7278 Py_ssize_t collendpos = *inpos+1; 7279 Py_ssize_t collpos; 7280 char *encoding = "charmap"; 7281 char *reason = "character maps to <undefined>"; 7282 charmapencode_result x; 7283 7284 /* find all unencodable characters */ 7285 while (collendpos < size) { 7286 PyObject *rep; 7287 if (Py_TYPE(mapping) == &EncodingMapType) { 7288 int res = encoding_map_lookup(p[collendpos], mapping); 7289 if (res != -1) 7290 break; 7291 ++collendpos; 7292 continue; 7293 } 7294 7295 rep = charmapencode_lookup(p[collendpos], mapping); 7296 if (rep==NULL) 7297 return -1; 7298 else if (rep!=Py_None) { 7299 Py_DECREF(rep); 7300 break; 7301 } 7302 Py_DECREF(rep); 7303 ++collendpos; 7304 } 7305 /* cache callback name lookup 7306 * (if not done yet, i.e. it's the first error) */ 7307 if (*known_errorHandler==-1) { 7308 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7309 *known_errorHandler = 1; 7310 else if (!strcmp(errors, "replace")) 7311 *known_errorHandler = 2; 7312 else if (!strcmp(errors, "ignore")) 7313 *known_errorHandler = 3; 7314 else if (!strcmp(errors, "xmlcharrefreplace")) 7315 *known_errorHandler = 4; 7316 else 7317 *known_errorHandler = 0; 7318 } 7319 switch (*known_errorHandler) { 7320 case 1: /* strict */ 7321 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7322 return -1; 7323 case 2: /* replace */ 7324 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 7325 x = charmapencode_output('?', mapping, res, respos); 7326 if (x==enc_EXCEPTION) { 7327 return -1; 7328 } 7329 else if (x==enc_FAILED) { 7330 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7331 return -1; 7332 } 7333 } 7334 /* fall through */ 7335 case 3: /* ignore */ 7336 *inpos = collendpos; 7337 break; 7338 case 4: /* xmlcharrefreplace */ 7339 /* generate replacement (temporarily (mis)uses p) */ 7340 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 7341 char buffer[2+29+1+1]; 7342 char *cp; 7343 sprintf(buffer, "&#%d;", (int)p[collpos]); 7344 for (cp = buffer; *cp; ++cp) { 7345 x = charmapencode_output(*cp, mapping, res, respos); 7346 if (x==enc_EXCEPTION) 7347 return -1; 7348 else if (x==enc_FAILED) { 7349 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7350 return -1; 7351 } 7352 } 7353 } 7354 *inpos = collendpos; 7355 break; 7356 default: 7357 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 7358 encoding, reason, p, size, exceptionObject, 7359 collstartpos, collendpos, &newpos); 7360 if (repunicode == NULL) 7361 return -1; 7362 if (PyBytes_Check(repunicode)) { 7363 /* Directly copy bytes result to output. */ 7364 Py_ssize_t outsize = PyBytes_Size(*res); 7365 Py_ssize_t requiredsize; 7366 repsize = PyBytes_Size(repunicode); 7367 requiredsize = *respos + repsize; 7368 if (requiredsize > outsize) 7369 /* Make room for all additional bytes. */ 7370 if (charmapencode_resize(res, respos, requiredsize)) { 7371 Py_DECREF(repunicode); 7372 return -1; 7373 } 7374 memcpy(PyBytes_AsString(*res) + *respos, 7375 PyBytes_AsString(repunicode), repsize); 7376 *respos += repsize; 7377 *inpos = newpos; 7378 Py_DECREF(repunicode); 7379 break; 7380 } 7381 /* generate replacement */ 7382 repsize = PyUnicode_GET_SIZE(repunicode); 7383 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 7384 x = charmapencode_output(*uni2, mapping, res, respos); 7385 if (x==enc_EXCEPTION) { 7386 return -1; 7387 } 7388 else if (x==enc_FAILED) { 7389 Py_DECREF(repunicode); 7390 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); 7391 return -1; 7392 } 7393 } 7394 *inpos = newpos; 7395 Py_DECREF(repunicode); 7396 } 7397 return 0; 7398} 7399 7400PyObject * 7401PyUnicode_EncodeCharmap(const Py_UNICODE *p, 7402 Py_ssize_t size, 7403 PyObject *mapping, 7404 const char *errors) 7405{ 7406 /* output object */ 7407 PyObject *res = NULL; 7408 /* current input position */ 7409 Py_ssize_t inpos = 0; 7410 /* current output position */ 7411 Py_ssize_t respos = 0; 7412 PyObject *errorHandler = NULL; 7413 PyObject *exc = NULL; 7414 /* the following variable is used for caching string comparisons 7415 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7416 * 3=ignore, 4=xmlcharrefreplace */ 7417 int known_errorHandler = -1; 7418 7419 /* Default to Latin-1 */ 7420 if (mapping == NULL) 7421 return PyUnicode_EncodeLatin1(p, size, errors); 7422 7423 /* allocate enough for a simple encoding without 7424 replacements, if we need more, we'll resize */ 7425 res = PyBytes_FromStringAndSize(NULL, size); 7426 if (res == NULL) 7427 goto onError; 7428 if (size == 0) 7429 return res; 7430 7431 while (inpos<size) { 7432 /* try to encode it */ 7433 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); 7434 if (x==enc_EXCEPTION) /* error */ 7435 goto onError; 7436 if (x==enc_FAILED) { /* unencodable character */ 7437 if (charmap_encoding_error(p, size, &inpos, mapping, 7438 &exc, 7439 &known_errorHandler, &errorHandler, errors, 7440 &res, &respos)) { 7441 goto onError; 7442 } 7443 } 7444 else 7445 /* done with this character => adjust input position */ 7446 ++inpos; 7447 } 7448 7449 /* Resize if we allocated to much */ 7450 if (respos<PyBytes_GET_SIZE(res)) 7451 if (_PyBytes_Resize(&res, respos) < 0) 7452 goto onError; 7453 7454 Py_XDECREF(exc); 7455 Py_XDECREF(errorHandler); 7456 return res; 7457 7458 onError: 7459 Py_XDECREF(res); 7460 Py_XDECREF(exc); 7461 Py_XDECREF(errorHandler); 7462 return NULL; 7463} 7464 7465PyObject * 7466PyUnicode_AsCharmapString(PyObject *unicode, 7467 PyObject *mapping) 7468{ 7469 if (!PyUnicode_Check(unicode) || mapping == NULL) { 7470 PyErr_BadArgument(); 7471 return NULL; 7472 } 7473 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), 7474 PyUnicode_GET_SIZE(unicode), 7475 mapping, 7476 NULL); 7477} 7478 7479/* create or adjust a UnicodeTranslateError */ 7480static void 7481make_translate_exception(PyObject **exceptionObject, 7482 PyObject *unicode, 7483 Py_ssize_t startpos, Py_ssize_t endpos, 7484 const char *reason) 7485{ 7486 if (*exceptionObject == NULL) { 7487 *exceptionObject = _PyUnicodeTranslateError_Create( 7488 unicode, startpos, endpos, reason); 7489 } 7490 else { 7491 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 7492 goto onError; 7493 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 7494 goto onError; 7495 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 7496 goto onError; 7497 return; 7498 onError: 7499 Py_DECREF(*exceptionObject); 7500 *exceptionObject = NULL; 7501 } 7502} 7503 7504/* raises a UnicodeTranslateError */ 7505static void 7506raise_translate_exception(PyObject **exceptionObject, 7507 PyObject *unicode, 7508 Py_ssize_t startpos, Py_ssize_t endpos, 7509 const char *reason) 7510{ 7511 make_translate_exception(exceptionObject, 7512 unicode, startpos, endpos, reason); 7513 if (*exceptionObject != NULL) 7514 PyCodec_StrictErrors(*exceptionObject); 7515} 7516 7517/* error handling callback helper: 7518 build arguments, call the callback and check the arguments, 7519 put the result into newpos and return the replacement string, which 7520 has to be freed by the caller */ 7521static PyObject * 7522unicode_translate_call_errorhandler(const char *errors, 7523 PyObject **errorHandler, 7524 const char *reason, 7525 PyObject *unicode, PyObject **exceptionObject, 7526 Py_ssize_t startpos, Py_ssize_t endpos, 7527 Py_ssize_t *newpos) 7528{ 7529 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 7530 7531 Py_ssize_t i_newpos; 7532 PyObject *restuple; 7533 PyObject *resunicode; 7534 7535 if (*errorHandler == NULL) { 7536 *errorHandler = PyCodec_LookupError(errors); 7537 if (*errorHandler == NULL) 7538 return NULL; 7539 } 7540 7541 make_translate_exception(exceptionObject, 7542 unicode, startpos, endpos, reason); 7543 if (*exceptionObject == NULL) 7544 return NULL; 7545 7546 restuple = PyObject_CallFunctionObjArgs( 7547 *errorHandler, *exceptionObject, NULL); 7548 if (restuple == NULL) 7549 return NULL; 7550 if (!PyTuple_Check(restuple)) { 7551 PyErr_SetString(PyExc_TypeError, &argparse[4]); 7552 Py_DECREF(restuple); 7553 return NULL; 7554 } 7555 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 7556 &resunicode, &i_newpos)) { 7557 Py_DECREF(restuple); 7558 return NULL; 7559 } 7560 if (i_newpos<0) 7561 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 7562 else 7563 *newpos = i_newpos; 7564 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 7565 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 7566 Py_DECREF(restuple); 7567 return NULL; 7568 } 7569 Py_INCREF(resunicode); 7570 Py_DECREF(restuple); 7571 return resunicode; 7572} 7573 7574/* Lookup the character ch in the mapping and put the result in result, 7575 which must be decrefed by the caller. 7576 Return 0 on success, -1 on error */ 7577static int 7578charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 7579{ 7580 PyObject *w = PyLong_FromLong((long)c); 7581 PyObject *x; 7582 7583 if (w == NULL) 7584 return -1; 7585 x = PyObject_GetItem(mapping, w); 7586 Py_DECREF(w); 7587 if (x == NULL) { 7588 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7589 /* No mapping found means: use 1:1 mapping. */ 7590 PyErr_Clear(); 7591 *result = NULL; 7592 return 0; 7593 } else 7594 return -1; 7595 } 7596 else if (x == Py_None) { 7597 *result = x; 7598 return 0; 7599 } 7600 else if (PyLong_Check(x)) { 7601 long value = PyLong_AS_LONG(x); 7602 long max = PyUnicode_GetMax(); 7603 if (value < 0 || value > max) { 7604 PyErr_Format(PyExc_TypeError, 7605 "character mapping must be in range(0x%x)", max+1); 7606 Py_DECREF(x); 7607 return -1; 7608 } 7609 *result = x; 7610 return 0; 7611 } 7612 else if (PyUnicode_Check(x)) { 7613 *result = x; 7614 return 0; 7615 } 7616 else { 7617 /* wrong return value */ 7618 PyErr_SetString(PyExc_TypeError, 7619 "character mapping must return integer, None or str"); 7620 Py_DECREF(x); 7621 return -1; 7622 } 7623} 7624/* ensure that *outobj is at least requiredsize characters long, 7625 if not reallocate and adjust various state variables. 7626 Return 0 on success, -1 on error */ 7627static int 7628charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 7629 Py_ssize_t requiredsize) 7630{ 7631 Py_ssize_t oldsize = *psize; 7632 if (requiredsize > oldsize) { 7633 /* exponentially overallocate to minimize reallocations */ 7634 if (requiredsize < 2 * oldsize) 7635 requiredsize = 2 * oldsize; 7636 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 7637 if (*outobj == 0) 7638 return -1; 7639 *psize = requiredsize; 7640 } 7641 return 0; 7642} 7643/* lookup the character, put the result in the output string and adjust 7644 various state variables. Return a new reference to the object that 7645 was put in the output buffer in *result, or Py_None, if the mapping was 7646 undefined (in which case no character was written). 7647 The called must decref result. 7648 Return 0 on success, -1 on error. */ 7649static int 7650charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 7651 PyObject *mapping, Py_UCS4 **output, 7652 Py_ssize_t *osize, Py_ssize_t *opos, 7653 PyObject **res) 7654{ 7655 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 7656 if (charmaptranslate_lookup(curinp, mapping, res)) 7657 return -1; 7658 if (*res==NULL) { 7659 /* not found => default to 1:1 mapping */ 7660 (*output)[(*opos)++] = curinp; 7661 } 7662 else if (*res==Py_None) 7663 ; 7664 else if (PyLong_Check(*res)) { 7665 /* no overflow check, because we know that the space is enough */ 7666 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 7667 } 7668 else if (PyUnicode_Check(*res)) { 7669 Py_ssize_t repsize; 7670 if (PyUnicode_READY(*res) == -1) 7671 return -1; 7672 repsize = PyUnicode_GET_LENGTH(*res); 7673 if (repsize==1) { 7674 /* no overflow check, because we know that the space is enough */ 7675 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 7676 } 7677 else if (repsize!=0) { 7678 /* more than one character */ 7679 Py_ssize_t requiredsize = *opos + 7680 (PyUnicode_GET_LENGTH(input) - ipos) + 7681 repsize - 1; 7682 Py_ssize_t i; 7683 if (charmaptranslate_makespace(output, osize, requiredsize)) 7684 return -1; 7685 for(i = 0; i < repsize; i++) 7686 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 7687 } 7688 } 7689 else 7690 return -1; 7691 return 0; 7692} 7693 7694PyObject * 7695_PyUnicode_TranslateCharmap(PyObject *input, 7696 PyObject *mapping, 7697 const char *errors) 7698{ 7699 /* input object */ 7700 char *idata; 7701 Py_ssize_t size, i; 7702 int kind; 7703 /* output buffer */ 7704 Py_UCS4 *output = NULL; 7705 Py_ssize_t osize; 7706 PyObject *res; 7707 /* current output position */ 7708 Py_ssize_t opos; 7709 char *reason = "character maps to <undefined>"; 7710 PyObject *errorHandler = NULL; 7711 PyObject *exc = NULL; 7712 /* the following variable is used for caching string comparisons 7713 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 7714 * 3=ignore, 4=xmlcharrefreplace */ 7715 int known_errorHandler = -1; 7716 7717 if (mapping == NULL) { 7718 PyErr_BadArgument(); 7719 return NULL; 7720 } 7721 7722 if (PyUnicode_READY(input) == -1) 7723 return NULL; 7724 idata = (char*)PyUnicode_DATA(input); 7725 kind = PyUnicode_KIND(input); 7726 size = PyUnicode_GET_LENGTH(input); 7727 i = 0; 7728 7729 if (size == 0) { 7730 Py_INCREF(input); 7731 return input; 7732 } 7733 7734 /* allocate enough for a simple 1:1 translation without 7735 replacements, if we need more, we'll resize */ 7736 osize = size; 7737 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 7738 opos = 0; 7739 if (output == NULL) { 7740 PyErr_NoMemory(); 7741 goto onError; 7742 } 7743 7744 while (i<size) { 7745 /* try to encode it */ 7746 PyObject *x = NULL; 7747 if (charmaptranslate_output(input, i, mapping, 7748 &output, &osize, &opos, &x)) { 7749 Py_XDECREF(x); 7750 goto onError; 7751 } 7752 Py_XDECREF(x); 7753 if (x!=Py_None) /* it worked => adjust input pointer */ 7754 ++i; 7755 else { /* untranslatable character */ 7756 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7757 Py_ssize_t repsize; 7758 Py_ssize_t newpos; 7759 Py_ssize_t uni2; 7760 /* startpos for collecting untranslatable chars */ 7761 Py_ssize_t collstart = i; 7762 Py_ssize_t collend = i+1; 7763 Py_ssize_t coll; 7764 7765 /* find all untranslatable characters */ 7766 while (collend < size) { 7767 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 7768 goto onError; 7769 Py_XDECREF(x); 7770 if (x!=Py_None) 7771 break; 7772 ++collend; 7773 } 7774 /* cache callback name lookup 7775 * (if not done yet, i.e. it's the first error) */ 7776 if (known_errorHandler==-1) { 7777 if ((errors==NULL) || (!strcmp(errors, "strict"))) 7778 known_errorHandler = 1; 7779 else if (!strcmp(errors, "replace")) 7780 known_errorHandler = 2; 7781 else if (!strcmp(errors, "ignore")) 7782 known_errorHandler = 3; 7783 else if (!strcmp(errors, "xmlcharrefreplace")) 7784 known_errorHandler = 4; 7785 else 7786 known_errorHandler = 0; 7787 } 7788 switch (known_errorHandler) { 7789 case 1: /* strict */ 7790 raise_translate_exception(&exc, input, collstart, 7791 collend, reason); 7792 goto onError; 7793 case 2: /* replace */ 7794 /* No need to check for space, this is a 1:1 replacement */ 7795 for (coll = collstart; coll<collend; coll++) 7796 output[opos++] = '?'; 7797 /* fall through */ 7798 case 3: /* ignore */ 7799 i = collend; 7800 break; 7801 case 4: /* xmlcharrefreplace */ 7802 /* generate replacement (temporarily (mis)uses i) */ 7803 for (i = collstart; i < collend; ++i) { 7804 char buffer[2+29+1+1]; 7805 char *cp; 7806 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 7807 if (charmaptranslate_makespace(&output, &osize, 7808 opos+strlen(buffer)+(size-collend))) 7809 goto onError; 7810 for (cp = buffer; *cp; ++cp) 7811 output[opos++] = *cp; 7812 } 7813 i = collend; 7814 break; 7815 default: 7816 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 7817 reason, input, &exc, 7818 collstart, collend, &newpos); 7819 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1) 7820 goto onError; 7821 /* generate replacement */ 7822 repsize = PyUnicode_GET_LENGTH(repunicode); 7823 if (charmaptranslate_makespace(&output, &osize, 7824 opos+repsize+(size-collend))) { 7825 Py_DECREF(repunicode); 7826 goto onError; 7827 } 7828 for (uni2 = 0; repsize-->0; ++uni2) 7829 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 7830 i = newpos; 7831 Py_DECREF(repunicode); 7832 } 7833 } 7834 } 7835 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 7836 if (!res) 7837 goto onError; 7838 PyMem_Free(output); 7839 Py_XDECREF(exc); 7840 Py_XDECREF(errorHandler); 7841 return res; 7842 7843 onError: 7844 PyMem_Free(output); 7845 Py_XDECREF(exc); 7846 Py_XDECREF(errorHandler); 7847 return NULL; 7848} 7849 7850/* Deprecated. Use PyUnicode_Translate instead. */ 7851PyObject * 7852PyUnicode_TranslateCharmap(const Py_UNICODE *p, 7853 Py_ssize_t size, 7854 PyObject *mapping, 7855 const char *errors) 7856{ 7857 PyObject *unicode = PyUnicode_FromUnicode(p, size); 7858 if (!unicode) 7859 return NULL; 7860 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 7861} 7862 7863PyObject * 7864PyUnicode_Translate(PyObject *str, 7865 PyObject *mapping, 7866 const char *errors) 7867{ 7868 PyObject *result; 7869 7870 str = PyUnicode_FromObject(str); 7871 if (str == NULL) 7872 goto onError; 7873 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 7874 Py_DECREF(str); 7875 return result; 7876 7877 onError: 7878 Py_XDECREF(str); 7879 return NULL; 7880} 7881 7882static Py_UCS4 7883fix_decimal_and_space_to_ascii(PyUnicodeObject *self) 7884{ 7885 /* No need to call PyUnicode_READY(self) because this function is only 7886 called as a callback from fixup() which does it already. */ 7887 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 7888 const int kind = PyUnicode_KIND(self); 7889 void *data = PyUnicode_DATA(self); 7890 Py_UCS4 maxchar = 0, ch, fixed; 7891 Py_ssize_t i; 7892 7893 for (i = 0; i < len; ++i) { 7894 ch = PyUnicode_READ(kind, data, i); 7895 fixed = 0; 7896 if (ch > 127) { 7897 if (Py_UNICODE_ISSPACE(ch)) 7898 fixed = ' '; 7899 else { 7900 const int decimal = Py_UNICODE_TODECIMAL(ch); 7901 if (decimal >= 0) 7902 fixed = '0' + decimal; 7903 } 7904 if (fixed != 0) { 7905 if (fixed > maxchar) 7906 maxchar = fixed; 7907 PyUnicode_WRITE(kind, data, i, fixed); 7908 } 7909 else if (ch > maxchar) 7910 maxchar = ch; 7911 } 7912 else if (ch > maxchar) 7913 maxchar = ch; 7914 } 7915 7916 return maxchar; 7917} 7918 7919PyObject * 7920_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 7921{ 7922 if (!PyUnicode_Check(unicode)) { 7923 PyErr_BadInternalCall(); 7924 return NULL; 7925 } 7926 if (PyUnicode_READY(unicode) == -1) 7927 return NULL; 7928 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 7929 /* If the string is already ASCII, just return the same string */ 7930 Py_INCREF(unicode); 7931 return unicode; 7932 } 7933 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); 7934} 7935 7936PyObject * 7937PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 7938 Py_ssize_t length) 7939{ 7940 PyObject *result; 7941 Py_UNICODE *p; /* write pointer into result */ 7942 Py_ssize_t i; 7943 /* Copy to a new string */ 7944 result = (PyObject *)_PyUnicode_New(length); 7945 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 7946 if (result == NULL) 7947 return result; 7948 p = PyUnicode_AS_UNICODE(result); 7949 /* Iterate over code points */ 7950 for (i = 0; i < length; i++) { 7951 Py_UNICODE ch =s[i]; 7952 if (ch > 127) { 7953 int decimal = Py_UNICODE_TODECIMAL(ch); 7954 if (decimal >= 0) 7955 p[i] = '0' + decimal; 7956 } 7957 } 7958 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) { 7959 Py_DECREF(result); 7960 return NULL; 7961 } 7962 return result; 7963} 7964/* --- Decimal Encoder ---------------------------------------------------- */ 7965 7966int 7967PyUnicode_EncodeDecimal(Py_UNICODE *s, 7968 Py_ssize_t length, 7969 char *output, 7970 const char *errors) 7971{ 7972 Py_UNICODE *p, *end; 7973 PyObject *errorHandler = NULL; 7974 PyObject *exc = NULL; 7975 const char *encoding = "decimal"; 7976 const char *reason = "invalid decimal Unicode string"; 7977 /* the following variable is used for caching string comparisons 7978 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 7979 int known_errorHandler = -1; 7980 7981 if (output == NULL) { 7982 PyErr_BadArgument(); 7983 return -1; 7984 } 7985 7986 p = s; 7987 end = s + length; 7988 while (p < end) { 7989 register Py_UNICODE ch = *p; 7990 int decimal; 7991 PyObject *repunicode; 7992 Py_ssize_t repsize; 7993 Py_ssize_t newpos; 7994 Py_UNICODE *uni2; 7995 Py_UNICODE *collstart; 7996 Py_UNICODE *collend; 7997 7998 if (Py_UNICODE_ISSPACE(ch)) { 7999 *output++ = ' '; 8000 ++p; 8001 continue; 8002 } 8003 decimal = Py_UNICODE_TODECIMAL(ch); 8004 if (decimal >= 0) { 8005 *output++ = '0' + decimal; 8006 ++p; 8007 continue; 8008 } 8009 if (0 < ch && ch < 256) { 8010 *output++ = (char)ch; 8011 ++p; 8012 continue; 8013 } 8014 /* All other characters are considered unencodable */ 8015 collstart = p; 8016 collend = p+1; 8017 while (collend < end) { 8018 if ((0 < *collend && *collend < 256) || 8019 !Py_UNICODE_ISSPACE(*collend) || 8020 Py_UNICODE_TODECIMAL(*collend)) 8021 break; 8022 } 8023 /* cache callback name lookup 8024 * (if not done yet, i.e. it's the first error) */ 8025 if (known_errorHandler==-1) { 8026 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8027 known_errorHandler = 1; 8028 else if (!strcmp(errors, "replace")) 8029 known_errorHandler = 2; 8030 else if (!strcmp(errors, "ignore")) 8031 known_errorHandler = 3; 8032 else if (!strcmp(errors, "xmlcharrefreplace")) 8033 known_errorHandler = 4; 8034 else 8035 known_errorHandler = 0; 8036 } 8037 switch (known_errorHandler) { 8038 case 1: /* strict */ 8039 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); 8040 goto onError; 8041 case 2: /* replace */ 8042 for (p = collstart; p < collend; ++p) 8043 *output++ = '?'; 8044 /* fall through */ 8045 case 3: /* ignore */ 8046 p = collend; 8047 break; 8048 case 4: /* xmlcharrefreplace */ 8049 /* generate replacement (temporarily (mis)uses p) */ 8050 for (p = collstart; p < collend; ++p) 8051 output += sprintf(output, "&#%d;", (int)*p); 8052 p = collend; 8053 break; 8054 default: 8055 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8056 encoding, reason, s, length, &exc, 8057 collstart-s, collend-s, &newpos); 8058 if (repunicode == NULL) 8059 goto onError; 8060 if (!PyUnicode_Check(repunicode)) { 8061 /* Byte results not supported, since they have no decimal property. */ 8062 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8063 Py_DECREF(repunicode); 8064 goto onError; 8065 } 8066 /* generate replacement */ 8067 repsize = PyUnicode_GET_SIZE(repunicode); 8068 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8069 Py_UNICODE ch = *uni2; 8070 if (Py_UNICODE_ISSPACE(ch)) 8071 *output++ = ' '; 8072 else { 8073 decimal = Py_UNICODE_TODECIMAL(ch); 8074 if (decimal >= 0) 8075 *output++ = '0' + decimal; 8076 else if (0 < ch && ch < 256) 8077 *output++ = (char)ch; 8078 else { 8079 Py_DECREF(repunicode); 8080 raise_encode_exception(&exc, encoding, 8081 s, length, collstart-s, collend-s, reason); 8082 goto onError; 8083 } 8084 } 8085 } 8086 p = s + newpos; 8087 Py_DECREF(repunicode); 8088 } 8089 } 8090 /* 0-terminate the output string */ 8091 *output++ = '\0'; 8092 Py_XDECREF(exc); 8093 Py_XDECREF(errorHandler); 8094 return 0; 8095 8096 onError: 8097 Py_XDECREF(exc); 8098 Py_XDECREF(errorHandler); 8099 return -1; 8100} 8101 8102/* --- Helpers ------------------------------------------------------------ */ 8103 8104#include "stringlib/ucs1lib.h" 8105#include "stringlib/fastsearch.h" 8106#include "stringlib/partition.h" 8107#include "stringlib/split.h" 8108#include "stringlib/count.h" 8109#include "stringlib/find.h" 8110#include "stringlib/localeutil.h" 8111#include "stringlib/undef.h" 8112 8113#include "stringlib/ucs2lib.h" 8114#include "stringlib/fastsearch.h" 8115#include "stringlib/partition.h" 8116#include "stringlib/split.h" 8117#include "stringlib/count.h" 8118#include "stringlib/find.h" 8119#include "stringlib/localeutil.h" 8120#include "stringlib/undef.h" 8121 8122#include "stringlib/ucs4lib.h" 8123#include "stringlib/fastsearch.h" 8124#include "stringlib/partition.h" 8125#include "stringlib/split.h" 8126#include "stringlib/count.h" 8127#include "stringlib/find.h" 8128#include "stringlib/localeutil.h" 8129#include "stringlib/undef.h" 8130 8131static Py_ssize_t 8132any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, 8133 const Py_UCS1*, Py_ssize_t, 8134 Py_ssize_t, Py_ssize_t), 8135 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, 8136 const Py_UCS2*, Py_ssize_t, 8137 Py_ssize_t, Py_ssize_t), 8138 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t, 8139 const Py_UCS4*, Py_ssize_t, 8140 Py_ssize_t, Py_ssize_t), 8141 PyObject* s1, PyObject* s2, 8142 Py_ssize_t start, 8143 Py_ssize_t end) 8144{ 8145 int kind1, kind2, kind; 8146 void *buf1, *buf2; 8147 Py_ssize_t len1, len2, result; 8148 8149 kind1 = PyUnicode_KIND(s1); 8150 kind2 = PyUnicode_KIND(s2); 8151 kind = kind1 > kind2 ? kind1 : kind2; 8152 buf1 = PyUnicode_DATA(s1); 8153 buf2 = PyUnicode_DATA(s2); 8154 if (kind1 != kind) 8155 buf1 = _PyUnicode_AsKind(s1, kind); 8156 if (!buf1) 8157 return -2; 8158 if (kind2 != kind) 8159 buf2 = _PyUnicode_AsKind(s2, kind); 8160 if (!buf2) { 8161 if (kind1 != kind) PyMem_Free(buf1); 8162 return -2; 8163 } 8164 len1 = PyUnicode_GET_LENGTH(s1); 8165 len2 = PyUnicode_GET_LENGTH(s2); 8166 8167 switch(kind) { 8168 case PyUnicode_1BYTE_KIND: 8169 result = ucs1(buf1, len1, buf2, len2, start, end); 8170 break; 8171 case PyUnicode_2BYTE_KIND: 8172 result = ucs2(buf1, len1, buf2, len2, start, end); 8173 break; 8174 case PyUnicode_4BYTE_KIND: 8175 result = ucs4(buf1, len1, buf2, len2, start, end); 8176 break; 8177 default: 8178 assert(0); result = -2; 8179 } 8180 8181 if (kind1 != kind) 8182 PyMem_Free(buf1); 8183 if (kind2 != kind) 8184 PyMem_Free(buf2); 8185 8186 return result; 8187} 8188 8189Py_ssize_t 8190_PyUnicode_InsertThousandsGrouping(int kind, void *data, 8191 Py_ssize_t n_buffer, 8192 void *digits, Py_ssize_t n_digits, 8193 Py_ssize_t min_width, 8194 const char *grouping, 8195 const char *thousands_sep) 8196{ 8197 switch(kind) { 8198 case PyUnicode_1BYTE_KIND: 8199 return _PyUnicode_ucs1_InsertThousandsGrouping( 8200 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8201 min_width, grouping, thousands_sep); 8202 case PyUnicode_2BYTE_KIND: 8203 return _PyUnicode_ucs2_InsertThousandsGrouping( 8204 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8205 min_width, grouping, thousands_sep); 8206 case PyUnicode_4BYTE_KIND: 8207 return _PyUnicode_ucs4_InsertThousandsGrouping( 8208 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 8209 min_width, grouping, thousands_sep); 8210 } 8211 assert(0); 8212 return -1; 8213} 8214 8215 8216#include "stringlib/unicodedefs.h" 8217#include "stringlib/fastsearch.h" 8218 8219#include "stringlib/count.h" 8220#include "stringlib/find.h" 8221 8222/* helper macro to fixup start/end slice values */ 8223#define ADJUST_INDICES(start, end, len) \ 8224 if (end > len) \ 8225 end = len; \ 8226 else if (end < 0) { \ 8227 end += len; \ 8228 if (end < 0) \ 8229 end = 0; \ 8230 } \ 8231 if (start < 0) { \ 8232 start += len; \ 8233 if (start < 0) \ 8234 start = 0; \ 8235 } 8236 8237Py_ssize_t 8238PyUnicode_Count(PyObject *str, 8239 PyObject *substr, 8240 Py_ssize_t start, 8241 Py_ssize_t end) 8242{ 8243 Py_ssize_t result; 8244 PyUnicodeObject* str_obj; 8245 PyUnicodeObject* sub_obj; 8246 int kind1, kind2, kind; 8247 void *buf1 = NULL, *buf2 = NULL; 8248 Py_ssize_t len1, len2; 8249 8250 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); 8251 if (!str_obj || PyUnicode_READY(str_obj) == -1) 8252 return -1; 8253 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); 8254 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 8255 Py_DECREF(str_obj); 8256 return -1; 8257 } 8258 8259 kind1 = PyUnicode_KIND(str_obj); 8260 kind2 = PyUnicode_KIND(sub_obj); 8261 kind = kind1 > kind2 ? kind1 : kind2; 8262 buf1 = PyUnicode_DATA(str_obj); 8263 if (kind1 != kind) 8264 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind); 8265 if (!buf1) 8266 goto onError; 8267 buf2 = PyUnicode_DATA(sub_obj); 8268 if (kind2 != kind) 8269 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind); 8270 if (!buf2) 8271 goto onError; 8272 len1 = PyUnicode_GET_LENGTH(str_obj); 8273 len2 = PyUnicode_GET_LENGTH(sub_obj); 8274 8275 ADJUST_INDICES(start, end, len1); 8276 switch(kind) { 8277 case PyUnicode_1BYTE_KIND: 8278 result = ucs1lib_count( 8279 ((Py_UCS1*)buf1) + start, end - start, 8280 buf2, len2, PY_SSIZE_T_MAX 8281 ); 8282 break; 8283 case PyUnicode_2BYTE_KIND: 8284 result = ucs2lib_count( 8285 ((Py_UCS2*)buf1) + start, end - start, 8286 buf2, len2, PY_SSIZE_T_MAX 8287 ); 8288 break; 8289 case PyUnicode_4BYTE_KIND: 8290 result = ucs4lib_count( 8291 ((Py_UCS4*)buf1) + start, end - start, 8292 buf2, len2, PY_SSIZE_T_MAX 8293 ); 8294 break; 8295 default: 8296 assert(0); result = 0; 8297 } 8298 8299 Py_DECREF(sub_obj); 8300 Py_DECREF(str_obj); 8301 8302 if (kind1 != kind) 8303 PyMem_Free(buf1); 8304 if (kind2 != kind) 8305 PyMem_Free(buf2); 8306 8307 return result; 8308 onError: 8309 Py_DECREF(sub_obj); 8310 Py_DECREF(str_obj); 8311 if (kind1 != kind && buf1) 8312 PyMem_Free(buf1); 8313 if (kind2 != kind && buf2) 8314 PyMem_Free(buf2); 8315 return -1; 8316} 8317 8318Py_ssize_t 8319PyUnicode_Find(PyObject *str, 8320 PyObject *sub, 8321 Py_ssize_t start, 8322 Py_ssize_t end, 8323 int direction) 8324{ 8325 Py_ssize_t result; 8326 8327 str = PyUnicode_FromObject(str); 8328 if (!str || PyUnicode_READY(str) == -1) 8329 return -2; 8330 sub = PyUnicode_FromObject(sub); 8331 if (!sub || PyUnicode_READY(sub) == -1) { 8332 Py_DECREF(str); 8333 return -2; 8334 } 8335 8336 if (direction > 0) 8337 result = any_find_slice( 8338 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 8339 str, sub, start, end 8340 ); 8341 else 8342 result = any_find_slice( 8343 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 8344 str, sub, start, end 8345 ); 8346 8347 Py_DECREF(str); 8348 Py_DECREF(sub); 8349 8350 return result; 8351} 8352 8353Py_ssize_t 8354PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 8355 Py_ssize_t start, Py_ssize_t end, 8356 int direction) 8357{ 8358 char *result; 8359 int kind; 8360 if (PyUnicode_READY(str) == -1) 8361 return -2; 8362 if (start < 0 || end < 0) { 8363 PyErr_SetString(PyExc_IndexError, "string index out of range"); 8364 return -2; 8365 } 8366 if (end > PyUnicode_GET_LENGTH(str)) 8367 end = PyUnicode_GET_LENGTH(str); 8368 kind = PyUnicode_KIND(str); 8369 result = findchar(PyUnicode_1BYTE_DATA(str) 8370 + PyUnicode_KIND_SIZE(kind, start), 8371 kind, 8372 end-start, ch, direction); 8373 if (!result) 8374 return -1; 8375 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1); 8376} 8377 8378static int 8379tailmatch(PyUnicodeObject *self, 8380 PyUnicodeObject *substring, 8381 Py_ssize_t start, 8382 Py_ssize_t end, 8383 int direction) 8384{ 8385 int kind_self; 8386 int kind_sub; 8387 void *data_self; 8388 void *data_sub; 8389 Py_ssize_t offset; 8390 Py_ssize_t i; 8391 Py_ssize_t end_sub; 8392 8393 if (PyUnicode_READY(self) == -1 || 8394 PyUnicode_READY(substring) == -1) 8395 return 0; 8396 8397 if (PyUnicode_GET_LENGTH(substring) == 0) 8398 return 1; 8399 8400 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 8401 end -= PyUnicode_GET_LENGTH(substring); 8402 if (end < start) 8403 return 0; 8404 8405 kind_self = PyUnicode_KIND(self); 8406 data_self = PyUnicode_DATA(self); 8407 kind_sub = PyUnicode_KIND(substring); 8408 data_sub = PyUnicode_DATA(substring); 8409 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 8410 8411 if (direction > 0) 8412 offset = end; 8413 else 8414 offset = start; 8415 8416 if (PyUnicode_READ(kind_self, data_self, offset) == 8417 PyUnicode_READ(kind_sub, data_sub, 0) && 8418 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 8419 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 8420 /* If both are of the same kind, memcmp is sufficient */ 8421 if (kind_self == kind_sub) { 8422 return ! memcmp((char *)data_self + 8423 (offset * PyUnicode_CHARACTER_SIZE(substring)), 8424 data_sub, 8425 PyUnicode_GET_LENGTH(substring) * 8426 PyUnicode_CHARACTER_SIZE(substring)); 8427 } 8428 /* otherwise we have to compare each character by first accesing it */ 8429 else { 8430 /* We do not need to compare 0 and len(substring)-1 because 8431 the if statement above ensured already that they are equal 8432 when we end up here. */ 8433 // TODO: honor direction and do a forward or backwards search 8434 for (i = 1; i < end_sub; ++i) { 8435 if (PyUnicode_READ(kind_self, data_self, offset + i) != 8436 PyUnicode_READ(kind_sub, data_sub, i)) 8437 return 0; 8438 } 8439 return 1; 8440 } 8441 } 8442 8443 return 0; 8444} 8445 8446Py_ssize_t 8447PyUnicode_Tailmatch(PyObject *str, 8448 PyObject *substr, 8449 Py_ssize_t start, 8450 Py_ssize_t end, 8451 int direction) 8452{ 8453 Py_ssize_t result; 8454 8455 str = PyUnicode_FromObject(str); 8456 if (str == NULL) 8457 return -1; 8458 substr = PyUnicode_FromObject(substr); 8459 if (substr == NULL) { 8460 Py_DECREF(str); 8461 return -1; 8462 } 8463 8464 result = tailmatch((PyUnicodeObject *)str, 8465 (PyUnicodeObject *)substr, 8466 start, end, direction); 8467 Py_DECREF(str); 8468 Py_DECREF(substr); 8469 return result; 8470} 8471 8472/* Apply fixfct filter to the Unicode object self and return a 8473 reference to the modified object */ 8474 8475static PyObject * 8476fixup(PyUnicodeObject *self, 8477 Py_UCS4 (*fixfct)(PyUnicodeObject *s)) 8478{ 8479 PyObject *u; 8480 Py_UCS4 maxchar_old, maxchar_new = 0; 8481 8482 if (PyUnicode_READY(self) == -1) 8483 return NULL; 8484 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 8485 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 8486 maxchar_old); 8487 if (u == NULL) 8488 return NULL; 8489 8490 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 8491 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u)); 8492 8493 /* fix functions return the new maximum character in a string, 8494 if the kind of the resulting unicode object does not change, 8495 everything is fine. Otherwise we need to change the string kind 8496 and re-run the fix function. */ 8497 maxchar_new = fixfct((PyUnicodeObject*)u); 8498 if (maxchar_new == 0) 8499 /* do nothing, keep maxchar_new at 0 which means no changes. */; 8500 else if (maxchar_new <= 127) 8501 maxchar_new = 127; 8502 else if (maxchar_new <= 255) 8503 maxchar_new = 255; 8504 else if (maxchar_new <= 65535) 8505 maxchar_new = 65535; 8506 else 8507 maxchar_new = 1114111; /* 0x10ffff */ 8508 8509 if (!maxchar_new && PyUnicode_CheckExact(self)) { 8510 /* fixfct should return TRUE if it modified the buffer. If 8511 FALSE, return a reference to the original buffer instead 8512 (to save space, not time) */ 8513 Py_INCREF(self); 8514 Py_DECREF(u); 8515 return (PyObject*) self; 8516 } 8517 else if (maxchar_new == maxchar_old) { 8518 return u; 8519 } 8520 else { 8521 /* In case the maximum character changed, we need to 8522 convert the string to the new category. */ 8523 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 8524 if (v == NULL) { 8525 Py_DECREF(u); 8526 return NULL; 8527 } 8528 if (maxchar_new > maxchar_old) { 8529 /* If the maxchar increased so that the kind changed, not all 8530 characters are representable anymore and we need to fix the 8531 string again. This only happens in very few cases. */ 8532 if (PyUnicode_CopyCharacters(v, 0, 8533 (PyObject*)self, 0, 8534 PyUnicode_GET_LENGTH(self)) < 0) 8535 { 8536 Py_DECREF(u); 8537 return NULL; 8538 } 8539 maxchar_old = fixfct((PyUnicodeObject*)v); 8540 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 8541 } 8542 else { 8543 if (PyUnicode_CopyCharacters(v, 0, 8544 u, 0, 8545 PyUnicode_GET_LENGTH(self)) < 0) 8546 { 8547 Py_DECREF(u); 8548 return NULL; 8549 } 8550 } 8551 8552 Py_DECREF(u); 8553 return v; 8554 } 8555} 8556 8557static Py_UCS4 8558fixupper(PyUnicodeObject *self) 8559{ 8560 /* No need to call PyUnicode_READY(self) because this function is only 8561 called as a callback from fixup() which does it already. */ 8562 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8563 const int kind = PyUnicode_KIND(self); 8564 void *data = PyUnicode_DATA(self); 8565 int touched = 0; 8566 Py_UCS4 maxchar = 0; 8567 Py_ssize_t i; 8568 8569 for (i = 0; i < len; ++i) { 8570 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8571 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 8572 if (up != ch) { 8573 if (up > maxchar) 8574 maxchar = up; 8575 PyUnicode_WRITE(kind, data, i, up); 8576 touched = 1; 8577 } 8578 else if (ch > maxchar) 8579 maxchar = ch; 8580 } 8581 8582 if (touched) 8583 return maxchar; 8584 else 8585 return 0; 8586} 8587 8588static Py_UCS4 8589fixlower(PyUnicodeObject *self) 8590{ 8591 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8592 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8593 const int kind = PyUnicode_KIND(self); 8594 void *data = PyUnicode_DATA(self); 8595 int touched = 0; 8596 Py_UCS4 maxchar = 0; 8597 Py_ssize_t i; 8598 8599 for(i = 0; i < len; ++i) { 8600 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8601 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8602 if (lo != ch) { 8603 if (lo > maxchar) 8604 maxchar = lo; 8605 PyUnicode_WRITE(kind, data, i, lo); 8606 touched = 1; 8607 } 8608 else if (ch > maxchar) 8609 maxchar = ch; 8610 } 8611 8612 if (touched) 8613 return maxchar; 8614 else 8615 return 0; 8616} 8617 8618static Py_UCS4 8619fixswapcase(PyUnicodeObject *self) 8620{ 8621 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8622 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8623 const int kind = PyUnicode_KIND(self); 8624 void *data = PyUnicode_DATA(self); 8625 int touched = 0; 8626 Py_UCS4 maxchar = 0; 8627 Py_ssize_t i; 8628 8629 for(i = 0; i < len; ++i) { 8630 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8631 Py_UCS4 nu = 0; 8632 8633 if (Py_UNICODE_ISUPPER(ch)) 8634 nu = Py_UNICODE_TOLOWER(ch); 8635 else if (Py_UNICODE_ISLOWER(ch)) 8636 nu = Py_UNICODE_TOUPPER(ch); 8637 8638 if (nu != 0) { 8639 if (nu > maxchar) 8640 maxchar = nu; 8641 PyUnicode_WRITE(kind, data, i, nu); 8642 touched = 1; 8643 } 8644 else if (ch > maxchar) 8645 maxchar = ch; 8646 } 8647 8648 if (touched) 8649 return maxchar; 8650 else 8651 return 0; 8652} 8653 8654static Py_UCS4 8655fixcapitalize(PyUnicodeObject *self) 8656{ 8657 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8658 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8659 const int kind = PyUnicode_KIND(self); 8660 void *data = PyUnicode_DATA(self); 8661 int touched = 0; 8662 Py_UCS4 maxchar = 0; 8663 Py_ssize_t i = 0; 8664 Py_UCS4 ch; 8665 8666 if (len == 0) 8667 return 0; 8668 8669 ch = PyUnicode_READ(kind, data, i); 8670 if (!Py_UNICODE_ISUPPER(ch)) { 8671 maxchar = Py_UNICODE_TOUPPER(ch); 8672 PyUnicode_WRITE(kind, data, i, maxchar); 8673 touched = 1; 8674 } 8675 ++i; 8676 for(; i < len; ++i) { 8677 ch = PyUnicode_READ(kind, data, i); 8678 if (!Py_UNICODE_ISLOWER(ch)) { 8679 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 8680 if (lo > maxchar) 8681 maxchar = lo; 8682 PyUnicode_WRITE(kind, data, i, lo); 8683 touched = 1; 8684 } 8685 else if (ch > maxchar) 8686 maxchar = ch; 8687 } 8688 8689 if (touched) 8690 return maxchar; 8691 else 8692 return 0; 8693} 8694 8695static Py_UCS4 8696fixtitle(PyUnicodeObject *self) 8697{ 8698 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 8699 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8700 const int kind = PyUnicode_KIND(self); 8701 void *data = PyUnicode_DATA(self); 8702 Py_UCS4 maxchar = 0; 8703 Py_ssize_t i = 0; 8704 int previous_is_cased; 8705 8706 /* Shortcut for single character strings */ 8707 if (len == 1) { 8708 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8709 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 8710 if (ti != ch) { 8711 PyUnicode_WRITE(kind, data, i, ti); 8712 return ti; 8713 } 8714 else 8715 return 0; 8716 } 8717 previous_is_cased = 0; 8718 for(; i < len; ++i) { 8719 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 8720 Py_UCS4 nu; 8721 8722 if (previous_is_cased) 8723 nu = Py_UNICODE_TOLOWER(ch); 8724 else 8725 nu = Py_UNICODE_TOTITLE(ch); 8726 8727 if (nu > maxchar) 8728 maxchar = nu; 8729 PyUnicode_WRITE(kind, data, i, nu); 8730 8731 if (Py_UNICODE_ISLOWER(ch) || 8732 Py_UNICODE_ISUPPER(ch) || 8733 Py_UNICODE_ISTITLE(ch)) 8734 previous_is_cased = 1; 8735 else 8736 previous_is_cased = 0; 8737 } 8738 return maxchar; 8739} 8740 8741PyObject * 8742PyUnicode_Join(PyObject *separator, PyObject *seq) 8743{ 8744 PyObject *sep = NULL; 8745 Py_ssize_t seplen = 1; 8746 PyObject *res = NULL; /* the result */ 8747 PyObject *fseq; /* PySequence_Fast(seq) */ 8748 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 8749 PyObject **items; 8750 PyObject *item; 8751 Py_ssize_t sz, i, res_offset; 8752 Py_UCS4 maxchar = 0; 8753 Py_UCS4 item_maxchar; 8754 8755 fseq = PySequence_Fast(seq, ""); 8756 if (fseq == NULL) { 8757 return NULL; 8758 } 8759 8760 /* NOTE: the following code can't call back into Python code, 8761 * so we are sure that fseq won't be mutated. 8762 */ 8763 8764 seqlen = PySequence_Fast_GET_SIZE(fseq); 8765 /* If empty sequence, return u"". */ 8766 if (seqlen == 0) { 8767 res = PyUnicode_New(0, 0); 8768 goto Done; 8769 } 8770 items = PySequence_Fast_ITEMS(fseq); 8771 /* If singleton sequence with an exact Unicode, return that. */ 8772 if (seqlen == 1) { 8773 item = items[0]; 8774 if (PyUnicode_CheckExact(item)) { 8775 Py_INCREF(item); 8776 res = item; 8777 goto Done; 8778 } 8779 } 8780 else { 8781 /* Set up sep and seplen */ 8782 if (separator == NULL) { 8783 /* fall back to a blank space separator */ 8784 sep = PyUnicode_FromOrdinal(' '); 8785 if (!sep) 8786 goto onError; 8787 } 8788 else { 8789 if (!PyUnicode_Check(separator)) { 8790 PyErr_Format(PyExc_TypeError, 8791 "separator: expected str instance," 8792 " %.80s found", 8793 Py_TYPE(separator)->tp_name); 8794 goto onError; 8795 } 8796 if (PyUnicode_READY(separator) == -1) 8797 goto onError; 8798 sep = separator; 8799 seplen = PyUnicode_GET_LENGTH(separator); 8800 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 8801 /* inc refcount to keep this code path symetric with the 8802 above case of a blank separator */ 8803 Py_INCREF(sep); 8804 } 8805 } 8806 8807 /* There are at least two things to join, or else we have a subclass 8808 * of str in the sequence. 8809 * Do a pre-pass to figure out the total amount of space we'll 8810 * need (sz), and see whether all argument are strings. 8811 */ 8812 sz = 0; 8813 for (i = 0; i < seqlen; i++) { 8814 const Py_ssize_t old_sz = sz; 8815 item = items[i]; 8816 if (!PyUnicode_Check(item)) { 8817 PyErr_Format(PyExc_TypeError, 8818 "sequence item %zd: expected str instance," 8819 " %.80s found", 8820 i, Py_TYPE(item)->tp_name); 8821 goto onError; 8822 } 8823 if (PyUnicode_READY(item) == -1) 8824 goto onError; 8825 sz += PyUnicode_GET_LENGTH(item); 8826 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 8827 if (item_maxchar > maxchar) 8828 maxchar = item_maxchar; 8829 if (i != 0) 8830 sz += seplen; 8831 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 8832 PyErr_SetString(PyExc_OverflowError, 8833 "join() result is too long for a Python string"); 8834 goto onError; 8835 } 8836 } 8837 8838 res = PyUnicode_New(sz, maxchar); 8839 if (res == NULL) 8840 goto onError; 8841 8842 /* Catenate everything. */ 8843 for (i = 0, res_offset = 0; i < seqlen; ++i) { 8844 Py_ssize_t itemlen; 8845 item = items[i]; 8846 itemlen = PyUnicode_GET_LENGTH(item); 8847 /* Copy item, and maybe the separator. */ 8848 if (i) { 8849 if (PyUnicode_CopyCharacters(res, res_offset, 8850 sep, 0, seplen) < 0) 8851 goto onError; 8852 res_offset += seplen; 8853 } 8854 if (PyUnicode_CopyCharacters(res, res_offset, 8855 item, 0, itemlen) < 0) 8856 goto onError; 8857 res_offset += itemlen; 8858 } 8859 assert(res_offset == PyUnicode_GET_LENGTH(res)); 8860 8861 Done: 8862 Py_DECREF(fseq); 8863 Py_XDECREF(sep); 8864 return res; 8865 8866 onError: 8867 Py_DECREF(fseq); 8868 Py_XDECREF(sep); 8869 Py_XDECREF(res); 8870 return NULL; 8871} 8872 8873#define FILL(kind, data, value, start, length) \ 8874 do { \ 8875 Py_ssize_t i_ = 0; \ 8876 assert(kind != PyUnicode_WCHAR_KIND); \ 8877 switch ((kind)) { \ 8878 case PyUnicode_1BYTE_KIND: { \ 8879 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 8880 memset(to_, (unsigned char)value, length); \ 8881 break; \ 8882 } \ 8883 case PyUnicode_2BYTE_KIND: { \ 8884 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 8885 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8886 break; \ 8887 } \ 8888 default: { \ 8889 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 8890 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 8891 break; \ 8892 } \ 8893 } \ 8894 } while (0) 8895 8896static PyUnicodeObject * 8897pad(PyUnicodeObject *self, 8898 Py_ssize_t left, 8899 Py_ssize_t right, 8900 Py_UCS4 fill) 8901{ 8902 PyObject *u; 8903 Py_UCS4 maxchar; 8904 int kind; 8905 void *data; 8906 8907 if (left < 0) 8908 left = 0; 8909 if (right < 0) 8910 right = 0; 8911 8912 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 8913 Py_INCREF(self); 8914 return self; 8915 } 8916 8917 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 8918 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 8919 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 8920 return NULL; 8921 } 8922 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 8923 if (fill > maxchar) 8924 maxchar = fill; 8925 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 8926 if (!u) 8927 return NULL; 8928 8929 kind = PyUnicode_KIND(u); 8930 data = PyUnicode_DATA(u); 8931 if (left) 8932 FILL(kind, data, fill, 0, left); 8933 if (right) 8934 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 8935 if (PyUnicode_CopyCharacters(u, left, 8936 (PyObject*)self, 0, 8937 _PyUnicode_LENGTH(self)) < 0) 8938 { 8939 Py_DECREF(u); 8940 return NULL; 8941 } 8942 8943 return (PyUnicodeObject*)u; 8944} 8945#undef FILL 8946 8947PyObject * 8948PyUnicode_Splitlines(PyObject *string, int keepends) 8949{ 8950 PyObject *list; 8951 8952 string = PyUnicode_FromObject(string); 8953 if (string == NULL || PyUnicode_READY(string) == -1) 8954 return NULL; 8955 8956 switch(PyUnicode_KIND(string)) { 8957 case PyUnicode_1BYTE_KIND: 8958 list = ucs1lib_splitlines( 8959 (PyObject*) string, PyUnicode_1BYTE_DATA(string), 8960 PyUnicode_GET_LENGTH(string), keepends); 8961 break; 8962 case PyUnicode_2BYTE_KIND: 8963 list = ucs2lib_splitlines( 8964 (PyObject*) string, PyUnicode_2BYTE_DATA(string), 8965 PyUnicode_GET_LENGTH(string), keepends); 8966 break; 8967 case PyUnicode_4BYTE_KIND: 8968 list = ucs4lib_splitlines( 8969 (PyObject*) string, PyUnicode_4BYTE_DATA(string), 8970 PyUnicode_GET_LENGTH(string), keepends); 8971 break; 8972 default: 8973 assert(0); 8974 list = 0; 8975 } 8976 Py_DECREF(string); 8977 return list; 8978} 8979 8980static PyObject * 8981split(PyUnicodeObject *self, 8982 PyUnicodeObject *substring, 8983 Py_ssize_t maxcount) 8984{ 8985 int kind1, kind2, kind; 8986 void *buf1, *buf2; 8987 Py_ssize_t len1, len2; 8988 PyObject* out; 8989 8990 if (maxcount < 0) 8991 maxcount = PY_SSIZE_T_MAX; 8992 8993 if (PyUnicode_READY(self) == -1) 8994 return NULL; 8995 8996 if (substring == NULL) 8997 switch(PyUnicode_KIND(self)) { 8998 case PyUnicode_1BYTE_KIND: 8999 return ucs1lib_split_whitespace( 9000 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9001 PyUnicode_GET_LENGTH(self), maxcount 9002 ); 9003 case PyUnicode_2BYTE_KIND: 9004 return ucs2lib_split_whitespace( 9005 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9006 PyUnicode_GET_LENGTH(self), maxcount 9007 ); 9008 case PyUnicode_4BYTE_KIND: 9009 return ucs4lib_split_whitespace( 9010 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9011 PyUnicode_GET_LENGTH(self), maxcount 9012 ); 9013 default: 9014 assert(0); 9015 return NULL; 9016 } 9017 9018 if (PyUnicode_READY(substring) == -1) 9019 return NULL; 9020 9021 kind1 = PyUnicode_KIND(self); 9022 kind2 = PyUnicode_KIND(substring); 9023 kind = kind1 > kind2 ? kind1 : kind2; 9024 buf1 = PyUnicode_DATA(self); 9025 buf2 = PyUnicode_DATA(substring); 9026 if (kind1 != kind) 9027 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9028 if (!buf1) 9029 return NULL; 9030 if (kind2 != kind) 9031 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9032 if (!buf2) { 9033 if (kind1 != kind) PyMem_Free(buf1); 9034 return NULL; 9035 } 9036 len1 = PyUnicode_GET_LENGTH(self); 9037 len2 = PyUnicode_GET_LENGTH(substring); 9038 9039 switch(kind) { 9040 case PyUnicode_1BYTE_KIND: 9041 out = ucs1lib_split( 9042 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9043 break; 9044 case PyUnicode_2BYTE_KIND: 9045 out = ucs2lib_split( 9046 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9047 break; 9048 case PyUnicode_4BYTE_KIND: 9049 out = ucs4lib_split( 9050 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9051 break; 9052 default: 9053 out = NULL; 9054 } 9055 if (kind1 != kind) 9056 PyMem_Free(buf1); 9057 if (kind2 != kind) 9058 PyMem_Free(buf2); 9059 return out; 9060} 9061 9062static PyObject * 9063rsplit(PyUnicodeObject *self, 9064 PyUnicodeObject *substring, 9065 Py_ssize_t maxcount) 9066{ 9067 int kind1, kind2, kind; 9068 void *buf1, *buf2; 9069 Py_ssize_t len1, len2; 9070 PyObject* out; 9071 9072 if (maxcount < 0) 9073 maxcount = PY_SSIZE_T_MAX; 9074 9075 if (PyUnicode_READY(self) == -1) 9076 return NULL; 9077 9078 if (substring == NULL) 9079 switch(PyUnicode_KIND(self)) { 9080 case PyUnicode_1BYTE_KIND: 9081 return ucs1lib_rsplit_whitespace( 9082 (PyObject*) self, PyUnicode_1BYTE_DATA(self), 9083 PyUnicode_GET_LENGTH(self), maxcount 9084 ); 9085 case PyUnicode_2BYTE_KIND: 9086 return ucs2lib_rsplit_whitespace( 9087 (PyObject*) self, PyUnicode_2BYTE_DATA(self), 9088 PyUnicode_GET_LENGTH(self), maxcount 9089 ); 9090 case PyUnicode_4BYTE_KIND: 9091 return ucs4lib_rsplit_whitespace( 9092 (PyObject*) self, PyUnicode_4BYTE_DATA(self), 9093 PyUnicode_GET_LENGTH(self), maxcount 9094 ); 9095 default: 9096 assert(0); 9097 return NULL; 9098 } 9099 9100 if (PyUnicode_READY(substring) == -1) 9101 return NULL; 9102 9103 kind1 = PyUnicode_KIND(self); 9104 kind2 = PyUnicode_KIND(substring); 9105 kind = kind1 > kind2 ? kind1 : kind2; 9106 buf1 = PyUnicode_DATA(self); 9107 buf2 = PyUnicode_DATA(substring); 9108 if (kind1 != kind) 9109 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9110 if (!buf1) 9111 return NULL; 9112 if (kind2 != kind) 9113 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9114 if (!buf2) { 9115 if (kind1 != kind) PyMem_Free(buf1); 9116 return NULL; 9117 } 9118 len1 = PyUnicode_GET_LENGTH(self); 9119 len2 = PyUnicode_GET_LENGTH(substring); 9120 9121 switch(kind) { 9122 case PyUnicode_1BYTE_KIND: 9123 out = ucs1lib_rsplit( 9124 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9125 break; 9126 case PyUnicode_2BYTE_KIND: 9127 out = ucs2lib_rsplit( 9128 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9129 break; 9130 case PyUnicode_4BYTE_KIND: 9131 out = ucs4lib_rsplit( 9132 (PyObject*) self, buf1, len1, buf2, len2, maxcount); 9133 break; 9134 default: 9135 out = NULL; 9136 } 9137 if (kind1 != kind) 9138 PyMem_Free(buf1); 9139 if (kind2 != kind) 9140 PyMem_Free(buf2); 9141 return out; 9142} 9143 9144static Py_ssize_t 9145anylib_find(int kind, void *buf1, Py_ssize_t len1, 9146 void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9147{ 9148 switch(kind) { 9149 case PyUnicode_1BYTE_KIND: 9150 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9151 case PyUnicode_2BYTE_KIND: 9152 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9153 case PyUnicode_4BYTE_KIND: 9154 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9155 } 9156 assert(0); 9157 return -1; 9158} 9159 9160static Py_ssize_t 9161anylib_count(int kind, void* sbuf, Py_ssize_t slen, 9162 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 9163{ 9164 switch(kind) { 9165 case PyUnicode_1BYTE_KIND: 9166 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 9167 case PyUnicode_2BYTE_KIND: 9168 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 9169 case PyUnicode_4BYTE_KIND: 9170 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 9171 } 9172 assert(0); 9173 return 0; 9174} 9175 9176static PyObject * 9177replace(PyObject *self, PyObject *str1, 9178 PyObject *str2, Py_ssize_t maxcount) 9179{ 9180 PyObject *u; 9181 char *sbuf = PyUnicode_DATA(self); 9182 char *buf1 = PyUnicode_DATA(str1); 9183 char *buf2 = PyUnicode_DATA(str2); 9184 int srelease = 0, release1 = 0, release2 = 0; 9185 int skind = PyUnicode_KIND(self); 9186 int kind1 = PyUnicode_KIND(str1); 9187 int kind2 = PyUnicode_KIND(str2); 9188 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 9189 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 9190 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 9191 9192 if (maxcount < 0) 9193 maxcount = PY_SSIZE_T_MAX; 9194 else if (maxcount == 0 || slen == 0) 9195 goto nothing; 9196 9197 if (skind < kind1) 9198 /* substring too wide to be present */ 9199 goto nothing; 9200 9201 if (len1 == len2) { 9202 Py_ssize_t i; 9203 /* same length */ 9204 if (len1 == 0) 9205 goto nothing; 9206 if (len1 == 1) { 9207 /* replace characters */ 9208 Py_UCS4 u1, u2, maxchar; 9209 int mayshrink, rkind; 9210 u1 = PyUnicode_READ_CHAR(str1, 0); 9211 if (!findchar(sbuf, PyUnicode_KIND(self), 9212 slen, u1, 1)) 9213 goto nothing; 9214 u2 = PyUnicode_READ_CHAR(str2, 0); 9215 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9216 /* Replacing u1 with u2 may cause a maxchar reduction in the 9217 result string. */ 9218 mayshrink = maxchar > 127; 9219 if (u2 > maxchar) { 9220 maxchar = u2; 9221 mayshrink = 0; 9222 } 9223 u = PyUnicode_New(slen, maxchar); 9224 if (!u) 9225 goto error; 9226 if (PyUnicode_CopyCharacters(u, 0, 9227 (PyObject*)self, 0, slen) < 0) 9228 { 9229 Py_DECREF(u); 9230 return NULL; 9231 } 9232 rkind = PyUnicode_KIND(u); 9233 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 9234 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 9235 if (--maxcount < 0) 9236 break; 9237 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 9238 } 9239 if (mayshrink) { 9240 PyObject *tmp = u; 9241 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), 9242 PyUnicode_GET_LENGTH(tmp)); 9243 Py_DECREF(tmp); 9244 } 9245 } else { 9246 int rkind = skind; 9247 char *res; 9248 if (kind1 < rkind) { 9249 /* widen substring */ 9250 buf1 = _PyUnicode_AsKind(str1, rkind); 9251 if (!buf1) goto error; 9252 release1 = 1; 9253 } 9254 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); 9255 if (i < 0) 9256 goto nothing; 9257 if (rkind > kind2) { 9258 /* widen replacement */ 9259 buf2 = _PyUnicode_AsKind(str2, rkind); 9260 if (!buf2) goto error; 9261 release2 = 1; 9262 } 9263 else if (rkind < kind2) { 9264 /* widen self and buf1 */ 9265 rkind = kind2; 9266 if (release1) PyMem_Free(buf1); 9267 sbuf = _PyUnicode_AsKind(self, rkind); 9268 if (!sbuf) goto error; 9269 srelease = 1; 9270 buf1 = _PyUnicode_AsKind(str1, rkind); 9271 if (!buf1) goto error; 9272 release1 = 1; 9273 } 9274 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); 9275 if (!res) { 9276 PyErr_NoMemory(); 9277 goto error; 9278 } 9279 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); 9280 /* change everything in-place, starting with this one */ 9281 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9282 buf2, 9283 PyUnicode_KIND_SIZE(rkind, len2)); 9284 i += len1; 9285 9286 while ( --maxcount > 0) { 9287 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), 9288 slen-i, 9289 buf1, len1, i); 9290 if (i == -1) 9291 break; 9292 memcpy(res + PyUnicode_KIND_SIZE(rkind, i), 9293 buf2, 9294 PyUnicode_KIND_SIZE(rkind, len2)); 9295 i += len1; 9296 } 9297 9298 u = PyUnicode_FromKindAndData(rkind, res, slen); 9299 PyMem_Free(res); 9300 if (!u) goto error; 9301 } 9302 } else { 9303 9304 Py_ssize_t n, i, j, ires; 9305 Py_ssize_t product, new_size; 9306 int rkind = skind; 9307 char *res; 9308 9309 if (kind1 < rkind) { 9310 buf1 = _PyUnicode_AsKind(str1, rkind); 9311 if (!buf1) goto error; 9312 release1 = 1; 9313 } 9314 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); 9315 if (n == 0) 9316 goto nothing; 9317 if (kind2 < rkind) { 9318 buf2 = _PyUnicode_AsKind(str2, rkind); 9319 if (!buf2) goto error; 9320 release2 = 1; 9321 } 9322 else if (kind2 > rkind) { 9323 rkind = kind2; 9324 sbuf = _PyUnicode_AsKind(self, rkind); 9325 if (!sbuf) goto error; 9326 srelease = 1; 9327 if (release1) PyMem_Free(buf1); 9328 buf1 = _PyUnicode_AsKind(str1, rkind); 9329 if (!buf1) goto error; 9330 release1 = 1; 9331 } 9332 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 9333 PyUnicode_GET_LENGTH(str1))); */ 9334 product = n * (len2-len1); 9335 if ((product / (len2-len1)) != n) { 9336 PyErr_SetString(PyExc_OverflowError, 9337 "replace string is too long"); 9338 goto error; 9339 } 9340 new_size = slen + product; 9341 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 9342 PyErr_SetString(PyExc_OverflowError, 9343 "replace string is too long"); 9344 goto error; 9345 } 9346 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); 9347 if (!res) 9348 goto error; 9349 ires = i = 0; 9350 if (len1 > 0) { 9351 while (n-- > 0) { 9352 /* look for next match */ 9353 j = anylib_find(rkind, 9354 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9355 slen-i, buf1, len1, i); 9356 if (j == -1) 9357 break; 9358 else if (j > i) { 9359 /* copy unchanged part [i:j] */ 9360 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9361 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9362 PyUnicode_KIND_SIZE(rkind, j-i)); 9363 ires += j - i; 9364 } 9365 /* copy substitution string */ 9366 if (len2 > 0) { 9367 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9368 buf2, 9369 PyUnicode_KIND_SIZE(rkind, len2)); 9370 ires += len2; 9371 } 9372 i = j + len1; 9373 } 9374 if (i < slen) 9375 /* copy tail [i:] */ 9376 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9377 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9378 PyUnicode_KIND_SIZE(rkind, slen-i)); 9379 } else { 9380 /* interleave */ 9381 while (n > 0) { 9382 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9383 buf2, 9384 PyUnicode_KIND_SIZE(rkind, len2)); 9385 ires += len2; 9386 if (--n <= 0) 9387 break; 9388 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9389 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9390 PyUnicode_KIND_SIZE(rkind, 1)); 9391 ires++; 9392 i++; 9393 } 9394 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires), 9395 sbuf + PyUnicode_KIND_SIZE(rkind, i), 9396 PyUnicode_KIND_SIZE(rkind, slen-i)); 9397 } 9398 u = PyUnicode_FromKindAndData(rkind, res, new_size); 9399 PyMem_Free(res); 9400 } 9401 if (srelease) 9402 PyMem_FREE(sbuf); 9403 if (release1) 9404 PyMem_FREE(buf1); 9405 if (release2) 9406 PyMem_FREE(buf2); 9407 return u; 9408 9409 nothing: 9410 /* nothing to replace; return original string (when possible) */ 9411 if (srelease) 9412 PyMem_FREE(sbuf); 9413 if (release1) 9414 PyMem_FREE(buf1); 9415 if (release2) 9416 PyMem_FREE(buf2); 9417 if (PyUnicode_CheckExact(self)) { 9418 Py_INCREF(self); 9419 return (PyObject *) self; 9420 } 9421 return PyUnicode_Copy(self); 9422 error: 9423 if (srelease && sbuf) 9424 PyMem_FREE(sbuf); 9425 if (release1 && buf1) 9426 PyMem_FREE(buf1); 9427 if (release2 && buf2) 9428 PyMem_FREE(buf2); 9429 return NULL; 9430} 9431 9432/* --- Unicode Object Methods --------------------------------------------- */ 9433 9434PyDoc_STRVAR(title__doc__, 9435 "S.title() -> str\n\ 9436\n\ 9437Return a titlecased version of S, i.e. words start with title case\n\ 9438characters, all remaining cased characters have lower case."); 9439 9440static PyObject* 9441unicode_title(PyUnicodeObject *self) 9442{ 9443 return fixup(self, fixtitle); 9444} 9445 9446PyDoc_STRVAR(capitalize__doc__, 9447 "S.capitalize() -> str\n\ 9448\n\ 9449Return a capitalized version of S, i.e. make the first character\n\ 9450have upper case and the rest lower case."); 9451 9452static PyObject* 9453unicode_capitalize(PyUnicodeObject *self) 9454{ 9455 return fixup(self, fixcapitalize); 9456} 9457 9458#if 0 9459PyDoc_STRVAR(capwords__doc__, 9460 "S.capwords() -> str\n\ 9461\n\ 9462Apply .capitalize() to all words in S and return the result with\n\ 9463normalized whitespace (all whitespace strings are replaced by ' ')."); 9464 9465static PyObject* 9466unicode_capwords(PyUnicodeObject *self) 9467{ 9468 PyObject *list; 9469 PyObject *item; 9470 Py_ssize_t i; 9471 9472 /* Split into words */ 9473 list = split(self, NULL, -1); 9474 if (!list) 9475 return NULL; 9476 9477 /* Capitalize each word */ 9478 for (i = 0; i < PyList_GET_SIZE(list); i++) { 9479 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), 9480 fixcapitalize); 9481 if (item == NULL) 9482 goto onError; 9483 Py_DECREF(PyList_GET_ITEM(list, i)); 9484 PyList_SET_ITEM(list, i, item); 9485 } 9486 9487 /* Join the words to form a new string */ 9488 item = PyUnicode_Join(NULL, list); 9489 9490 onError: 9491 Py_DECREF(list); 9492 return (PyObject *)item; 9493} 9494#endif 9495 9496/* Argument converter. Coerces to a single unicode character */ 9497 9498static int 9499convert_uc(PyObject *obj, void *addr) 9500{ 9501 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 9502 PyObject *uniobj; 9503 9504 uniobj = PyUnicode_FromObject(obj); 9505 if (uniobj == NULL) { 9506 PyErr_SetString(PyExc_TypeError, 9507 "The fill character cannot be converted to Unicode"); 9508 return 0; 9509 } 9510 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 9511 PyErr_SetString(PyExc_TypeError, 9512 "The fill character must be exactly one character long"); 9513 Py_DECREF(uniobj); 9514 return 0; 9515 } 9516 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 9517 Py_DECREF(uniobj); 9518 return 1; 9519} 9520 9521PyDoc_STRVAR(center__doc__, 9522 "S.center(width[, fillchar]) -> str\n\ 9523\n\ 9524Return S centered in a string of length width. Padding is\n\ 9525done using the specified fill character (default is a space)"); 9526 9527static PyObject * 9528unicode_center(PyUnicodeObject *self, PyObject *args) 9529{ 9530 Py_ssize_t marg, left; 9531 Py_ssize_t width; 9532 Py_UCS4 fillchar = ' '; 9533 9534 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 9535 return NULL; 9536 9537 if (PyUnicode_READY(self) == -1) 9538 return NULL; 9539 9540 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 9541 Py_INCREF(self); 9542 return (PyObject*) self; 9543 } 9544 9545 marg = width - _PyUnicode_LENGTH(self); 9546 left = marg / 2 + (marg & width & 1); 9547 9548 return (PyObject*) pad(self, left, marg - left, fillchar); 9549} 9550 9551#if 0 9552 9553/* This code should go into some future Unicode collation support 9554 module. The basic comparison should compare ordinals on a naive 9555 basis (this is what Java does and thus Jython too). */ 9556 9557/* speedy UTF-16 code point order comparison */ 9558/* gleaned from: */ 9559/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ 9560 9561static short utf16Fixup[32] = 9562{ 9563 0, 0, 0, 0, 0, 0, 0, 0, 9564 0, 0, 0, 0, 0, 0, 0, 0, 9565 0, 0, 0, 0, 0, 0, 0, 0, 9566 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 9567}; 9568 9569static int 9570unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9571{ 9572 Py_ssize_t len1, len2; 9573 9574 Py_UNICODE *s1 = str1->str; 9575 Py_UNICODE *s2 = str2->str; 9576 9577 len1 = str1->_base._base.length; 9578 len2 = str2->_base._base.length; 9579 9580 while (len1 > 0 && len2 > 0) { 9581 Py_UNICODE c1, c2; 9582 9583 c1 = *s1++; 9584 c2 = *s2++; 9585 9586 if (c1 > (1<<11) * 26) 9587 c1 += utf16Fixup[c1>>11]; 9588 if (c2 > (1<<11) * 26) 9589 c2 += utf16Fixup[c2>>11]; 9590 /* now c1 and c2 are in UTF-32-compatible order */ 9591 9592 if (c1 != c2) 9593 return (c1 < c2) ? -1 : 1; 9594 9595 len1--; len2--; 9596 } 9597 9598 return (len1 < len2) ? -1 : (len1 != len2); 9599} 9600 9601#else 9602 9603/* This function assumes that str1 and str2 are readied by the caller. */ 9604 9605static int 9606unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) 9607{ 9608 int kind1, kind2; 9609 void *data1, *data2; 9610 Py_ssize_t len1, len2, i; 9611 9612 kind1 = PyUnicode_KIND(str1); 9613 kind2 = PyUnicode_KIND(str2); 9614 data1 = PyUnicode_DATA(str1); 9615 data2 = PyUnicode_DATA(str2); 9616 len1 = PyUnicode_GET_LENGTH(str1); 9617 len2 = PyUnicode_GET_LENGTH(str2); 9618 9619 for (i = 0; i < len1 && i < len2; ++i) { 9620 Py_UCS4 c1, c2; 9621 c1 = PyUnicode_READ(kind1, data1, i); 9622 c2 = PyUnicode_READ(kind2, data2, i); 9623 9624 if (c1 != c2) 9625 return (c1 < c2) ? -1 : 1; 9626 } 9627 9628 return (len1 < len2) ? -1 : (len1 != len2); 9629} 9630 9631#endif 9632 9633int 9634PyUnicode_Compare(PyObject *left, PyObject *right) 9635{ 9636 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9637 if (PyUnicode_READY(left) == -1 || 9638 PyUnicode_READY(right) == -1) 9639 return -1; 9640 return unicode_compare((PyUnicodeObject *)left, 9641 (PyUnicodeObject *)right); 9642 } 9643 PyErr_Format(PyExc_TypeError, 9644 "Can't compare %.100s and %.100s", 9645 left->ob_type->tp_name, 9646 right->ob_type->tp_name); 9647 return -1; 9648} 9649 9650int 9651PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 9652{ 9653 Py_ssize_t i; 9654 int kind; 9655 void *data; 9656 Py_UCS4 chr; 9657 9658 assert(_PyUnicode_CHECK(uni)); 9659 if (PyUnicode_READY(uni) == -1) 9660 return -1; 9661 kind = PyUnicode_KIND(uni); 9662 data = PyUnicode_DATA(uni); 9663 /* Compare Unicode string and source character set string */ 9664 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 9665 if (chr != str[i]) 9666 return (chr < (unsigned char)(str[i])) ? -1 : 1; 9667 /* This check keeps Python strings that end in '\0' from comparing equal 9668 to C strings identical up to that point. */ 9669 if (PyUnicode_GET_LENGTH(uni) != i || chr) 9670 return 1; /* uni is longer */ 9671 if (str[i]) 9672 return -1; /* str is longer */ 9673 return 0; 9674} 9675 9676 9677#define TEST_COND(cond) \ 9678 ((cond) ? Py_True : Py_False) 9679 9680PyObject * 9681PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 9682{ 9683 int result; 9684 9685 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 9686 PyObject *v; 9687 if (PyUnicode_READY(left) == -1 || 9688 PyUnicode_READY(right) == -1) 9689 return NULL; 9690 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 9691 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 9692 if (op == Py_EQ) { 9693 Py_INCREF(Py_False); 9694 return Py_False; 9695 } 9696 if (op == Py_NE) { 9697 Py_INCREF(Py_True); 9698 return Py_True; 9699 } 9700 } 9701 if (left == right) 9702 result = 0; 9703 else 9704 result = unicode_compare((PyUnicodeObject *)left, 9705 (PyUnicodeObject *)right); 9706 9707 /* Convert the return value to a Boolean */ 9708 switch (op) { 9709 case Py_EQ: 9710 v = TEST_COND(result == 0); 9711 break; 9712 case Py_NE: 9713 v = TEST_COND(result != 0); 9714 break; 9715 case Py_LE: 9716 v = TEST_COND(result <= 0); 9717 break; 9718 case Py_GE: 9719 v = TEST_COND(result >= 0); 9720 break; 9721 case Py_LT: 9722 v = TEST_COND(result == -1); 9723 break; 9724 case Py_GT: 9725 v = TEST_COND(result == 1); 9726 break; 9727 default: 9728 PyErr_BadArgument(); 9729 return NULL; 9730 } 9731 Py_INCREF(v); 9732 return v; 9733 } 9734 9735 Py_RETURN_NOTIMPLEMENTED; 9736} 9737 9738int 9739PyUnicode_Contains(PyObject *container, PyObject *element) 9740{ 9741 PyObject *str, *sub; 9742 int kind1, kind2, kind; 9743 void *buf1, *buf2; 9744 Py_ssize_t len1, len2; 9745 int result; 9746 9747 /* Coerce the two arguments */ 9748 sub = PyUnicode_FromObject(element); 9749 if (!sub) { 9750 PyErr_Format(PyExc_TypeError, 9751 "'in <string>' requires string as left operand, not %s", 9752 element->ob_type->tp_name); 9753 return -1; 9754 } 9755 if (PyUnicode_READY(sub) == -1) 9756 return -1; 9757 9758 str = PyUnicode_FromObject(container); 9759 if (!str || PyUnicode_READY(str) == -1) { 9760 Py_DECREF(sub); 9761 return -1; 9762 } 9763 9764 kind1 = PyUnicode_KIND(str); 9765 kind2 = PyUnicode_KIND(sub); 9766 kind = kind1 > kind2 ? kind1 : kind2; 9767 buf1 = PyUnicode_DATA(str); 9768 buf2 = PyUnicode_DATA(sub); 9769 if (kind1 != kind) 9770 buf1 = _PyUnicode_AsKind((PyObject*)str, kind); 9771 if (!buf1) { 9772 Py_DECREF(sub); 9773 return -1; 9774 } 9775 if (kind2 != kind) 9776 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind); 9777 if (!buf2) { 9778 Py_DECREF(sub); 9779 if (kind1 != kind) PyMem_Free(buf1); 9780 return -1; 9781 } 9782 len1 = PyUnicode_GET_LENGTH(str); 9783 len2 = PyUnicode_GET_LENGTH(sub); 9784 9785 switch(kind) { 9786 case PyUnicode_1BYTE_KIND: 9787 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 9788 break; 9789 case PyUnicode_2BYTE_KIND: 9790 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 9791 break; 9792 case PyUnicode_4BYTE_KIND: 9793 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 9794 break; 9795 default: 9796 result = -1; 9797 assert(0); 9798 } 9799 9800 Py_DECREF(str); 9801 Py_DECREF(sub); 9802 9803 if (kind1 != kind) 9804 PyMem_Free(buf1); 9805 if (kind2 != kind) 9806 PyMem_Free(buf2); 9807 9808 return result; 9809} 9810 9811/* Concat to string or Unicode object giving a new Unicode object. */ 9812 9813PyObject * 9814PyUnicode_Concat(PyObject *left, PyObject *right) 9815{ 9816 PyObject *u = NULL, *v = NULL, *w; 9817 Py_UCS4 maxchar; 9818 9819 /* Coerce the two arguments */ 9820 u = PyUnicode_FromObject(left); 9821 if (u == NULL) 9822 goto onError; 9823 v = PyUnicode_FromObject(right); 9824 if (v == NULL) 9825 goto onError; 9826 9827 /* Shortcuts */ 9828 if (v == unicode_empty) { 9829 Py_DECREF(v); 9830 return u; 9831 } 9832 if (u == unicode_empty) { 9833 Py_DECREF(u); 9834 return v; 9835 } 9836 9837 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 9838 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v)); 9839 9840 /* Concat the two Unicode strings */ 9841 w = PyUnicode_New( 9842 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 9843 maxchar); 9844 if (w == NULL) 9845 goto onError; 9846 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) 9847 goto onError; 9848 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), 9849 v, 0, 9850 PyUnicode_GET_LENGTH(v)) < 0) 9851 goto onError; 9852 Py_DECREF(u); 9853 Py_DECREF(v); 9854 return w; 9855 9856 onError: 9857 Py_XDECREF(u); 9858 Py_XDECREF(v); 9859 return NULL; 9860} 9861 9862void 9863PyUnicode_Append(PyObject **p_left, PyObject *right) 9864{ 9865 PyObject *left, *res; 9866 9867 if (p_left == NULL) { 9868 if (!PyErr_Occurred()) 9869 PyErr_BadInternalCall(); 9870 return; 9871 } 9872 left = *p_left; 9873 if (right == NULL || !PyUnicode_Check(left)) { 9874 if (!PyErr_Occurred()) 9875 PyErr_BadInternalCall(); 9876 goto error; 9877 } 9878 9879 if (PyUnicode_CheckExact(left) && left != unicode_empty 9880 && PyUnicode_CheckExact(right) && right != unicode_empty 9881 && unicode_resizable(left) 9882 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 9883 || _PyUnicode_WSTR(left) != NULL)) 9884 { 9885 Py_ssize_t u_len, v_len, new_len, copied; 9886 9887 /* FIXME: don't make wstr string ready */ 9888 if (PyUnicode_READY(left)) 9889 goto error; 9890 if (PyUnicode_READY(right)) 9891 goto error; 9892 9893 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */ 9894 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left)) 9895 { 9896 u_len = PyUnicode_GET_LENGTH(left); 9897 v_len = PyUnicode_GET_LENGTH(right); 9898 if (u_len > PY_SSIZE_T_MAX - v_len) { 9899 PyErr_SetString(PyExc_OverflowError, 9900 "strings are too large to concat"); 9901 goto error; 9902 } 9903 new_len = u_len + v_len; 9904 9905 /* Now we own the last reference to 'left', so we can resize it 9906 * in-place. 9907 */ 9908 if (unicode_resize(&left, new_len) != 0) { 9909 /* XXX if _PyUnicode_Resize() fails, 'left' has been 9910 * deallocated so it cannot be put back into 9911 * 'variable'. The MemoryError is raised when there 9912 * is no value in 'variable', which might (very 9913 * remotely) be a cause of incompatibilities. 9914 */ 9915 goto error; 9916 } 9917 /* copy 'right' into the newly allocated area of 'left' */ 9918 copied = PyUnicode_CopyCharacters(left, u_len, 9919 right, 0, 9920 v_len); 9921 assert(0 <= copied); 9922 *p_left = left; 9923 return; 9924 } 9925 } 9926 9927 res = PyUnicode_Concat(left, right); 9928 if (res == NULL) 9929 goto error; 9930 Py_DECREF(left); 9931 *p_left = res; 9932 return; 9933 9934error: 9935 Py_DECREF(*p_left); 9936 *p_left = NULL; 9937} 9938 9939void 9940PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 9941{ 9942 PyUnicode_Append(pleft, right); 9943 Py_XDECREF(right); 9944} 9945 9946PyDoc_STRVAR(count__doc__, 9947 "S.count(sub[, start[, end]]) -> int\n\ 9948\n\ 9949Return the number of non-overlapping occurrences of substring sub in\n\ 9950string S[start:end]. Optional arguments start and end are\n\ 9951interpreted as in slice notation."); 9952 9953static PyObject * 9954unicode_count(PyUnicodeObject *self, PyObject *args) 9955{ 9956 PyUnicodeObject *substring; 9957 Py_ssize_t start = 0; 9958 Py_ssize_t end = PY_SSIZE_T_MAX; 9959 PyObject *result; 9960 int kind1, kind2, kind; 9961 void *buf1, *buf2; 9962 Py_ssize_t len1, len2, iresult; 9963 9964 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 9965 &start, &end)) 9966 return NULL; 9967 9968 kind1 = PyUnicode_KIND(self); 9969 kind2 = PyUnicode_KIND(substring); 9970 kind = kind1 > kind2 ? kind1 : kind2; 9971 buf1 = PyUnicode_DATA(self); 9972 buf2 = PyUnicode_DATA(substring); 9973 if (kind1 != kind) 9974 buf1 = _PyUnicode_AsKind((PyObject*)self, kind); 9975 if (!buf1) { 9976 Py_DECREF(substring); 9977 return NULL; 9978 } 9979 if (kind2 != kind) 9980 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind); 9981 if (!buf2) { 9982 Py_DECREF(substring); 9983 if (kind1 != kind) PyMem_Free(buf1); 9984 return NULL; 9985 } 9986 len1 = PyUnicode_GET_LENGTH(self); 9987 len2 = PyUnicode_GET_LENGTH(substring); 9988 9989 ADJUST_INDICES(start, end, len1); 9990 switch(kind) { 9991 case PyUnicode_1BYTE_KIND: 9992 iresult = ucs1lib_count( 9993 ((Py_UCS1*)buf1) + start, end - start, 9994 buf2, len2, PY_SSIZE_T_MAX 9995 ); 9996 break; 9997 case PyUnicode_2BYTE_KIND: 9998 iresult = ucs2lib_count( 9999 ((Py_UCS2*)buf1) + start, end - start, 10000 buf2, len2, PY_SSIZE_T_MAX 10001 ); 10002 break; 10003 case PyUnicode_4BYTE_KIND: 10004 iresult = ucs4lib_count( 10005 ((Py_UCS4*)buf1) + start, end - start, 10006 buf2, len2, PY_SSIZE_T_MAX 10007 ); 10008 break; 10009 default: 10010 assert(0); iresult = 0; 10011 } 10012 10013 result = PyLong_FromSsize_t(iresult); 10014 10015 if (kind1 != kind) 10016 PyMem_Free(buf1); 10017 if (kind2 != kind) 10018 PyMem_Free(buf2); 10019 10020 Py_DECREF(substring); 10021 10022 return result; 10023} 10024 10025PyDoc_STRVAR(encode__doc__, 10026 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10027\n\ 10028Encode S using the codec registered for encoding. Default encoding\n\ 10029is 'utf-8'. errors may be given to set a different error\n\ 10030handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10031a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10032'xmlcharrefreplace' as well as any other name registered with\n\ 10033codecs.register_error that can handle UnicodeEncodeErrors."); 10034 10035static PyObject * 10036unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) 10037{ 10038 static char *kwlist[] = {"encoding", "errors", 0}; 10039 char *encoding = NULL; 10040 char *errors = NULL; 10041 10042 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10043 kwlist, &encoding, &errors)) 10044 return NULL; 10045 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); 10046} 10047 10048PyDoc_STRVAR(expandtabs__doc__, 10049 "S.expandtabs([tabsize]) -> str\n\ 10050\n\ 10051Return a copy of S where all tab characters are expanded using spaces.\n\ 10052If tabsize is not given, a tab size of 8 characters is assumed."); 10053 10054static PyObject* 10055unicode_expandtabs(PyUnicodeObject *self, PyObject *args) 10056{ 10057 Py_UNICODE *e; 10058 Py_UNICODE *p; 10059 Py_UNICODE *q; 10060 Py_UNICODE *qe; 10061 Py_ssize_t i, j, incr, wstr_length; 10062 PyUnicodeObject *u; 10063 int tabsize = 8; 10064 10065 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10066 return NULL; 10067 10068 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL) 10069 return NULL; 10070 10071 /* First pass: determine size of output string */ 10072 i = 0; /* chars up to and including most recent \n or \r */ 10073 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ 10074 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */ 10075 for (p = _PyUnicode_WSTR(self); p < e; p++) 10076 if (*p == '\t') { 10077 if (tabsize > 0) { 10078 incr = tabsize - (j % tabsize); /* cannot overflow */ 10079 if (j > PY_SSIZE_T_MAX - incr) 10080 goto overflow1; 10081 j += incr; 10082 } 10083 } 10084 else { 10085 if (j > PY_SSIZE_T_MAX - 1) 10086 goto overflow1; 10087 j++; 10088 if (*p == '\n' || *p == '\r') { 10089 if (i > PY_SSIZE_T_MAX - j) 10090 goto overflow1; 10091 i += j; 10092 j = 0; 10093 } 10094 } 10095 10096 if (i > PY_SSIZE_T_MAX - j) 10097 goto overflow1; 10098 10099 /* Second pass: create output string and fill it */ 10100 u = _PyUnicode_New(i + j); 10101 if (!u) 10102 return NULL; 10103 10104 j = 0; /* same as in first pass */ 10105 q = _PyUnicode_WSTR(u); /* next output char */ 10106 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */ 10107 10108 for (p = _PyUnicode_WSTR(self); p < e; p++) 10109 if (*p == '\t') { 10110 if (tabsize > 0) { 10111 i = tabsize - (j % tabsize); 10112 j += i; 10113 while (i--) { 10114 if (q >= qe) 10115 goto overflow2; 10116 *q++ = ' '; 10117 } 10118 } 10119 } 10120 else { 10121 if (q >= qe) 10122 goto overflow2; 10123 *q++ = *p; 10124 j++; 10125 if (*p == '\n' || *p == '\r') 10126 j = 0; 10127 } 10128 10129 if (PyUnicode_READY(u) == -1) { 10130 Py_DECREF(u); 10131 return NULL; 10132 } 10133 return (PyObject*) u; 10134 10135 overflow2: 10136 Py_DECREF(u); 10137 overflow1: 10138 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10139 return NULL; 10140} 10141 10142PyDoc_STRVAR(find__doc__, 10143 "S.find(sub[, start[, end]]) -> int\n\ 10144\n\ 10145Return the lowest index in S where substring sub is found,\n\ 10146such that sub is contained within S[start:end]. Optional\n\ 10147arguments start and end are interpreted as in slice notation.\n\ 10148\n\ 10149Return -1 on failure."); 10150 10151static PyObject * 10152unicode_find(PyObject *self, PyObject *args) 10153{ 10154 PyUnicodeObject *substring; 10155 Py_ssize_t start; 10156 Py_ssize_t end; 10157 Py_ssize_t result; 10158 10159 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10160 &start, &end)) 10161 return NULL; 10162 10163 if (PyUnicode_READY(self) == -1) 10164 return NULL; 10165 if (PyUnicode_READY(substring) == -1) 10166 return NULL; 10167 10168 result = any_find_slice( 10169 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10170 self, (PyObject*)substring, start, end 10171 ); 10172 10173 Py_DECREF(substring); 10174 10175 if (result == -2) 10176 return NULL; 10177 10178 return PyLong_FromSsize_t(result); 10179} 10180 10181static PyObject * 10182unicode_getitem(PyObject *self, Py_ssize_t index) 10183{ 10184 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10185 if (ch == (Py_UCS4)-1) 10186 return NULL; 10187 return PyUnicode_FromOrdinal(ch); 10188} 10189 10190/* Believe it or not, this produces the same value for ASCII strings 10191 as bytes_hash(). */ 10192static Py_hash_t 10193unicode_hash(PyUnicodeObject *self) 10194{ 10195 Py_ssize_t len; 10196 Py_uhash_t x; 10197 10198 if (_PyUnicode_HASH(self) != -1) 10199 return _PyUnicode_HASH(self); 10200 if (PyUnicode_READY(self) == -1) 10201 return -1; 10202 len = PyUnicode_GET_LENGTH(self); 10203 10204 /* The hash function as a macro, gets expanded three times below. */ 10205#define HASH(P) \ 10206 x = (Py_uhash_t)*P << 7; \ 10207 while (--len >= 0) \ 10208 x = (1000003*x) ^ (Py_uhash_t)*P++; 10209 10210 switch (PyUnicode_KIND(self)) { 10211 case PyUnicode_1BYTE_KIND: { 10212 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 10213 HASH(c); 10214 break; 10215 } 10216 case PyUnicode_2BYTE_KIND: { 10217 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 10218 HASH(s); 10219 break; 10220 } 10221 default: { 10222 Py_UCS4 *l; 10223 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 10224 "Impossible switch case in unicode_hash"); 10225 l = PyUnicode_4BYTE_DATA(self); 10226 HASH(l); 10227 break; 10228 } 10229 } 10230 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 10231 10232 if (x == -1) 10233 x = -2; 10234 _PyUnicode_HASH(self) = x; 10235 return x; 10236} 10237#undef HASH 10238 10239PyDoc_STRVAR(index__doc__, 10240 "S.index(sub[, start[, end]]) -> int\n\ 10241\n\ 10242Like S.find() but raise ValueError when the substring is not found."); 10243 10244static PyObject * 10245unicode_index(PyObject *self, PyObject *args) 10246{ 10247 Py_ssize_t result; 10248 PyUnicodeObject *substring; 10249 Py_ssize_t start; 10250 Py_ssize_t end; 10251 10252 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 10253 &start, &end)) 10254 return NULL; 10255 10256 if (PyUnicode_READY(self) == -1) 10257 return NULL; 10258 if (PyUnicode_READY(substring) == -1) 10259 return NULL; 10260 10261 result = any_find_slice( 10262 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, 10263 self, (PyObject*)substring, start, end 10264 ); 10265 10266 Py_DECREF(substring); 10267 10268 if (result == -2) 10269 return NULL; 10270 10271 if (result < 0) { 10272 PyErr_SetString(PyExc_ValueError, "substring not found"); 10273 return NULL; 10274 } 10275 10276 return PyLong_FromSsize_t(result); 10277} 10278 10279PyDoc_STRVAR(islower__doc__, 10280 "S.islower() -> bool\n\ 10281\n\ 10282Return True if all cased characters in S are lowercase and there is\n\ 10283at least one cased character in S, False otherwise."); 10284 10285static PyObject* 10286unicode_islower(PyUnicodeObject *self) 10287{ 10288 Py_ssize_t i, length; 10289 int kind; 10290 void *data; 10291 int cased; 10292 10293 if (PyUnicode_READY(self) == -1) 10294 return NULL; 10295 length = PyUnicode_GET_LENGTH(self); 10296 kind = PyUnicode_KIND(self); 10297 data = PyUnicode_DATA(self); 10298 10299 /* Shortcut for single character strings */ 10300 if (length == 1) 10301 return PyBool_FromLong( 10302 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 10303 10304 /* Special case for empty strings */ 10305 if (length == 0) 10306 return PyBool_FromLong(0); 10307 10308 cased = 0; 10309 for (i = 0; i < length; i++) { 10310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10311 10312 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 10313 return PyBool_FromLong(0); 10314 else if (!cased && Py_UNICODE_ISLOWER(ch)) 10315 cased = 1; 10316 } 10317 return PyBool_FromLong(cased); 10318} 10319 10320PyDoc_STRVAR(isupper__doc__, 10321 "S.isupper() -> bool\n\ 10322\n\ 10323Return True if all cased characters in S are uppercase and there is\n\ 10324at least one cased character in S, False otherwise."); 10325 10326static PyObject* 10327unicode_isupper(PyUnicodeObject *self) 10328{ 10329 Py_ssize_t i, length; 10330 int kind; 10331 void *data; 10332 int cased; 10333 10334 if (PyUnicode_READY(self) == -1) 10335 return NULL; 10336 length = PyUnicode_GET_LENGTH(self); 10337 kind = PyUnicode_KIND(self); 10338 data = PyUnicode_DATA(self); 10339 10340 /* Shortcut for single character strings */ 10341 if (length == 1) 10342 return PyBool_FromLong( 10343 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 10344 10345 /* Special case for empty strings */ 10346 if (length == 0) 10347 return PyBool_FromLong(0); 10348 10349 cased = 0; 10350 for (i = 0; i < length; i++) { 10351 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10352 10353 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 10354 return PyBool_FromLong(0); 10355 else if (!cased && Py_UNICODE_ISUPPER(ch)) 10356 cased = 1; 10357 } 10358 return PyBool_FromLong(cased); 10359} 10360 10361PyDoc_STRVAR(istitle__doc__, 10362 "S.istitle() -> bool\n\ 10363\n\ 10364Return True if S is a titlecased string and there is at least one\n\ 10365character in S, i.e. upper- and titlecase characters may only\n\ 10366follow uncased characters and lowercase characters only cased ones.\n\ 10367Return False otherwise."); 10368 10369static PyObject* 10370unicode_istitle(PyUnicodeObject *self) 10371{ 10372 Py_ssize_t i, length; 10373 int kind; 10374 void *data; 10375 int cased, previous_is_cased; 10376 10377 if (PyUnicode_READY(self) == -1) 10378 return NULL; 10379 length = PyUnicode_GET_LENGTH(self); 10380 kind = PyUnicode_KIND(self); 10381 data = PyUnicode_DATA(self); 10382 10383 /* Shortcut for single character strings */ 10384 if (length == 1) { 10385 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10386 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 10387 (Py_UNICODE_ISUPPER(ch) != 0)); 10388 } 10389 10390 /* Special case for empty strings */ 10391 if (length == 0) 10392 return PyBool_FromLong(0); 10393 10394 cased = 0; 10395 previous_is_cased = 0; 10396 for (i = 0; i < length; i++) { 10397 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10398 10399 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 10400 if (previous_is_cased) 10401 return PyBool_FromLong(0); 10402 previous_is_cased = 1; 10403 cased = 1; 10404 } 10405 else if (Py_UNICODE_ISLOWER(ch)) { 10406 if (!previous_is_cased) 10407 return PyBool_FromLong(0); 10408 previous_is_cased = 1; 10409 cased = 1; 10410 } 10411 else 10412 previous_is_cased = 0; 10413 } 10414 return PyBool_FromLong(cased); 10415} 10416 10417PyDoc_STRVAR(isspace__doc__, 10418 "S.isspace() -> bool\n\ 10419\n\ 10420Return True if all characters in S are whitespace\n\ 10421and there is at least one character in S, False otherwise."); 10422 10423static PyObject* 10424unicode_isspace(PyUnicodeObject *self) 10425{ 10426 Py_ssize_t i, length; 10427 int kind; 10428 void *data; 10429 10430 if (PyUnicode_READY(self) == -1) 10431 return NULL; 10432 length = PyUnicode_GET_LENGTH(self); 10433 kind = PyUnicode_KIND(self); 10434 data = PyUnicode_DATA(self); 10435 10436 /* Shortcut for single character strings */ 10437 if (length == 1) 10438 return PyBool_FromLong( 10439 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 10440 10441 /* Special case for empty strings */ 10442 if (length == 0) 10443 return PyBool_FromLong(0); 10444 10445 for (i = 0; i < length; i++) { 10446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10447 if (!Py_UNICODE_ISSPACE(ch)) 10448 return PyBool_FromLong(0); 10449 } 10450 return PyBool_FromLong(1); 10451} 10452 10453PyDoc_STRVAR(isalpha__doc__, 10454 "S.isalpha() -> bool\n\ 10455\n\ 10456Return True if all characters in S are alphabetic\n\ 10457and there is at least one character in S, False otherwise."); 10458 10459static PyObject* 10460unicode_isalpha(PyUnicodeObject *self) 10461{ 10462 Py_ssize_t i, length; 10463 int kind; 10464 void *data; 10465 10466 if (PyUnicode_READY(self) == -1) 10467 return NULL; 10468 length = PyUnicode_GET_LENGTH(self); 10469 kind = PyUnicode_KIND(self); 10470 data = PyUnicode_DATA(self); 10471 10472 /* Shortcut for single character strings */ 10473 if (length == 1) 10474 return PyBool_FromLong( 10475 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 10476 10477 /* Special case for empty strings */ 10478 if (length == 0) 10479 return PyBool_FromLong(0); 10480 10481 for (i = 0; i < length; i++) { 10482 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 10483 return PyBool_FromLong(0); 10484 } 10485 return PyBool_FromLong(1); 10486} 10487 10488PyDoc_STRVAR(isalnum__doc__, 10489 "S.isalnum() -> bool\n\ 10490\n\ 10491Return True if all characters in S are alphanumeric\n\ 10492and there is at least one character in S, False otherwise."); 10493 10494static PyObject* 10495unicode_isalnum(PyUnicodeObject *self) 10496{ 10497 int kind; 10498 void *data; 10499 Py_ssize_t len, i; 10500 10501 if (PyUnicode_READY(self) == -1) 10502 return NULL; 10503 10504 kind = PyUnicode_KIND(self); 10505 data = PyUnicode_DATA(self); 10506 len = PyUnicode_GET_LENGTH(self); 10507 10508 /* Shortcut for single character strings */ 10509 if (len == 1) { 10510 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10511 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 10512 } 10513 10514 /* Special case for empty strings */ 10515 if (len == 0) 10516 return PyBool_FromLong(0); 10517 10518 for (i = 0; i < len; i++) { 10519 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 10520 if (!Py_UNICODE_ISALNUM(ch)) 10521 return PyBool_FromLong(0); 10522 } 10523 return PyBool_FromLong(1); 10524} 10525 10526PyDoc_STRVAR(isdecimal__doc__, 10527 "S.isdecimal() -> bool\n\ 10528\n\ 10529Return True if there are only decimal characters in S,\n\ 10530False otherwise."); 10531 10532static PyObject* 10533unicode_isdecimal(PyUnicodeObject *self) 10534{ 10535 Py_ssize_t i, length; 10536 int kind; 10537 void *data; 10538 10539 if (PyUnicode_READY(self) == -1) 10540 return NULL; 10541 length = PyUnicode_GET_LENGTH(self); 10542 kind = PyUnicode_KIND(self); 10543 data = PyUnicode_DATA(self); 10544 10545 /* Shortcut for single character strings */ 10546 if (length == 1) 10547 return PyBool_FromLong( 10548 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 10549 10550 /* Special case for empty strings */ 10551 if (length == 0) 10552 return PyBool_FromLong(0); 10553 10554 for (i = 0; i < length; i++) { 10555 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 10556 return PyBool_FromLong(0); 10557 } 10558 return PyBool_FromLong(1); 10559} 10560 10561PyDoc_STRVAR(isdigit__doc__, 10562 "S.isdigit() -> bool\n\ 10563\n\ 10564Return True if all characters in S are digits\n\ 10565and there is at least one character in S, False otherwise."); 10566 10567static PyObject* 10568unicode_isdigit(PyUnicodeObject *self) 10569{ 10570 Py_ssize_t i, length; 10571 int kind; 10572 void *data; 10573 10574 if (PyUnicode_READY(self) == -1) 10575 return NULL; 10576 length = PyUnicode_GET_LENGTH(self); 10577 kind = PyUnicode_KIND(self); 10578 data = PyUnicode_DATA(self); 10579 10580 /* Shortcut for single character strings */ 10581 if (length == 1) { 10582 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 10583 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 10584 } 10585 10586 /* Special case for empty strings */ 10587 if (length == 0) 10588 return PyBool_FromLong(0); 10589 10590 for (i = 0; i < length; i++) { 10591 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 10592 return PyBool_FromLong(0); 10593 } 10594 return PyBool_FromLong(1); 10595} 10596 10597PyDoc_STRVAR(isnumeric__doc__, 10598 "S.isnumeric() -> bool\n\ 10599\n\ 10600Return True if there are only numeric characters in S,\n\ 10601False otherwise."); 10602 10603static PyObject* 10604unicode_isnumeric(PyUnicodeObject *self) 10605{ 10606 Py_ssize_t i, length; 10607 int kind; 10608 void *data; 10609 10610 if (PyUnicode_READY(self) == -1) 10611 return NULL; 10612 length = PyUnicode_GET_LENGTH(self); 10613 kind = PyUnicode_KIND(self); 10614 data = PyUnicode_DATA(self); 10615 10616 /* Shortcut for single character strings */ 10617 if (length == 1) 10618 return PyBool_FromLong( 10619 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 10620 10621 /* Special case for empty strings */ 10622 if (length == 0) 10623 return PyBool_FromLong(0); 10624 10625 for (i = 0; i < length; i++) { 10626 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 10627 return PyBool_FromLong(0); 10628 } 10629 return PyBool_FromLong(1); 10630} 10631 10632int 10633PyUnicode_IsIdentifier(PyObject *self) 10634{ 10635 int kind; 10636 void *data; 10637 Py_ssize_t i; 10638 Py_UCS4 first; 10639 10640 if (PyUnicode_READY(self) == -1) { 10641 Py_FatalError("identifier not ready"); 10642 return 0; 10643 } 10644 10645 /* Special case for empty strings */ 10646 if (PyUnicode_GET_LENGTH(self) == 0) 10647 return 0; 10648 kind = PyUnicode_KIND(self); 10649 data = PyUnicode_DATA(self); 10650 10651 /* PEP 3131 says that the first character must be in 10652 XID_Start and subsequent characters in XID_Continue, 10653 and for the ASCII range, the 2.x rules apply (i.e 10654 start with letters and underscore, continue with 10655 letters, digits, underscore). However, given the current 10656 definition of XID_Start and XID_Continue, it is sufficient 10657 to check just for these, except that _ must be allowed 10658 as starting an identifier. */ 10659 first = PyUnicode_READ(kind, data, 0); 10660 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 10661 return 0; 10662 10663 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 10664 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 10665 return 0; 10666 return 1; 10667} 10668 10669PyDoc_STRVAR(isidentifier__doc__, 10670 "S.isidentifier() -> bool\n\ 10671\n\ 10672Return True if S is a valid identifier according\n\ 10673to the language definition."); 10674 10675static PyObject* 10676unicode_isidentifier(PyObject *self) 10677{ 10678 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 10679} 10680 10681PyDoc_STRVAR(isprintable__doc__, 10682 "S.isprintable() -> bool\n\ 10683\n\ 10684Return True if all characters in S are considered\n\ 10685printable in repr() or S is empty, False otherwise."); 10686 10687static PyObject* 10688unicode_isprintable(PyObject *self) 10689{ 10690 Py_ssize_t i, length; 10691 int kind; 10692 void *data; 10693 10694 if (PyUnicode_READY(self) == -1) 10695 return NULL; 10696 length = PyUnicode_GET_LENGTH(self); 10697 kind = PyUnicode_KIND(self); 10698 data = PyUnicode_DATA(self); 10699 10700 /* Shortcut for single character strings */ 10701 if (length == 1) 10702 return PyBool_FromLong( 10703 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 10704 10705 for (i = 0; i < length; i++) { 10706 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 10707 Py_RETURN_FALSE; 10708 } 10709 } 10710 Py_RETURN_TRUE; 10711} 10712 10713PyDoc_STRVAR(join__doc__, 10714 "S.join(iterable) -> str\n\ 10715\n\ 10716Return a string which is the concatenation of the strings in the\n\ 10717iterable. The separator between elements is S."); 10718 10719static PyObject* 10720unicode_join(PyObject *self, PyObject *data) 10721{ 10722 return PyUnicode_Join(self, data); 10723} 10724 10725static Py_ssize_t 10726unicode_length(PyUnicodeObject *self) 10727{ 10728 if (PyUnicode_READY(self) == -1) 10729 return -1; 10730 return PyUnicode_GET_LENGTH(self); 10731} 10732 10733PyDoc_STRVAR(ljust__doc__, 10734 "S.ljust(width[, fillchar]) -> str\n\ 10735\n\ 10736Return S left-justified in a Unicode string of length width. Padding is\n\ 10737done using the specified fill character (default is a space)."); 10738 10739static PyObject * 10740unicode_ljust(PyUnicodeObject *self, PyObject *args) 10741{ 10742 Py_ssize_t width; 10743 Py_UCS4 fillchar = ' '; 10744 10745 if (PyUnicode_READY(self) == -1) 10746 return NULL; 10747 10748 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 10749 return NULL; 10750 10751 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10752 Py_INCREF(self); 10753 return (PyObject*) self; 10754 } 10755 10756 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 10757} 10758 10759PyDoc_STRVAR(lower__doc__, 10760 "S.lower() -> str\n\ 10761\n\ 10762Return a copy of the string S converted to lowercase."); 10763 10764static PyObject* 10765unicode_lower(PyUnicodeObject *self) 10766{ 10767 return fixup(self, fixlower); 10768} 10769 10770#define LEFTSTRIP 0 10771#define RIGHTSTRIP 1 10772#define BOTHSTRIP 2 10773 10774/* Arrays indexed by above */ 10775static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 10776 10777#define STRIPNAME(i) (stripformat[i]+3) 10778 10779/* externally visible for str.strip(unicode) */ 10780PyObject * 10781_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) 10782{ 10783 void *data; 10784 int kind; 10785 Py_ssize_t i, j, len; 10786 BLOOM_MASK sepmask; 10787 10788 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 10789 return NULL; 10790 10791 kind = PyUnicode_KIND(self); 10792 data = PyUnicode_DATA(self); 10793 len = PyUnicode_GET_LENGTH(self); 10794 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 10795 PyUnicode_DATA(sepobj), 10796 PyUnicode_GET_LENGTH(sepobj)); 10797 10798 i = 0; 10799 if (striptype != RIGHTSTRIP) { 10800 while (i < len && 10801 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 10802 i++; 10803 } 10804 } 10805 10806 j = len; 10807 if (striptype != LEFTSTRIP) { 10808 do { 10809 j--; 10810 } while (j >= i && 10811 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 10812 j++; 10813 } 10814 10815 return PyUnicode_Substring((PyObject*)self, i, j); 10816} 10817 10818PyObject* 10819PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 10820{ 10821 unsigned char *data; 10822 int kind; 10823 Py_ssize_t length; 10824 10825 if (PyUnicode_READY(self) == -1) 10826 return NULL; 10827 10828 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 10829 10830 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 10831 { 10832 if (PyUnicode_CheckExact(self)) { 10833 Py_INCREF(self); 10834 return self; 10835 } 10836 else 10837 return PyUnicode_Copy(self); 10838 } 10839 10840 length = end - start; 10841 if (length == 1) 10842 return unicode_getitem(self, start); 10843 10844 if (start < 0 || end < 0) { 10845 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10846 return NULL; 10847 } 10848 10849 kind = PyUnicode_KIND(self); 10850 data = PyUnicode_1BYTE_DATA(self); 10851 return PyUnicode_FromKindAndData(kind, 10852 data + PyUnicode_KIND_SIZE(kind, start), 10853 length); 10854} 10855 10856static PyObject * 10857do_strip(PyUnicodeObject *self, int striptype) 10858{ 10859 int kind; 10860 void *data; 10861 Py_ssize_t len, i, j; 10862 10863 if (PyUnicode_READY(self) == -1) 10864 return NULL; 10865 10866 kind = PyUnicode_KIND(self); 10867 data = PyUnicode_DATA(self); 10868 len = PyUnicode_GET_LENGTH(self); 10869 10870 i = 0; 10871 if (striptype != RIGHTSTRIP) { 10872 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 10873 i++; 10874 } 10875 } 10876 10877 j = len; 10878 if (striptype != LEFTSTRIP) { 10879 do { 10880 j--; 10881 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 10882 j++; 10883 } 10884 10885 return PyUnicode_Substring((PyObject*)self, i, j); 10886} 10887 10888 10889static PyObject * 10890do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) 10891{ 10892 PyObject *sep = NULL; 10893 10894 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 10895 return NULL; 10896 10897 if (sep != NULL && sep != Py_None) { 10898 if (PyUnicode_Check(sep)) 10899 return _PyUnicode_XStrip(self, striptype, sep); 10900 else { 10901 PyErr_Format(PyExc_TypeError, 10902 "%s arg must be None or str", 10903 STRIPNAME(striptype)); 10904 return NULL; 10905 } 10906 } 10907 10908 return do_strip(self, striptype); 10909} 10910 10911 10912PyDoc_STRVAR(strip__doc__, 10913 "S.strip([chars]) -> str\n\ 10914\n\ 10915Return a copy of the string S with leading and trailing\n\ 10916whitespace removed.\n\ 10917If chars is given and not None, remove characters in chars instead."); 10918 10919static PyObject * 10920unicode_strip(PyUnicodeObject *self, PyObject *args) 10921{ 10922 if (PyTuple_GET_SIZE(args) == 0) 10923 return do_strip(self, BOTHSTRIP); /* Common case */ 10924 else 10925 return do_argstrip(self, BOTHSTRIP, args); 10926} 10927 10928 10929PyDoc_STRVAR(lstrip__doc__, 10930 "S.lstrip([chars]) -> str\n\ 10931\n\ 10932Return a copy of the string S with leading whitespace removed.\n\ 10933If chars is given and not None, remove characters in chars instead."); 10934 10935static PyObject * 10936unicode_lstrip(PyUnicodeObject *self, PyObject *args) 10937{ 10938 if (PyTuple_GET_SIZE(args) == 0) 10939 return do_strip(self, LEFTSTRIP); /* Common case */ 10940 else 10941 return do_argstrip(self, LEFTSTRIP, args); 10942} 10943 10944 10945PyDoc_STRVAR(rstrip__doc__, 10946 "S.rstrip([chars]) -> str\n\ 10947\n\ 10948Return a copy of the string S with trailing whitespace removed.\n\ 10949If chars is given and not None, remove characters in chars instead."); 10950 10951static PyObject * 10952unicode_rstrip(PyUnicodeObject *self, PyObject *args) 10953{ 10954 if (PyTuple_GET_SIZE(args) == 0) 10955 return do_strip(self, RIGHTSTRIP); /* Common case */ 10956 else 10957 return do_argstrip(self, RIGHTSTRIP, args); 10958} 10959 10960 10961static PyObject* 10962unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) 10963{ 10964 PyUnicodeObject *u; 10965 Py_ssize_t nchars, n; 10966 10967 if (len < 1) { 10968 Py_INCREF(unicode_empty); 10969 return unicode_empty; 10970 } 10971 10972 if (len == 1 && PyUnicode_CheckExact(str)) { 10973 /* no repeat, return original string */ 10974 Py_INCREF(str); 10975 return (PyObject*) str; 10976 } 10977 10978 if (PyUnicode_READY(str) == -1) 10979 return NULL; 10980 10981 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 10982 PyErr_SetString(PyExc_OverflowError, 10983 "repeated string is too long"); 10984 return NULL; 10985 } 10986 nchars = len * PyUnicode_GET_LENGTH(str); 10987 10988 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 10989 if (!u) 10990 return NULL; 10991 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 10992 10993 if (PyUnicode_GET_LENGTH(str) == 1) { 10994 const int kind = PyUnicode_KIND(str); 10995 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 10996 void *to = PyUnicode_DATA(u); 10997 if (kind == PyUnicode_1BYTE_KIND) 10998 memset(to, (unsigned char)fill_char, len); 10999 else { 11000 for (n = 0; n < len; ++n) 11001 PyUnicode_WRITE(kind, to, n, fill_char); 11002 } 11003 } 11004 else { 11005 /* number of characters copied this far */ 11006 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11007 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str); 11008 char *to = (char *) PyUnicode_DATA(u); 11009 Py_MEMCPY(to, PyUnicode_DATA(str), 11010 PyUnicode_GET_LENGTH(str) * char_size); 11011 while (done < nchars) { 11012 n = (done <= nchars-done) ? done : nchars-done; 11013 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11014 done += n; 11015 } 11016 } 11017 11018 return (PyObject*) u; 11019} 11020 11021PyObject * 11022PyUnicode_Replace(PyObject *obj, 11023 PyObject *subobj, 11024 PyObject *replobj, 11025 Py_ssize_t maxcount) 11026{ 11027 PyObject *self; 11028 PyObject *str1; 11029 PyObject *str2; 11030 PyObject *result; 11031 11032 self = PyUnicode_FromObject(obj); 11033 if (self == NULL || PyUnicode_READY(self) == -1) 11034 return NULL; 11035 str1 = PyUnicode_FromObject(subobj); 11036 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11037 Py_DECREF(self); 11038 return NULL; 11039 } 11040 str2 = PyUnicode_FromObject(replobj); 11041 if (str2 == NULL || PyUnicode_READY(str2)) { 11042 Py_DECREF(self); 11043 Py_DECREF(str1); 11044 return NULL; 11045 } 11046 result = replace(self, str1, str2, maxcount); 11047 Py_DECREF(self); 11048 Py_DECREF(str1); 11049 Py_DECREF(str2); 11050 return result; 11051} 11052 11053PyDoc_STRVAR(replace__doc__, 11054 "S.replace(old, new[, count]) -> str\n\ 11055\n\ 11056Return a copy of S with all occurrences of substring\n\ 11057old replaced by new. If the optional argument count is\n\ 11058given, only the first count occurrences are replaced."); 11059 11060static PyObject* 11061unicode_replace(PyObject *self, PyObject *args) 11062{ 11063 PyObject *str1; 11064 PyObject *str2; 11065 Py_ssize_t maxcount = -1; 11066 PyObject *result; 11067 11068 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11069 return NULL; 11070 if (!PyUnicode_READY(self) == -1) 11071 return NULL; 11072 str1 = PyUnicode_FromObject(str1); 11073 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11074 return NULL; 11075 str2 = PyUnicode_FromObject(str2); 11076 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11077 Py_DECREF(str1); 11078 return NULL; 11079 } 11080 11081 result = replace(self, str1, str2, maxcount); 11082 11083 Py_DECREF(str1); 11084 Py_DECREF(str2); 11085 return result; 11086} 11087 11088static PyObject * 11089unicode_repr(PyObject *unicode) 11090{ 11091 PyObject *repr; 11092 Py_ssize_t isize; 11093 Py_ssize_t osize, squote, dquote, i, o; 11094 Py_UCS4 max, quote; 11095 int ikind, okind; 11096 void *idata, *odata; 11097 11098 if (PyUnicode_READY(unicode) == -1) 11099 return NULL; 11100 11101 isize = PyUnicode_GET_LENGTH(unicode); 11102 idata = PyUnicode_DATA(unicode); 11103 11104 /* Compute length of output, quote characters, and 11105 maximum character */ 11106 osize = 2; /* quotes */ 11107 max = 127; 11108 squote = dquote = 0; 11109 ikind = PyUnicode_KIND(unicode); 11110 for (i = 0; i < isize; i++) { 11111 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11112 switch (ch) { 11113 case '\'': squote++; osize++; break; 11114 case '"': dquote++; osize++; break; 11115 case '\\': case '\t': case '\r': case '\n': 11116 osize += 2; break; 11117 default: 11118 /* Fast-path ASCII */ 11119 if (ch < ' ' || ch == 0x7f) 11120 osize += 4; /* \xHH */ 11121 else if (ch < 0x7f) 11122 osize++; 11123 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11124 osize++; 11125 max = ch > max ? ch : max; 11126 } 11127 else if (ch < 0x100) 11128 osize += 4; /* \xHH */ 11129 else if (ch < 0x10000) 11130 osize += 6; /* \uHHHH */ 11131 else 11132 osize += 10; /* \uHHHHHHHH */ 11133 } 11134 } 11135 11136 quote = '\''; 11137 if (squote) { 11138 if (dquote) 11139 /* Both squote and dquote present. Use squote, 11140 and escape them */ 11141 osize += squote; 11142 else 11143 quote = '"'; 11144 } 11145 11146 repr = PyUnicode_New(osize, max); 11147 if (repr == NULL) 11148 return NULL; 11149 okind = PyUnicode_KIND(repr); 11150 odata = PyUnicode_DATA(repr); 11151 11152 PyUnicode_WRITE(okind, odata, 0, quote); 11153 PyUnicode_WRITE(okind, odata, osize-1, quote); 11154 11155 for (i = 0, o = 1; i < isize; i++) { 11156 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11157 11158 /* Escape quotes and backslashes */ 11159 if ((ch == quote) || (ch == '\\')) { 11160 PyUnicode_WRITE(okind, odata, o++, '\\'); 11161 PyUnicode_WRITE(okind, odata, o++, ch); 11162 continue; 11163 } 11164 11165 /* Map special whitespace to '\t', \n', '\r' */ 11166 if (ch == '\t') { 11167 PyUnicode_WRITE(okind, odata, o++, '\\'); 11168 PyUnicode_WRITE(okind, odata, o++, 't'); 11169 } 11170 else if (ch == '\n') { 11171 PyUnicode_WRITE(okind, odata, o++, '\\'); 11172 PyUnicode_WRITE(okind, odata, o++, 'n'); 11173 } 11174 else if (ch == '\r') { 11175 PyUnicode_WRITE(okind, odata, o++, '\\'); 11176 PyUnicode_WRITE(okind, odata, o++, 'r'); 11177 } 11178 11179 /* Map non-printable US ASCII to '\xhh' */ 11180 else if (ch < ' ' || ch == 0x7F) { 11181 PyUnicode_WRITE(okind, odata, o++, '\\'); 11182 PyUnicode_WRITE(okind, odata, o++, 'x'); 11183 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11184 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11185 } 11186 11187 /* Copy ASCII characters as-is */ 11188 else if (ch < 0x7F) { 11189 PyUnicode_WRITE(okind, odata, o++, ch); 11190 } 11191 11192 /* Non-ASCII characters */ 11193 else { 11194 /* Map Unicode whitespace and control characters 11195 (categories Z* and C* except ASCII space) 11196 */ 11197 if (!Py_UNICODE_ISPRINTABLE(ch)) { 11198 /* Map 8-bit characters to '\xhh' */ 11199 if (ch <= 0xff) { 11200 PyUnicode_WRITE(okind, odata, o++, '\\'); 11201 PyUnicode_WRITE(okind, odata, o++, 'x'); 11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]); 11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]); 11204 } 11205 /* Map 21-bit characters to '\U00xxxxxx' */ 11206 else if (ch >= 0x10000) { 11207 PyUnicode_WRITE(okind, odata, o++, '\\'); 11208 PyUnicode_WRITE(okind, odata, o++, 'U'); 11209 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]); 11210 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]); 11211 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]); 11212 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]); 11213 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11214 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11215 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11216 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11217 } 11218 /* Map 16-bit characters to '\uxxxx' */ 11219 else { 11220 PyUnicode_WRITE(okind, odata, o++, '\\'); 11221 PyUnicode_WRITE(okind, odata, o++, 'u'); 11222 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]); 11223 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]); 11224 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]); 11225 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]); 11226 } 11227 } 11228 /* Copy characters as-is */ 11229 else { 11230 PyUnicode_WRITE(okind, odata, o++, ch); 11231 } 11232 } 11233 } 11234 /* Closing quote already added at the beginning */ 11235 return repr; 11236} 11237 11238PyDoc_STRVAR(rfind__doc__, 11239 "S.rfind(sub[, start[, end]]) -> int\n\ 11240\n\ 11241Return the highest index in S where substring sub is found,\n\ 11242such that sub is contained within S[start:end]. Optional\n\ 11243arguments start and end are interpreted as in slice notation.\n\ 11244\n\ 11245Return -1 on failure."); 11246 11247static PyObject * 11248unicode_rfind(PyObject *self, PyObject *args) 11249{ 11250 PyUnicodeObject *substring; 11251 Py_ssize_t start; 11252 Py_ssize_t end; 11253 Py_ssize_t result; 11254 11255 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 11256 &start, &end)) 11257 return NULL; 11258 11259 if (PyUnicode_READY(self) == -1) 11260 return NULL; 11261 if (PyUnicode_READY(substring) == -1) 11262 return NULL; 11263 11264 result = any_find_slice( 11265 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11266 self, (PyObject*)substring, start, end 11267 ); 11268 11269 Py_DECREF(substring); 11270 11271 if (result == -2) 11272 return NULL; 11273 11274 return PyLong_FromSsize_t(result); 11275} 11276 11277PyDoc_STRVAR(rindex__doc__, 11278 "S.rindex(sub[, start[, end]]) -> int\n\ 11279\n\ 11280Like S.rfind() but raise ValueError when the substring is not found."); 11281 11282static PyObject * 11283unicode_rindex(PyObject *self, PyObject *args) 11284{ 11285 PyUnicodeObject *substring; 11286 Py_ssize_t start; 11287 Py_ssize_t end; 11288 Py_ssize_t result; 11289 11290 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 11291 &start, &end)) 11292 return NULL; 11293 11294 if (PyUnicode_READY(self) == -1) 11295 return NULL; 11296 if (PyUnicode_READY(substring) == -1) 11297 return NULL; 11298 11299 result = any_find_slice( 11300 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, 11301 self, (PyObject*)substring, start, end 11302 ); 11303 11304 Py_DECREF(substring); 11305 11306 if (result == -2) 11307 return NULL; 11308 11309 if (result < 0) { 11310 PyErr_SetString(PyExc_ValueError, "substring not found"); 11311 return NULL; 11312 } 11313 11314 return PyLong_FromSsize_t(result); 11315} 11316 11317PyDoc_STRVAR(rjust__doc__, 11318 "S.rjust(width[, fillchar]) -> str\n\ 11319\n\ 11320Return S right-justified in a string of length width. Padding is\n\ 11321done using the specified fill character (default is a space)."); 11322 11323static PyObject * 11324unicode_rjust(PyUnicodeObject *self, PyObject *args) 11325{ 11326 Py_ssize_t width; 11327 Py_UCS4 fillchar = ' '; 11328 11329 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 11330 return NULL; 11331 11332 if (PyUnicode_READY(self) == -1) 11333 return NULL; 11334 11335 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11336 Py_INCREF(self); 11337 return (PyObject*) self; 11338 } 11339 11340 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 11341} 11342 11343PyObject * 11344PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11345{ 11346 PyObject *result; 11347 11348 s = PyUnicode_FromObject(s); 11349 if (s == NULL) 11350 return NULL; 11351 if (sep != NULL) { 11352 sep = PyUnicode_FromObject(sep); 11353 if (sep == NULL) { 11354 Py_DECREF(s); 11355 return NULL; 11356 } 11357 } 11358 11359 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11360 11361 Py_DECREF(s); 11362 Py_XDECREF(sep); 11363 return result; 11364} 11365 11366PyDoc_STRVAR(split__doc__, 11367 "S.split([sep[, maxsplit]]) -> list of strings\n\ 11368\n\ 11369Return a list of the words in S, using sep as the\n\ 11370delimiter string. If maxsplit is given, at most maxsplit\n\ 11371splits are done. If sep is not specified or is None, any\n\ 11372whitespace string is a separator and empty strings are\n\ 11373removed from the result."); 11374 11375static PyObject* 11376unicode_split(PyUnicodeObject *self, PyObject *args) 11377{ 11378 PyObject *substring = Py_None; 11379 Py_ssize_t maxcount = -1; 11380 11381 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 11382 return NULL; 11383 11384 if (substring == Py_None) 11385 return split(self, NULL, maxcount); 11386 else if (PyUnicode_Check(substring)) 11387 return split(self, (PyUnicodeObject *)substring, maxcount); 11388 else 11389 return PyUnicode_Split((PyObject *)self, substring, maxcount); 11390} 11391 11392PyObject * 11393PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 11394{ 11395 PyObject* str_obj; 11396 PyObject* sep_obj; 11397 PyObject* out; 11398 int kind1, kind2, kind; 11399 void *buf1 = NULL, *buf2 = NULL; 11400 Py_ssize_t len1, len2; 11401 11402 str_obj = PyUnicode_FromObject(str_in); 11403 if (!str_obj || PyUnicode_READY(str_obj) == -1) 11404 return NULL; 11405 sep_obj = PyUnicode_FromObject(sep_in); 11406 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 11407 Py_DECREF(str_obj); 11408 return NULL; 11409 } 11410 11411 kind1 = PyUnicode_KIND(str_in); 11412 kind2 = PyUnicode_KIND(sep_obj); 11413 kind = kind1 > kind2 ? kind1 : kind2; 11414 buf1 = PyUnicode_DATA(str_in); 11415 if (kind1 != kind) 11416 buf1 = _PyUnicode_AsKind(str_in, kind); 11417 if (!buf1) 11418 goto onError; 11419 buf2 = PyUnicode_DATA(sep_obj); 11420 if (kind2 != kind) 11421 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11422 if (!buf2) 11423 goto onError; 11424 len1 = PyUnicode_GET_LENGTH(str_obj); 11425 len2 = PyUnicode_GET_LENGTH(sep_obj); 11426 11427 switch(PyUnicode_KIND(str_in)) { 11428 case PyUnicode_1BYTE_KIND: 11429 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11430 break; 11431 case PyUnicode_2BYTE_KIND: 11432 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11433 break; 11434 case PyUnicode_4BYTE_KIND: 11435 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 11436 break; 11437 default: 11438 assert(0); 11439 out = 0; 11440 } 11441 11442 Py_DECREF(sep_obj); 11443 Py_DECREF(str_obj); 11444 if (kind1 != kind) 11445 PyMem_Free(buf1); 11446 if (kind2 != kind) 11447 PyMem_Free(buf2); 11448 11449 return out; 11450 onError: 11451 Py_DECREF(sep_obj); 11452 Py_DECREF(str_obj); 11453 if (kind1 != kind && buf1) 11454 PyMem_Free(buf1); 11455 if (kind2 != kind && buf2) 11456 PyMem_Free(buf2); 11457 return NULL; 11458} 11459 11460 11461PyObject * 11462PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 11463{ 11464 PyObject* str_obj; 11465 PyObject* sep_obj; 11466 PyObject* out; 11467 int kind1, kind2, kind; 11468 void *buf1 = NULL, *buf2 = NULL; 11469 Py_ssize_t len1, len2; 11470 11471 str_obj = PyUnicode_FromObject(str_in); 11472 if (!str_obj) 11473 return NULL; 11474 sep_obj = PyUnicode_FromObject(sep_in); 11475 if (!sep_obj) { 11476 Py_DECREF(str_obj); 11477 return NULL; 11478 } 11479 11480 kind1 = PyUnicode_KIND(str_in); 11481 kind2 = PyUnicode_KIND(sep_obj); 11482 kind = Py_MAX(kind1, kind2); 11483 buf1 = PyUnicode_DATA(str_in); 11484 if (kind1 != kind) 11485 buf1 = _PyUnicode_AsKind(str_in, kind); 11486 if (!buf1) 11487 goto onError; 11488 buf2 = PyUnicode_DATA(sep_obj); 11489 if (kind2 != kind) 11490 buf2 = _PyUnicode_AsKind(sep_obj, kind); 11491 if (!buf2) 11492 goto onError; 11493 len1 = PyUnicode_GET_LENGTH(str_obj); 11494 len2 = PyUnicode_GET_LENGTH(sep_obj); 11495 11496 switch(PyUnicode_KIND(str_in)) { 11497 case PyUnicode_1BYTE_KIND: 11498 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11499 break; 11500 case PyUnicode_2BYTE_KIND: 11501 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11502 break; 11503 case PyUnicode_4BYTE_KIND: 11504 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 11505 break; 11506 default: 11507 assert(0); 11508 out = 0; 11509 } 11510 11511 Py_DECREF(sep_obj); 11512 Py_DECREF(str_obj); 11513 if (kind1 != kind) 11514 PyMem_Free(buf1); 11515 if (kind2 != kind) 11516 PyMem_Free(buf2); 11517 11518 return out; 11519 onError: 11520 Py_DECREF(sep_obj); 11521 Py_DECREF(str_obj); 11522 if (kind1 != kind && buf1) 11523 PyMem_Free(buf1); 11524 if (kind2 != kind && buf2) 11525 PyMem_Free(buf2); 11526 return NULL; 11527} 11528 11529PyDoc_STRVAR(partition__doc__, 11530 "S.partition(sep) -> (head, sep, tail)\n\ 11531\n\ 11532Search for the separator sep in S, and return the part before it,\n\ 11533the separator itself, and the part after it. If the separator is not\n\ 11534found, return S and two empty strings."); 11535 11536static PyObject* 11537unicode_partition(PyUnicodeObject *self, PyObject *separator) 11538{ 11539 return PyUnicode_Partition((PyObject *)self, separator); 11540} 11541 11542PyDoc_STRVAR(rpartition__doc__, 11543 "S.rpartition(sep) -> (head, sep, tail)\n\ 11544\n\ 11545Search for the separator sep in S, starting at the end of S, and return\n\ 11546the part before it, the separator itself, and the part after it. If the\n\ 11547separator is not found, return two empty strings and S."); 11548 11549static PyObject* 11550unicode_rpartition(PyUnicodeObject *self, PyObject *separator) 11551{ 11552 return PyUnicode_RPartition((PyObject *)self, separator); 11553} 11554 11555PyObject * 11556PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 11557{ 11558 PyObject *result; 11559 11560 s = PyUnicode_FromObject(s); 11561 if (s == NULL) 11562 return NULL; 11563 if (sep != NULL) { 11564 sep = PyUnicode_FromObject(sep); 11565 if (sep == NULL) { 11566 Py_DECREF(s); 11567 return NULL; 11568 } 11569 } 11570 11571 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); 11572 11573 Py_DECREF(s); 11574 Py_XDECREF(sep); 11575 return result; 11576} 11577 11578PyDoc_STRVAR(rsplit__doc__, 11579 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 11580\n\ 11581Return a list of the words in S, using sep as the\n\ 11582delimiter string, starting at the end of the string and\n\ 11583working to the front. If maxsplit is given, at most maxsplit\n\ 11584splits are done. If sep is not specified, any whitespace string\n\ 11585is a separator."); 11586 11587static PyObject* 11588unicode_rsplit(PyUnicodeObject *self, PyObject *args) 11589{ 11590 PyObject *substring = Py_None; 11591 Py_ssize_t maxcount = -1; 11592 11593 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 11594 return NULL; 11595 11596 if (substring == Py_None) 11597 return rsplit(self, NULL, maxcount); 11598 else if (PyUnicode_Check(substring)) 11599 return rsplit(self, (PyUnicodeObject *)substring, maxcount); 11600 else 11601 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); 11602} 11603 11604PyDoc_STRVAR(splitlines__doc__, 11605 "S.splitlines([keepends]) -> list of strings\n\ 11606\n\ 11607Return a list of the lines in S, breaking at line boundaries.\n\ 11608Line breaks are not included in the resulting list unless keepends\n\ 11609is given and true."); 11610 11611static PyObject* 11612unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds) 11613{ 11614 static char *kwlist[] = {"keepends", 0}; 11615 int keepends = 0; 11616 11617 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 11618 kwlist, &keepends)) 11619 return NULL; 11620 11621 return PyUnicode_Splitlines((PyObject *)self, keepends); 11622} 11623 11624static 11625PyObject *unicode_str(PyObject *self) 11626{ 11627 if (PyUnicode_CheckExact(self)) { 11628 Py_INCREF(self); 11629 return self; 11630 } else 11631 /* Subtype -- return genuine unicode string with the same value. */ 11632 return PyUnicode_Copy(self); 11633} 11634 11635PyDoc_STRVAR(swapcase__doc__, 11636 "S.swapcase() -> str\n\ 11637\n\ 11638Return a copy of S with uppercase characters converted to lowercase\n\ 11639and vice versa."); 11640 11641static PyObject* 11642unicode_swapcase(PyUnicodeObject *self) 11643{ 11644 return fixup(self, fixswapcase); 11645} 11646 11647PyDoc_STRVAR(maketrans__doc__, 11648 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 11649\n\ 11650Return a translation table usable for str.translate().\n\ 11651If there is only one argument, it must be a dictionary mapping Unicode\n\ 11652ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 11653Character keys will be then converted to ordinals.\n\ 11654If there are two arguments, they must be strings of equal length, and\n\ 11655in the resulting dictionary, each character in x will be mapped to the\n\ 11656character at the same position in y. If there is a third argument, it\n\ 11657must be a string, whose characters will be mapped to None in the result."); 11658 11659static PyObject* 11660unicode_maketrans(PyUnicodeObject *null, PyObject *args) 11661{ 11662 PyObject *x, *y = NULL, *z = NULL; 11663 PyObject *new = NULL, *key, *value; 11664 Py_ssize_t i = 0; 11665 int res; 11666 11667 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 11668 return NULL; 11669 new = PyDict_New(); 11670 if (!new) 11671 return NULL; 11672 if (y != NULL) { 11673 int x_kind, y_kind, z_kind; 11674 void *x_data, *y_data, *z_data; 11675 11676 /* x must be a string too, of equal length */ 11677 if (!PyUnicode_Check(x)) { 11678 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 11679 "be a string if there is a second argument"); 11680 goto err; 11681 } 11682 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 11683 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 11684 "arguments must have equal length"); 11685 goto err; 11686 } 11687 /* create entries for translating chars in x to those in y */ 11688 x_kind = PyUnicode_KIND(x); 11689 y_kind = PyUnicode_KIND(y); 11690 x_data = PyUnicode_DATA(x); 11691 y_data = PyUnicode_DATA(y); 11692 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 11693 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 11694 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 11695 if (!key || !value) 11696 goto err; 11697 res = PyDict_SetItem(new, key, value); 11698 Py_DECREF(key); 11699 Py_DECREF(value); 11700 if (res < 0) 11701 goto err; 11702 } 11703 /* create entries for deleting chars in z */ 11704 if (z != NULL) { 11705 z_kind = PyUnicode_KIND(z); 11706 z_data = PyUnicode_DATA(z); 11707 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) { 11708 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 11709 if (!key) 11710 goto err; 11711 res = PyDict_SetItem(new, key, Py_None); 11712 Py_DECREF(key); 11713 if (res < 0) 11714 goto err; 11715 } 11716 } 11717 } else { 11718 int kind; 11719 void *data; 11720 11721 /* x must be a dict */ 11722 if (!PyDict_CheckExact(x)) { 11723 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 11724 "to maketrans it must be a dict"); 11725 goto err; 11726 } 11727 /* copy entries into the new dict, converting string keys to int keys */ 11728 while (PyDict_Next(x, &i, &key, &value)) { 11729 if (PyUnicode_Check(key)) { 11730 /* convert string keys to integer keys */ 11731 PyObject *newkey; 11732 if (PyUnicode_GET_SIZE(key) != 1) { 11733 PyErr_SetString(PyExc_ValueError, "string keys in translate " 11734 "table must be of length 1"); 11735 goto err; 11736 } 11737 kind = PyUnicode_KIND(key); 11738 data = PyUnicode_DATA(key); 11739 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 11740 if (!newkey) 11741 goto err; 11742 res = PyDict_SetItem(new, newkey, value); 11743 Py_DECREF(newkey); 11744 if (res < 0) 11745 goto err; 11746 } else if (PyLong_Check(key)) { 11747 /* just keep integer keys */ 11748 if (PyDict_SetItem(new, key, value) < 0) 11749 goto err; 11750 } else { 11751 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 11752 "be strings or integers"); 11753 goto err; 11754 } 11755 } 11756 } 11757 return new; 11758 err: 11759 Py_DECREF(new); 11760 return NULL; 11761} 11762 11763PyDoc_STRVAR(translate__doc__, 11764 "S.translate(table) -> str\n\ 11765\n\ 11766Return a copy of the string S, where all characters have been mapped\n\ 11767through the given translation table, which must be a mapping of\n\ 11768Unicode ordinals to Unicode ordinals, strings, or None.\n\ 11769Unmapped characters are left untouched. Characters mapped to None\n\ 11770are deleted."); 11771 11772static PyObject* 11773unicode_translate(PyObject *self, PyObject *table) 11774{ 11775 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 11776} 11777 11778PyDoc_STRVAR(upper__doc__, 11779 "S.upper() -> str\n\ 11780\n\ 11781Return a copy of S converted to uppercase."); 11782 11783static PyObject* 11784unicode_upper(PyUnicodeObject *self) 11785{ 11786 return fixup(self, fixupper); 11787} 11788 11789PyDoc_STRVAR(zfill__doc__, 11790 "S.zfill(width) -> str\n\ 11791\n\ 11792Pad a numeric string S with zeros on the left, to fill a field\n\ 11793of the specified width. The string S is never truncated."); 11794 11795static PyObject * 11796unicode_zfill(PyUnicodeObject *self, PyObject *args) 11797{ 11798 Py_ssize_t fill; 11799 PyUnicodeObject *u; 11800 Py_ssize_t width; 11801 int kind; 11802 void *data; 11803 Py_UCS4 chr; 11804 11805 if (PyUnicode_READY(self) == -1) 11806 return NULL; 11807 11808 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 11809 return NULL; 11810 11811 if (PyUnicode_GET_LENGTH(self) >= width) { 11812 if (PyUnicode_CheckExact(self)) { 11813 Py_INCREF(self); 11814 return (PyObject*) self; 11815 } 11816 else 11817 return PyUnicode_Copy((PyObject*)self); 11818 } 11819 11820 fill = width - _PyUnicode_LENGTH(self); 11821 11822 u = pad(self, fill, 0, '0'); 11823 11824 if (u == NULL) 11825 return NULL; 11826 11827 kind = PyUnicode_KIND(u); 11828 data = PyUnicode_DATA(u); 11829 chr = PyUnicode_READ(kind, data, fill); 11830 11831 if (chr == '+' || chr == '-') { 11832 /* move sign to beginning of string */ 11833 PyUnicode_WRITE(kind, data, 0, chr); 11834 PyUnicode_WRITE(kind, data, fill, '0'); 11835 } 11836 11837 return (PyObject*) u; 11838} 11839 11840#if 0 11841static PyObject * 11842unicode__decimal2ascii(PyObject *self) 11843{ 11844 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 11845} 11846#endif 11847 11848PyDoc_STRVAR(startswith__doc__, 11849 "S.startswith(prefix[, start[, end]]) -> bool\n\ 11850\n\ 11851Return True if S starts with the specified prefix, False otherwise.\n\ 11852With optional start, test S beginning at that position.\n\ 11853With optional end, stop comparing S at that position.\n\ 11854prefix can also be a tuple of strings to try."); 11855 11856static PyObject * 11857unicode_startswith(PyUnicodeObject *self, 11858 PyObject *args) 11859{ 11860 PyObject *subobj; 11861 PyUnicodeObject *substring; 11862 Py_ssize_t start = 0; 11863 Py_ssize_t end = PY_SSIZE_T_MAX; 11864 int result; 11865 11866 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 11867 return NULL; 11868 if (PyTuple_Check(subobj)) { 11869 Py_ssize_t i; 11870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 11871 substring = (PyUnicodeObject *)PyUnicode_FromObject( 11872 PyTuple_GET_ITEM(subobj, i)); 11873 if (substring == NULL) 11874 return NULL; 11875 result = tailmatch(self, substring, start, end, -1); 11876 Py_DECREF(substring); 11877 if (result) { 11878 Py_RETURN_TRUE; 11879 } 11880 } 11881 /* nothing matched */ 11882 Py_RETURN_FALSE; 11883 } 11884 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 11885 if (substring == NULL) { 11886 if (PyErr_ExceptionMatches(PyExc_TypeError)) 11887 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 11888 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 11889 return NULL; 11890 } 11891 result = tailmatch(self, substring, start, end, -1); 11892 Py_DECREF(substring); 11893 return PyBool_FromLong(result); 11894} 11895 11896 11897PyDoc_STRVAR(endswith__doc__, 11898 "S.endswith(suffix[, start[, end]]) -> bool\n\ 11899\n\ 11900Return True if S ends with the specified suffix, False otherwise.\n\ 11901With optional start, test S beginning at that position.\n\ 11902With optional end, stop comparing S at that position.\n\ 11903suffix can also be a tuple of strings to try."); 11904 11905static PyObject * 11906unicode_endswith(PyUnicodeObject *self, 11907 PyObject *args) 11908{ 11909 PyObject *subobj; 11910 PyUnicodeObject *substring; 11911 Py_ssize_t start = 0; 11912 Py_ssize_t end = PY_SSIZE_T_MAX; 11913 int result; 11914 11915 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 11916 return NULL; 11917 if (PyTuple_Check(subobj)) { 11918 Py_ssize_t i; 11919 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 11920 substring = (PyUnicodeObject *)PyUnicode_FromObject( 11921 PyTuple_GET_ITEM(subobj, i)); 11922 if (substring == NULL) 11923 return NULL; 11924 result = tailmatch(self, substring, start, end, +1); 11925 Py_DECREF(substring); 11926 if (result) { 11927 Py_RETURN_TRUE; 11928 } 11929 } 11930 Py_RETURN_FALSE; 11931 } 11932 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); 11933 if (substring == NULL) { 11934 if (PyErr_ExceptionMatches(PyExc_TypeError)) 11935 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 11936 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 11937 return NULL; 11938 } 11939 result = tailmatch(self, substring, start, end, +1); 11940 Py_DECREF(substring); 11941 return PyBool_FromLong(result); 11942} 11943 11944#include "stringlib/unicode_format.h" 11945 11946PyDoc_STRVAR(format__doc__, 11947 "S.format(*args, **kwargs) -> str\n\ 11948\n\ 11949Return a formatted version of S, using substitutions from args and kwargs.\n\ 11950The substitutions are identified by braces ('{' and '}')."); 11951 11952PyDoc_STRVAR(format_map__doc__, 11953 "S.format_map(mapping) -> str\n\ 11954\n\ 11955Return a formatted version of S, using substitutions from mapping.\n\ 11956The substitutions are identified by braces ('{' and '}')."); 11957 11958static PyObject * 11959unicode__format__(PyObject* self, PyObject* args) 11960{ 11961 PyObject *format_spec; 11962 11963 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 11964 return NULL; 11965 11966 return _PyUnicode_FormatAdvanced(self, format_spec, 0, 11967 PyUnicode_GET_LENGTH(format_spec)); 11968} 11969 11970PyDoc_STRVAR(p_format__doc__, 11971 "S.__format__(format_spec) -> str\n\ 11972\n\ 11973Return a formatted version of S as described by format_spec."); 11974 11975static PyObject * 11976unicode__sizeof__(PyUnicodeObject *v) 11977{ 11978 Py_ssize_t size; 11979 11980 /* If it's a compact object, account for base structure + 11981 character data. */ 11982 if (PyUnicode_IS_COMPACT_ASCII(v)) 11983 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 11984 else if (PyUnicode_IS_COMPACT(v)) 11985 size = sizeof(PyCompactUnicodeObject) + 11986 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v); 11987 else { 11988 /* If it is a two-block object, account for base object, and 11989 for character block if present. */ 11990 size = sizeof(PyUnicodeObject); 11991 if (_PyUnicode_DATA_ANY(v)) 11992 size += (PyUnicode_GET_LENGTH(v) + 1) * 11993 PyUnicode_CHARACTER_SIZE(v); 11994 } 11995 /* If the wstr pointer is present, account for it unless it is shared 11996 with the data pointer. Check if the data is not shared. */ 11997 if (_PyUnicode_WSTR(v) && 11998 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))) 11999 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12000 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12001 size += PyUnicode_UTF8_LENGTH(v) + 1; 12002 12003 return PyLong_FromSsize_t(size); 12004} 12005 12006PyDoc_STRVAR(sizeof__doc__, 12007 "S.__sizeof__() -> size of S in memory, in bytes"); 12008 12009static PyObject * 12010unicode_getnewargs(PyObject *v) 12011{ 12012 PyObject *copy = PyUnicode_Copy(v); 12013 if (!copy) 12014 return NULL; 12015 return Py_BuildValue("(N)", copy); 12016} 12017 12018static PyMethodDef unicode_methods[] = { 12019 12020 /* Order is according to common usage: often used methods should 12021 appear first, since lookup is done sequentially. */ 12022 12023 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12024 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12025 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12026 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12027 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12028 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12029 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12030 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12031 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12032 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12033 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12034 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12035 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12036 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12037 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12038 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12039 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12040 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12041 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12042 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12043 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12044 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12045 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12046 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12047 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12048 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12049 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12050 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12051 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12052 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12053 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12054 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12055 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12056 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12057 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12058 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12059 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12060 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12061 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12062 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12063 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12064 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12065 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12066 {"maketrans", (PyCFunction) unicode_maketrans, 12067 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12068 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12069#if 0 12070 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12071#endif 12072 12073#if 0 12074 /* These methods are just used for debugging the implementation. */ 12075 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12076#endif 12077 12078 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12079 {NULL, NULL} 12080}; 12081 12082static PyObject * 12083unicode_mod(PyObject *v, PyObject *w) 12084{ 12085 if (!PyUnicode_Check(v)) 12086 Py_RETURN_NOTIMPLEMENTED; 12087 return PyUnicode_Format(v, w); 12088} 12089 12090static PyNumberMethods unicode_as_number = { 12091 0, /*nb_add*/ 12092 0, /*nb_subtract*/ 12093 0, /*nb_multiply*/ 12094 unicode_mod, /*nb_remainder*/ 12095}; 12096 12097static PySequenceMethods unicode_as_sequence = { 12098 (lenfunc) unicode_length, /* sq_length */ 12099 PyUnicode_Concat, /* sq_concat */ 12100 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12101 (ssizeargfunc) unicode_getitem, /* sq_item */ 12102 0, /* sq_slice */ 12103 0, /* sq_ass_item */ 12104 0, /* sq_ass_slice */ 12105 PyUnicode_Contains, /* sq_contains */ 12106}; 12107 12108static PyObject* 12109unicode_subscript(PyUnicodeObject* self, PyObject* item) 12110{ 12111 if (PyUnicode_READY(self) == -1) 12112 return NULL; 12113 12114 if (PyIndex_Check(item)) { 12115 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12116 if (i == -1 && PyErr_Occurred()) 12117 return NULL; 12118 if (i < 0) 12119 i += PyUnicode_GET_LENGTH(self); 12120 return unicode_getitem((PyObject*)self, i); 12121 } else if (PySlice_Check(item)) { 12122 Py_ssize_t start, stop, step, slicelength, cur, i; 12123 const Py_UNICODE* source_buf; 12124 Py_UNICODE* result_buf; 12125 PyObject* result; 12126 12127 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12128 &start, &stop, &step, &slicelength) < 0) { 12129 return NULL; 12130 } 12131 12132 if (slicelength <= 0) { 12133 return PyUnicode_New(0, 0); 12134 } else if (start == 0 && step == 1 && 12135 slicelength == PyUnicode_GET_LENGTH(self) && 12136 PyUnicode_CheckExact(self)) { 12137 Py_INCREF(self); 12138 return (PyObject *)self; 12139 } else if (step == 1) { 12140 return PyUnicode_Substring((PyObject*)self, 12141 start, start + slicelength); 12142 } else { 12143 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); 12144 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* 12145 sizeof(Py_UNICODE)); 12146 12147 if (result_buf == NULL) 12148 return PyErr_NoMemory(); 12149 12150 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12151 result_buf[i] = source_buf[cur]; 12152 } 12153 12154 result = PyUnicode_FromUnicode(result_buf, slicelength); 12155 PyObject_FREE(result_buf); 12156 return result; 12157 } 12158 } else { 12159 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12160 return NULL; 12161 } 12162} 12163 12164static PyMappingMethods unicode_as_mapping = { 12165 (lenfunc)unicode_length, /* mp_length */ 12166 (binaryfunc)unicode_subscript, /* mp_subscript */ 12167 (objobjargproc)0, /* mp_ass_subscript */ 12168}; 12169 12170 12171/* Helpers for PyUnicode_Format() */ 12172 12173static PyObject * 12174getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12175{ 12176 Py_ssize_t argidx = *p_argidx; 12177 if (argidx < arglen) { 12178 (*p_argidx)++; 12179 if (arglen < 0) 12180 return args; 12181 else 12182 return PyTuple_GetItem(args, argidx); 12183 } 12184 PyErr_SetString(PyExc_TypeError, 12185 "not enough arguments for format string"); 12186 return NULL; 12187} 12188 12189/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 12190 12191static PyObject * 12192formatfloat(PyObject *v, int flags, int prec, int type) 12193{ 12194 char *p; 12195 PyObject *result; 12196 double x; 12197 12198 x = PyFloat_AsDouble(v); 12199 if (x == -1.0 && PyErr_Occurred()) 12200 return NULL; 12201 12202 if (prec < 0) 12203 prec = 6; 12204 12205 p = PyOS_double_to_string(x, type, prec, 12206 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 12207 if (p == NULL) 12208 return NULL; 12209 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 12210 PyMem_Free(p); 12211 return result; 12212} 12213 12214static PyObject* 12215formatlong(PyObject *val, int flags, int prec, int type) 12216{ 12217 char *buf; 12218 int len; 12219 PyObject *str; /* temporary string object. */ 12220 PyObject *result; 12221 12222 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 12223 if (!str) 12224 return NULL; 12225 result = PyUnicode_DecodeASCII(buf, len, NULL); 12226 Py_DECREF(str); 12227 return result; 12228} 12229 12230static int 12231formatchar(Py_UCS4 *buf, 12232 size_t buflen, 12233 PyObject *v) 12234{ 12235 /* presume that the buffer is at least 3 characters long */ 12236 if (PyUnicode_Check(v)) { 12237 if (PyUnicode_GET_LENGTH(v) == 1) { 12238 buf[0] = PyUnicode_READ_CHAR(v, 0); 12239 buf[1] = '\0'; 12240 return 1; 12241 } 12242 goto onError; 12243 } 12244 else { 12245 /* Integer input truncated to a character */ 12246 long x; 12247 x = PyLong_AsLong(v); 12248 if (x == -1 && PyErr_Occurred()) 12249 goto onError; 12250 12251 if (x < 0 || x > 0x10ffff) { 12252 PyErr_SetString(PyExc_OverflowError, 12253 "%c arg not in range(0x110000)"); 12254 return -1; 12255 } 12256 12257 buf[0] = (Py_UCS4) x; 12258 buf[1] = '\0'; 12259 return 1; 12260 } 12261 12262 onError: 12263 PyErr_SetString(PyExc_TypeError, 12264 "%c requires int or char"); 12265 return -1; 12266} 12267 12268/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) 12269 FORMATBUFLEN is the length of the buffer in which chars are formatted. 12270*/ 12271#define FORMATBUFLEN (size_t)10 12272 12273PyObject * 12274PyUnicode_Format(PyObject *format, PyObject *args) 12275{ 12276 void *fmt; 12277 int fmtkind; 12278 PyObject *result; 12279 Py_UCS4 *res, *res0; 12280 Py_UCS4 max; 12281 int kind; 12282 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx; 12283 int args_owned = 0; 12284 PyObject *dict = NULL; 12285 PyUnicodeObject *uformat; 12286 12287 if (format == NULL || args == NULL) { 12288 PyErr_BadInternalCall(); 12289 return NULL; 12290 } 12291 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format); 12292 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 12293 return NULL; 12294 fmt = PyUnicode_DATA(uformat); 12295 fmtkind = PyUnicode_KIND(uformat); 12296 fmtcnt = PyUnicode_GET_LENGTH(uformat); 12297 fmtpos = 0; 12298 12299 reslen = rescnt = fmtcnt + 100; 12300 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4)); 12301 if (res0 == NULL) { 12302 PyErr_NoMemory(); 12303 goto onError; 12304 } 12305 12306 if (PyTuple_Check(args)) { 12307 arglen = PyTuple_Size(args); 12308 argidx = 0; 12309 } 12310 else { 12311 arglen = -1; 12312 argidx = -2; 12313 } 12314 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 12315 !PyUnicode_Check(args)) 12316 dict = args; 12317 12318 while (--fmtcnt >= 0) { 12319 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 12320 if (--rescnt < 0) { 12321 rescnt = fmtcnt + 100; 12322 reslen += rescnt; 12323 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12324 if (res0 == NULL){ 12325 PyErr_NoMemory(); 12326 goto onError; 12327 } 12328 res = res0 + reslen - rescnt; 12329 --rescnt; 12330 } 12331 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12332 } 12333 else { 12334 /* Got a format specifier */ 12335 int flags = 0; 12336 Py_ssize_t width = -1; 12337 int prec = -1; 12338 Py_UCS4 c = '\0'; 12339 Py_UCS4 fill; 12340 int isnumok; 12341 PyObject *v = NULL; 12342 PyObject *temp = NULL; 12343 void *pbuf; 12344 Py_ssize_t pindex; 12345 Py_UNICODE sign; 12346 Py_ssize_t len, len1; 12347 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */ 12348 12349 fmtpos++; 12350 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 12351 Py_ssize_t keystart; 12352 Py_ssize_t keylen; 12353 PyObject *key; 12354 int pcount = 1; 12355 12356 if (dict == NULL) { 12357 PyErr_SetString(PyExc_TypeError, 12358 "format requires a mapping"); 12359 goto onError; 12360 } 12361 ++fmtpos; 12362 --fmtcnt; 12363 keystart = fmtpos; 12364 /* Skip over balanced parentheses */ 12365 while (pcount > 0 && --fmtcnt >= 0) { 12366 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 12367 --pcount; 12368 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 12369 ++pcount; 12370 fmtpos++; 12371 } 12372 keylen = fmtpos - keystart - 1; 12373 if (fmtcnt < 0 || pcount > 0) { 12374 PyErr_SetString(PyExc_ValueError, 12375 "incomplete format key"); 12376 goto onError; 12377 } 12378 key = PyUnicode_Substring((PyObject*)uformat, 12379 keystart, keystart + keylen); 12380 if (key == NULL) 12381 goto onError; 12382 if (args_owned) { 12383 Py_DECREF(args); 12384 args_owned = 0; 12385 } 12386 args = PyObject_GetItem(dict, key); 12387 Py_DECREF(key); 12388 if (args == NULL) { 12389 goto onError; 12390 } 12391 args_owned = 1; 12392 arglen = -1; 12393 argidx = -2; 12394 } 12395 while (--fmtcnt >= 0) { 12396 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 12397 case '-': flags |= F_LJUST; continue; 12398 case '+': flags |= F_SIGN; continue; 12399 case ' ': flags |= F_BLANK; continue; 12400 case '#': flags |= F_ALT; continue; 12401 case '0': flags |= F_ZERO; continue; 12402 } 12403 break; 12404 } 12405 if (c == '*') { 12406 v = getnextarg(args, arglen, &argidx); 12407 if (v == NULL) 12408 goto onError; 12409 if (!PyLong_Check(v)) { 12410 PyErr_SetString(PyExc_TypeError, 12411 "* wants int"); 12412 goto onError; 12413 } 12414 width = PyLong_AsLong(v); 12415 if (width == -1 && PyErr_Occurred()) 12416 goto onError; 12417 if (width < 0) { 12418 flags |= F_LJUST; 12419 width = -width; 12420 } 12421 if (--fmtcnt >= 0) 12422 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12423 } 12424 else if (c >= '0' && c <= '9') { 12425 width = c - '0'; 12426 while (--fmtcnt >= 0) { 12427 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12428 if (c < '0' || c > '9') 12429 break; 12430 if ((width*10) / 10 != width) { 12431 PyErr_SetString(PyExc_ValueError, 12432 "width too big"); 12433 goto onError; 12434 } 12435 width = width*10 + (c - '0'); 12436 } 12437 } 12438 if (c == '.') { 12439 prec = 0; 12440 if (--fmtcnt >= 0) 12441 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12442 if (c == '*') { 12443 v = getnextarg(args, arglen, &argidx); 12444 if (v == NULL) 12445 goto onError; 12446 if (!PyLong_Check(v)) { 12447 PyErr_SetString(PyExc_TypeError, 12448 "* wants int"); 12449 goto onError; 12450 } 12451 prec = PyLong_AsLong(v); 12452 if (prec == -1 && PyErr_Occurred()) 12453 goto onError; 12454 if (prec < 0) 12455 prec = 0; 12456 if (--fmtcnt >= 0) 12457 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12458 } 12459 else if (c >= '0' && c <= '9') { 12460 prec = c - '0'; 12461 while (--fmtcnt >= 0) { 12462 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12463 if (c < '0' || c > '9') 12464 break; 12465 if ((prec*10) / 10 != prec) { 12466 PyErr_SetString(PyExc_ValueError, 12467 "prec too big"); 12468 goto onError; 12469 } 12470 prec = prec*10 + (c - '0'); 12471 } 12472 } 12473 } /* prec */ 12474 if (fmtcnt >= 0) { 12475 if (c == 'h' || c == 'l' || c == 'L') { 12476 if (--fmtcnt >= 0) 12477 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 12478 } 12479 } 12480 if (fmtcnt < 0) { 12481 PyErr_SetString(PyExc_ValueError, 12482 "incomplete format"); 12483 goto onError; 12484 } 12485 if (c != '%') { 12486 v = getnextarg(args, arglen, &argidx); 12487 if (v == NULL) 12488 goto onError; 12489 } 12490 sign = 0; 12491 fill = ' '; 12492 switch (c) { 12493 12494 case '%': 12495 pbuf = formatbuf; 12496 kind = PyUnicode_4BYTE_KIND; 12497 /* presume that buffer length is at least 1 */ 12498 PyUnicode_WRITE(kind, pbuf, 0, '%'); 12499 len = 1; 12500 break; 12501 12502 case 's': 12503 case 'r': 12504 case 'a': 12505 if (PyUnicode_CheckExact(v) && c == 's') { 12506 temp = v; 12507 Py_INCREF(temp); 12508 } 12509 else { 12510 if (c == 's') 12511 temp = PyObject_Str(v); 12512 else if (c == 'r') 12513 temp = PyObject_Repr(v); 12514 else 12515 temp = PyObject_ASCII(v); 12516 if (temp == NULL) 12517 goto onError; 12518 if (PyUnicode_Check(temp)) 12519 /* nothing to do */; 12520 else { 12521 Py_DECREF(temp); 12522 PyErr_SetString(PyExc_TypeError, 12523 "%s argument has non-string str()"); 12524 goto onError; 12525 } 12526 } 12527 if (PyUnicode_READY(temp) == -1) { 12528 Py_CLEAR(temp); 12529 goto onError; 12530 } 12531 pbuf = PyUnicode_DATA(temp); 12532 kind = PyUnicode_KIND(temp); 12533 len = PyUnicode_GET_LENGTH(temp); 12534 if (prec >= 0 && len > prec) 12535 len = prec; 12536 break; 12537 12538 case 'i': 12539 case 'd': 12540 case 'u': 12541 case 'o': 12542 case 'x': 12543 case 'X': 12544 isnumok = 0; 12545 if (PyNumber_Check(v)) { 12546 PyObject *iobj=NULL; 12547 12548 if (PyLong_Check(v)) { 12549 iobj = v; 12550 Py_INCREF(iobj); 12551 } 12552 else { 12553 iobj = PyNumber_Long(v); 12554 } 12555 if (iobj!=NULL) { 12556 if (PyLong_Check(iobj)) { 12557 isnumok = 1; 12558 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 12559 Py_DECREF(iobj); 12560 if (!temp) 12561 goto onError; 12562 if (PyUnicode_READY(temp) == -1) { 12563 Py_CLEAR(temp); 12564 goto onError; 12565 } 12566 pbuf = PyUnicode_DATA(temp); 12567 kind = PyUnicode_KIND(temp); 12568 len = PyUnicode_GET_LENGTH(temp); 12569 sign = 1; 12570 } 12571 else { 12572 Py_DECREF(iobj); 12573 } 12574 } 12575 } 12576 if (!isnumok) { 12577 PyErr_Format(PyExc_TypeError, 12578 "%%%c format: a number is required, " 12579 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 12580 goto onError; 12581 } 12582 if (flags & F_ZERO) 12583 fill = '0'; 12584 break; 12585 12586 case 'e': 12587 case 'E': 12588 case 'f': 12589 case 'F': 12590 case 'g': 12591 case 'G': 12592 temp = formatfloat(v, flags, prec, c); 12593 if (!temp) 12594 goto onError; 12595 if (PyUnicode_READY(temp) == -1) { 12596 Py_CLEAR(temp); 12597 goto onError; 12598 } 12599 pbuf = PyUnicode_DATA(temp); 12600 kind = PyUnicode_KIND(temp); 12601 len = PyUnicode_GET_LENGTH(temp); 12602 sign = 1; 12603 if (flags & F_ZERO) 12604 fill = '0'; 12605 break; 12606 12607 case 'c': 12608 pbuf = formatbuf; 12609 kind = PyUnicode_4BYTE_KIND; 12610 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v); 12611 if (len < 0) 12612 goto onError; 12613 break; 12614 12615 default: 12616 PyErr_Format(PyExc_ValueError, 12617 "unsupported format character '%c' (0x%x) " 12618 "at index %zd", 12619 (31<=c && c<=126) ? (char)c : '?', 12620 (int)c, 12621 fmtpos - 1); 12622 goto onError; 12623 } 12624 /* pbuf is initialized here. */ 12625 pindex = 0; 12626 if (sign) { 12627 if (PyUnicode_READ(kind, pbuf, pindex) == '-' || 12628 PyUnicode_READ(kind, pbuf, pindex) == '+') { 12629 sign = PyUnicode_READ(kind, pbuf, pindex++); 12630 len--; 12631 } 12632 else if (flags & F_SIGN) 12633 sign = '+'; 12634 else if (flags & F_BLANK) 12635 sign = ' '; 12636 else 12637 sign = 0; 12638 } 12639 if (width < len) 12640 width = len; 12641 if (rescnt - (sign != 0) < width) { 12642 reslen -= rescnt; 12643 rescnt = width + fmtcnt + 100; 12644 reslen += rescnt; 12645 if (reslen < 0) { 12646 Py_XDECREF(temp); 12647 PyErr_NoMemory(); 12648 goto onError; 12649 } 12650 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4)); 12651 if (res0 == 0) { 12652 PyErr_NoMemory(); 12653 Py_XDECREF(temp); 12654 goto onError; 12655 } 12656 res = res0 + reslen - rescnt; 12657 } 12658 if (sign) { 12659 if (fill != ' ') 12660 *res++ = sign; 12661 rescnt--; 12662 if (width > len) 12663 width--; 12664 } 12665 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12666 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12667 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12668 if (fill != ' ') { 12669 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12670 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12671 } 12672 rescnt -= 2; 12673 width -= 2; 12674 if (width < 0) 12675 width = 0; 12676 len -= 2; 12677 } 12678 if (width > len && !(flags & F_LJUST)) { 12679 do { 12680 --rescnt; 12681 *res++ = fill; 12682 } while (--width > len); 12683 } 12684 if (fill == ' ') { 12685 if (sign) 12686 *res++ = sign; 12687 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 12688 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 12689 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 12690 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12691 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12692 } 12693 } 12694 /* Copy all characters, preserving len */ 12695 len1 = len; 12696 while (len1--) { 12697 *res++ = PyUnicode_READ(kind, pbuf, pindex++); 12698 rescnt--; 12699 } 12700 while (--width >= len) { 12701 --rescnt; 12702 *res++ = ' '; 12703 } 12704 if (dict && (argidx < arglen) && c != '%') { 12705 PyErr_SetString(PyExc_TypeError, 12706 "not all arguments converted during string formatting"); 12707 Py_XDECREF(temp); 12708 goto onError; 12709 } 12710 Py_XDECREF(temp); 12711 } /* '%' */ 12712 } /* until end */ 12713 if (argidx < arglen && !dict) { 12714 PyErr_SetString(PyExc_TypeError, 12715 "not all arguments converted during string formatting"); 12716 goto onError; 12717 } 12718 12719 12720 for (max=0, res = res0; res < res0+reslen-rescnt; res++) 12721 if (*res > max) 12722 max = *res; 12723 result = PyUnicode_New(reslen - rescnt, max); 12724 if (!result) 12725 goto onError; 12726 kind = PyUnicode_KIND(result); 12727 for (res = res0; res < res0+reslen-rescnt; res++) 12728 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res); 12729 PyMem_Free(res0); 12730 if (args_owned) { 12731 Py_DECREF(args); 12732 } 12733 Py_DECREF(uformat); 12734 return (PyObject *)result; 12735 12736 onError: 12737 PyMem_Free(res0); 12738 Py_DECREF(uformat); 12739 if (args_owned) { 12740 Py_DECREF(args); 12741 } 12742 return NULL; 12743} 12744 12745static PyObject * 12746unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 12747 12748static PyObject * 12749unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12750{ 12751 PyObject *x = NULL; 12752 static char *kwlist[] = {"object", "encoding", "errors", 0}; 12753 char *encoding = NULL; 12754 char *errors = NULL; 12755 12756 if (type != &PyUnicode_Type) 12757 return unicode_subtype_new(type, args, kwds); 12758 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 12759 kwlist, &x, &encoding, &errors)) 12760 return NULL; 12761 if (x == NULL) 12762 return (PyObject *)PyUnicode_New(0, 0); 12763 if (encoding == NULL && errors == NULL) 12764 return PyObject_Str(x); 12765 else 12766 return PyUnicode_FromEncodedObject(x, encoding, errors); 12767} 12768 12769static PyObject * 12770unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 12771{ 12772 PyUnicodeObject *unicode, *self; 12773 Py_ssize_t length, char_size; 12774 int share_wstr, share_utf8; 12775 unsigned int kind; 12776 void *data; 12777 12778 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 12779 12780 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); 12781 if (unicode == NULL) 12782 return NULL; 12783 assert(_PyUnicode_CHECK(unicode)); 12784 if (PyUnicode_READY(unicode)) 12785 return NULL; 12786 12787 self = (PyUnicodeObject *) type->tp_alloc(type, 0); 12788 if (self == NULL) { 12789 Py_DECREF(unicode); 12790 return NULL; 12791 } 12792 kind = PyUnicode_KIND(unicode); 12793 length = PyUnicode_GET_LENGTH(unicode); 12794 12795 _PyUnicode_LENGTH(self) = length; 12796 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 12797 _PyUnicode_STATE(self).interned = 0; 12798 _PyUnicode_STATE(self).kind = kind; 12799 _PyUnicode_STATE(self).compact = 0; 12800 _PyUnicode_STATE(self).ascii = 0; 12801 _PyUnicode_STATE(self).ready = 1; 12802 _PyUnicode_WSTR(self) = NULL; 12803 _PyUnicode_UTF8_LENGTH(self) = 0; 12804 _PyUnicode_UTF8(self) = NULL; 12805 _PyUnicode_WSTR_LENGTH(self) = 0; 12806 _PyUnicode_DATA_ANY(self) = NULL; 12807 12808 share_utf8 = 0; 12809 share_wstr = 0; 12810 if (kind == PyUnicode_1BYTE_KIND) { 12811 char_size = 1; 12812 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 12813 share_utf8 = 1; 12814 } 12815 else if (kind == PyUnicode_2BYTE_KIND) { 12816 char_size = 2; 12817 if (sizeof(wchar_t) == 2) 12818 share_wstr = 1; 12819 } 12820 else { 12821 assert(kind == PyUnicode_4BYTE_KIND); 12822 char_size = 4; 12823 if (sizeof(wchar_t) == 4) 12824 share_wstr = 1; 12825 } 12826 12827 /* Ensure we won't overflow the length. */ 12828 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 12829 PyErr_NoMemory(); 12830 goto onError; 12831 } 12832 data = PyObject_MALLOC((length + 1) * char_size); 12833 if (data == NULL) { 12834 PyErr_NoMemory(); 12835 goto onError; 12836 } 12837 12838 _PyUnicode_DATA_ANY(self) = data; 12839 if (share_utf8) { 12840 _PyUnicode_UTF8_LENGTH(self) = length; 12841 _PyUnicode_UTF8(self) = data; 12842 } 12843 if (share_wstr) { 12844 _PyUnicode_WSTR_LENGTH(self) = length; 12845 _PyUnicode_WSTR(self) = (wchar_t *)data; 12846 } 12847 12848 Py_MEMCPY(data, PyUnicode_DATA(unicode), 12849 PyUnicode_KIND_SIZE(kind, length + 1)); 12850 Py_DECREF(unicode); 12851 return (PyObject *)self; 12852 12853onError: 12854 Py_DECREF(unicode); 12855 Py_DECREF(self); 12856 return NULL; 12857} 12858 12859PyDoc_STRVAR(unicode_doc, 12860 "str(string[, encoding[, errors]]) -> str\n\ 12861\n\ 12862Create a new string object from the given encoded string.\n\ 12863encoding defaults to the current default string encoding.\n\ 12864errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 12865 12866static PyObject *unicode_iter(PyObject *seq); 12867 12868PyTypeObject PyUnicode_Type = { 12869 PyVarObject_HEAD_INIT(&PyType_Type, 0) 12870 "str", /* tp_name */ 12871 sizeof(PyUnicodeObject), /* tp_size */ 12872 0, /* tp_itemsize */ 12873 /* Slots */ 12874 (destructor)unicode_dealloc, /* tp_dealloc */ 12875 0, /* tp_print */ 12876 0, /* tp_getattr */ 12877 0, /* tp_setattr */ 12878 0, /* tp_reserved */ 12879 unicode_repr, /* tp_repr */ 12880 &unicode_as_number, /* tp_as_number */ 12881 &unicode_as_sequence, /* tp_as_sequence */ 12882 &unicode_as_mapping, /* tp_as_mapping */ 12883 (hashfunc) unicode_hash, /* tp_hash*/ 12884 0, /* tp_call*/ 12885 (reprfunc) unicode_str, /* tp_str */ 12886 PyObject_GenericGetAttr, /* tp_getattro */ 12887 0, /* tp_setattro */ 12888 0, /* tp_as_buffer */ 12889 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 12890 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 12891 unicode_doc, /* tp_doc */ 12892 0, /* tp_traverse */ 12893 0, /* tp_clear */ 12894 PyUnicode_RichCompare, /* tp_richcompare */ 12895 0, /* tp_weaklistoffset */ 12896 unicode_iter, /* tp_iter */ 12897 0, /* tp_iternext */ 12898 unicode_methods, /* tp_methods */ 12899 0, /* tp_members */ 12900 0, /* tp_getset */ 12901 &PyBaseObject_Type, /* tp_base */ 12902 0, /* tp_dict */ 12903 0, /* tp_descr_get */ 12904 0, /* tp_descr_set */ 12905 0, /* tp_dictoffset */ 12906 0, /* tp_init */ 12907 0, /* tp_alloc */ 12908 unicode_new, /* tp_new */ 12909 PyObject_Del, /* tp_free */ 12910}; 12911 12912/* Initialize the Unicode implementation */ 12913 12914void _PyUnicode_Init(void) 12915{ 12916 int i; 12917 12918 /* XXX - move this array to unicodectype.c ? */ 12919 Py_UCS2 linebreak[] = { 12920 0x000A, /* LINE FEED */ 12921 0x000D, /* CARRIAGE RETURN */ 12922 0x001C, /* FILE SEPARATOR */ 12923 0x001D, /* GROUP SEPARATOR */ 12924 0x001E, /* RECORD SEPARATOR */ 12925 0x0085, /* NEXT LINE */ 12926 0x2028, /* LINE SEPARATOR */ 12927 0x2029, /* PARAGRAPH SEPARATOR */ 12928 }; 12929 12930 /* Init the implementation */ 12931 unicode_empty = PyUnicode_New(0, 0); 12932 if (!unicode_empty) 12933 Py_FatalError("Can't create empty string"); 12934 12935 for (i = 0; i < 256; i++) 12936 unicode_latin1[i] = NULL; 12937 if (PyType_Ready(&PyUnicode_Type) < 0) 12938 Py_FatalError("Can't initialize 'unicode'"); 12939 12940 /* initialize the linebreak bloom filter */ 12941 bloom_linebreak = make_bloom_mask( 12942 PyUnicode_2BYTE_KIND, linebreak, 12943 Py_ARRAY_LENGTH(linebreak)); 12944 12945 PyType_Ready(&EncodingMapType); 12946} 12947 12948/* Finalize the Unicode implementation */ 12949 12950int 12951PyUnicode_ClearFreeList(void) 12952{ 12953 return 0; 12954} 12955 12956void 12957_PyUnicode_Fini(void) 12958{ 12959 int i; 12960 12961 Py_XDECREF(unicode_empty); 12962 unicode_empty = NULL; 12963 12964 for (i = 0; i < 256; i++) { 12965 if (unicode_latin1[i]) { 12966 Py_DECREF(unicode_latin1[i]); 12967 unicode_latin1[i] = NULL; 12968 } 12969 } 12970 (void)PyUnicode_ClearFreeList(); 12971} 12972 12973void 12974PyUnicode_InternInPlace(PyObject **p) 12975{ 12976 register PyUnicodeObject *s = (PyUnicodeObject *)(*p); 12977 PyObject *t; 12978#ifdef Py_DEBUG 12979 assert(s != NULL); 12980 assert(_PyUnicode_CHECK(s)); 12981#else 12982 if (s == NULL || !PyUnicode_Check(s)) 12983 return; 12984#endif 12985 /* If it's a subclass, we don't really know what putting 12986 it in the interned dict might do. */ 12987 if (!PyUnicode_CheckExact(s)) 12988 return; 12989 if (PyUnicode_CHECK_INTERNED(s)) 12990 return; 12991 if (PyUnicode_READY(s) == -1) { 12992 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace"); 12993 return; 12994 } 12995 if (interned == NULL) { 12996 interned = PyDict_New(); 12997 if (interned == NULL) { 12998 PyErr_Clear(); /* Don't leave an exception */ 12999 return; 13000 } 13001 } 13002 /* It might be that the GetItem call fails even 13003 though the key is present in the dictionary, 13004 namely when this happens during a stack overflow. */ 13005 Py_ALLOW_RECURSION 13006 t = PyDict_GetItem(interned, (PyObject *)s); 13007 Py_END_ALLOW_RECURSION 13008 13009 if (t) { 13010 Py_INCREF(t); 13011 Py_DECREF(*p); 13012 *p = t; 13013 return; 13014 } 13015 13016 PyThreadState_GET()->recursion_critical = 1; 13017 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { 13018 PyErr_Clear(); 13019 PyThreadState_GET()->recursion_critical = 0; 13020 return; 13021 } 13022 PyThreadState_GET()->recursion_critical = 0; 13023 /* The two references in interned are not counted by refcnt. 13024 The deallocator will take care of this */ 13025 Py_REFCNT(s) -= 2; 13026 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13027} 13028 13029void 13030PyUnicode_InternImmortal(PyObject **p) 13031{ 13032 PyUnicodeObject *u = (PyUnicodeObject *)*p; 13033 13034 PyUnicode_InternInPlace(p); 13035 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13036 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL; 13037 Py_INCREF(*p); 13038 } 13039} 13040 13041PyObject * 13042PyUnicode_InternFromString(const char *cp) 13043{ 13044 PyObject *s = PyUnicode_FromString(cp); 13045 if (s == NULL) 13046 return NULL; 13047 PyUnicode_InternInPlace(&s); 13048 return s; 13049} 13050 13051void 13052_Py_ReleaseInternedUnicodeStrings(void) 13053{ 13054 PyObject *keys; 13055 PyUnicodeObject *s; 13056 Py_ssize_t i, n; 13057 Py_ssize_t immortal_size = 0, mortal_size = 0; 13058 13059 if (interned == NULL || !PyDict_Check(interned)) 13060 return; 13061 keys = PyDict_Keys(interned); 13062 if (keys == NULL || !PyList_Check(keys)) { 13063 PyErr_Clear(); 13064 return; 13065 } 13066 13067 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13068 detector, interned unicode strings are not forcibly deallocated; 13069 rather, we give them their stolen references back, and then clear 13070 and DECREF the interned dict. */ 13071 13072 n = PyList_GET_SIZE(keys); 13073 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13074 n); 13075 for (i = 0; i < n; i++) { 13076 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); 13077 if (PyUnicode_READY(s) == -1) 13078 fprintf(stderr, "could not ready string\n"); 13079 switch (PyUnicode_CHECK_INTERNED(s)) { 13080 case SSTATE_NOT_INTERNED: 13081 /* XXX Shouldn't happen */ 13082 break; 13083 case SSTATE_INTERNED_IMMORTAL: 13084 Py_REFCNT(s) += 1; 13085 immortal_size += PyUnicode_GET_LENGTH(s); 13086 break; 13087 case SSTATE_INTERNED_MORTAL: 13088 Py_REFCNT(s) += 2; 13089 mortal_size += PyUnicode_GET_LENGTH(s); 13090 break; 13091 default: 13092 Py_FatalError("Inconsistent interned string state."); 13093 } 13094 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13095 } 13096 fprintf(stderr, "total size of all interned strings: " 13097 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13098 "mortal/immortal\n", mortal_size, immortal_size); 13099 Py_DECREF(keys); 13100 PyDict_Clear(interned); 13101 Py_DECREF(interned); 13102 interned = NULL; 13103} 13104 13105 13106/********************* Unicode Iterator **************************/ 13107 13108typedef struct { 13109 PyObject_HEAD 13110 Py_ssize_t it_index; 13111 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */ 13112} unicodeiterobject; 13113 13114static void 13115unicodeiter_dealloc(unicodeiterobject *it) 13116{ 13117 _PyObject_GC_UNTRACK(it); 13118 Py_XDECREF(it->it_seq); 13119 PyObject_GC_Del(it); 13120} 13121 13122static int 13123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13124{ 13125 Py_VISIT(it->it_seq); 13126 return 0; 13127} 13128 13129static PyObject * 13130unicodeiter_next(unicodeiterobject *it) 13131{ 13132 PyUnicodeObject *seq; 13133 PyObject *item; 13134 13135 assert(it != NULL); 13136 seq = it->it_seq; 13137 if (seq == NULL) 13138 return NULL; 13139 assert(_PyUnicode_CHECK(seq)); 13140 13141 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 13142 int kind = PyUnicode_KIND(seq); 13143 void *data = PyUnicode_DATA(seq); 13144 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 13145 item = PyUnicode_FromOrdinal(chr); 13146 if (item != NULL) 13147 ++it->it_index; 13148 return item; 13149 } 13150 13151 Py_DECREF(seq); 13152 it->it_seq = NULL; 13153 return NULL; 13154} 13155 13156static PyObject * 13157unicodeiter_len(unicodeiterobject *it) 13158{ 13159 Py_ssize_t len = 0; 13160 if (it->it_seq) 13161 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index; 13162 return PyLong_FromSsize_t(len); 13163} 13164 13165PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 13166 13167static PyMethodDef unicodeiter_methods[] = { 13168 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 13169 length_hint_doc}, 13170 {NULL, NULL} /* sentinel */ 13171}; 13172 13173PyTypeObject PyUnicodeIter_Type = { 13174 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13175 "str_iterator", /* tp_name */ 13176 sizeof(unicodeiterobject), /* tp_basicsize */ 13177 0, /* tp_itemsize */ 13178 /* methods */ 13179 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 13180 0, /* tp_print */ 13181 0, /* tp_getattr */ 13182 0, /* tp_setattr */ 13183 0, /* tp_reserved */ 13184 0, /* tp_repr */ 13185 0, /* tp_as_number */ 13186 0, /* tp_as_sequence */ 13187 0, /* tp_as_mapping */ 13188 0, /* tp_hash */ 13189 0, /* tp_call */ 13190 0, /* tp_str */ 13191 PyObject_GenericGetAttr, /* tp_getattro */ 13192 0, /* tp_setattro */ 13193 0, /* tp_as_buffer */ 13194 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 13195 0, /* tp_doc */ 13196 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 13197 0, /* tp_clear */ 13198 0, /* tp_richcompare */ 13199 0, /* tp_weaklistoffset */ 13200 PyObject_SelfIter, /* tp_iter */ 13201 (iternextfunc)unicodeiter_next, /* tp_iternext */ 13202 unicodeiter_methods, /* tp_methods */ 13203 0, 13204}; 13205 13206static PyObject * 13207unicode_iter(PyObject *seq) 13208{ 13209 unicodeiterobject *it; 13210 13211 if (!PyUnicode_Check(seq)) { 13212 PyErr_BadInternalCall(); 13213 return NULL; 13214 } 13215 if (PyUnicode_READY(seq) == -1) 13216 return NULL; 13217 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 13218 if (it == NULL) 13219 return NULL; 13220 it->it_index = 0; 13221 Py_INCREF(seq); 13222 it->it_seq = (PyUnicodeObject *)seq; 13223 _PyObject_GC_TRACK(it); 13224 return (PyObject *)it; 13225} 13226 13227#define UNIOP(x) Py_UNICODE_##x 13228#define UNIOP_t Py_UNICODE 13229#include "uniops.h" 13230#undef UNIOP 13231#undef UNIOP_t 13232#define UNIOP(x) Py_UCS4_##x 13233#define UNIOP_t Py_UCS4 13234#include "uniops.h" 13235#undef UNIOP 13236#undef UNIOP_t 13237 13238Py_UNICODE* 13239PyUnicode_AsUnicodeCopy(PyObject *object) 13240{ 13241 PyUnicodeObject *unicode = (PyUnicodeObject *)object; 13242 Py_UNICODE *copy; 13243 Py_ssize_t size; 13244 13245 if (!PyUnicode_Check(unicode)) { 13246 PyErr_BadArgument(); 13247 return NULL; 13248 } 13249 /* Ensure we won't overflow the size. */ 13250 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 13251 PyErr_NoMemory(); 13252 return NULL; 13253 } 13254 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ 13255 size *= sizeof(Py_UNICODE); 13256 copy = PyMem_Malloc(size); 13257 if (copy == NULL) { 13258 PyErr_NoMemory(); 13259 return NULL; 13260 } 13261 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); 13262 return copy; 13263} 13264 13265/* A _string module, to export formatter_parser and formatter_field_name_split 13266 to the string.Formatter class implemented in Python. */ 13267 13268static PyMethodDef _string_methods[] = { 13269 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 13270 METH_O, PyDoc_STR("split the argument as a field name")}, 13271 {"formatter_parser", (PyCFunction) formatter_parser, 13272 METH_O, PyDoc_STR("parse the argument as a format string")}, 13273 {NULL, NULL} 13274}; 13275 13276static struct PyModuleDef _string_module = { 13277 PyModuleDef_HEAD_INIT, 13278 "_string", 13279 PyDoc_STR("string helper module"), 13280 0, 13281 _string_methods, 13282 NULL, 13283 NULL, 13284 NULL, 13285 NULL 13286}; 13287 13288PyMODINIT_FUNC 13289PyInit__string(void) 13290{ 13291 return PyModule_Create(&_string_module); 13292} 13293 13294 13295#ifdef __cplusplus 13296} 13297#endif 13298