unicodeobject.c revision 1f7951711c16cca5f041288b81d01cb3021d0b7e
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49#ifdef Py_DEBUG 50# define DONT_MAKE_RESULT_READY 51#endif 52 53/* Endianness switches; defaults to little endian */ 54 55#ifdef WORDS_BIGENDIAN 56# define BYTEORDER_IS_BIG_ENDIAN 57#else 58# define BYTEORDER_IS_LITTLE_ENDIAN 59#endif 60 61/* --- Globals ------------------------------------------------------------ 62 63 The globals are initialized by the _PyUnicode_Init() API and should 64 not be used before calling that API. 65 66*/ 67 68 69#ifdef __cplusplus 70extern "C" { 71#endif 72 73#ifdef Py_DEBUG 74# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 75#else 76# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 77#endif 78 79#define _PyUnicode_UTF8(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8) 81#define PyUnicode_UTF8(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((char*)((PyASCIIObject*)(op) + 1)) : \ 86 _PyUnicode_UTF8(op)) 87#define _PyUnicode_UTF8_LENGTH(op) \ 88 (((PyCompactUnicodeObject*)(op))->utf8_length) 89#define PyUnicode_UTF8_LENGTH(op) \ 90 (assert(_PyUnicode_CHECK(op)), \ 91 assert(PyUnicode_IS_READY(op)), \ 92 PyUnicode_IS_COMPACT_ASCII(op) ? \ 93 ((PyASCIIObject*)(op))->length : \ 94 _PyUnicode_UTF8_LENGTH(op)) 95#define _PyUnicode_WSTR(op) \ 96 (((PyASCIIObject*)(op))->wstr) 97#define _PyUnicode_WSTR_LENGTH(op) \ 98 (((PyCompactUnicodeObject*)(op))->wstr_length) 99#define _PyUnicode_LENGTH(op) \ 100 (((PyASCIIObject *)(op))->length) 101#define _PyUnicode_STATE(op) \ 102 (((PyASCIIObject *)(op))->state) 103#define _PyUnicode_HASH(op) \ 104 (((PyASCIIObject *)(op))->hash) 105#define _PyUnicode_KIND(op) \ 106 (assert(_PyUnicode_CHECK(op)), \ 107 ((PyASCIIObject *)(op))->state.kind) 108#define _PyUnicode_GET_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 ((PyASCIIObject *)(op))->length) 111#define _PyUnicode_DATA_ANY(op) \ 112 (((PyUnicodeObject*)(op))->data.any) 113 114#undef PyUnicode_READY 115#define PyUnicode_READY(op) \ 116 (assert(_PyUnicode_CHECK(op)), \ 117 (PyUnicode_IS_READY(op) ? \ 118 0 : \ 119 _PyUnicode_Ready(op))) 120 121#define _PyUnicode_READY_REPLACE(p_obj) \ 122 (assert(_PyUnicode_CHECK(*p_obj)), \ 123 (PyUnicode_IS_READY(*p_obj) ? \ 124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 125 126#define _PyUnicode_SHARE_UTF8(op) \ 127 (assert(_PyUnicode_CHECK(op)), \ 128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 130#define _PyUnicode_SHARE_WSTR(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated UTF-8 memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (!PyUnicode_IS_COMPACT_ASCII(op) \ 139 && _PyUnicode_UTF8(op) \ 140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 141 142/* true if the Unicode object has an allocated wstr memory block 143 (not shared with other data) */ 144#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 145 (assert(_PyUnicode_CHECK(op)), \ 146 (_PyUnicode_WSTR(op) && \ 147 (!PyUnicode_IS_READY(op) || \ 148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 149 150/* Generic helper macro to convert characters of different types. 151 from_type and to_type have to be valid type names, begin and end 152 are pointers to the source characters which should be of type 153 "from_type *". to is a pointer of type "to_type *" and points to the 154 buffer where the result characters are written to. */ 155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 156 do { \ 157 to_type *_to = (to_type *) to; \ 158 const from_type *_iter = (begin); \ 159 const from_type *_end = (end); \ 160 Py_ssize_t n = (_end) - (_iter); \ 161 const from_type *_unrolled_end = \ 162 _iter + (n & ~ (Py_ssize_t) 3); \ 163 while (_iter < (_unrolled_end)) { \ 164 _to[0] = (to_type) _iter[0]; \ 165 _to[1] = (to_type) _iter[1]; \ 166 _to[2] = (to_type) _iter[2]; \ 167 _to[3] = (to_type) _iter[3]; \ 168 _iter += 4; _to += 4; \ 169 } \ 170 while (_iter < (_end)) \ 171 *_to++ = (to_type) *_iter++; \ 172 } while (0) 173 174/* The Unicode string has been modified: reset the hash */ 175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 176 177/* This dictionary holds all interned unicode strings. Note that references 178 to strings in this dictionary are *not* counted in the string's ob_refcnt. 179 When the interned string reaches a refcnt of 0 the string deallocation 180 function will delete the reference from this dictionary. 181 182 Another way to look at this is that to say that the actual reference 183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 184*/ 185static PyObject *interned; 186 187/* The empty Unicode object is shared to improve performance. */ 188static PyObject *unicode_empty; 189 190/* List of static strings. */ 191static _Py_Identifier *static_strings; 192 193/* Single character Unicode strings in the Latin-1 range are being 194 shared as well. */ 195static PyObject *unicode_latin1[256]; 196 197/* Fast detection of the most frequent whitespace characters */ 198const unsigned char _Py_ascii_whitespace[] = { 199 0, 0, 0, 0, 0, 0, 0, 0, 200/* case 0x0009: * CHARACTER TABULATION */ 201/* case 0x000A: * LINE FEED */ 202/* case 0x000B: * LINE TABULATION */ 203/* case 0x000C: * FORM FEED */ 204/* case 0x000D: * CARRIAGE RETURN */ 205 0, 1, 1, 1, 1, 1, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x001C: * FILE SEPARATOR */ 208/* case 0x001D: * GROUP SEPARATOR */ 209/* case 0x001E: * RECORD SEPARATOR */ 210/* case 0x001F: * UNIT SEPARATOR */ 211 0, 0, 0, 0, 1, 1, 1, 1, 212/* case 0x0020: * SPACE */ 213 1, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0 226}; 227 228/* forward */ 229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 230static PyObject* get_latin1_char(unsigned char ch); 231static void copy_characters( 232 PyObject *to, Py_ssize_t to_start, 233 PyObject *from, Py_ssize_t from_start, 234 Py_ssize_t how_many); 235#ifdef Py_DEBUG 236static int unicode_is_singleton(PyObject *unicode); 237#endif 238 239static PyObject * 240unicode_fromascii(const unsigned char *s, Py_ssize_t size); 241static PyObject * 242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 243static PyObject * 244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 245static PyObject * 246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 247 248static PyObject * 249unicode_encode_call_errorhandler(const char *errors, 250 PyObject **errorHandler,const char *encoding, const char *reason, 251 PyObject *unicode, PyObject **exceptionObject, 252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 253 254static void 255raise_encode_exception(PyObject **exceptionObject, 256 const char *encoding, 257 PyObject *unicode, 258 Py_ssize_t startpos, Py_ssize_t endpos, 259 const char *reason); 260 261/* Same for linebreaks */ 262static unsigned char ascii_linebreak[] = { 263 0, 0, 0, 0, 0, 0, 0, 0, 264/* 0x000A, * LINE FEED */ 265/* 0x000B, * LINE TABULATION */ 266/* 0x000C, * FORM FEED */ 267/* 0x000D, * CARRIAGE RETURN */ 268 0, 0, 1, 1, 1, 1, 0, 0, 269 0, 0, 0, 0, 0, 0, 0, 0, 270/* 0x001C, * FILE SEPARATOR */ 271/* 0x001D, * GROUP SEPARATOR */ 272/* 0x001E, * RECORD SEPARATOR */ 273 0, 0, 0, 0, 1, 1, 1, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0, 278 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0 287}; 288 289/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 290 This function is kept for backward compatibility with the old API. */ 291Py_UNICODE 292PyUnicode_GetMax(void) 293{ 294#ifdef Py_UNICODE_WIDE 295 return 0x10FFFF; 296#else 297 /* This is actually an illegal character, so it should 298 not be passed to unichr. */ 299 return 0xFFFF; 300#endif 301} 302 303#ifdef Py_DEBUG 304int 305_PyUnicode_CheckConsistency(PyObject *op, int check_content) 306{ 307 PyASCIIObject *ascii; 308 unsigned int kind; 309 310 assert(PyUnicode_Check(op)); 311 312 ascii = (PyASCIIObject *)op; 313 kind = ascii->state.kind; 314 315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 316 assert(kind == PyUnicode_1BYTE_KIND); 317 assert(ascii->state.ready == 1); 318 } 319 else { 320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 321 void *data; 322 323 if (ascii->state.compact == 1) { 324 data = compact + 1; 325 assert(kind == PyUnicode_1BYTE_KIND 326 || kind == PyUnicode_2BYTE_KIND 327 || kind == PyUnicode_4BYTE_KIND); 328 assert(ascii->state.ascii == 0); 329 assert(ascii->state.ready == 1); 330 assert (compact->utf8 != data); 331 } 332 else { 333 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 334 335 data = unicode->data.any; 336 if (kind == PyUnicode_WCHAR_KIND) { 337 assert(ascii->length == 0); 338 assert(ascii->hash == -1); 339 assert(ascii->state.compact == 0); 340 assert(ascii->state.ascii == 0); 341 assert(ascii->state.ready == 0); 342 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 343 assert(ascii->wstr != NULL); 344 assert(data == NULL); 345 assert(compact->utf8 == NULL); 346 } 347 else { 348 assert(kind == PyUnicode_1BYTE_KIND 349 || kind == PyUnicode_2BYTE_KIND 350 || kind == PyUnicode_4BYTE_KIND); 351 assert(ascii->state.compact == 0); 352 assert(ascii->state.ready == 1); 353 assert(data != NULL); 354 if (ascii->state.ascii) { 355 assert (compact->utf8 == data); 356 assert (compact->utf8_length == ascii->length); 357 } 358 else 359 assert (compact->utf8 != data); 360 } 361 } 362 if (kind != PyUnicode_WCHAR_KIND) { 363 if ( 364#if SIZEOF_WCHAR_T == 2 365 kind == PyUnicode_2BYTE_KIND 366#else 367 kind == PyUnicode_4BYTE_KIND 368#endif 369 ) 370 { 371 assert(ascii->wstr == data); 372 assert(compact->wstr_length == ascii->length); 373 } else 374 assert(ascii->wstr != data); 375 } 376 377 if (compact->utf8 == NULL) 378 assert(compact->utf8_length == 0); 379 if (ascii->wstr == NULL) 380 assert(compact->wstr_length == 0); 381 } 382 /* check that the best kind is used */ 383 if (check_content && kind != PyUnicode_WCHAR_KIND) 384 { 385 Py_ssize_t i; 386 Py_UCS4 maxchar = 0; 387 void *data = PyUnicode_DATA(ascii); 388 for (i=0; i < ascii->length; i++) 389 { 390 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 391 if (ch > maxchar) 392 maxchar = ch; 393 } 394 if (kind == PyUnicode_1BYTE_KIND) { 395 if (ascii->state.ascii == 0) 396 assert(maxchar >= 128); 397 else 398 assert(maxchar < 128); 399 } 400 else if (kind == PyUnicode_2BYTE_KIND) 401 assert(maxchar >= 0x100); 402 else 403 assert(maxchar >= 0x10000); 404 } 405 if (check_content && !unicode_is_singleton(op)) 406 assert(ascii->hash == -1); 407 return 1; 408} 409#endif 410 411#ifdef HAVE_MBCS 412static OSVERSIONINFOEX winver; 413#endif 414 415/* --- Bloom Filters ----------------------------------------------------- */ 416 417/* stuff to implement simple "bloom filters" for Unicode characters. 418 to keep things simple, we use a single bitmask, using the least 5 419 bits from each unicode characters as the bit index. */ 420 421/* the linebreak mask is set up by Unicode_Init below */ 422 423#if LONG_BIT >= 128 424#define BLOOM_WIDTH 128 425#elif LONG_BIT >= 64 426#define BLOOM_WIDTH 64 427#elif LONG_BIT >= 32 428#define BLOOM_WIDTH 32 429#else 430#error "LONG_BIT is smaller than 32" 431#endif 432 433#define BLOOM_MASK unsigned long 434 435static BLOOM_MASK bloom_linebreak; 436 437#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 438#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 439 440#define BLOOM_LINEBREAK(ch) \ 441 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 442 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 443 444Py_LOCAL_INLINE(BLOOM_MASK) 445make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 446{ 447 /* calculate simple bloom-style bitmask for a given unicode string */ 448 449 BLOOM_MASK mask; 450 Py_ssize_t i; 451 452 mask = 0; 453 for (i = 0; i < len; i++) 454 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 455 456 return mask; 457} 458 459#define BLOOM_MEMBER(mask, chr, str) \ 460 (BLOOM(mask, chr) \ 461 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 462 463/* Compilation of templated routines */ 464 465#include "stringlib/asciilib.h" 466#include "stringlib/fastsearch.h" 467#include "stringlib/partition.h" 468#include "stringlib/split.h" 469#include "stringlib/count.h" 470#include "stringlib/find.h" 471#include "stringlib/find_max_char.h" 472#include "stringlib/localeutil.h" 473#include "stringlib/undef.h" 474 475#include "stringlib/ucs1lib.h" 476#include "stringlib/fastsearch.h" 477#include "stringlib/partition.h" 478#include "stringlib/split.h" 479#include "stringlib/count.h" 480#include "stringlib/find.h" 481#include "stringlib/find_max_char.h" 482#include "stringlib/localeutil.h" 483#include "stringlib/undef.h" 484 485#include "stringlib/ucs2lib.h" 486#include "stringlib/fastsearch.h" 487#include "stringlib/partition.h" 488#include "stringlib/split.h" 489#include "stringlib/count.h" 490#include "stringlib/find.h" 491#include "stringlib/find_max_char.h" 492#include "stringlib/localeutil.h" 493#include "stringlib/undef.h" 494 495#include "stringlib/ucs4lib.h" 496#include "stringlib/fastsearch.h" 497#include "stringlib/partition.h" 498#include "stringlib/split.h" 499#include "stringlib/count.h" 500#include "stringlib/find.h" 501#include "stringlib/find_max_char.h" 502#include "stringlib/localeutil.h" 503#include "stringlib/undef.h" 504 505#include "stringlib/unicodedefs.h" 506#include "stringlib/fastsearch.h" 507#include "stringlib/count.h" 508#include "stringlib/find.h" 509 510/* --- Unicode Object ----------------------------------------------------- */ 511 512static PyObject * 513fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 514 515Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 516 Py_ssize_t size, Py_UCS4 ch, 517 int direction) 518{ 519 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 520 521 switch (kind) { 522 case PyUnicode_1BYTE_KIND: 523 { 524 Py_UCS1 ch1 = (Py_UCS1) ch; 525 if (ch1 == ch) 526 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 527 else 528 return -1; 529 } 530 case PyUnicode_2BYTE_KIND: 531 { 532 Py_UCS2 ch2 = (Py_UCS2) ch; 533 if (ch2 == ch) 534 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 535 else 536 return -1; 537 } 538 case PyUnicode_4BYTE_KIND: 539 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 540 default: 541 assert(0); 542 return -1; 543 } 544} 545 546static PyObject* 547resize_compact(PyObject *unicode, Py_ssize_t length) 548{ 549 Py_ssize_t char_size; 550 Py_ssize_t struct_size; 551 Py_ssize_t new_size; 552 int share_wstr; 553 554 assert(PyUnicode_IS_READY(unicode)); 555 char_size = PyUnicode_KIND(unicode); 556 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 557 struct_size = sizeof(PyASCIIObject); 558 else 559 struct_size = sizeof(PyCompactUnicodeObject); 560 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 561 562 _Py_DEC_REFTOTAL; 563 _Py_ForgetReference(unicode); 564 565 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 566 PyErr_NoMemory(); 567 return NULL; 568 } 569 new_size = (struct_size + (length + 1) * char_size); 570 571 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 572 if (unicode == NULL) { 573 PyObject_Del(unicode); 574 PyErr_NoMemory(); 575 return NULL; 576 } 577 _Py_NewReference(unicode); 578 _PyUnicode_LENGTH(unicode) = length; 579 if (share_wstr) { 580 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 581 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 582 _PyUnicode_WSTR_LENGTH(unicode) = length; 583 } 584 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 585 length, 0); 586 return unicode; 587} 588 589static int 590resize_inplace(PyObject *unicode, Py_ssize_t length) 591{ 592 wchar_t *wstr; 593 assert(!PyUnicode_IS_COMPACT(unicode)); 594 assert(Py_REFCNT(unicode) == 1); 595 596 _PyUnicode_DIRTY(unicode); 597 598 if (PyUnicode_IS_READY(unicode)) { 599 Py_ssize_t char_size; 600 Py_ssize_t new_size; 601 int share_wstr, share_utf8; 602 void *data; 603 604 data = _PyUnicode_DATA_ANY(unicode); 605 assert(data != NULL); 606 char_size = PyUnicode_KIND(unicode); 607 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 608 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 609 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 610 { 611 PyObject_DEL(_PyUnicode_UTF8(unicode)); 612 _PyUnicode_UTF8(unicode) = NULL; 613 _PyUnicode_UTF8_LENGTH(unicode) = 0; 614 } 615 616 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 617 PyErr_NoMemory(); 618 return -1; 619 } 620 new_size = (length + 1) * char_size; 621 622 data = (PyObject *)PyObject_REALLOC(data, new_size); 623 if (data == NULL) { 624 PyErr_NoMemory(); 625 return -1; 626 } 627 _PyUnicode_DATA_ANY(unicode) = data; 628 if (share_wstr) { 629 _PyUnicode_WSTR(unicode) = data; 630 _PyUnicode_WSTR_LENGTH(unicode) = length; 631 } 632 if (share_utf8) { 633 _PyUnicode_UTF8(unicode) = data; 634 _PyUnicode_UTF8_LENGTH(unicode) = length; 635 } 636 _PyUnicode_LENGTH(unicode) = length; 637 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 638 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 639 assert(_PyUnicode_CheckConsistency(unicode, 0)); 640 return 0; 641 } 642 } 643 assert(_PyUnicode_WSTR(unicode) != NULL); 644 645 /* check for integer overflow */ 646 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 647 PyErr_NoMemory(); 648 return -1; 649 } 650 wstr = _PyUnicode_WSTR(unicode); 651 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 652 if (!wstr) { 653 PyErr_NoMemory(); 654 return -1; 655 } 656 _PyUnicode_WSTR(unicode) = wstr; 657 _PyUnicode_WSTR(unicode)[length] = 0; 658 _PyUnicode_WSTR_LENGTH(unicode) = length; 659 assert(_PyUnicode_CheckConsistency(unicode, 0)); 660 return 0; 661} 662 663static PyObject* 664resize_copy(PyObject *unicode, Py_ssize_t length) 665{ 666 Py_ssize_t copy_length; 667 if (PyUnicode_IS_COMPACT(unicode)) { 668 PyObject *copy; 669 assert(PyUnicode_IS_READY(unicode)); 670 671 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 672 if (copy == NULL) 673 return NULL; 674 675 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 676 copy_characters(copy, 0, unicode, 0, copy_length); 677 return copy; 678 } 679 else { 680 PyObject *w; 681 assert(_PyUnicode_WSTR(unicode) != NULL); 682 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 683 w = (PyObject*)_PyUnicode_New(length); 684 if (w == NULL) 685 return NULL; 686 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 687 copy_length = Py_MIN(copy_length, length); 688 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 689 copy_length); 690 return w; 691 } 692} 693 694/* We allocate one more byte to make sure the string is 695 Ux0000 terminated; some code (e.g. new_identifier) 696 relies on that. 697 698 XXX This allocator could further be enhanced by assuring that the 699 free list never reduces its size below 1. 700 701*/ 702 703#ifdef Py_DEBUG 704static int unicode_old_new_calls = 0; 705#endif 706 707static PyUnicodeObject * 708_PyUnicode_New(Py_ssize_t length) 709{ 710 register PyUnicodeObject *unicode; 711 size_t new_size; 712 713 /* Optimization for empty strings */ 714 if (length == 0 && unicode_empty != NULL) { 715 Py_INCREF(unicode_empty); 716 return (PyUnicodeObject*)unicode_empty; 717 } 718 719 /* Ensure we won't overflow the size. */ 720 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 721 return (PyUnicodeObject *)PyErr_NoMemory(); 722 } 723 if (length < 0) { 724 PyErr_SetString(PyExc_SystemError, 725 "Negative size passed to _PyUnicode_New"); 726 return NULL; 727 } 728 729#ifdef Py_DEBUG 730 ++unicode_old_new_calls; 731#endif 732 733 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 734 if (unicode == NULL) 735 return NULL; 736 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 737 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 738 if (!_PyUnicode_WSTR(unicode)) { 739 PyErr_NoMemory(); 740 goto onError; 741 } 742 743 /* Initialize the first element to guard against cases where 744 * the caller fails before initializing str -- unicode_resize() 745 * reads str[0], and the Keep-Alive optimization can keep memory 746 * allocated for str alive across a call to unicode_dealloc(unicode). 747 * We don't want unicode_resize to read uninitialized memory in 748 * that case. 749 */ 750 _PyUnicode_WSTR(unicode)[0] = 0; 751 _PyUnicode_WSTR(unicode)[length] = 0; 752 _PyUnicode_WSTR_LENGTH(unicode) = length; 753 _PyUnicode_HASH(unicode) = -1; 754 _PyUnicode_STATE(unicode).interned = 0; 755 _PyUnicode_STATE(unicode).kind = 0; 756 _PyUnicode_STATE(unicode).compact = 0; 757 _PyUnicode_STATE(unicode).ready = 0; 758 _PyUnicode_STATE(unicode).ascii = 0; 759 _PyUnicode_DATA_ANY(unicode) = NULL; 760 _PyUnicode_LENGTH(unicode) = 0; 761 _PyUnicode_UTF8(unicode) = NULL; 762 _PyUnicode_UTF8_LENGTH(unicode) = 0; 763 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 764 return unicode; 765 766 onError: 767 /* XXX UNREF/NEWREF interface should be more symmetrical */ 768 _Py_DEC_REFTOTAL; 769 _Py_ForgetReference((PyObject *)unicode); 770 PyObject_Del(unicode); 771 return NULL; 772} 773 774static const char* 775unicode_kind_name(PyObject *unicode) 776{ 777 /* don't check consistency: unicode_kind_name() is called from 778 _PyUnicode_Dump() */ 779 if (!PyUnicode_IS_COMPACT(unicode)) 780 { 781 if (!PyUnicode_IS_READY(unicode)) 782 return "wstr"; 783 switch(PyUnicode_KIND(unicode)) 784 { 785 case PyUnicode_1BYTE_KIND: 786 if (PyUnicode_IS_ASCII(unicode)) 787 return "legacy ascii"; 788 else 789 return "legacy latin1"; 790 case PyUnicode_2BYTE_KIND: 791 return "legacy UCS2"; 792 case PyUnicode_4BYTE_KIND: 793 return "legacy UCS4"; 794 default: 795 return "<legacy invalid kind>"; 796 } 797 } 798 assert(PyUnicode_IS_READY(unicode)); 799 switch(PyUnicode_KIND(unicode)) 800 { 801 case PyUnicode_1BYTE_KIND: 802 if (PyUnicode_IS_ASCII(unicode)) 803 return "ascii"; 804 else 805 return "latin1"; 806 case PyUnicode_2BYTE_KIND: 807 return "UCS2"; 808 case PyUnicode_4BYTE_KIND: 809 return "UCS4"; 810 default: 811 return "<invalid compact kind>"; 812 } 813} 814 815#ifdef Py_DEBUG 816static int unicode_new_new_calls = 0; 817 818/* Functions wrapping macros for use in debugger */ 819char *_PyUnicode_utf8(void *unicode){ 820 return PyUnicode_UTF8(unicode); 821} 822 823void *_PyUnicode_compact_data(void *unicode) { 824 return _PyUnicode_COMPACT_DATA(unicode); 825} 826void *_PyUnicode_data(void *unicode){ 827 printf("obj %p\n", unicode); 828 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 829 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 830 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 831 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 832 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 833 return PyUnicode_DATA(unicode); 834} 835 836void 837_PyUnicode_Dump(PyObject *op) 838{ 839 PyASCIIObject *ascii = (PyASCIIObject *)op; 840 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 841 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 842 void *data; 843 844 if (ascii->state.compact) 845 { 846 if (ascii->state.ascii) 847 data = (ascii + 1); 848 else 849 data = (compact + 1); 850 } 851 else 852 data = unicode->data.any; 853 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 854 855 if (ascii->wstr == data) 856 printf("shared "); 857 printf("wstr=%p", ascii->wstr); 858 859 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 860 printf(" (%zu), ", compact->wstr_length); 861 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 862 printf("shared "); 863 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 864 } 865 printf(", data=%p\n", data); 866} 867#endif 868 869PyObject * 870PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 871{ 872 PyObject *obj; 873 PyCompactUnicodeObject *unicode; 874 void *data; 875 int kind_state; 876 int is_sharing, is_ascii; 877 Py_ssize_t char_size; 878 Py_ssize_t struct_size; 879 880 /* Optimization for empty strings */ 881 if (size == 0 && unicode_empty != NULL) { 882 Py_INCREF(unicode_empty); 883 return unicode_empty; 884 } 885 886#ifdef Py_DEBUG 887 ++unicode_new_new_calls; 888#endif 889 890 is_ascii = 0; 891 is_sharing = 0; 892 struct_size = sizeof(PyCompactUnicodeObject); 893 if (maxchar < 128) { 894 kind_state = PyUnicode_1BYTE_KIND; 895 char_size = 1; 896 is_ascii = 1; 897 struct_size = sizeof(PyASCIIObject); 898 } 899 else if (maxchar < 256) { 900 kind_state = PyUnicode_1BYTE_KIND; 901 char_size = 1; 902 } 903 else if (maxchar < 65536) { 904 kind_state = PyUnicode_2BYTE_KIND; 905 char_size = 2; 906 if (sizeof(wchar_t) == 2) 907 is_sharing = 1; 908 } 909 else { 910 kind_state = PyUnicode_4BYTE_KIND; 911 char_size = 4; 912 if (sizeof(wchar_t) == 4) 913 is_sharing = 1; 914 } 915 916 /* Ensure we won't overflow the size. */ 917 if (size < 0) { 918 PyErr_SetString(PyExc_SystemError, 919 "Negative size passed to PyUnicode_New"); 920 return NULL; 921 } 922 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 923 return PyErr_NoMemory(); 924 925 /* Duplicated allocation code from _PyObject_New() instead of a call to 926 * PyObject_New() so we are able to allocate space for the object and 927 * it's data buffer. 928 */ 929 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 930 if (obj == NULL) 931 return PyErr_NoMemory(); 932 obj = PyObject_INIT(obj, &PyUnicode_Type); 933 if (obj == NULL) 934 return NULL; 935 936 unicode = (PyCompactUnicodeObject *)obj; 937 if (is_ascii) 938 data = ((PyASCIIObject*)obj) + 1; 939 else 940 data = unicode + 1; 941 _PyUnicode_LENGTH(unicode) = size; 942 _PyUnicode_HASH(unicode) = -1; 943 _PyUnicode_STATE(unicode).interned = 0; 944 _PyUnicode_STATE(unicode).kind = kind_state; 945 _PyUnicode_STATE(unicode).compact = 1; 946 _PyUnicode_STATE(unicode).ready = 1; 947 _PyUnicode_STATE(unicode).ascii = is_ascii; 948 if (is_ascii) { 949 ((char*)data)[size] = 0; 950 _PyUnicode_WSTR(unicode) = NULL; 951 } 952 else if (kind_state == PyUnicode_1BYTE_KIND) { 953 ((char*)data)[size] = 0; 954 _PyUnicode_WSTR(unicode) = NULL; 955 _PyUnicode_WSTR_LENGTH(unicode) = 0; 956 unicode->utf8 = NULL; 957 unicode->utf8_length = 0; 958 } 959 else { 960 unicode->utf8 = NULL; 961 unicode->utf8_length = 0; 962 if (kind_state == PyUnicode_2BYTE_KIND) 963 ((Py_UCS2*)data)[size] = 0; 964 else /* kind_state == PyUnicode_4BYTE_KIND */ 965 ((Py_UCS4*)data)[size] = 0; 966 if (is_sharing) { 967 _PyUnicode_WSTR_LENGTH(unicode) = size; 968 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 969 } 970 else { 971 _PyUnicode_WSTR_LENGTH(unicode) = 0; 972 _PyUnicode_WSTR(unicode) = NULL; 973 } 974 } 975 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 976 return obj; 977} 978 979#if SIZEOF_WCHAR_T == 2 980/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 981 will decode surrogate pairs, the other conversions are implemented as macros 982 for efficiency. 983 984 This function assumes that unicode can hold one more code point than wstr 985 characters for a terminating null character. */ 986static void 987unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 988 PyObject *unicode) 989{ 990 const wchar_t *iter; 991 Py_UCS4 *ucs4_out; 992 993 assert(unicode != NULL); 994 assert(_PyUnicode_CHECK(unicode)); 995 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 996 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 997 998 for (iter = begin; iter < end; ) { 999 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1000 _PyUnicode_GET_LENGTH(unicode))); 1001 if (*iter >= 0xD800 && *iter <= 0xDBFF 1002 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1003 { 1004 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 1005 iter += 2; 1006 } 1007 else { 1008 *ucs4_out++ = *iter; 1009 iter++; 1010 } 1011 } 1012 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1013 _PyUnicode_GET_LENGTH(unicode))); 1014 1015} 1016#endif 1017 1018static int 1019_PyUnicode_Dirty(PyObject *unicode) 1020{ 1021 assert(_PyUnicode_CHECK(unicode)); 1022 if (Py_REFCNT(unicode) != 1) { 1023 PyErr_SetString(PyExc_SystemError, 1024 "Cannot modify a string having more than 1 reference"); 1025 return -1; 1026 } 1027 _PyUnicode_DIRTY(unicode); 1028 return 0; 1029} 1030 1031static int 1032_copy_characters(PyObject *to, Py_ssize_t to_start, 1033 PyObject *from, Py_ssize_t from_start, 1034 Py_ssize_t how_many, int check_maxchar) 1035{ 1036 unsigned int from_kind, to_kind; 1037 void *from_data, *to_data; 1038 int fast; 1039 1040 assert(PyUnicode_Check(from)); 1041 assert(PyUnicode_Check(to)); 1042 assert(PyUnicode_IS_READY(from)); 1043 assert(PyUnicode_IS_READY(to)); 1044 1045 assert(PyUnicode_GET_LENGTH(from) >= how_many); 1046 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1047 assert(0 <= how_many); 1048 1049 if (how_many == 0) 1050 return 0; 1051 1052 from_kind = PyUnicode_KIND(from); 1053 from_data = PyUnicode_DATA(from); 1054 to_kind = PyUnicode_KIND(to); 1055 to_data = PyUnicode_DATA(to); 1056 1057#ifdef Py_DEBUG 1058 if (!check_maxchar 1059 && (from_kind > to_kind 1060 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 1061 { 1062 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1063 Py_UCS4 ch; 1064 Py_ssize_t i; 1065 for (i=0; i < how_many; i++) { 1066 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1067 assert(ch <= to_maxchar); 1068 } 1069 } 1070#endif 1071 fast = (from_kind == to_kind); 1072 if (check_maxchar 1073 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1074 { 1075 /* deny latin1 => ascii */ 1076 fast = 0; 1077 } 1078 1079 if (fast) { 1080 Py_MEMCPY((char*)to_data + to_kind * to_start, 1081 (char*)from_data + from_kind * from_start, 1082 to_kind * how_many); 1083 } 1084 else if (from_kind == PyUnicode_1BYTE_KIND 1085 && to_kind == PyUnicode_2BYTE_KIND) 1086 { 1087 _PyUnicode_CONVERT_BYTES( 1088 Py_UCS1, Py_UCS2, 1089 PyUnicode_1BYTE_DATA(from) + from_start, 1090 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1091 PyUnicode_2BYTE_DATA(to) + to_start 1092 ); 1093 } 1094 else if (from_kind == PyUnicode_1BYTE_KIND 1095 && to_kind == PyUnicode_4BYTE_KIND) 1096 { 1097 _PyUnicode_CONVERT_BYTES( 1098 Py_UCS1, Py_UCS4, 1099 PyUnicode_1BYTE_DATA(from) + from_start, 1100 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1101 PyUnicode_4BYTE_DATA(to) + to_start 1102 ); 1103 } 1104 else if (from_kind == PyUnicode_2BYTE_KIND 1105 && to_kind == PyUnicode_4BYTE_KIND) 1106 { 1107 _PyUnicode_CONVERT_BYTES( 1108 Py_UCS2, Py_UCS4, 1109 PyUnicode_2BYTE_DATA(from) + from_start, 1110 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1111 PyUnicode_4BYTE_DATA(to) + to_start 1112 ); 1113 } 1114 else { 1115 /* check if max_char(from substring) <= max_char(to) */ 1116 if (from_kind > to_kind 1117 /* latin1 => ascii */ 1118 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1119 { 1120 /* slow path to check for character overflow */ 1121 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1122 Py_UCS4 ch; 1123 Py_ssize_t i; 1124 1125#ifdef Py_DEBUG 1126 for (i=0; i < how_many; i++) { 1127 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1128 assert(ch <= to_maxchar); 1129 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1130 } 1131#else 1132 if (!check_maxchar) { 1133 for (i=0; i < how_many; i++) { 1134 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1135 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1136 } 1137 } 1138 else { 1139 for (i=0; i < how_many; i++) { 1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1141 if (ch > to_maxchar) 1142 return 1; 1143 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1144 } 1145 } 1146#endif 1147 } 1148 else { 1149 assert(0 && "inconsistent state"); 1150 return 1; 1151 } 1152 } 1153 return 0; 1154} 1155 1156static void 1157copy_characters(PyObject *to, Py_ssize_t to_start, 1158 PyObject *from, Py_ssize_t from_start, 1159 Py_ssize_t how_many) 1160{ 1161 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1162} 1163 1164Py_ssize_t 1165PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1166 PyObject *from, Py_ssize_t from_start, 1167 Py_ssize_t how_many) 1168{ 1169 int err; 1170 1171 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1172 PyErr_BadInternalCall(); 1173 return -1; 1174 } 1175 1176 if (PyUnicode_READY(from)) 1177 return -1; 1178 if (PyUnicode_READY(to)) 1179 return -1; 1180 1181 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1182 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1183 PyErr_Format(PyExc_SystemError, 1184 "Cannot write %zi characters at %zi " 1185 "in a string of %zi characters", 1186 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1187 return -1; 1188 } 1189 1190 if (how_many == 0) 1191 return 0; 1192 1193 if (_PyUnicode_Dirty(to)) 1194 return -1; 1195 1196 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1197 if (err) { 1198 PyErr_Format(PyExc_SystemError, 1199 "Cannot copy %s characters " 1200 "into a string of %s characters", 1201 unicode_kind_name(from), 1202 unicode_kind_name(to)); 1203 return -1; 1204 } 1205 return how_many; 1206} 1207 1208/* Find the maximum code point and count the number of surrogate pairs so a 1209 correct string length can be computed before converting a string to UCS4. 1210 This function counts single surrogates as a character and not as a pair. 1211 1212 Return 0 on success, or -1 on error. */ 1213static int 1214find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1215 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1216{ 1217 const wchar_t *iter; 1218 1219 assert(num_surrogates != NULL && maxchar != NULL); 1220 *num_surrogates = 0; 1221 *maxchar = 0; 1222 1223 for (iter = begin; iter < end; ) { 1224 if (*iter > *maxchar) { 1225 *maxchar = *iter; 1226#if SIZEOF_WCHAR_T != 2 1227 if (*maxchar >= 0x10000) 1228 return 0; 1229#endif 1230 } 1231#if SIZEOF_WCHAR_T == 2 1232 if (*iter >= 0xD800 && *iter <= 0xDBFF 1233 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1234 { 1235 Py_UCS4 surrogate_val; 1236 surrogate_val = (((iter[0] & 0x3FF)<<10) 1237 | (iter[1] & 0x3FF)) + 0x10000; 1238 ++(*num_surrogates); 1239 if (surrogate_val > *maxchar) 1240 *maxchar = surrogate_val; 1241 iter += 2; 1242 } 1243 else 1244 iter++; 1245#else 1246 iter++; 1247#endif 1248 } 1249 return 0; 1250} 1251 1252#ifdef Py_DEBUG 1253static int unicode_ready_calls = 0; 1254#endif 1255 1256static int 1257unicode_ready(PyObject **p_obj, int replace) 1258{ 1259 PyObject *unicode; 1260 wchar_t *end; 1261 Py_UCS4 maxchar = 0; 1262 Py_ssize_t num_surrogates; 1263#if SIZEOF_WCHAR_T == 2 1264 Py_ssize_t length_wo_surrogates; 1265#endif 1266 1267 assert(p_obj != NULL); 1268 unicode = *p_obj; 1269 1270 /* _PyUnicode_Ready() is only intended for old-style API usage where 1271 strings were created using _PyObject_New() and where no canonical 1272 representation (the str field) has been set yet aka strings 1273 which are not yet ready. */ 1274 assert(_PyUnicode_CHECK(unicode)); 1275 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1276 assert(_PyUnicode_WSTR(unicode) != NULL); 1277 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1278 assert(_PyUnicode_UTF8(unicode) == NULL); 1279 /* Actually, it should neither be interned nor be anything else: */ 1280 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1281 1282#ifdef Py_DEBUG 1283 ++unicode_ready_calls; 1284#endif 1285 1286#ifdef Py_DEBUG 1287 assert(!replace || Py_REFCNT(unicode) == 1); 1288#else 1289 if (replace && Py_REFCNT(unicode) != 1) 1290 replace = 0; 1291#endif 1292 if (replace) { 1293 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1294 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1295 /* Optimization for empty strings */ 1296 if (len == 0) { 1297 Py_INCREF(unicode_empty); 1298 Py_DECREF(*p_obj); 1299 *p_obj = unicode_empty; 1300 return 0; 1301 } 1302 if (len == 1 && wstr[0] < 256) { 1303 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1304 if (latin1_char == NULL) 1305 return -1; 1306 Py_DECREF(*p_obj); 1307 *p_obj = latin1_char; 1308 return 0; 1309 } 1310 } 1311 1312 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1313 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1314 &maxchar, &num_surrogates) == -1) 1315 return -1; 1316 1317 if (maxchar < 256) { 1318 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1319 if (!_PyUnicode_DATA_ANY(unicode)) { 1320 PyErr_NoMemory(); 1321 return -1; 1322 } 1323 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1324 _PyUnicode_WSTR(unicode), end, 1325 PyUnicode_1BYTE_DATA(unicode)); 1326 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1327 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1328 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1329 if (maxchar < 128) { 1330 _PyUnicode_STATE(unicode).ascii = 1; 1331 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1332 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1333 } 1334 else { 1335 _PyUnicode_STATE(unicode).ascii = 0; 1336 _PyUnicode_UTF8(unicode) = NULL; 1337 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1338 } 1339 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1340 _PyUnicode_WSTR(unicode) = NULL; 1341 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1342 } 1343 /* In this case we might have to convert down from 4-byte native 1344 wchar_t to 2-byte unicode. */ 1345 else if (maxchar < 65536) { 1346 assert(num_surrogates == 0 && 1347 "FindMaxCharAndNumSurrogatePairs() messed up"); 1348 1349#if SIZEOF_WCHAR_T == 2 1350 /* We can share representations and are done. */ 1351 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1352 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1353 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1354 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1355 _PyUnicode_UTF8(unicode) = NULL; 1356 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1357#else 1358 /* sizeof(wchar_t) == 4 */ 1359 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1360 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1361 if (!_PyUnicode_DATA_ANY(unicode)) { 1362 PyErr_NoMemory(); 1363 return -1; 1364 } 1365 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1366 _PyUnicode_WSTR(unicode), end, 1367 PyUnicode_2BYTE_DATA(unicode)); 1368 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1369 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1370 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1371 _PyUnicode_UTF8(unicode) = NULL; 1372 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1373 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1374 _PyUnicode_WSTR(unicode) = NULL; 1375 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1376#endif 1377 } 1378 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1379 else { 1380#if SIZEOF_WCHAR_T == 2 1381 /* in case the native representation is 2-bytes, we need to allocate a 1382 new normalized 4-byte version. */ 1383 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1384 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1385 if (!_PyUnicode_DATA_ANY(unicode)) { 1386 PyErr_NoMemory(); 1387 return -1; 1388 } 1389 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1390 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1391 _PyUnicode_UTF8(unicode) = NULL; 1392 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1393 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1394 _PyUnicode_STATE(unicode).ready = 1; 1395 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1396 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1397 _PyUnicode_WSTR(unicode) = NULL; 1398 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1399#else 1400 assert(num_surrogates == 0); 1401 1402 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1403 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1404 _PyUnicode_UTF8(unicode) = NULL; 1405 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1406 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1407#endif 1408 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1409 } 1410 _PyUnicode_STATE(unicode).ready = 1; 1411 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1412 return 0; 1413} 1414 1415int 1416_PyUnicode_ReadyReplace(PyObject **op) 1417{ 1418 return unicode_ready(op, 1); 1419} 1420 1421int 1422_PyUnicode_Ready(PyObject *op) 1423{ 1424 return unicode_ready(&op, 0); 1425} 1426 1427static void 1428unicode_dealloc(register PyObject *unicode) 1429{ 1430 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1431 case SSTATE_NOT_INTERNED: 1432 break; 1433 1434 case SSTATE_INTERNED_MORTAL: 1435 /* revive dead object temporarily for DelItem */ 1436 Py_REFCNT(unicode) = 3; 1437 if (PyDict_DelItem(interned, unicode) != 0) 1438 Py_FatalError( 1439 "deletion of interned string failed"); 1440 break; 1441 1442 case SSTATE_INTERNED_IMMORTAL: 1443 Py_FatalError("Immortal interned string died."); 1444 1445 default: 1446 Py_FatalError("Inconsistent interned string state."); 1447 } 1448 1449 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1450 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1451 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1452 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1453 1454 if (PyUnicode_IS_COMPACT(unicode)) { 1455 Py_TYPE(unicode)->tp_free(unicode); 1456 } 1457 else { 1458 if (_PyUnicode_DATA_ANY(unicode)) 1459 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1460 Py_TYPE(unicode)->tp_free(unicode); 1461 } 1462} 1463 1464#ifdef Py_DEBUG 1465static int 1466unicode_is_singleton(PyObject *unicode) 1467{ 1468 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1469 if (unicode == unicode_empty) 1470 return 1; 1471 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1472 { 1473 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1474 if (ch < 256 && unicode_latin1[ch] == unicode) 1475 return 1; 1476 } 1477 return 0; 1478} 1479#endif 1480 1481static int 1482unicode_resizable(PyObject *unicode) 1483{ 1484 if (Py_REFCNT(unicode) != 1) 1485 return 0; 1486 if (PyUnicode_CHECK_INTERNED(unicode)) 1487 return 0; 1488#ifdef Py_DEBUG 1489 /* singleton refcount is greater than 1 */ 1490 assert(!unicode_is_singleton(unicode)); 1491#endif 1492 return 1; 1493} 1494 1495static int 1496unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1497{ 1498 PyObject *unicode; 1499 Py_ssize_t old_length; 1500 1501 assert(p_unicode != NULL); 1502 unicode = *p_unicode; 1503 1504 assert(unicode != NULL); 1505 assert(PyUnicode_Check(unicode)); 1506 assert(0 <= length); 1507 1508 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1509 old_length = PyUnicode_WSTR_LENGTH(unicode); 1510 else 1511 old_length = PyUnicode_GET_LENGTH(unicode); 1512 if (old_length == length) 1513 return 0; 1514 1515 if (length == 0) { 1516 Py_DECREF(*p_unicode); 1517 *p_unicode = unicode_empty; 1518 Py_INCREF(*p_unicode); 1519 return 0; 1520 } 1521 1522 if (!unicode_resizable(unicode)) { 1523 PyObject *copy = resize_copy(unicode, length); 1524 if (copy == NULL) 1525 return -1; 1526 Py_DECREF(*p_unicode); 1527 *p_unicode = copy; 1528 return 0; 1529 } 1530 1531 if (PyUnicode_IS_COMPACT(unicode)) { 1532 *p_unicode = resize_compact(unicode, length); 1533 if (*p_unicode == NULL) 1534 return -1; 1535 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); 1536 return 0; 1537 } 1538 return resize_inplace(unicode, length); 1539} 1540 1541int 1542PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1543{ 1544 PyObject *unicode; 1545 if (p_unicode == NULL) { 1546 PyErr_BadInternalCall(); 1547 return -1; 1548 } 1549 unicode = *p_unicode; 1550 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1551 { 1552 PyErr_BadInternalCall(); 1553 return -1; 1554 } 1555 return unicode_resize(p_unicode, length); 1556} 1557 1558static int 1559unicode_widen(PyObject **p_unicode, unsigned int maxchar) 1560{ 1561 PyObject *result; 1562 assert(PyUnicode_IS_READY(*p_unicode)); 1563 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1564 return 0; 1565 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1566 maxchar); 1567 if (result == NULL) 1568 return -1; 1569 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, 1570 PyUnicode_GET_LENGTH(*p_unicode)); 1571 Py_DECREF(*p_unicode); 1572 *p_unicode = result; 1573 return 0; 1574} 1575 1576static int 1577unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1578 Py_UCS4 ch) 1579{ 1580 if (unicode_widen(p_unicode, ch) < 0) 1581 return -1; 1582 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1583 PyUnicode_DATA(*p_unicode), 1584 (*pos)++, ch); 1585 return 0; 1586} 1587 1588static PyObject* 1589get_latin1_char(unsigned char ch) 1590{ 1591 PyObject *unicode = unicode_latin1[ch]; 1592 if (!unicode) { 1593 unicode = PyUnicode_New(1, ch); 1594 if (!unicode) 1595 return NULL; 1596 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1597 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1598 unicode_latin1[ch] = unicode; 1599 } 1600 Py_INCREF(unicode); 1601 return unicode; 1602} 1603 1604PyObject * 1605PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1606{ 1607 PyObject *unicode; 1608 Py_UCS4 maxchar = 0; 1609 Py_ssize_t num_surrogates; 1610 1611 if (u == NULL) 1612 return (PyObject*)_PyUnicode_New(size); 1613 1614 /* If the Unicode data is known at construction time, we can apply 1615 some optimizations which share commonly used objects. */ 1616 1617 /* Optimization for empty strings */ 1618 if (size == 0 && unicode_empty != NULL) { 1619 Py_INCREF(unicode_empty); 1620 return unicode_empty; 1621 } 1622 1623 /* Single character Unicode objects in the Latin-1 range are 1624 shared when using this constructor */ 1625 if (size == 1 && *u < 256) 1626 return get_latin1_char((unsigned char)*u); 1627 1628 /* If not empty and not single character, copy the Unicode data 1629 into the new object */ 1630 if (find_maxchar_surrogates(u, u + size, 1631 &maxchar, &num_surrogates) == -1) 1632 return NULL; 1633 1634 unicode = PyUnicode_New(size - num_surrogates, 1635 maxchar); 1636 if (!unicode) 1637 return NULL; 1638 1639 switch (PyUnicode_KIND(unicode)) { 1640 case PyUnicode_1BYTE_KIND: 1641 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1642 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1643 break; 1644 case PyUnicode_2BYTE_KIND: 1645#if Py_UNICODE_SIZE == 2 1646 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1647#else 1648 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1649 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1650#endif 1651 break; 1652 case PyUnicode_4BYTE_KIND: 1653#if SIZEOF_WCHAR_T == 2 1654 /* This is the only case which has to process surrogates, thus 1655 a simple copy loop is not enough and we need a function. */ 1656 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1657#else 1658 assert(num_surrogates == 0); 1659 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1660#endif 1661 break; 1662 default: 1663 assert(0 && "Impossible state"); 1664 } 1665 1666 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1667 return unicode; 1668} 1669 1670PyObject * 1671PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1672{ 1673 if (size < 0) { 1674 PyErr_SetString(PyExc_SystemError, 1675 "Negative size passed to PyUnicode_FromStringAndSize"); 1676 return NULL; 1677 } 1678 1679 /* If the Unicode data is known at construction time, we can apply 1680 some optimizations which share commonly used objects. 1681 Also, this means the input must be UTF-8, so fall back to the 1682 UTF-8 decoder at the end. */ 1683 if (u != NULL) { 1684 1685 /* Optimization for empty strings */ 1686 if (size == 0 && unicode_empty != NULL) { 1687 Py_INCREF(unicode_empty); 1688 return unicode_empty; 1689 } 1690 1691 /* Single characters are shared when using this constructor. 1692 Restrict to ASCII, since the input must be UTF-8. */ 1693 if (size == 1 && (unsigned char)*u < 128) 1694 return get_latin1_char((unsigned char)*u); 1695 1696 return PyUnicode_DecodeUTF8(u, size, NULL); 1697 } 1698 1699 return (PyObject *)_PyUnicode_New(size); 1700} 1701 1702PyObject * 1703PyUnicode_FromString(const char *u) 1704{ 1705 size_t size = strlen(u); 1706 if (size > PY_SSIZE_T_MAX) { 1707 PyErr_SetString(PyExc_OverflowError, "input too long"); 1708 return NULL; 1709 } 1710 1711 return PyUnicode_FromStringAndSize(u, size); 1712} 1713 1714PyObject * 1715_PyUnicode_FromId(_Py_Identifier *id) 1716{ 1717 if (!id->object) { 1718 id->object = PyUnicode_FromString(id->string); 1719 if (!id->object) 1720 return NULL; 1721 PyUnicode_InternInPlace(&id->object); 1722 assert(!id->next); 1723 id->next = static_strings; 1724 static_strings = id; 1725 } 1726 return id->object; 1727} 1728 1729void 1730_PyUnicode_ClearStaticStrings() 1731{ 1732 _Py_Identifier *i; 1733 for (i = static_strings; i; i = i->next) { 1734 Py_DECREF(i->object); 1735 i->object = NULL; 1736 i->next = NULL; 1737 } 1738} 1739 1740static PyObject* 1741unicode_fromascii(const unsigned char* s, Py_ssize_t size) 1742{ 1743 PyObject *res; 1744#ifdef Py_DEBUG 1745 const unsigned char *p; 1746 const unsigned char *end = s + size; 1747 for (p=s; p < end; p++) { 1748 assert(*p < 128); 1749 } 1750#endif 1751 if (size == 1) 1752 return get_latin1_char(s[0]); 1753 res = PyUnicode_New(size, 127); 1754 if (!res) 1755 return NULL; 1756 memcpy(PyUnicode_1BYTE_DATA(res), s, size); 1757 return res; 1758} 1759 1760static Py_UCS4 1761kind_maxchar_limit(unsigned int kind) 1762{ 1763 switch(kind) { 1764 case PyUnicode_1BYTE_KIND: 1765 return 0x80; 1766 case PyUnicode_2BYTE_KIND: 1767 return 0x100; 1768 case PyUnicode_4BYTE_KIND: 1769 return 0x10000; 1770 default: 1771 assert(0 && "invalid kind"); 1772 return 0x10ffff; 1773 } 1774} 1775 1776static PyObject* 1777_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1778{ 1779 PyObject *res; 1780 unsigned char max_char = 127; 1781 1782 assert(size >= 0); 1783 if (size == 1) 1784 return get_latin1_char(u[0]); 1785 max_char = ucs1lib_find_max_char(u, u + size); 1786 res = PyUnicode_New(size, max_char); 1787 if (!res) 1788 return NULL; 1789 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1790 assert(_PyUnicode_CheckConsistency(res, 1)); 1791 return res; 1792} 1793 1794static PyObject* 1795_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1796{ 1797 PyObject *res; 1798 Py_UCS2 max_char = 0; 1799 1800 assert(size >= 0); 1801 if (size == 1 && u[0] < 256) 1802 return get_latin1_char((unsigned char)u[0]); 1803 max_char = ucs2lib_find_max_char(u, u + size); 1804 res = PyUnicode_New(size, max_char); 1805 if (!res) 1806 return NULL; 1807 if (max_char >= 256) 1808 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1809 else { 1810 _PyUnicode_CONVERT_BYTES( 1811 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1812 } 1813 assert(_PyUnicode_CheckConsistency(res, 1)); 1814 return res; 1815} 1816 1817static PyObject* 1818_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1819{ 1820 PyObject *res; 1821 Py_UCS4 max_char = 0; 1822 1823 assert(size >= 0); 1824 if (size == 1 && u[0] < 256) 1825 return get_latin1_char(u[0]); 1826 max_char = ucs4lib_find_max_char(u, u + size); 1827 res = PyUnicode_New(size, max_char); 1828 if (!res) 1829 return NULL; 1830 if (max_char < 256) 1831 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1832 PyUnicode_1BYTE_DATA(res)); 1833 else if (max_char < 0x10000) 1834 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1835 PyUnicode_2BYTE_DATA(res)); 1836 else 1837 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1838 assert(_PyUnicode_CheckConsistency(res, 1)); 1839 return res; 1840} 1841 1842PyObject* 1843PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1844{ 1845 switch(kind) { 1846 case PyUnicode_1BYTE_KIND: 1847 return _PyUnicode_FromUCS1(buffer, size); 1848 case PyUnicode_2BYTE_KIND: 1849 return _PyUnicode_FromUCS2(buffer, size); 1850 case PyUnicode_4BYTE_KIND: 1851 return _PyUnicode_FromUCS4(buffer, size); 1852 default: 1853 assert(0 && "invalid kind"); 1854 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1855 return NULL; 1856 } 1857} 1858 1859/* Ensure that a string uses the most efficient storage, if it is not the 1860 case: create a new string with of the right kind. Write NULL into *p_unicode 1861 on error. */ 1862static void 1863unicode_adjust_maxchar(PyObject **p_unicode) 1864{ 1865 PyObject *unicode, *copy; 1866 Py_UCS4 max_char; 1867 Py_ssize_t len; 1868 unsigned int kind; 1869 1870 assert(p_unicode != NULL); 1871 unicode = *p_unicode; 1872 assert(PyUnicode_IS_READY(unicode)); 1873 if (PyUnicode_IS_ASCII(unicode)) 1874 return; 1875 1876 len = PyUnicode_GET_LENGTH(unicode); 1877 kind = PyUnicode_KIND(unicode); 1878 if (kind == PyUnicode_1BYTE_KIND) { 1879 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 1880 max_char = ucs1lib_find_max_char(u, u + len); 1881 if (max_char >= 128) 1882 return; 1883 } 1884 else if (kind == PyUnicode_2BYTE_KIND) { 1885 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 1886 max_char = ucs2lib_find_max_char(u, u + len); 1887 if (max_char >= 256) 1888 return; 1889 } 1890 else { 1891 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 1892 assert(kind == PyUnicode_4BYTE_KIND); 1893 max_char = ucs4lib_find_max_char(u, u + len); 1894 if (max_char >= 0x10000) 1895 return; 1896 } 1897 copy = PyUnicode_New(len, max_char); 1898 copy_characters(copy, 0, unicode, 0, len); 1899 Py_DECREF(unicode); 1900 *p_unicode = copy; 1901} 1902 1903PyObject* 1904PyUnicode_Copy(PyObject *unicode) 1905{ 1906 Py_ssize_t size; 1907 PyObject *copy; 1908 void *data; 1909 1910 if (!PyUnicode_Check(unicode)) { 1911 PyErr_BadInternalCall(); 1912 return NULL; 1913 } 1914 if (PyUnicode_READY(unicode)) 1915 return NULL; 1916 1917 size = PyUnicode_GET_LENGTH(unicode); 1918 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1919 if (!copy) 1920 return NULL; 1921 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1922 1923 data = PyUnicode_DATA(unicode); 1924 switch (PyUnicode_KIND(unicode)) 1925 { 1926 case PyUnicode_1BYTE_KIND: 1927 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1928 break; 1929 case PyUnicode_2BYTE_KIND: 1930 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1931 break; 1932 case PyUnicode_4BYTE_KIND: 1933 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1934 break; 1935 default: 1936 assert(0); 1937 break; 1938 } 1939 assert(_PyUnicode_CheckConsistency(copy, 1)); 1940 return copy; 1941} 1942 1943 1944/* Widen Unicode objects to larger buffers. Don't write terminating null 1945 character. Return NULL on error. */ 1946 1947void* 1948_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1949{ 1950 Py_ssize_t len; 1951 void *result; 1952 unsigned int skind; 1953 1954 if (PyUnicode_READY(s)) 1955 return NULL; 1956 1957 len = PyUnicode_GET_LENGTH(s); 1958 skind = PyUnicode_KIND(s); 1959 if (skind >= kind) { 1960 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1961 return NULL; 1962 } 1963 switch(kind) { 1964 case PyUnicode_2BYTE_KIND: 1965 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1966 if (!result) 1967 return PyErr_NoMemory(); 1968 assert(skind == PyUnicode_1BYTE_KIND); 1969 _PyUnicode_CONVERT_BYTES( 1970 Py_UCS1, Py_UCS2, 1971 PyUnicode_1BYTE_DATA(s), 1972 PyUnicode_1BYTE_DATA(s) + len, 1973 result); 1974 return result; 1975 case PyUnicode_4BYTE_KIND: 1976 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1977 if (!result) 1978 return PyErr_NoMemory(); 1979 if (skind == PyUnicode_2BYTE_KIND) { 1980 _PyUnicode_CONVERT_BYTES( 1981 Py_UCS2, Py_UCS4, 1982 PyUnicode_2BYTE_DATA(s), 1983 PyUnicode_2BYTE_DATA(s) + len, 1984 result); 1985 } 1986 else { 1987 assert(skind == PyUnicode_1BYTE_KIND); 1988 _PyUnicode_CONVERT_BYTES( 1989 Py_UCS1, Py_UCS4, 1990 PyUnicode_1BYTE_DATA(s), 1991 PyUnicode_1BYTE_DATA(s) + len, 1992 result); 1993 } 1994 return result; 1995 default: 1996 break; 1997 } 1998 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1999 return NULL; 2000} 2001 2002static Py_UCS4* 2003as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2004 int copy_null) 2005{ 2006 int kind; 2007 void *data; 2008 Py_ssize_t len, targetlen; 2009 if (PyUnicode_READY(string) == -1) 2010 return NULL; 2011 kind = PyUnicode_KIND(string); 2012 data = PyUnicode_DATA(string); 2013 len = PyUnicode_GET_LENGTH(string); 2014 targetlen = len; 2015 if (copy_null) 2016 targetlen++; 2017 if (!target) { 2018 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2019 PyErr_NoMemory(); 2020 return NULL; 2021 } 2022 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2023 if (!target) { 2024 PyErr_NoMemory(); 2025 return NULL; 2026 } 2027 } 2028 else { 2029 if (targetsize < targetlen) { 2030 PyErr_Format(PyExc_SystemError, 2031 "string is longer than the buffer"); 2032 if (copy_null && 0 < targetsize) 2033 target[0] = 0; 2034 return NULL; 2035 } 2036 } 2037 if (kind == PyUnicode_1BYTE_KIND) { 2038 Py_UCS1 *start = (Py_UCS1 *) data; 2039 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2040 } 2041 else if (kind == PyUnicode_2BYTE_KIND) { 2042 Py_UCS2 *start = (Py_UCS2 *) data; 2043 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2044 } 2045 else { 2046 assert(kind == PyUnicode_4BYTE_KIND); 2047 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2048 } 2049 if (copy_null) 2050 target[len] = 0; 2051 return target; 2052} 2053 2054Py_UCS4* 2055PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2056 int copy_null) 2057{ 2058 if (target == NULL || targetsize < 0) { 2059 PyErr_BadInternalCall(); 2060 return NULL; 2061 } 2062 return as_ucs4(string, target, targetsize, copy_null); 2063} 2064 2065Py_UCS4* 2066PyUnicode_AsUCS4Copy(PyObject *string) 2067{ 2068 return as_ucs4(string, NULL, 0, 1); 2069} 2070 2071#ifdef HAVE_WCHAR_H 2072 2073PyObject * 2074PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2075{ 2076 if (w == NULL) { 2077 if (size == 0) 2078 return PyUnicode_New(0, 0); 2079 PyErr_BadInternalCall(); 2080 return NULL; 2081 } 2082 2083 if (size == -1) { 2084 size = wcslen(w); 2085 } 2086 2087 return PyUnicode_FromUnicode(w, size); 2088} 2089 2090#endif /* HAVE_WCHAR_H */ 2091 2092static void 2093makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2094 int zeropad, int width, int precision, char c) 2095{ 2096 *fmt++ = '%'; 2097 if (width) { 2098 if (zeropad) 2099 *fmt++ = '0'; 2100 fmt += sprintf(fmt, "%d", width); 2101 } 2102 if (precision) 2103 fmt += sprintf(fmt, ".%d", precision); 2104 if (longflag) 2105 *fmt++ = 'l'; 2106 else if (longlongflag) { 2107 /* longlongflag should only ever be nonzero on machines with 2108 HAVE_LONG_LONG defined */ 2109#ifdef HAVE_LONG_LONG 2110 char *f = PY_FORMAT_LONG_LONG; 2111 while (*f) 2112 *fmt++ = *f++; 2113#else 2114 /* we shouldn't ever get here */ 2115 assert(0); 2116 *fmt++ = 'l'; 2117#endif 2118 } 2119 else if (size_tflag) { 2120 char *f = PY_FORMAT_SIZE_T; 2121 while (*f) 2122 *fmt++ = *f++; 2123 } 2124 *fmt++ = c; 2125 *fmt = '\0'; 2126} 2127 2128/* helper for PyUnicode_FromFormatV() */ 2129 2130static const char* 2131parse_format_flags(const char *f, 2132 int *p_width, int *p_precision, 2133 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2134{ 2135 int width, precision, longflag, longlongflag, size_tflag; 2136 2137 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2138 f++; 2139 width = 0; 2140 while (Py_ISDIGIT((unsigned)*f)) 2141 width = (width*10) + *f++ - '0'; 2142 precision = 0; 2143 if (*f == '.') { 2144 f++; 2145 while (Py_ISDIGIT((unsigned)*f)) 2146 precision = (precision*10) + *f++ - '0'; 2147 if (*f == '%') { 2148 /* "%.3%s" => f points to "3" */ 2149 f--; 2150 } 2151 } 2152 if (*f == '\0') { 2153 /* bogus format "%.1" => go backward, f points to "1" */ 2154 f--; 2155 } 2156 if (p_width != NULL) 2157 *p_width = width; 2158 if (p_precision != NULL) 2159 *p_precision = precision; 2160 2161 /* Handle %ld, %lu, %lld and %llu. */ 2162 longflag = 0; 2163 longlongflag = 0; 2164 size_tflag = 0; 2165 2166 if (*f == 'l') { 2167 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2168 longflag = 1; 2169 ++f; 2170 } 2171#ifdef HAVE_LONG_LONG 2172 else if (f[1] == 'l' && 2173 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2174 longlongflag = 1; 2175 f += 2; 2176 } 2177#endif 2178 } 2179 /* handle the size_t flag. */ 2180 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2181 size_tflag = 1; 2182 ++f; 2183 } 2184 if (p_longflag != NULL) 2185 *p_longflag = longflag; 2186 if (p_longlongflag != NULL) 2187 *p_longlongflag = longlongflag; 2188 if (p_size_tflag != NULL) 2189 *p_size_tflag = size_tflag; 2190 return f; 2191} 2192 2193/* maximum number of characters required for output of %ld. 21 characters 2194 allows for 64-bit integers (in decimal) and an optional sign. */ 2195#define MAX_LONG_CHARS 21 2196/* maximum number of characters required for output of %lld. 2197 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2198 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2199#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2200 2201PyObject * 2202PyUnicode_FromFormatV(const char *format, va_list vargs) 2203{ 2204 va_list count; 2205 Py_ssize_t callcount = 0; 2206 PyObject **callresults = NULL; 2207 PyObject **callresult = NULL; 2208 Py_ssize_t n = 0; 2209 int width = 0; 2210 int precision = 0; 2211 int zeropad; 2212 const char* f; 2213 PyObject *string; 2214 /* used by sprintf */ 2215 char fmt[61]; /* should be enough for %0width.precisionlld */ 2216 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2217 Py_UCS4 argmaxchar; 2218 Py_ssize_t numbersize = 0; 2219 char *numberresults = NULL; 2220 char *numberresult = NULL; 2221 Py_ssize_t i; 2222 int kind; 2223 void *data; 2224 2225 Py_VA_COPY(count, vargs); 2226 /* step 1: count the number of %S/%R/%A/%s format specifications 2227 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2228 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2229 * result in an array) 2230 * also estimate a upper bound for all the number formats in the string, 2231 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2232 * buffer before putting everything together. */ 2233 for (f = format; *f; f++) { 2234 if (*f == '%') { 2235 int longlongflag; 2236 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2237 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2238 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2239 ++callcount; 2240 2241 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2242#ifdef HAVE_LONG_LONG 2243 if (longlongflag) { 2244 if (width < MAX_LONG_LONG_CHARS) 2245 width = MAX_LONG_LONG_CHARS; 2246 } 2247 else 2248#endif 2249 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2250 including sign. Decimal takes the most space. This 2251 isn't enough for octal. If a width is specified we 2252 need more (which we allocate later). */ 2253 if (width < MAX_LONG_CHARS) 2254 width = MAX_LONG_CHARS; 2255 2256 /* account for the size + '\0' to separate numbers 2257 inside of the numberresults buffer */ 2258 numbersize += (width + 1); 2259 } 2260 } 2261 else if ((unsigned char)*f > 127) { 2262 PyErr_Format(PyExc_ValueError, 2263 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2264 "string, got a non-ASCII byte: 0x%02x", 2265 (unsigned char)*f); 2266 return NULL; 2267 } 2268 } 2269 /* step 2: allocate memory for the results of 2270 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2271 if (callcount) { 2272 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2273 if (!callresults) { 2274 PyErr_NoMemory(); 2275 return NULL; 2276 } 2277 callresult = callresults; 2278 } 2279 /* step 2.5: allocate memory for the results of formating numbers */ 2280 if (numbersize) { 2281 numberresults = PyObject_Malloc(numbersize); 2282 if (!numberresults) { 2283 PyErr_NoMemory(); 2284 goto fail; 2285 } 2286 numberresult = numberresults; 2287 } 2288 2289 /* step 3: format numbers and figure out how large a buffer we need */ 2290 for (f = format; *f; f++) { 2291 if (*f == '%') { 2292 const char* p; 2293 int longflag; 2294 int longlongflag; 2295 int size_tflag; 2296 int numprinted; 2297 2298 p = f; 2299 zeropad = (f[1] == '0'); 2300 f = parse_format_flags(f, &width, &precision, 2301 &longflag, &longlongflag, &size_tflag); 2302 switch (*f) { 2303 case 'c': 2304 { 2305 Py_UCS4 ordinal = va_arg(count, int); 2306 maxchar = Py_MAX(maxchar, ordinal); 2307 n++; 2308 break; 2309 } 2310 case '%': 2311 n++; 2312 break; 2313 case 'i': 2314 case 'd': 2315 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2316 width, precision, *f); 2317 if (longflag) 2318 numprinted = sprintf(numberresult, fmt, 2319 va_arg(count, long)); 2320#ifdef HAVE_LONG_LONG 2321 else if (longlongflag) 2322 numprinted = sprintf(numberresult, fmt, 2323 va_arg(count, PY_LONG_LONG)); 2324#endif 2325 else if (size_tflag) 2326 numprinted = sprintf(numberresult, fmt, 2327 va_arg(count, Py_ssize_t)); 2328 else 2329 numprinted = sprintf(numberresult, fmt, 2330 va_arg(count, int)); 2331 n += numprinted; 2332 /* advance by +1 to skip over the '\0' */ 2333 numberresult += (numprinted + 1); 2334 assert(*(numberresult - 1) == '\0'); 2335 assert(*(numberresult - 2) != '\0'); 2336 assert(numprinted >= 0); 2337 assert(numberresult <= numberresults + numbersize); 2338 break; 2339 case 'u': 2340 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2341 width, precision, 'u'); 2342 if (longflag) 2343 numprinted = sprintf(numberresult, fmt, 2344 va_arg(count, unsigned long)); 2345#ifdef HAVE_LONG_LONG 2346 else if (longlongflag) 2347 numprinted = sprintf(numberresult, fmt, 2348 va_arg(count, unsigned PY_LONG_LONG)); 2349#endif 2350 else if (size_tflag) 2351 numprinted = sprintf(numberresult, fmt, 2352 va_arg(count, size_t)); 2353 else 2354 numprinted = sprintf(numberresult, fmt, 2355 va_arg(count, unsigned int)); 2356 n += numprinted; 2357 numberresult += (numprinted + 1); 2358 assert(*(numberresult - 1) == '\0'); 2359 assert(*(numberresult - 2) != '\0'); 2360 assert(numprinted >= 0); 2361 assert(numberresult <= numberresults + numbersize); 2362 break; 2363 case 'x': 2364 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2365 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2366 n += numprinted; 2367 numberresult += (numprinted + 1); 2368 assert(*(numberresult - 1) == '\0'); 2369 assert(*(numberresult - 2) != '\0'); 2370 assert(numprinted >= 0); 2371 assert(numberresult <= numberresults + numbersize); 2372 break; 2373 case 'p': 2374 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2375 /* %p is ill-defined: ensure leading 0x. */ 2376 if (numberresult[1] == 'X') 2377 numberresult[1] = 'x'; 2378 else if (numberresult[1] != 'x') { 2379 memmove(numberresult + 2, numberresult, 2380 strlen(numberresult) + 1); 2381 numberresult[0] = '0'; 2382 numberresult[1] = 'x'; 2383 numprinted += 2; 2384 } 2385 n += numprinted; 2386 numberresult += (numprinted + 1); 2387 assert(*(numberresult - 1) == '\0'); 2388 assert(*(numberresult - 2) != '\0'); 2389 assert(numprinted >= 0); 2390 assert(numberresult <= numberresults + numbersize); 2391 break; 2392 case 's': 2393 { 2394 /* UTF-8 */ 2395 const char *s = va_arg(count, const char*); 2396 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2397 if (!str) 2398 goto fail; 2399 /* since PyUnicode_DecodeUTF8 returns already flexible 2400 unicode objects, there is no need to call ready on them */ 2401 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2402 maxchar = Py_MAX(maxchar, argmaxchar); 2403 n += PyUnicode_GET_LENGTH(str); 2404 /* Remember the str and switch to the next slot */ 2405 *callresult++ = str; 2406 break; 2407 } 2408 case 'U': 2409 { 2410 PyObject *obj = va_arg(count, PyObject *); 2411 assert(obj && _PyUnicode_CHECK(obj)); 2412 if (PyUnicode_READY(obj) == -1) 2413 goto fail; 2414 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2415 maxchar = Py_MAX(maxchar, argmaxchar); 2416 n += PyUnicode_GET_LENGTH(obj); 2417 break; 2418 } 2419 case 'V': 2420 { 2421 PyObject *obj = va_arg(count, PyObject *); 2422 const char *str = va_arg(count, const char *); 2423 PyObject *str_obj; 2424 assert(obj || str); 2425 assert(!obj || _PyUnicode_CHECK(obj)); 2426 if (obj) { 2427 if (PyUnicode_READY(obj) == -1) 2428 goto fail; 2429 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2430 maxchar = Py_MAX(maxchar, argmaxchar); 2431 n += PyUnicode_GET_LENGTH(obj); 2432 *callresult++ = NULL; 2433 } 2434 else { 2435 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2436 if (!str_obj) 2437 goto fail; 2438 if (PyUnicode_READY(str_obj)) { 2439 Py_DECREF(str_obj); 2440 goto fail; 2441 } 2442 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2443 maxchar = Py_MAX(maxchar, argmaxchar); 2444 n += PyUnicode_GET_LENGTH(str_obj); 2445 *callresult++ = str_obj; 2446 } 2447 break; 2448 } 2449 case 'S': 2450 { 2451 PyObject *obj = va_arg(count, PyObject *); 2452 PyObject *str; 2453 assert(obj); 2454 str = PyObject_Str(obj); 2455 if (!str || PyUnicode_READY(str) == -1) 2456 goto fail; 2457 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2458 maxchar = Py_MAX(maxchar, argmaxchar); 2459 n += PyUnicode_GET_LENGTH(str); 2460 /* Remember the str and switch to the next slot */ 2461 *callresult++ = str; 2462 break; 2463 } 2464 case 'R': 2465 { 2466 PyObject *obj = va_arg(count, PyObject *); 2467 PyObject *repr; 2468 assert(obj); 2469 repr = PyObject_Repr(obj); 2470 if (!repr || PyUnicode_READY(repr) == -1) 2471 goto fail; 2472 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2473 maxchar = Py_MAX(maxchar, argmaxchar); 2474 n += PyUnicode_GET_LENGTH(repr); 2475 /* Remember the repr and switch to the next slot */ 2476 *callresult++ = repr; 2477 break; 2478 } 2479 case 'A': 2480 { 2481 PyObject *obj = va_arg(count, PyObject *); 2482 PyObject *ascii; 2483 assert(obj); 2484 ascii = PyObject_ASCII(obj); 2485 if (!ascii || PyUnicode_READY(ascii) == -1) 2486 goto fail; 2487 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2488 maxchar = Py_MAX(maxchar, argmaxchar); 2489 n += PyUnicode_GET_LENGTH(ascii); 2490 /* Remember the repr and switch to the next slot */ 2491 *callresult++ = ascii; 2492 break; 2493 } 2494 default: 2495 /* if we stumble upon an unknown 2496 formatting code, copy the rest of 2497 the format string to the output 2498 string. (we cannot just skip the 2499 code, since there's no way to know 2500 what's in the argument list) */ 2501 n += strlen(p); 2502 goto expand; 2503 } 2504 } else 2505 n++; 2506 } 2507 expand: 2508 /* step 4: fill the buffer */ 2509 /* Since we've analyzed how much space we need, 2510 we don't have to resize the string. 2511 There can be no errors beyond this point. */ 2512 string = PyUnicode_New(n, maxchar); 2513 if (!string) 2514 goto fail; 2515 kind = PyUnicode_KIND(string); 2516 data = PyUnicode_DATA(string); 2517 callresult = callresults; 2518 numberresult = numberresults; 2519 2520 for (i = 0, f = format; *f; f++) { 2521 if (*f == '%') { 2522 const char* p; 2523 2524 p = f; 2525 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2526 /* checking for == because the last argument could be a empty 2527 string, which causes i to point to end, the assert at the end of 2528 the loop */ 2529 assert(i <= PyUnicode_GET_LENGTH(string)); 2530 2531 switch (*f) { 2532 case 'c': 2533 { 2534 const int ordinal = va_arg(vargs, int); 2535 PyUnicode_WRITE(kind, data, i++, ordinal); 2536 break; 2537 } 2538 case 'i': 2539 case 'd': 2540 case 'u': 2541 case 'x': 2542 case 'p': 2543 /* unused, since we already have the result */ 2544 if (*f == 'p') 2545 (void) va_arg(vargs, void *); 2546 else 2547 (void) va_arg(vargs, int); 2548 /* extract the result from numberresults and append. */ 2549 for (; *numberresult; ++i, ++numberresult) 2550 PyUnicode_WRITE(kind, data, i, *numberresult); 2551 /* skip over the separating '\0' */ 2552 assert(*numberresult == '\0'); 2553 numberresult++; 2554 assert(numberresult <= numberresults + numbersize); 2555 break; 2556 case 's': 2557 { 2558 /* unused, since we already have the result */ 2559 Py_ssize_t size; 2560 (void) va_arg(vargs, char *); 2561 size = PyUnicode_GET_LENGTH(*callresult); 2562 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2563 copy_characters(string, i, *callresult, 0, size); 2564 i += size; 2565 /* We're done with the unicode()/repr() => forget it */ 2566 Py_DECREF(*callresult); 2567 /* switch to next unicode()/repr() result */ 2568 ++callresult; 2569 break; 2570 } 2571 case 'U': 2572 { 2573 PyObject *obj = va_arg(vargs, PyObject *); 2574 Py_ssize_t size; 2575 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2576 size = PyUnicode_GET_LENGTH(obj); 2577 copy_characters(string, i, obj, 0, size); 2578 i += size; 2579 break; 2580 } 2581 case 'V': 2582 { 2583 Py_ssize_t size; 2584 PyObject *obj = va_arg(vargs, PyObject *); 2585 va_arg(vargs, const char *); 2586 if (obj) { 2587 size = PyUnicode_GET_LENGTH(obj); 2588 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2589 copy_characters(string, i, obj, 0, size); 2590 i += size; 2591 } else { 2592 size = PyUnicode_GET_LENGTH(*callresult); 2593 assert(PyUnicode_KIND(*callresult) <= 2594 PyUnicode_KIND(string)); 2595 copy_characters(string, i, *callresult, 0, size); 2596 i += size; 2597 Py_DECREF(*callresult); 2598 } 2599 ++callresult; 2600 break; 2601 } 2602 case 'S': 2603 case 'R': 2604 case 'A': 2605 { 2606 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2607 /* unused, since we already have the result */ 2608 (void) va_arg(vargs, PyObject *); 2609 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2610 copy_characters(string, i, *callresult, 0, size); 2611 i += size; 2612 /* We're done with the unicode()/repr() => forget it */ 2613 Py_DECREF(*callresult); 2614 /* switch to next unicode()/repr() result */ 2615 ++callresult; 2616 break; 2617 } 2618 case '%': 2619 PyUnicode_WRITE(kind, data, i++, '%'); 2620 break; 2621 default: 2622 for (; *p; ++p, ++i) 2623 PyUnicode_WRITE(kind, data, i, *p); 2624 assert(i == PyUnicode_GET_LENGTH(string)); 2625 goto end; 2626 } 2627 } 2628 else { 2629 assert(i < PyUnicode_GET_LENGTH(string)); 2630 PyUnicode_WRITE(kind, data, i++, *f); 2631 } 2632 } 2633 assert(i == PyUnicode_GET_LENGTH(string)); 2634 2635 end: 2636 if (callresults) 2637 PyObject_Free(callresults); 2638 if (numberresults) 2639 PyObject_Free(numberresults); 2640 assert(_PyUnicode_CheckConsistency(string, 1)); 2641 return string; 2642 fail: 2643 if (callresults) { 2644 PyObject **callresult2 = callresults; 2645 while (callresult2 < callresult) { 2646 Py_XDECREF(*callresult2); 2647 ++callresult2; 2648 } 2649 PyObject_Free(callresults); 2650 } 2651 if (numberresults) 2652 PyObject_Free(numberresults); 2653 return NULL; 2654} 2655 2656PyObject * 2657PyUnicode_FromFormat(const char *format, ...) 2658{ 2659 PyObject* ret; 2660 va_list vargs; 2661 2662#ifdef HAVE_STDARG_PROTOTYPES 2663 va_start(vargs, format); 2664#else 2665 va_start(vargs); 2666#endif 2667 ret = PyUnicode_FromFormatV(format, vargs); 2668 va_end(vargs); 2669 return ret; 2670} 2671 2672#ifdef HAVE_WCHAR_H 2673 2674/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2675 convert a Unicode object to a wide character string. 2676 2677 - If w is NULL: return the number of wide characters (including the null 2678 character) required to convert the unicode object. Ignore size argument. 2679 2680 - Otherwise: return the number of wide characters (excluding the null 2681 character) written into w. Write at most size wide characters (including 2682 the null character). */ 2683static Py_ssize_t 2684unicode_aswidechar(PyObject *unicode, 2685 wchar_t *w, 2686 Py_ssize_t size) 2687{ 2688 Py_ssize_t res; 2689 const wchar_t *wstr; 2690 2691 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2692 if (wstr == NULL) 2693 return -1; 2694 2695 if (w != NULL) { 2696 if (size > res) 2697 size = res + 1; 2698 else 2699 res = size; 2700 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2701 return res; 2702 } 2703 else 2704 return res + 1; 2705} 2706 2707Py_ssize_t 2708PyUnicode_AsWideChar(PyObject *unicode, 2709 wchar_t *w, 2710 Py_ssize_t size) 2711{ 2712 if (unicode == NULL) { 2713 PyErr_BadInternalCall(); 2714 return -1; 2715 } 2716 return unicode_aswidechar(unicode, w, size); 2717} 2718 2719wchar_t* 2720PyUnicode_AsWideCharString(PyObject *unicode, 2721 Py_ssize_t *size) 2722{ 2723 wchar_t* buffer; 2724 Py_ssize_t buflen; 2725 2726 if (unicode == NULL) { 2727 PyErr_BadInternalCall(); 2728 return NULL; 2729 } 2730 2731 buflen = unicode_aswidechar(unicode, NULL, 0); 2732 if (buflen == -1) 2733 return NULL; 2734 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2735 PyErr_NoMemory(); 2736 return NULL; 2737 } 2738 2739 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2740 if (buffer == NULL) { 2741 PyErr_NoMemory(); 2742 return NULL; 2743 } 2744 buflen = unicode_aswidechar(unicode, buffer, buflen); 2745 if (buflen == -1) 2746 return NULL; 2747 if (size != NULL) 2748 *size = buflen; 2749 return buffer; 2750} 2751 2752#endif /* HAVE_WCHAR_H */ 2753 2754PyObject * 2755PyUnicode_FromOrdinal(int ordinal) 2756{ 2757 PyObject *v; 2758 if (ordinal < 0 || ordinal > 0x10ffff) { 2759 PyErr_SetString(PyExc_ValueError, 2760 "chr() arg not in range(0x110000)"); 2761 return NULL; 2762 } 2763 2764 if (ordinal < 256) 2765 return get_latin1_char(ordinal); 2766 2767 v = PyUnicode_New(1, ordinal); 2768 if (v == NULL) 2769 return NULL; 2770 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2771 assert(_PyUnicode_CheckConsistency(v, 1)); 2772 return v; 2773} 2774 2775PyObject * 2776PyUnicode_FromObject(register PyObject *obj) 2777{ 2778 /* XXX Perhaps we should make this API an alias of 2779 PyObject_Str() instead ?! */ 2780 if (PyUnicode_CheckExact(obj)) { 2781 if (PyUnicode_READY(obj)) 2782 return NULL; 2783 Py_INCREF(obj); 2784 return obj; 2785 } 2786 if (PyUnicode_Check(obj)) { 2787 /* For a Unicode subtype that's not a Unicode object, 2788 return a true Unicode object with the same data. */ 2789 return PyUnicode_Copy(obj); 2790 } 2791 PyErr_Format(PyExc_TypeError, 2792 "Can't convert '%.100s' object to str implicitly", 2793 Py_TYPE(obj)->tp_name); 2794 return NULL; 2795} 2796 2797PyObject * 2798PyUnicode_FromEncodedObject(register PyObject *obj, 2799 const char *encoding, 2800 const char *errors) 2801{ 2802 Py_buffer buffer; 2803 PyObject *v; 2804 2805 if (obj == NULL) { 2806 PyErr_BadInternalCall(); 2807 return NULL; 2808 } 2809 2810 /* Decoding bytes objects is the most common case and should be fast */ 2811 if (PyBytes_Check(obj)) { 2812 if (PyBytes_GET_SIZE(obj) == 0) { 2813 Py_INCREF(unicode_empty); 2814 v = unicode_empty; 2815 } 2816 else { 2817 v = PyUnicode_Decode( 2818 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2819 encoding, errors); 2820 } 2821 return v; 2822 } 2823 2824 if (PyUnicode_Check(obj)) { 2825 PyErr_SetString(PyExc_TypeError, 2826 "decoding str is not supported"); 2827 return NULL; 2828 } 2829 2830 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2831 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2832 PyErr_Format(PyExc_TypeError, 2833 "coercing to str: need bytes, bytearray " 2834 "or buffer-like object, %.80s found", 2835 Py_TYPE(obj)->tp_name); 2836 return NULL; 2837 } 2838 2839 if (buffer.len == 0) { 2840 Py_INCREF(unicode_empty); 2841 v = unicode_empty; 2842 } 2843 else 2844 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2845 2846 PyBuffer_Release(&buffer); 2847 return v; 2848} 2849 2850/* Convert encoding to lower case and replace '_' with '-' in order to 2851 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2852 1 on success. */ 2853static int 2854normalize_encoding(const char *encoding, 2855 char *lower, 2856 size_t lower_len) 2857{ 2858 const char *e; 2859 char *l; 2860 char *l_end; 2861 2862 if (encoding == NULL) { 2863 strcpy(lower, "utf-8"); 2864 return 1; 2865 } 2866 e = encoding; 2867 l = lower; 2868 l_end = &lower[lower_len - 1]; 2869 while (*e) { 2870 if (l == l_end) 2871 return 0; 2872 if (Py_ISUPPER(*e)) { 2873 *l++ = Py_TOLOWER(*e++); 2874 } 2875 else if (*e == '_') { 2876 *l++ = '-'; 2877 e++; 2878 } 2879 else { 2880 *l++ = *e++; 2881 } 2882 } 2883 *l = '\0'; 2884 return 1; 2885} 2886 2887PyObject * 2888PyUnicode_Decode(const char *s, 2889 Py_ssize_t size, 2890 const char *encoding, 2891 const char *errors) 2892{ 2893 PyObject *buffer = NULL, *unicode; 2894 Py_buffer info; 2895 char lower[11]; /* Enough for any encoding shortcut */ 2896 2897 /* Shortcuts for common default encodings */ 2898 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2899 if ((strcmp(lower, "utf-8") == 0) || 2900 (strcmp(lower, "utf8") == 0)) 2901 return PyUnicode_DecodeUTF8(s, size, errors); 2902 else if ((strcmp(lower, "latin-1") == 0) || 2903 (strcmp(lower, "latin1") == 0) || 2904 (strcmp(lower, "iso-8859-1") == 0)) 2905 return PyUnicode_DecodeLatin1(s, size, errors); 2906#ifdef HAVE_MBCS 2907 else if (strcmp(lower, "mbcs") == 0) 2908 return PyUnicode_DecodeMBCS(s, size, errors); 2909#endif 2910 else if (strcmp(lower, "ascii") == 0) 2911 return PyUnicode_DecodeASCII(s, size, errors); 2912 else if (strcmp(lower, "utf-16") == 0) 2913 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2914 else if (strcmp(lower, "utf-32") == 0) 2915 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2916 } 2917 2918 /* Decode via the codec registry */ 2919 buffer = NULL; 2920 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2921 goto onError; 2922 buffer = PyMemoryView_FromBuffer(&info); 2923 if (buffer == NULL) 2924 goto onError; 2925 unicode = PyCodec_Decode(buffer, encoding, errors); 2926 if (unicode == NULL) 2927 goto onError; 2928 if (!PyUnicode_Check(unicode)) { 2929 PyErr_Format(PyExc_TypeError, 2930 "decoder did not return a str object (type=%.400s)", 2931 Py_TYPE(unicode)->tp_name); 2932 Py_DECREF(unicode); 2933 goto onError; 2934 } 2935 Py_DECREF(buffer); 2936#ifndef DONT_MAKE_RESULT_READY 2937 if (_PyUnicode_READY_REPLACE(&unicode)) { 2938 Py_DECREF(unicode); 2939 return NULL; 2940 } 2941#endif 2942 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2943 return unicode; 2944 2945 onError: 2946 Py_XDECREF(buffer); 2947 return NULL; 2948} 2949 2950PyObject * 2951PyUnicode_AsDecodedObject(PyObject *unicode, 2952 const char *encoding, 2953 const char *errors) 2954{ 2955 PyObject *v; 2956 2957 if (!PyUnicode_Check(unicode)) { 2958 PyErr_BadArgument(); 2959 goto onError; 2960 } 2961 2962 if (encoding == NULL) 2963 encoding = PyUnicode_GetDefaultEncoding(); 2964 2965 /* Decode via the codec registry */ 2966 v = PyCodec_Decode(unicode, encoding, errors); 2967 if (v == NULL) 2968 goto onError; 2969 assert(_PyUnicode_CheckConsistency(v, 1)); 2970 return v; 2971 2972 onError: 2973 return NULL; 2974} 2975 2976PyObject * 2977PyUnicode_AsDecodedUnicode(PyObject *unicode, 2978 const char *encoding, 2979 const char *errors) 2980{ 2981 PyObject *v; 2982 2983 if (!PyUnicode_Check(unicode)) { 2984 PyErr_BadArgument(); 2985 goto onError; 2986 } 2987 2988 if (encoding == NULL) 2989 encoding = PyUnicode_GetDefaultEncoding(); 2990 2991 /* Decode via the codec registry */ 2992 v = PyCodec_Decode(unicode, encoding, errors); 2993 if (v == NULL) 2994 goto onError; 2995 if (!PyUnicode_Check(v)) { 2996 PyErr_Format(PyExc_TypeError, 2997 "decoder did not return a str object (type=%.400s)", 2998 Py_TYPE(v)->tp_name); 2999 Py_DECREF(v); 3000 goto onError; 3001 } 3002 assert(_PyUnicode_CheckConsistency(v, 1)); 3003 return v; 3004 3005 onError: 3006 return NULL; 3007} 3008 3009PyObject * 3010PyUnicode_Encode(const Py_UNICODE *s, 3011 Py_ssize_t size, 3012 const char *encoding, 3013 const char *errors) 3014{ 3015 PyObject *v, *unicode; 3016 3017 unicode = PyUnicode_FromUnicode(s, size); 3018 if (unicode == NULL) 3019 return NULL; 3020 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3021 Py_DECREF(unicode); 3022 return v; 3023} 3024 3025PyObject * 3026PyUnicode_AsEncodedObject(PyObject *unicode, 3027 const char *encoding, 3028 const char *errors) 3029{ 3030 PyObject *v; 3031 3032 if (!PyUnicode_Check(unicode)) { 3033 PyErr_BadArgument(); 3034 goto onError; 3035 } 3036 3037 if (encoding == NULL) 3038 encoding = PyUnicode_GetDefaultEncoding(); 3039 3040 /* Encode via the codec registry */ 3041 v = PyCodec_Encode(unicode, encoding, errors); 3042 if (v == NULL) 3043 goto onError; 3044 return v; 3045 3046 onError: 3047 return NULL; 3048} 3049 3050PyObject * 3051PyUnicode_EncodeFSDefault(PyObject *unicode) 3052{ 3053#ifdef HAVE_MBCS 3054 const Py_UNICODE *wstr; 3055 Py_ssize_t wlen; 3056 3057 wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen); 3058 if (wstr == NULL) 3059 return NULL; 3060 return PyUnicode_EncodeMBCS(wstr, wlen, NULL); 3061#elif defined(__APPLE__) 3062 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3063#else 3064 PyInterpreterState *interp = PyThreadState_GET()->interp; 3065 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3066 cannot use it to encode and decode filenames before it is loaded. Load 3067 the Python codec requires to encode at least its own filename. Use the C 3068 version of the locale codec until the codec registry is initialized and 3069 the Python codec is loaded. 3070 3071 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3072 cannot only rely on it: check also interp->fscodec_initialized for 3073 subinterpreters. */ 3074 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3075 return PyUnicode_AsEncodedString(unicode, 3076 Py_FileSystemDefaultEncoding, 3077 "surrogateescape"); 3078 } 3079 else { 3080 /* locale encoding with surrogateescape */ 3081 wchar_t *wchar; 3082 char *bytes; 3083 PyObject *bytes_obj; 3084 size_t error_pos; 3085 3086 wchar = PyUnicode_AsWideCharString(unicode, NULL); 3087 if (wchar == NULL) 3088 return NULL; 3089 bytes = _Py_wchar2char(wchar, &error_pos); 3090 if (bytes == NULL) { 3091 if (error_pos != (size_t)-1) { 3092 char *errmsg = strerror(errno); 3093 PyObject *exc = NULL; 3094 if (errmsg == NULL) 3095 errmsg = "Py_wchar2char() failed"; 3096 raise_encode_exception(&exc, 3097 "filesystemencoding", unicode, 3098 error_pos, error_pos+1, 3099 errmsg); 3100 Py_XDECREF(exc); 3101 } 3102 else 3103 PyErr_NoMemory(); 3104 PyMem_Free(wchar); 3105 return NULL; 3106 } 3107 PyMem_Free(wchar); 3108 3109 bytes_obj = PyBytes_FromString(bytes); 3110 PyMem_Free(bytes); 3111 return bytes_obj; 3112 } 3113#endif 3114} 3115 3116PyObject * 3117PyUnicode_AsEncodedString(PyObject *unicode, 3118 const char *encoding, 3119 const char *errors) 3120{ 3121 PyObject *v; 3122 char lower[11]; /* Enough for any encoding shortcut */ 3123 3124 if (!PyUnicode_Check(unicode)) { 3125 PyErr_BadArgument(); 3126 return NULL; 3127 } 3128 3129 /* Shortcuts for common default encodings */ 3130 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3131 if ((strcmp(lower, "utf-8") == 0) || 3132 (strcmp(lower, "utf8") == 0)) 3133 { 3134 if (errors == NULL || strcmp(errors, "strict") == 0) 3135 return _PyUnicode_AsUTF8String(unicode, NULL); 3136 else 3137 return _PyUnicode_AsUTF8String(unicode, errors); 3138 } 3139 else if ((strcmp(lower, "latin-1") == 0) || 3140 (strcmp(lower, "latin1") == 0) || 3141 (strcmp(lower, "iso-8859-1") == 0)) 3142 return _PyUnicode_AsLatin1String(unicode, errors); 3143#ifdef HAVE_MBCS 3144 else if (strcmp(lower, "mbcs") == 0) { 3145 const Py_UNICODE *wstr; 3146 Py_ssize_t wlen; 3147 3148 wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen); 3149 if (wstr == NULL) 3150 return NULL; 3151 return PyUnicode_EncodeMBCS(wstr, wlen, errors); 3152 } 3153#endif 3154 else if (strcmp(lower, "ascii") == 0) 3155 return _PyUnicode_AsASCIIString(unicode, errors); 3156 } 3157 3158 /* Encode via the codec registry */ 3159 v = PyCodec_Encode(unicode, encoding, errors); 3160 if (v == NULL) 3161 return NULL; 3162 3163 /* The normal path */ 3164 if (PyBytes_Check(v)) 3165 return v; 3166 3167 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3168 if (PyByteArray_Check(v)) { 3169 int error; 3170 PyObject *b; 3171 3172 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3173 "encoder %s returned bytearray instead of bytes", 3174 encoding); 3175 if (error) { 3176 Py_DECREF(v); 3177 return NULL; 3178 } 3179 3180 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3181 Py_DECREF(v); 3182 return b; 3183 } 3184 3185 PyErr_Format(PyExc_TypeError, 3186 "encoder did not return a bytes object (type=%.400s)", 3187 Py_TYPE(v)->tp_name); 3188 Py_DECREF(v); 3189 return NULL; 3190} 3191 3192PyObject * 3193PyUnicode_AsEncodedUnicode(PyObject *unicode, 3194 const char *encoding, 3195 const char *errors) 3196{ 3197 PyObject *v; 3198 3199 if (!PyUnicode_Check(unicode)) { 3200 PyErr_BadArgument(); 3201 goto onError; 3202 } 3203 3204 if (encoding == NULL) 3205 encoding = PyUnicode_GetDefaultEncoding(); 3206 3207 /* Encode via the codec registry */ 3208 v = PyCodec_Encode(unicode, encoding, errors); 3209 if (v == NULL) 3210 goto onError; 3211 if (!PyUnicode_Check(v)) { 3212 PyErr_Format(PyExc_TypeError, 3213 "encoder did not return an str object (type=%.400s)", 3214 Py_TYPE(v)->tp_name); 3215 Py_DECREF(v); 3216 goto onError; 3217 } 3218 return v; 3219 3220 onError: 3221 return NULL; 3222} 3223 3224PyObject* 3225PyUnicode_DecodeFSDefault(const char *s) { 3226 Py_ssize_t size = (Py_ssize_t)strlen(s); 3227 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3228} 3229 3230PyObject* 3231PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3232{ 3233#ifdef HAVE_MBCS 3234 return PyUnicode_DecodeMBCS(s, size, NULL); 3235#elif defined(__APPLE__) 3236 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 3237#else 3238 PyInterpreterState *interp = PyThreadState_GET()->interp; 3239 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3240 cannot use it to encode and decode filenames before it is loaded. Load 3241 the Python codec requires to encode at least its own filename. Use the C 3242 version of the locale codec until the codec registry is initialized and 3243 the Python codec is loaded. 3244 3245 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3246 cannot only rely on it: check also interp->fscodec_initialized for 3247 subinterpreters. */ 3248 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3249 return PyUnicode_Decode(s, size, 3250 Py_FileSystemDefaultEncoding, 3251 "surrogateescape"); 3252 } 3253 else { 3254 /* locale encoding with surrogateescape */ 3255 wchar_t *wchar; 3256 PyObject *unicode; 3257 size_t len; 3258 3259 if (s[size] != '\0' || size != strlen(s)) { 3260 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3261 return NULL; 3262 } 3263 3264 wchar = _Py_char2wchar(s, &len); 3265 if (wchar == NULL) 3266 return PyErr_NoMemory(); 3267 3268 unicode = PyUnicode_FromWideChar(wchar, len); 3269 PyMem_Free(wchar); 3270 return unicode; 3271 } 3272#endif 3273} 3274 3275 3276int 3277PyUnicode_FSConverter(PyObject* arg, void* addr) 3278{ 3279 PyObject *output = NULL; 3280 Py_ssize_t size; 3281 void *data; 3282 if (arg == NULL) { 3283 Py_DECREF(*(PyObject**)addr); 3284 return 1; 3285 } 3286 if (PyBytes_Check(arg)) { 3287 output = arg; 3288 Py_INCREF(output); 3289 } 3290 else { 3291 arg = PyUnicode_FromObject(arg); 3292 if (!arg) 3293 return 0; 3294 output = PyUnicode_EncodeFSDefault(arg); 3295 Py_DECREF(arg); 3296 if (!output) 3297 return 0; 3298 if (!PyBytes_Check(output)) { 3299 Py_DECREF(output); 3300 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3301 return 0; 3302 } 3303 } 3304 size = PyBytes_GET_SIZE(output); 3305 data = PyBytes_AS_STRING(output); 3306 if (size != strlen(data)) { 3307 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3308 Py_DECREF(output); 3309 return 0; 3310 } 3311 *(PyObject**)addr = output; 3312 return Py_CLEANUP_SUPPORTED; 3313} 3314 3315 3316int 3317PyUnicode_FSDecoder(PyObject* arg, void* addr) 3318{ 3319 PyObject *output = NULL; 3320 if (arg == NULL) { 3321 Py_DECREF(*(PyObject**)addr); 3322 return 1; 3323 } 3324 if (PyUnicode_Check(arg)) { 3325 if (PyUnicode_READY(arg)) 3326 return 0; 3327 output = arg; 3328 Py_INCREF(output); 3329 } 3330 else { 3331 arg = PyBytes_FromObject(arg); 3332 if (!arg) 3333 return 0; 3334 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3335 PyBytes_GET_SIZE(arg)); 3336 Py_DECREF(arg); 3337 if (!output) 3338 return 0; 3339 if (!PyUnicode_Check(output)) { 3340 Py_DECREF(output); 3341 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3342 return 0; 3343 } 3344 } 3345 if (PyUnicode_READY(output) < 0) { 3346 Py_DECREF(output); 3347 return 0; 3348 } 3349 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3350 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3351 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3352 Py_DECREF(output); 3353 return 0; 3354 } 3355 *(PyObject**)addr = output; 3356 return Py_CLEANUP_SUPPORTED; 3357} 3358 3359 3360char* 3361PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3362{ 3363 PyObject *bytes; 3364 3365 if (!PyUnicode_Check(unicode)) { 3366 PyErr_BadArgument(); 3367 return NULL; 3368 } 3369 if (PyUnicode_READY(unicode) == -1) 3370 return NULL; 3371 3372 if (PyUnicode_UTF8(unicode) == NULL) { 3373 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3374 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3375 if (bytes == NULL) 3376 return NULL; 3377 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3378 if (_PyUnicode_UTF8(unicode) == NULL) { 3379 Py_DECREF(bytes); 3380 return NULL; 3381 } 3382 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3383 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3384 PyBytes_AS_STRING(bytes), 3385 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3386 Py_DECREF(bytes); 3387 } 3388 3389 if (psize) 3390 *psize = PyUnicode_UTF8_LENGTH(unicode); 3391 return PyUnicode_UTF8(unicode); 3392} 3393 3394char* 3395PyUnicode_AsUTF8(PyObject *unicode) 3396{ 3397 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3398} 3399 3400#ifdef Py_DEBUG 3401static int unicode_as_unicode_calls = 0; 3402#endif 3403 3404 3405Py_UNICODE * 3406PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3407{ 3408 const unsigned char *one_byte; 3409#if SIZEOF_WCHAR_T == 4 3410 const Py_UCS2 *two_bytes; 3411#else 3412 const Py_UCS4 *four_bytes; 3413 const Py_UCS4 *ucs4_end; 3414 Py_ssize_t num_surrogates; 3415#endif 3416 wchar_t *w; 3417 wchar_t *wchar_end; 3418 3419 if (!PyUnicode_Check(unicode)) { 3420 PyErr_BadArgument(); 3421 return NULL; 3422 } 3423 if (_PyUnicode_WSTR(unicode) == NULL) { 3424 /* Non-ASCII compact unicode object */ 3425 assert(_PyUnicode_KIND(unicode) != 0); 3426 assert(PyUnicode_IS_READY(unicode)); 3427 3428#ifdef Py_DEBUG 3429 ++unicode_as_unicode_calls; 3430#endif 3431 3432 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3433#if SIZEOF_WCHAR_T == 2 3434 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3435 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3436 num_surrogates = 0; 3437 3438 for (; four_bytes < ucs4_end; ++four_bytes) { 3439 if (*four_bytes > 0xFFFF) 3440 ++num_surrogates; 3441 } 3442 3443 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3444 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3445 if (!_PyUnicode_WSTR(unicode)) { 3446 PyErr_NoMemory(); 3447 return NULL; 3448 } 3449 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3450 3451 w = _PyUnicode_WSTR(unicode); 3452 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3453 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3454 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3455 if (*four_bytes > 0xFFFF) { 3456 /* encode surrogate pair in this case */ 3457 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3458 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3459 } 3460 else 3461 *w = *four_bytes; 3462 3463 if (w > wchar_end) { 3464 assert(0 && "Miscalculated string end"); 3465 } 3466 } 3467 *w = 0; 3468#else 3469 /* sizeof(wchar_t) == 4 */ 3470 Py_FatalError("Impossible unicode object state, wstr and str " 3471 "should share memory already."); 3472 return NULL; 3473#endif 3474 } 3475 else { 3476 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3477 (_PyUnicode_LENGTH(unicode) + 1)); 3478 if (!_PyUnicode_WSTR(unicode)) { 3479 PyErr_NoMemory(); 3480 return NULL; 3481 } 3482 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3483 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3484 w = _PyUnicode_WSTR(unicode); 3485 wchar_end = w + _PyUnicode_LENGTH(unicode); 3486 3487 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3488 one_byte = PyUnicode_1BYTE_DATA(unicode); 3489 for (; w < wchar_end; ++one_byte, ++w) 3490 *w = *one_byte; 3491 /* null-terminate the wstr */ 3492 *w = 0; 3493 } 3494 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3495#if SIZEOF_WCHAR_T == 4 3496 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3497 for (; w < wchar_end; ++two_bytes, ++w) 3498 *w = *two_bytes; 3499 /* null-terminate the wstr */ 3500 *w = 0; 3501#else 3502 /* sizeof(wchar_t) == 2 */ 3503 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3504 _PyUnicode_WSTR(unicode) = NULL; 3505 Py_FatalError("Impossible unicode object state, wstr " 3506 "and str should share memory already."); 3507 return NULL; 3508#endif 3509 } 3510 else { 3511 assert(0 && "This should never happen."); 3512 } 3513 } 3514 } 3515 if (size != NULL) 3516 *size = PyUnicode_WSTR_LENGTH(unicode); 3517 return _PyUnicode_WSTR(unicode); 3518} 3519 3520Py_UNICODE * 3521PyUnicode_AsUnicode(PyObject *unicode) 3522{ 3523 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3524} 3525 3526 3527Py_ssize_t 3528PyUnicode_GetSize(PyObject *unicode) 3529{ 3530 if (!PyUnicode_Check(unicode)) { 3531 PyErr_BadArgument(); 3532 goto onError; 3533 } 3534 return PyUnicode_GET_SIZE(unicode); 3535 3536 onError: 3537 return -1; 3538} 3539 3540Py_ssize_t 3541PyUnicode_GetLength(PyObject *unicode) 3542{ 3543 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3544 PyErr_BadArgument(); 3545 return -1; 3546 } 3547 3548 return PyUnicode_GET_LENGTH(unicode); 3549} 3550 3551Py_UCS4 3552PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3553{ 3554 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3555 PyErr_BadArgument(); 3556 return (Py_UCS4)-1; 3557 } 3558 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3559 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3560 return (Py_UCS4)-1; 3561 } 3562 return PyUnicode_READ_CHAR(unicode, index); 3563} 3564 3565int 3566PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3567{ 3568 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3569 PyErr_BadArgument(); 3570 return -1; 3571 } 3572 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3573 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3574 return -1; 3575 } 3576 if (_PyUnicode_Dirty(unicode)) 3577 return -1; 3578 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3579 index, ch); 3580 return 0; 3581} 3582 3583const char * 3584PyUnicode_GetDefaultEncoding(void) 3585{ 3586 return "utf-8"; 3587} 3588 3589/* create or adjust a UnicodeDecodeError */ 3590static void 3591make_decode_exception(PyObject **exceptionObject, 3592 const char *encoding, 3593 const char *input, Py_ssize_t length, 3594 Py_ssize_t startpos, Py_ssize_t endpos, 3595 const char *reason) 3596{ 3597 if (*exceptionObject == NULL) { 3598 *exceptionObject = PyUnicodeDecodeError_Create( 3599 encoding, input, length, startpos, endpos, reason); 3600 } 3601 else { 3602 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3603 goto onError; 3604 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3605 goto onError; 3606 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3607 goto onError; 3608 } 3609 return; 3610 3611onError: 3612 Py_DECREF(*exceptionObject); 3613 *exceptionObject = NULL; 3614} 3615 3616/* error handling callback helper: 3617 build arguments, call the callback and check the arguments, 3618 if no exception occurred, copy the replacement to the output 3619 and adjust various state variables. 3620 return 0 on success, -1 on error 3621*/ 3622 3623static int 3624unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3625 const char *encoding, const char *reason, 3626 const char **input, const char **inend, Py_ssize_t *startinpos, 3627 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3628 PyObject **output, Py_ssize_t *outpos) 3629{ 3630 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3631 3632 PyObject *restuple = NULL; 3633 PyObject *repunicode = NULL; 3634 Py_ssize_t outsize; 3635 Py_ssize_t insize; 3636 Py_ssize_t requiredsize; 3637 Py_ssize_t newpos; 3638 PyObject *inputobj = NULL; 3639 int res = -1; 3640 3641 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 3642 outsize = PyUnicode_GET_LENGTH(*output); 3643 else 3644 outsize = _PyUnicode_WSTR_LENGTH(*output); 3645 3646 if (*errorHandler == NULL) { 3647 *errorHandler = PyCodec_LookupError(errors); 3648 if (*errorHandler == NULL) 3649 goto onError; 3650 } 3651 3652 make_decode_exception(exceptionObject, 3653 encoding, 3654 *input, *inend - *input, 3655 *startinpos, *endinpos, 3656 reason); 3657 if (*exceptionObject == NULL) 3658 goto onError; 3659 3660 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3661 if (restuple == NULL) 3662 goto onError; 3663 if (!PyTuple_Check(restuple)) { 3664 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3665 goto onError; 3666 } 3667 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3668 goto onError; 3669 if (PyUnicode_READY(repunicode) < 0) 3670 goto onError; 3671 3672 /* Copy back the bytes variables, which might have been modified by the 3673 callback */ 3674 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3675 if (!inputobj) 3676 goto onError; 3677 if (!PyBytes_Check(inputobj)) { 3678 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3679 } 3680 *input = PyBytes_AS_STRING(inputobj); 3681 insize = PyBytes_GET_SIZE(inputobj); 3682 *inend = *input + insize; 3683 /* we can DECREF safely, as the exception has another reference, 3684 so the object won't go away. */ 3685 Py_DECREF(inputobj); 3686 3687 if (newpos<0) 3688 newpos = insize+newpos; 3689 if (newpos<0 || newpos>insize) { 3690 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3691 goto onError; 3692 } 3693 3694 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 3695 /* need more space? (at least enough for what we 3696 have+the replacement+the rest of the string (starting 3697 at the new input position), so we won't have to check space 3698 when there are no errors in the rest of the string) */ 3699 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 3700 requiredsize = *outpos + replen + insize-newpos; 3701 if (requiredsize > outsize) { 3702 if (requiredsize<2*outsize) 3703 requiredsize = 2*outsize; 3704 if (unicode_resize(output, requiredsize) < 0) 3705 goto onError; 3706 } 3707 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 3708 goto onError; 3709 copy_characters(*output, *outpos, repunicode, 0, replen); 3710 *outpos += replen; 3711 } 3712 else { 3713 wchar_t *repwstr; 3714 Py_ssize_t repwlen; 3715 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 3716 if (repwstr == NULL) 3717 goto onError; 3718 /* need more space? (at least enough for what we 3719 have+the replacement+the rest of the string (starting 3720 at the new input position), so we won't have to check space 3721 when there are no errors in the rest of the string) */ 3722 requiredsize = *outpos + repwlen + insize-newpos; 3723 if (requiredsize > outsize) { 3724 if (requiredsize < 2*outsize) 3725 requiredsize = 2*outsize; 3726 if (unicode_resize(output, requiredsize) < 0) 3727 goto onError; 3728 } 3729 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 3730 *outpos += repwlen; 3731 } 3732 *endinpos = newpos; 3733 *inptr = *input + newpos; 3734 3735 /* we made it! */ 3736 res = 0; 3737 3738 onError: 3739 Py_XDECREF(restuple); 3740 return res; 3741} 3742 3743/* --- UTF-7 Codec -------------------------------------------------------- */ 3744 3745/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3746 3747/* Three simple macros defining base-64. */ 3748 3749/* Is c a base-64 character? */ 3750 3751#define IS_BASE64(c) \ 3752 (((c) >= 'A' && (c) <= 'Z') || \ 3753 ((c) >= 'a' && (c) <= 'z') || \ 3754 ((c) >= '0' && (c) <= '9') || \ 3755 (c) == '+' || (c) == '/') 3756 3757/* given that c is a base-64 character, what is its base-64 value? */ 3758 3759#define FROM_BASE64(c) \ 3760 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3761 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3762 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3763 (c) == '+' ? 62 : 63) 3764 3765/* What is the base-64 character of the bottom 6 bits of n? */ 3766 3767#define TO_BASE64(n) \ 3768 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3769 3770/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3771 * decoded as itself. We are permissive on decoding; the only ASCII 3772 * byte not decoding to itself is the + which begins a base64 3773 * string. */ 3774 3775#define DECODE_DIRECT(c) \ 3776 ((c) <= 127 && (c) != '+') 3777 3778/* The UTF-7 encoder treats ASCII characters differently according to 3779 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3780 * the above). See RFC2152. This array identifies these different 3781 * sets: 3782 * 0 : "Set D" 3783 * alphanumeric and '(),-./:? 3784 * 1 : "Set O" 3785 * !"#$%&*;<=>@[]^_`{|} 3786 * 2 : "whitespace" 3787 * ht nl cr sp 3788 * 3 : special (must be base64 encoded) 3789 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3790 */ 3791 3792static 3793char utf7_category[128] = { 3794/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3795 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3796/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3797 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3798/* sp ! " # $ % & ' ( ) * + , - . / */ 3799 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3800/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3801 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3802/* @ A B C D E F G H I J K L M N O */ 3803 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3804/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3805 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3806/* ` a b c d e f g h i j k l m n o */ 3807 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3808/* p q r s t u v w x y z { | } ~ del */ 3809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3810}; 3811 3812/* ENCODE_DIRECT: this character should be encoded as itself. The 3813 * answer depends on whether we are encoding set O as itself, and also 3814 * on whether we are encoding whitespace as itself. RFC2152 makes it 3815 * clear that the answers to these questions vary between 3816 * applications, so this code needs to be flexible. */ 3817 3818#define ENCODE_DIRECT(c, directO, directWS) \ 3819 ((c) < 128 && (c) > 0 && \ 3820 ((utf7_category[(c)] == 0) || \ 3821 (directWS && (utf7_category[(c)] == 2)) || \ 3822 (directO && (utf7_category[(c)] == 1)))) 3823 3824PyObject * 3825PyUnicode_DecodeUTF7(const char *s, 3826 Py_ssize_t size, 3827 const char *errors) 3828{ 3829 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3830} 3831 3832/* The decoder. The only state we preserve is our read position, 3833 * i.e. how many characters we have consumed. So if we end in the 3834 * middle of a shift sequence we have to back off the read position 3835 * and the output to the beginning of the sequence, otherwise we lose 3836 * all the shift state (seen bits, number of bits seen, high 3837 * surrogate). */ 3838 3839PyObject * 3840PyUnicode_DecodeUTF7Stateful(const char *s, 3841 Py_ssize_t size, 3842 const char *errors, 3843 Py_ssize_t *consumed) 3844{ 3845 const char *starts = s; 3846 Py_ssize_t startinpos; 3847 Py_ssize_t endinpos; 3848 Py_ssize_t outpos; 3849 const char *e; 3850 PyObject *unicode; 3851 const char *errmsg = ""; 3852 int inShift = 0; 3853 Py_ssize_t shiftOutStart; 3854 unsigned int base64bits = 0; 3855 unsigned long base64buffer = 0; 3856 Py_UCS4 surrogate = 0; 3857 PyObject *errorHandler = NULL; 3858 PyObject *exc = NULL; 3859 3860 /* Start off assuming it's all ASCII. Widen later as necessary. */ 3861 unicode = PyUnicode_New(size, 127); 3862 if (!unicode) 3863 return NULL; 3864 if (size == 0) { 3865 if (consumed) 3866 *consumed = 0; 3867 return unicode; 3868 } 3869 3870 shiftOutStart = outpos = 0; 3871 e = s + size; 3872 3873 while (s < e) { 3874 Py_UCS4 ch; 3875 restart: 3876 ch = (unsigned char) *s; 3877 3878 if (inShift) { /* in a base-64 section */ 3879 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3880 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3881 base64bits += 6; 3882 s++; 3883 if (base64bits >= 16) { 3884 /* we have enough bits for a UTF-16 value */ 3885 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 3886 base64bits -= 16; 3887 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3888 if (surrogate) { 3889 /* expecting a second surrogate */ 3890 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3891 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10) 3892 | (outCh & 0x3FF)) + 0x10000; 3893 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 3894 goto onError; 3895 surrogate = 0; 3896 continue; 3897 } 3898 else { 3899 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 3900 goto onError; 3901 surrogate = 0; 3902 } 3903 } 3904 if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3905 /* first surrogate */ 3906 surrogate = outCh; 3907 } 3908 else { 3909 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 3910 goto onError; 3911 } 3912 } 3913 } 3914 else { /* now leaving a base-64 section */ 3915 inShift = 0; 3916 s++; 3917 if (surrogate) { 3918 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 3919 goto onError; 3920 surrogate = 0; 3921 } 3922 if (base64bits > 0) { /* left-over bits */ 3923 if (base64bits >= 6) { 3924 /* We've seen at least one base-64 character */ 3925 errmsg = "partial character in shift sequence"; 3926 goto utf7Error; 3927 } 3928 else { 3929 /* Some bits remain; they should be zero */ 3930 if (base64buffer != 0) { 3931 errmsg = "non-zero padding bits in shift sequence"; 3932 goto utf7Error; 3933 } 3934 } 3935 } 3936 if (ch != '-') { 3937 /* '-' is absorbed; other terminating 3938 characters are preserved */ 3939 if (unicode_putchar(&unicode, &outpos, ch) < 0) 3940 goto onError; 3941 } 3942 } 3943 } 3944 else if ( ch == '+' ) { 3945 startinpos = s-starts; 3946 s++; /* consume '+' */ 3947 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3948 s++; 3949 if (unicode_putchar(&unicode, &outpos, '+') < 0) 3950 goto onError; 3951 } 3952 else { /* begin base64-encoded section */ 3953 inShift = 1; 3954 shiftOutStart = outpos; 3955 base64bits = 0; 3956 } 3957 } 3958 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3959 if (unicode_putchar(&unicode, &outpos, ch) < 0) 3960 goto onError; 3961 s++; 3962 } 3963 else { 3964 startinpos = s-starts; 3965 s++; 3966 errmsg = "unexpected special character"; 3967 goto utf7Error; 3968 } 3969 continue; 3970utf7Error: 3971 endinpos = s-starts; 3972 if (unicode_decode_call_errorhandler( 3973 errors, &errorHandler, 3974 "utf7", errmsg, 3975 &starts, &e, &startinpos, &endinpos, &exc, &s, 3976 &unicode, &outpos)) 3977 goto onError; 3978 } 3979 3980 /* end of string */ 3981 3982 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3983 /* if we're in an inconsistent state, that's an error */ 3984 if (surrogate || 3985 (base64bits >= 6) || 3986 (base64bits > 0 && base64buffer != 0)) { 3987 endinpos = size; 3988 if (unicode_decode_call_errorhandler( 3989 errors, &errorHandler, 3990 "utf7", "unterminated shift sequence", 3991 &starts, &e, &startinpos, &endinpos, &exc, &s, 3992 &unicode, &outpos)) 3993 goto onError; 3994 if (s < e) 3995 goto restart; 3996 } 3997 } 3998 3999 /* return state */ 4000 if (consumed) { 4001 if (inShift) { 4002 outpos = shiftOutStart; /* back off output */ 4003 *consumed = startinpos; 4004 } 4005 else { 4006 *consumed = s-starts; 4007 } 4008 } 4009 4010 if (unicode_resize(&unicode, outpos) < 0) 4011 goto onError; 4012 4013 Py_XDECREF(errorHandler); 4014 Py_XDECREF(exc); 4015#ifndef DONT_MAKE_RESULT_READY 4016 if (_PyUnicode_READY_REPLACE(&unicode)) { 4017 Py_DECREF(unicode); 4018 return NULL; 4019 } 4020#endif 4021 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4022 return unicode; 4023 4024 onError: 4025 Py_XDECREF(errorHandler); 4026 Py_XDECREF(exc); 4027 Py_DECREF(unicode); 4028 return NULL; 4029} 4030 4031 4032PyObject * 4033_PyUnicode_EncodeUTF7(PyObject *str, 4034 int base64SetO, 4035 int base64WhiteSpace, 4036 const char *errors) 4037{ 4038 int kind; 4039 void *data; 4040 Py_ssize_t len; 4041 PyObject *v; 4042 Py_ssize_t allocated; 4043 int inShift = 0; 4044 Py_ssize_t i; 4045 unsigned int base64bits = 0; 4046 unsigned long base64buffer = 0; 4047 char * out; 4048 char * start; 4049 4050 if (PyUnicode_READY(str) < 0) 4051 return NULL; 4052 kind = PyUnicode_KIND(str); 4053 data = PyUnicode_DATA(str); 4054 len = PyUnicode_GET_LENGTH(str); 4055 4056 if (len == 0) 4057 return PyBytes_FromStringAndSize(NULL, 0); 4058 4059 /* It might be possible to tighten this worst case */ 4060 allocated = 8 * len; 4061 if (allocated / 8 != len) 4062 return PyErr_NoMemory(); 4063 4064 v = PyBytes_FromStringAndSize(NULL, allocated); 4065 if (v == NULL) 4066 return NULL; 4067 4068 start = out = PyBytes_AS_STRING(v); 4069 for (i = 0; i < len; ++i) { 4070 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4071 4072 if (inShift) { 4073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4074 /* shifting out */ 4075 if (base64bits) { /* output remaining bits */ 4076 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4077 base64buffer = 0; 4078 base64bits = 0; 4079 } 4080 inShift = 0; 4081 /* Characters not in the BASE64 set implicitly unshift the sequence 4082 so no '-' is required, except if the character is itself a '-' */ 4083 if (IS_BASE64(ch) || ch == '-') { 4084 *out++ = '-'; 4085 } 4086 *out++ = (char) ch; 4087 } 4088 else { 4089 goto encode_char; 4090 } 4091 } 4092 else { /* not in a shift sequence */ 4093 if (ch == '+') { 4094 *out++ = '+'; 4095 *out++ = '-'; 4096 } 4097 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4098 *out++ = (char) ch; 4099 } 4100 else { 4101 *out++ = '+'; 4102 inShift = 1; 4103 goto encode_char; 4104 } 4105 } 4106 continue; 4107encode_char: 4108 if (ch >= 0x10000) { 4109 /* code first surrogate */ 4110 base64bits += 16; 4111 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4112 while (base64bits >= 6) { 4113 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4114 base64bits -= 6; 4115 } 4116 /* prepare second surrogate */ 4117 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 4118 } 4119 base64bits += 16; 4120 base64buffer = (base64buffer << 16) | ch; 4121 while (base64bits >= 6) { 4122 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4123 base64bits -= 6; 4124 } 4125 } 4126 if (base64bits) 4127 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4128 if (inShift) 4129 *out++ = '-'; 4130 if (_PyBytes_Resize(&v, out - start) < 0) 4131 return NULL; 4132 return v; 4133} 4134PyObject * 4135PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4136 Py_ssize_t size, 4137 int base64SetO, 4138 int base64WhiteSpace, 4139 const char *errors) 4140{ 4141 PyObject *result; 4142 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4143 if (tmp == NULL) 4144 return NULL; 4145 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4146 base64WhiteSpace, errors); 4147 Py_DECREF(tmp); 4148 return result; 4149} 4150 4151#undef IS_BASE64 4152#undef FROM_BASE64 4153#undef TO_BASE64 4154#undef DECODE_DIRECT 4155#undef ENCODE_DIRECT 4156 4157/* --- UTF-8 Codec -------------------------------------------------------- */ 4158 4159static 4160char utf8_code_length[256] = { 4161 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 4162 illegal prefix. See RFC 3629 for details */ 4163 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 4164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4165 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4166 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4167 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4168 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4170 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 4171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 4172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 4175 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 4176 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 4177 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 4178 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 4179}; 4180 4181PyObject * 4182PyUnicode_DecodeUTF8(const char *s, 4183 Py_ssize_t size, 4184 const char *errors) 4185{ 4186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4187} 4188 4189/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 4190#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 4191 4192/* Mask to quickly check whether a C 'long' contains a 4193 non-ASCII, UTF8-encoded char. */ 4194#if (SIZEOF_LONG == 8) 4195# define ASCII_CHAR_MASK 0x8080808080808080L 4196#elif (SIZEOF_LONG == 4) 4197# define ASCII_CHAR_MASK 0x80808080L 4198#else 4199# error C 'long' size should be either 4 or 8! 4200#endif 4201 4202/* Scans a UTF-8 string and returns the maximum character to be expected, 4203 the size of the decoded unicode string and if any major errors were 4204 encountered. 4205 4206 This function does check basic UTF-8 sanity, it does however NOT CHECK 4207 if the string contains surrogates, and if all continuation bytes are 4208 within the correct ranges, these checks are performed in 4209 PyUnicode_DecodeUTF8Stateful. 4210 4211 If it sets has_errors to 1, it means the value of unicode_size and max_char 4212 will be bogus and you should not rely on useful information in them. 4213 */ 4214static Py_UCS4 4215utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 4216 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 4217 int *has_errors) 4218{ 4219 Py_ssize_t n; 4220 Py_ssize_t char_count = 0; 4221 Py_UCS4 max_char = 127, new_max; 4222 Py_UCS4 upper_bound; 4223 const unsigned char *p = (const unsigned char *)s; 4224 const unsigned char *end = p + string_size; 4225 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 4226 int err = 0; 4227 4228 for (; p < end && !err; ++p, ++char_count) { 4229 /* Only check value if it's not a ASCII char... */ 4230 if (*p < 0x80) { 4231 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 4232 an explanation. */ 4233 if (!((size_t) p & LONG_PTR_MASK)) { 4234 /* Help register allocation */ 4235 register const unsigned char *_p = p; 4236 while (_p < aligned_end) { 4237 unsigned long value = *(unsigned long *) _p; 4238 if (value & ASCII_CHAR_MASK) 4239 break; 4240 _p += SIZEOF_LONG; 4241 char_count += SIZEOF_LONG; 4242 } 4243 p = _p; 4244 if (p == end) 4245 break; 4246 } 4247 } 4248 if (*p >= 0x80) { 4249 n = utf8_code_length[*p]; 4250 new_max = max_char; 4251 switch (n) { 4252 /* invalid start byte */ 4253 case 0: 4254 err = 1; 4255 break; 4256 case 2: 4257 /* Code points between 0x00FF and 0x07FF inclusive. 4258 Approximate the upper bound of the code point, 4259 if this flips over 255 we can be sure it will be more 4260 than 255 and the string will need 2 bytes per code coint, 4261 if it stays under or equal to 255, we can be sure 1 byte 4262 is enough. 4263 ((*p & 0b00011111) << 6) | 0b00111111 */ 4264 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 4265 if (max_char < upper_bound) 4266 new_max = upper_bound; 4267 /* Ensure we track at least that we left ASCII space. */ 4268 if (new_max < 128) 4269 new_max = 128; 4270 break; 4271 case 3: 4272 /* Between 0x0FFF and 0xFFFF inclusive, so values are 4273 always > 255 and <= 65535 and will always need 2 bytes. */ 4274 if (max_char < 65535) 4275 new_max = 65535; 4276 break; 4277 case 4: 4278 /* Code point will be above 0xFFFF for sure in this case. */ 4279 new_max = 65537; 4280 break; 4281 /* Internal error, this should be caught by the first if */ 4282 case 1: 4283 default: 4284 assert(0 && "Impossible case in utf8_max_char_and_size"); 4285 err = 1; 4286 } 4287 /* Instead of number of overall bytes for this code point, 4288 n contains the number of following bytes: */ 4289 --n; 4290 /* Check if the follow up chars are all valid continuation bytes */ 4291 if (n >= 1) { 4292 const unsigned char *cont; 4293 if ((p + n) >= end) { 4294 if (consumed == 0) 4295 /* incomplete data, non-incremental decoding */ 4296 err = 1; 4297 break; 4298 } 4299 for (cont = p + 1; cont <= (p + n); ++cont) { 4300 if ((*cont & 0xc0) != 0x80) { 4301 err = 1; 4302 break; 4303 } 4304 } 4305 p += n; 4306 } 4307 else 4308 err = 1; 4309 max_char = new_max; 4310 } 4311 } 4312 4313 if (unicode_size) 4314 *unicode_size = char_count; 4315 if (has_errors) 4316 *has_errors = err; 4317 return max_char; 4318} 4319 4320/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string 4321 in case of errors. Implicit parameters: unicode, kind, data, has_errors, 4322 onError. Potential resizing overallocates, so the result needs to shrink 4323 at the end. 4324*/ 4325#define WRITE_MAYBE_FAIL(index, value) \ 4326 do { \ 4327 if (has_errors) { \ 4328 Py_ssize_t pos = index; \ 4329 if (pos > PyUnicode_GET_LENGTH(unicode) && \ 4330 unicode_resize(&unicode, pos + pos/8) < 0) \ 4331 goto onError; \ 4332 if (unicode_putchar(&unicode, &pos, value) < 0) \ 4333 goto onError; \ 4334 } \ 4335 else \ 4336 PyUnicode_WRITE(kind, data, index, value); \ 4337 } while (0) 4338 4339PyObject * 4340PyUnicode_DecodeUTF8Stateful(const char *s, 4341 Py_ssize_t size, 4342 const char *errors, 4343 Py_ssize_t *consumed) 4344{ 4345 const char *starts = s; 4346 int n; 4347 int k; 4348 Py_ssize_t startinpos; 4349 Py_ssize_t endinpos; 4350 const char *e, *aligned_end; 4351 PyObject *unicode; 4352 const char *errmsg = ""; 4353 PyObject *errorHandler = NULL; 4354 PyObject *exc = NULL; 4355 Py_UCS4 maxchar = 0; 4356 Py_ssize_t unicode_size; 4357 Py_ssize_t i; 4358 int kind; 4359 void *data; 4360 int has_errors; 4361 4362 if (size == 0) { 4363 if (consumed) 4364 *consumed = 0; 4365 return (PyObject *)PyUnicode_New(0, 0); 4366 } 4367 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4368 consumed, &has_errors); 4369 if (has_errors) 4370 /* maxchar and size computation might be incorrect; 4371 code below widens and resizes as necessary. */ 4372 unicode = PyUnicode_New(size, 127); 4373 else 4374 unicode = PyUnicode_New(unicode_size, maxchar); 4375 if (!unicode) 4376 return NULL; 4377 /* When the string is ASCII only, just use memcpy and return. 4378 unicode_size may be != size if there is an incomplete UTF-8 4379 sequence at the end of the ASCII block. */ 4380 if (!has_errors && maxchar < 128 && size == unicode_size) { 4381 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4382 return unicode; 4383 } 4384 kind = PyUnicode_KIND(unicode); 4385 data = PyUnicode_DATA(unicode); 4386 /* Unpack UTF-8 encoded data */ 4387 i = 0; 4388 e = s + size; 4389 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4390 4391 while (s < e) { 4392 Py_UCS4 ch = (unsigned char)*s; 4393 4394 if (ch < 0x80) { 4395 /* Fast path for runs of ASCII characters. Given that common UTF-8 4396 input will consist of an overwhelming majority of ASCII 4397 characters, we try to optimize for this case by checking 4398 as many characters as a C 'long' can contain. 4399 First, check if we can do an aligned read, as most CPUs have 4400 a penalty for unaligned reads. 4401 */ 4402 if (!((size_t) s & LONG_PTR_MASK)) { 4403 /* Help register allocation */ 4404 register const char *_s = s; 4405 register Py_ssize_t _i = i; 4406 while (_s < aligned_end) { 4407 /* Read a whole long at a time (either 4 or 8 bytes), 4408 and do a fast unrolled copy if it only contains ASCII 4409 characters. */ 4410 unsigned long value = *(unsigned long *) _s; 4411 if (value & ASCII_CHAR_MASK) 4412 break; 4413 WRITE_MAYBE_FAIL(_i+0, _s[0]); 4414 WRITE_MAYBE_FAIL(_i+1, _s[1]); 4415 WRITE_MAYBE_FAIL(_i+2, _s[2]); 4416 WRITE_MAYBE_FAIL(_i+3, _s[3]); 4417#if (SIZEOF_LONG == 8) 4418 WRITE_MAYBE_FAIL(_i+4, _s[4]); 4419 WRITE_MAYBE_FAIL(_i+5, _s[5]); 4420 WRITE_MAYBE_FAIL(_i+6, _s[6]); 4421 WRITE_MAYBE_FAIL(_i+7, _s[7]); 4422#endif 4423 _s += SIZEOF_LONG; 4424 _i += SIZEOF_LONG; 4425 } 4426 s = _s; 4427 i = _i; 4428 if (s == e) 4429 break; 4430 ch = (unsigned char)*s; 4431 } 4432 } 4433 4434 if (ch < 0x80) { 4435 WRITE_MAYBE_FAIL(i++, ch); 4436 s++; 4437 continue; 4438 } 4439 4440 n = utf8_code_length[ch]; 4441 4442 if (s + n > e) { 4443 if (consumed) 4444 break; 4445 else { 4446 errmsg = "unexpected end of data"; 4447 startinpos = s-starts; 4448 endinpos = startinpos+1; 4449 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4450 endinpos++; 4451 goto utf8Error; 4452 } 4453 } 4454 4455 switch (n) { 4456 4457 case 0: 4458 errmsg = "invalid start byte"; 4459 startinpos = s-starts; 4460 endinpos = startinpos+1; 4461 goto utf8Error; 4462 4463 case 1: 4464 errmsg = "internal error"; 4465 startinpos = s-starts; 4466 endinpos = startinpos+1; 4467 goto utf8Error; 4468 4469 case 2: 4470 if ((s[1] & 0xc0) != 0x80) { 4471 errmsg = "invalid continuation byte"; 4472 startinpos = s-starts; 4473 endinpos = startinpos + 1; 4474 goto utf8Error; 4475 } 4476 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4477 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4478 WRITE_MAYBE_FAIL(i++, ch); 4479 break; 4480 4481 case 3: 4482 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4483 will result in surrogates in range d800-dfff. Surrogates are 4484 not valid UTF-8 so they are rejected. 4485 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4486 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4487 if ((s[1] & 0xc0) != 0x80 || 4488 (s[2] & 0xc0) != 0x80 || 4489 ((unsigned char)s[0] == 0xE0 && 4490 (unsigned char)s[1] < 0xA0) || 4491 ((unsigned char)s[0] == 0xED && 4492 (unsigned char)s[1] > 0x9F)) { 4493 errmsg = "invalid continuation byte"; 4494 startinpos = s-starts; 4495 endinpos = startinpos + 1; 4496 4497 /* if s[1] first two bits are 1 and 0, then the invalid 4498 continuation byte is s[2], so increment endinpos by 1, 4499 if not, s[1] is invalid and endinpos doesn't need to 4500 be incremented. */ 4501 if ((s[1] & 0xC0) == 0x80) 4502 endinpos++; 4503 goto utf8Error; 4504 } 4505 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4506 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4507 WRITE_MAYBE_FAIL(i++, ch); 4508 break; 4509 4510 case 4: 4511 if ((s[1] & 0xc0) != 0x80 || 4512 (s[2] & 0xc0) != 0x80 || 4513 (s[3] & 0xc0) != 0x80 || 4514 ((unsigned char)s[0] == 0xF0 && 4515 (unsigned char)s[1] < 0x90) || 4516 ((unsigned char)s[0] == 0xF4 && 4517 (unsigned char)s[1] > 0x8F)) { 4518 errmsg = "invalid continuation byte"; 4519 startinpos = s-starts; 4520 endinpos = startinpos + 1; 4521 if ((s[1] & 0xC0) == 0x80) { 4522 endinpos++; 4523 if ((s[2] & 0xC0) == 0x80) 4524 endinpos++; 4525 } 4526 goto utf8Error; 4527 } 4528 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4529 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4530 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4531 4532 WRITE_MAYBE_FAIL(i++, ch); 4533 break; 4534 } 4535 s += n; 4536 continue; 4537 4538 utf8Error: 4539 if (!has_errors) { 4540 PyObject *tmp; 4541 Py_ssize_t k; 4542 /* We encountered some error that wasn't detected in the original scan, 4543 e.g. an encoded surrogate character. The original maxchar computation may 4544 have been incorrect, so redo it now. */ 4545 for (k = 0, maxchar = 0; k < i; k++) 4546 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k)); 4547 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar); 4548 if (tmp == NULL) 4549 goto onError; 4550 PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i); 4551 Py_DECREF(unicode); 4552 unicode = tmp; 4553 has_errors = 1; 4554 } 4555 if (unicode_decode_call_errorhandler( 4556 errors, &errorHandler, 4557 "utf8", errmsg, 4558 &starts, &e, &startinpos, &endinpos, &exc, &s, 4559 &unicode, &i)) 4560 goto onError; 4561 /* Update data because unicode_decode_call_errorhandler might have 4562 re-created or resized the unicode object. */ 4563 data = PyUnicode_DATA(unicode); 4564 kind = PyUnicode_KIND(unicode); 4565 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4566 } 4567 /* Ensure the unicode_size calculation above was correct: */ 4568 assert(has_errors || i == unicode_size); 4569 4570 if (consumed) 4571 *consumed = s-starts; 4572 4573 /* Adjust length and ready string when it contained errors and 4574 is of the old resizable kind. */ 4575 if (has_errors) { 4576 if (PyUnicode_Resize(&unicode, i) < 0) 4577 goto onError; 4578 } 4579 4580 Py_XDECREF(errorHandler); 4581 Py_XDECREF(exc); 4582 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4583 return unicode; 4584 4585 onError: 4586 Py_XDECREF(errorHandler); 4587 Py_XDECREF(exc); 4588 Py_DECREF(unicode); 4589 return NULL; 4590} 4591 4592#undef WRITE_MAYBE_FAIL 4593 4594#ifdef __APPLE__ 4595 4596/* Simplified UTF-8 decoder using surrogateescape error handler, 4597 used to decode the command line arguments on Mac OS X. */ 4598 4599wchar_t* 4600_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4601{ 4602 int n; 4603 const char *e; 4604 wchar_t *unicode, *p; 4605 4606 /* Note: size will always be longer than the resulting Unicode 4607 character count */ 4608 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4609 PyErr_NoMemory(); 4610 return NULL; 4611 } 4612 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4613 if (!unicode) 4614 return NULL; 4615 4616 /* Unpack UTF-8 encoded data */ 4617 p = unicode; 4618 e = s + size; 4619 while (s < e) { 4620 Py_UCS4 ch = (unsigned char)*s; 4621 4622 if (ch < 0x80) { 4623 *p++ = (wchar_t)ch; 4624 s++; 4625 continue; 4626 } 4627 4628 n = utf8_code_length[ch]; 4629 if (s + n > e) { 4630 goto surrogateescape; 4631 } 4632 4633 switch (n) { 4634 case 0: 4635 case 1: 4636 goto surrogateescape; 4637 4638 case 2: 4639 if ((s[1] & 0xc0) != 0x80) 4640 goto surrogateescape; 4641 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4642 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4643 *p++ = (wchar_t)ch; 4644 break; 4645 4646 case 3: 4647 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4648 will result in surrogates in range d800-dfff. Surrogates are 4649 not valid UTF-8 so they are rejected. 4650 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4651 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4652 if ((s[1] & 0xc0) != 0x80 || 4653 (s[2] & 0xc0) != 0x80 || 4654 ((unsigned char)s[0] == 0xE0 && 4655 (unsigned char)s[1] < 0xA0) || 4656 ((unsigned char)s[0] == 0xED && 4657 (unsigned char)s[1] > 0x9F)) { 4658 4659 goto surrogateescape; 4660 } 4661 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4662 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4663 *p++ = (wchar_t)ch; 4664 break; 4665 4666 case 4: 4667 if ((s[1] & 0xc0) != 0x80 || 4668 (s[2] & 0xc0) != 0x80 || 4669 (s[3] & 0xc0) != 0x80 || 4670 ((unsigned char)s[0] == 0xF0 && 4671 (unsigned char)s[1] < 0x90) || 4672 ((unsigned char)s[0] == 0xF4 && 4673 (unsigned char)s[1] > 0x8F)) { 4674 goto surrogateescape; 4675 } 4676 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4677 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4678 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4679 4680#if SIZEOF_WCHAR_T == 4 4681 *p++ = (wchar_t)ch; 4682#else 4683 /* compute and append the two surrogates: */ 4684 4685 /* translate from 10000..10FFFF to 0..FFFF */ 4686 ch -= 0x10000; 4687 4688 /* high surrogate = top 10 bits added to D800 */ 4689 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4690 4691 /* low surrogate = bottom 10 bits added to DC00 */ 4692 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4693#endif 4694 break; 4695 } 4696 s += n; 4697 continue; 4698 4699 surrogateescape: 4700 *p++ = 0xDC00 + ch; 4701 s++; 4702 } 4703 *p = L'\0'; 4704 return unicode; 4705} 4706 4707#endif /* __APPLE__ */ 4708 4709/* Primary internal function which creates utf8 encoded bytes objects. 4710 4711 Allocation strategy: if the string is short, convert into a stack buffer 4712 and allocate exactly as much space needed at the end. Else allocate the 4713 maximum possible needed (4 result bytes per Unicode character), and return 4714 the excess memory at the end. 4715*/ 4716PyObject * 4717_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4718{ 4719#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4720 4721 Py_ssize_t i; /* index into s of next input byte */ 4722 PyObject *result; /* result string object */ 4723 char *p; /* next free byte in output buffer */ 4724 Py_ssize_t nallocated; /* number of result bytes allocated */ 4725 Py_ssize_t nneeded; /* number of result bytes needed */ 4726 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4727 PyObject *errorHandler = NULL; 4728 PyObject *exc = NULL; 4729 int kind; 4730 void *data; 4731 Py_ssize_t size; 4732 PyObject *rep = NULL; 4733 4734 if (!PyUnicode_Check(unicode)) { 4735 PyErr_BadArgument(); 4736 return NULL; 4737 } 4738 4739 if (PyUnicode_READY(unicode) == -1) 4740 return NULL; 4741 4742 if (PyUnicode_UTF8(unicode)) 4743 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4744 PyUnicode_UTF8_LENGTH(unicode)); 4745 4746 kind = PyUnicode_KIND(unicode); 4747 data = PyUnicode_DATA(unicode); 4748 size = PyUnicode_GET_LENGTH(unicode); 4749 4750 assert(size >= 0); 4751 4752 if (size <= MAX_SHORT_UNICHARS) { 4753 /* Write into the stack buffer; nallocated can't overflow. 4754 * At the end, we'll allocate exactly as much heap space as it 4755 * turns out we need. 4756 */ 4757 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4758 result = NULL; /* will allocate after we're done */ 4759 p = stackbuf; 4760 } 4761 else { 4762 /* Overallocate on the heap, and give the excess back at the end. */ 4763 nallocated = size * 4; 4764 if (nallocated / 4 != size) /* overflow! */ 4765 return PyErr_NoMemory(); 4766 result = PyBytes_FromStringAndSize(NULL, nallocated); 4767 if (result == NULL) 4768 return NULL; 4769 p = PyBytes_AS_STRING(result); 4770 } 4771 4772 for (i = 0; i < size;) { 4773 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4774 4775 if (ch < 0x80) 4776 /* Encode ASCII */ 4777 *p++ = (char) ch; 4778 4779 else if (ch < 0x0800) { 4780 /* Encode Latin-1 */ 4781 *p++ = (char)(0xc0 | (ch >> 6)); 4782 *p++ = (char)(0x80 | (ch & 0x3f)); 4783 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4784 Py_ssize_t newpos; 4785 Py_ssize_t repsize, k, startpos; 4786 startpos = i-1; 4787 rep = unicode_encode_call_errorhandler( 4788 errors, &errorHandler, "utf-8", "surrogates not allowed", 4789 unicode, &exc, startpos, startpos+1, &newpos); 4790 if (!rep) 4791 goto error; 4792 4793 if (PyBytes_Check(rep)) 4794 repsize = PyBytes_GET_SIZE(rep); 4795 else 4796 repsize = PyUnicode_GET_SIZE(rep); 4797 4798 if (repsize > 4) { 4799 Py_ssize_t offset; 4800 4801 if (result == NULL) 4802 offset = p - stackbuf; 4803 else 4804 offset = p - PyBytes_AS_STRING(result); 4805 4806 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4807 /* integer overflow */ 4808 PyErr_NoMemory(); 4809 goto error; 4810 } 4811 nallocated += repsize - 4; 4812 if (result != NULL) { 4813 if (_PyBytes_Resize(&result, nallocated) < 0) 4814 goto error; 4815 } else { 4816 result = PyBytes_FromStringAndSize(NULL, nallocated); 4817 if (result == NULL) 4818 goto error; 4819 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4820 } 4821 p = PyBytes_AS_STRING(result) + offset; 4822 } 4823 4824 if (PyBytes_Check(rep)) { 4825 char *prep = PyBytes_AS_STRING(rep); 4826 for(k = repsize; k > 0; k--) 4827 *p++ = *prep++; 4828 } else /* rep is unicode */ { 4829 enum PyUnicode_Kind repkind; 4830 void *repdata; 4831 4832 if (PyUnicode_READY(rep) < 0) 4833 goto error; 4834 repkind = PyUnicode_KIND(rep); 4835 repdata = PyUnicode_DATA(rep); 4836 4837 for(k=0; k<repsize; k++) { 4838 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); 4839 if (0x80 <= c) { 4840 raise_encode_exception(&exc, "utf-8", 4841 unicode, 4842 i-1, i, 4843 "surrogates not allowed"); 4844 goto error; 4845 } 4846 *p++ = (char)c; 4847 } 4848 } 4849 Py_CLEAR(rep); 4850 } else if (ch < 0x10000) { 4851 *p++ = (char)(0xe0 | (ch >> 12)); 4852 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4853 *p++ = (char)(0x80 | (ch & 0x3f)); 4854 } else /* ch >= 0x10000 */ { 4855 /* Encode UCS4 Unicode ordinals */ 4856 *p++ = (char)(0xf0 | (ch >> 18)); 4857 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4858 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4859 *p++ = (char)(0x80 | (ch & 0x3f)); 4860 } 4861 } 4862 4863 if (result == NULL) { 4864 /* This was stack allocated. */ 4865 nneeded = p - stackbuf; 4866 assert(nneeded <= nallocated); 4867 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4868 } 4869 else { 4870 /* Cut back to size actually needed. */ 4871 nneeded = p - PyBytes_AS_STRING(result); 4872 assert(nneeded <= nallocated); 4873 _PyBytes_Resize(&result, nneeded); 4874 } 4875 4876 Py_XDECREF(errorHandler); 4877 Py_XDECREF(exc); 4878 return result; 4879 error: 4880 Py_XDECREF(rep); 4881 Py_XDECREF(errorHandler); 4882 Py_XDECREF(exc); 4883 Py_XDECREF(result); 4884 return NULL; 4885 4886#undef MAX_SHORT_UNICHARS 4887} 4888 4889PyObject * 4890PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4891 Py_ssize_t size, 4892 const char *errors) 4893{ 4894 PyObject *v, *unicode; 4895 4896 unicode = PyUnicode_FromUnicode(s, size); 4897 if (unicode == NULL) 4898 return NULL; 4899 v = _PyUnicode_AsUTF8String(unicode, errors); 4900 Py_DECREF(unicode); 4901 return v; 4902} 4903 4904PyObject * 4905PyUnicode_AsUTF8String(PyObject *unicode) 4906{ 4907 return _PyUnicode_AsUTF8String(unicode, NULL); 4908} 4909 4910/* --- UTF-32 Codec ------------------------------------------------------- */ 4911 4912PyObject * 4913PyUnicode_DecodeUTF32(const char *s, 4914 Py_ssize_t size, 4915 const char *errors, 4916 int *byteorder) 4917{ 4918 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4919} 4920 4921PyObject * 4922PyUnicode_DecodeUTF32Stateful(const char *s, 4923 Py_ssize_t size, 4924 const char *errors, 4925 int *byteorder, 4926 Py_ssize_t *consumed) 4927{ 4928 const char *starts = s; 4929 Py_ssize_t startinpos; 4930 Py_ssize_t endinpos; 4931 Py_ssize_t outpos; 4932 PyObject *unicode; 4933 const unsigned char *q, *e; 4934 int bo = 0; /* assume native ordering by default */ 4935 const char *errmsg = ""; 4936 /* Offsets from q for retrieving bytes in the right order. */ 4937#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4938 int iorder[] = {0, 1, 2, 3}; 4939#else 4940 int iorder[] = {3, 2, 1, 0}; 4941#endif 4942 PyObject *errorHandler = NULL; 4943 PyObject *exc = NULL; 4944 4945 q = (unsigned char *)s; 4946 e = q + size; 4947 4948 if (byteorder) 4949 bo = *byteorder; 4950 4951 /* Check for BOM marks (U+FEFF) in the input and adjust current 4952 byte order setting accordingly. In native mode, the leading BOM 4953 mark is skipped, in all other modes, it is copied to the output 4954 stream as-is (giving a ZWNBSP character). */ 4955 if (bo == 0) { 4956 if (size >= 4) { 4957 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4958 (q[iorder[1]] << 8) | q[iorder[0]]; 4959#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4960 if (bom == 0x0000FEFF) { 4961 q += 4; 4962 bo = -1; 4963 } 4964 else if (bom == 0xFFFE0000) { 4965 q += 4; 4966 bo = 1; 4967 } 4968#else 4969 if (bom == 0x0000FEFF) { 4970 q += 4; 4971 bo = 1; 4972 } 4973 else if (bom == 0xFFFE0000) { 4974 q += 4; 4975 bo = -1; 4976 } 4977#endif 4978 } 4979 } 4980 4981 if (bo == -1) { 4982 /* force LE */ 4983 iorder[0] = 0; 4984 iorder[1] = 1; 4985 iorder[2] = 2; 4986 iorder[3] = 3; 4987 } 4988 else if (bo == 1) { 4989 /* force BE */ 4990 iorder[0] = 3; 4991 iorder[1] = 2; 4992 iorder[2] = 1; 4993 iorder[3] = 0; 4994 } 4995 4996 /* This might be one to much, because of a BOM */ 4997 unicode = PyUnicode_New((size+3)/4, 127); 4998 if (!unicode) 4999 return NULL; 5000 if (size == 0) 5001 return unicode; 5002 outpos = 0; 5003 5004 while (q < e) { 5005 Py_UCS4 ch; 5006 /* remaining bytes at the end? (size should be divisible by 4) */ 5007 if (e-q<4) { 5008 if (consumed) 5009 break; 5010 errmsg = "truncated data"; 5011 startinpos = ((const char *)q)-starts; 5012 endinpos = ((const char *)e)-starts; 5013 goto utf32Error; 5014 /* The remaining input chars are ignored if the callback 5015 chooses to skip the input */ 5016 } 5017 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 5018 (q[iorder[1]] << 8) | q[iorder[0]]; 5019 5020 if (ch >= 0x110000) 5021 { 5022 errmsg = "codepoint not in range(0x110000)"; 5023 startinpos = ((const char *)q)-starts; 5024 endinpos = startinpos+4; 5025 goto utf32Error; 5026 } 5027 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5028 goto onError; 5029 q += 4; 5030 continue; 5031 utf32Error: 5032 if (unicode_decode_call_errorhandler( 5033 errors, &errorHandler, 5034 "utf32", errmsg, 5035 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5036 &unicode, &outpos)) 5037 goto onError; 5038 } 5039 5040 if (byteorder) 5041 *byteorder = bo; 5042 5043 if (consumed) 5044 *consumed = (const char *)q-starts; 5045 5046 /* Adjust length */ 5047 if (PyUnicode_Resize(&unicode, outpos) < 0) 5048 goto onError; 5049 5050 Py_XDECREF(errorHandler); 5051 Py_XDECREF(exc); 5052#ifndef DONT_MAKE_RESULT_READY 5053 if (_PyUnicode_READY_REPLACE(&unicode)) { 5054 Py_DECREF(unicode); 5055 return NULL; 5056 } 5057#endif 5058 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5059 return unicode; 5060 5061 onError: 5062 Py_DECREF(unicode); 5063 Py_XDECREF(errorHandler); 5064 Py_XDECREF(exc); 5065 return NULL; 5066} 5067 5068PyObject * 5069_PyUnicode_EncodeUTF32(PyObject *str, 5070 const char *errors, 5071 int byteorder) 5072{ 5073 int kind; 5074 void *data; 5075 Py_ssize_t len; 5076 PyObject *v; 5077 unsigned char *p; 5078 Py_ssize_t nsize, bytesize, i; 5079 /* Offsets from p for storing byte pairs in the right order. */ 5080#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5081 int iorder[] = {0, 1, 2, 3}; 5082#else 5083 int iorder[] = {3, 2, 1, 0}; 5084#endif 5085 5086#define STORECHAR(CH) \ 5087 do { \ 5088 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5089 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5090 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5091 p[iorder[0]] = (CH) & 0xff; \ 5092 p += 4; \ 5093 } while(0) 5094 5095 if (!PyUnicode_Check(str)) { 5096 PyErr_BadArgument(); 5097 return NULL; 5098 } 5099 if (PyUnicode_READY(str) < 0) 5100 return NULL; 5101 kind = PyUnicode_KIND(str); 5102 data = PyUnicode_DATA(str); 5103 len = PyUnicode_GET_LENGTH(str); 5104 5105 nsize = len + (byteorder == 0); 5106 bytesize = nsize * 4; 5107 if (bytesize / 4 != nsize) 5108 return PyErr_NoMemory(); 5109 v = PyBytes_FromStringAndSize(NULL, bytesize); 5110 if (v == NULL) 5111 return NULL; 5112 5113 p = (unsigned char *)PyBytes_AS_STRING(v); 5114 if (byteorder == 0) 5115 STORECHAR(0xFEFF); 5116 if (len == 0) 5117 goto done; 5118 5119 if (byteorder == -1) { 5120 /* force LE */ 5121 iorder[0] = 0; 5122 iorder[1] = 1; 5123 iorder[2] = 2; 5124 iorder[3] = 3; 5125 } 5126 else if (byteorder == 1) { 5127 /* force BE */ 5128 iorder[0] = 3; 5129 iorder[1] = 2; 5130 iorder[2] = 1; 5131 iorder[3] = 0; 5132 } 5133 5134 for (i = 0; i < len; i++) 5135 STORECHAR(PyUnicode_READ(kind, data, i)); 5136 5137 done: 5138 return v; 5139#undef STORECHAR 5140} 5141 5142PyObject * 5143PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5144 Py_ssize_t size, 5145 const char *errors, 5146 int byteorder) 5147{ 5148 PyObject *result; 5149 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5150 if (tmp == NULL) 5151 return NULL; 5152 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5153 Py_DECREF(tmp); 5154 return result; 5155} 5156 5157PyObject * 5158PyUnicode_AsUTF32String(PyObject *unicode) 5159{ 5160 const Py_UNICODE *wstr; 5161 Py_ssize_t wlen; 5162 wstr = PyUnicode_AsUnicodeAndSize(unicode, &wlen); 5163 if (wstr == NULL) 5164 return NULL; 5165 return PyUnicode_EncodeUTF32(wstr, wlen, NULL, 0); 5166} 5167 5168/* --- UTF-16 Codec ------------------------------------------------------- */ 5169 5170PyObject * 5171PyUnicode_DecodeUTF16(const char *s, 5172 Py_ssize_t size, 5173 const char *errors, 5174 int *byteorder) 5175{ 5176 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5177} 5178 5179/* Two masks for fast checking of whether a C 'long' may contain 5180 UTF16-encoded surrogate characters. This is an efficient heuristic, 5181 assuming that non-surrogate characters with a code point >= 0x8000 are 5182 rare in most input. 5183 FAST_CHAR_MASK is used when the input is in native byte ordering, 5184 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 5185*/ 5186#if (SIZEOF_LONG == 8) 5187# define FAST_CHAR_MASK 0x8000800080008000L 5188# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 5189#elif (SIZEOF_LONG == 4) 5190# define FAST_CHAR_MASK 0x80008000L 5191# define SWAPPED_FAST_CHAR_MASK 0x00800080L 5192#else 5193# error C 'long' size should be either 4 or 8! 5194#endif 5195 5196PyObject * 5197PyUnicode_DecodeUTF16Stateful(const char *s, 5198 Py_ssize_t size, 5199 const char *errors, 5200 int *byteorder, 5201 Py_ssize_t *consumed) 5202{ 5203 const char *starts = s; 5204 Py_ssize_t startinpos; 5205 Py_ssize_t endinpos; 5206 Py_ssize_t outpos; 5207 PyObject *unicode; 5208 const unsigned char *q, *e, *aligned_end; 5209 int bo = 0; /* assume native ordering by default */ 5210 int native_ordering = 0; 5211 const char *errmsg = ""; 5212 /* Offsets from q for retrieving byte pairs in the right order. */ 5213#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5214 int ihi = 1, ilo = 0; 5215#else 5216 int ihi = 0, ilo = 1; 5217#endif 5218 PyObject *errorHandler = NULL; 5219 PyObject *exc = NULL; 5220 5221 /* Note: size will always be longer than the resulting Unicode 5222 character count */ 5223 unicode = PyUnicode_New(size, 127); 5224 if (!unicode) 5225 return NULL; 5226 if (size == 0) 5227 return unicode; 5228 outpos = 0; 5229 5230 q = (unsigned char *)s; 5231 e = q + size - 1; 5232 5233 if (byteorder) 5234 bo = *byteorder; 5235 5236 /* Check for BOM marks (U+FEFF) in the input and adjust current 5237 byte order setting accordingly. In native mode, the leading BOM 5238 mark is skipped, in all other modes, it is copied to the output 5239 stream as-is (giving a ZWNBSP character). */ 5240 if (bo == 0) { 5241 if (size >= 2) { 5242 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; 5243#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5244 if (bom == 0xFEFF) { 5245 q += 2; 5246 bo = -1; 5247 } 5248 else if (bom == 0xFFFE) { 5249 q += 2; 5250 bo = 1; 5251 } 5252#else 5253 if (bom == 0xFEFF) { 5254 q += 2; 5255 bo = 1; 5256 } 5257 else if (bom == 0xFFFE) { 5258 q += 2; 5259 bo = -1; 5260 } 5261#endif 5262 } 5263 } 5264 5265 if (bo == -1) { 5266 /* force LE */ 5267 ihi = 1; 5268 ilo = 0; 5269 } 5270 else if (bo == 1) { 5271 /* force BE */ 5272 ihi = 0; 5273 ilo = 1; 5274 } 5275#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5276 native_ordering = ilo < ihi; 5277#else 5278 native_ordering = ilo > ihi; 5279#endif 5280 5281 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5282 while (q < e) { 5283 Py_UCS4 ch; 5284 /* First check for possible aligned read of a C 'long'. Unaligned 5285 reads are more expensive, better to defer to another iteration. */ 5286 if (!((size_t) q & LONG_PTR_MASK)) { 5287 /* Fast path for runs of non-surrogate chars. */ 5288 register const unsigned char *_q = q; 5289 int kind = PyUnicode_KIND(unicode); 5290 void *data = PyUnicode_DATA(unicode); 5291 while (_q < aligned_end) { 5292 unsigned long block = * (unsigned long *) _q; 5293 unsigned short *pblock = (unsigned short*)█ 5294 Py_UCS4 maxch; 5295 if (native_ordering) { 5296 /* Can use buffer directly */ 5297 if (block & FAST_CHAR_MASK) 5298 break; 5299 } 5300 else { 5301 /* Need to byte-swap */ 5302 unsigned char *_p = (unsigned char*)pblock; 5303 if (block & SWAPPED_FAST_CHAR_MASK) 5304 break; 5305 _p[0] = _q[1]; 5306 _p[1] = _q[0]; 5307 _p[2] = _q[3]; 5308 _p[3] = _q[2]; 5309#if (SIZEOF_LONG == 8) 5310 _p[4] = _q[5]; 5311 _p[5] = _q[4]; 5312 _p[6] = _q[7]; 5313 _p[7] = _q[6]; 5314#endif 5315 } 5316 maxch = Py_MAX(pblock[0], pblock[1]); 5317#if SIZEOF_LONG == 8 5318 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3])); 5319#endif 5320 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 5321 if (unicode_widen(&unicode, maxch) < 0) 5322 goto onError; 5323 kind = PyUnicode_KIND(unicode); 5324 data = PyUnicode_DATA(unicode); 5325 } 5326 PyUnicode_WRITE(kind, data, outpos++, pblock[0]); 5327 PyUnicode_WRITE(kind, data, outpos++, pblock[1]); 5328#if SIZEOF_LONG == 8 5329 PyUnicode_WRITE(kind, data, outpos++, pblock[2]); 5330 PyUnicode_WRITE(kind, data, outpos++, pblock[3]); 5331#endif 5332 _q += SIZEOF_LONG; 5333 } 5334 q = _q; 5335 if (q >= e) 5336 break; 5337 } 5338 ch = (q[ihi] << 8) | q[ilo]; 5339 5340 q += 2; 5341 5342 if (ch < 0xD800 || ch > 0xDFFF) { 5343 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5344 goto onError; 5345 continue; 5346 } 5347 5348 /* UTF-16 code pair: */ 5349 if (q > e) { 5350 errmsg = "unexpected end of data"; 5351 startinpos = (((const char *)q) - 2) - starts; 5352 endinpos = ((const char *)e) + 1 - starts; 5353 goto utf16Error; 5354 } 5355 if (0xD800 <= ch && ch <= 0xDBFF) { 5356 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5357 q += 2; 5358 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5359 if (unicode_putchar(&unicode, &outpos, 5360 (((ch & 0x3FF)<<10) | 5361 (ch2 & 0x3FF)) + 0x10000) < 0) 5362 goto onError; 5363 continue; 5364 } 5365 else { 5366 errmsg = "illegal UTF-16 surrogate"; 5367 startinpos = (((const char *)q)-4)-starts; 5368 endinpos = startinpos+2; 5369 goto utf16Error; 5370 } 5371 5372 } 5373 errmsg = "illegal encoding"; 5374 startinpos = (((const char *)q)-2)-starts; 5375 endinpos = startinpos+2; 5376 /* Fall through to report the error */ 5377 5378 utf16Error: 5379 if (unicode_decode_call_errorhandler( 5380 errors, 5381 &errorHandler, 5382 "utf16", errmsg, 5383 &starts, 5384 (const char **)&e, 5385 &startinpos, 5386 &endinpos, 5387 &exc, 5388 (const char **)&q, 5389 &unicode, 5390 &outpos)) 5391 goto onError; 5392 } 5393 /* remaining byte at the end? (size should be even) */ 5394 if (e == q) { 5395 if (!consumed) { 5396 errmsg = "truncated data"; 5397 startinpos = ((const char *)q) - starts; 5398 endinpos = ((const char *)e) + 1 - starts; 5399 if (unicode_decode_call_errorhandler( 5400 errors, 5401 &errorHandler, 5402 "utf16", errmsg, 5403 &starts, 5404 (const char **)&e, 5405 &startinpos, 5406 &endinpos, 5407 &exc, 5408 (const char **)&q, 5409 &unicode, 5410 &outpos)) 5411 goto onError; 5412 /* The remaining input chars are ignored if the callback 5413 chooses to skip the input */ 5414 } 5415 } 5416 5417 if (byteorder) 5418 *byteorder = bo; 5419 5420 if (consumed) 5421 *consumed = (const char *)q-starts; 5422 5423 /* Adjust length */ 5424 if (PyUnicode_Resize(&unicode, outpos) < 0) 5425 goto onError; 5426 5427 Py_XDECREF(errorHandler); 5428 Py_XDECREF(exc); 5429 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5430 return unicode; 5431 5432 onError: 5433 Py_DECREF(unicode); 5434 Py_XDECREF(errorHandler); 5435 Py_XDECREF(exc); 5436 return NULL; 5437} 5438 5439#undef FAST_CHAR_MASK 5440#undef SWAPPED_FAST_CHAR_MASK 5441 5442PyObject * 5443_PyUnicode_EncodeUTF16(PyObject *str, 5444 const char *errors, 5445 int byteorder) 5446{ 5447 int kind; 5448 void *data; 5449 Py_ssize_t len; 5450 PyObject *v; 5451 unsigned char *p; 5452 Py_ssize_t nsize, bytesize; 5453 Py_ssize_t i, pairs; 5454 /* Offsets from p for storing byte pairs in the right order. */ 5455#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5456 int ihi = 1, ilo = 0; 5457#else 5458 int ihi = 0, ilo = 1; 5459#endif 5460 5461#define STORECHAR(CH) \ 5462 do { \ 5463 p[ihi] = ((CH) >> 8) & 0xff; \ 5464 p[ilo] = (CH) & 0xff; \ 5465 p += 2; \ 5466 } while(0) 5467 5468 if (!PyUnicode_Check(str)) { 5469 PyErr_BadArgument(); 5470 return NULL; 5471 } 5472 if (PyUnicode_READY(str) < 0) 5473 return NULL; 5474 kind = PyUnicode_KIND(str); 5475 data = PyUnicode_DATA(str); 5476 len = PyUnicode_GET_LENGTH(str); 5477 5478 pairs = 0; 5479 if (kind == PyUnicode_4BYTE_KIND) 5480 for (i = 0; i < len; i++) 5481 if (PyUnicode_READ(kind, data, i) >= 0x10000) 5482 pairs++; 5483 /* 2 * (len + pairs + (byteorder == 0)) */ 5484 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5485 return PyErr_NoMemory(); 5486 nsize = len + pairs + (byteorder == 0); 5487 bytesize = nsize * 2; 5488 if (bytesize / 2 != nsize) 5489 return PyErr_NoMemory(); 5490 v = PyBytes_FromStringAndSize(NULL, bytesize); 5491 if (v == NULL) 5492 return NULL; 5493 5494 p = (unsigned char *)PyBytes_AS_STRING(v); 5495 if (byteorder == 0) 5496 STORECHAR(0xFEFF); 5497 if (len == 0) 5498 goto done; 5499 5500 if (byteorder == -1) { 5501 /* force LE */ 5502 ihi = 1; 5503 ilo = 0; 5504 } 5505 else if (byteorder == 1) { 5506 /* force BE */ 5507 ihi = 0; 5508 ilo = 1; 5509 } 5510 5511 for (i = 0; i < len; i++) { 5512 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5513 Py_UCS4 ch2 = 0; 5514 if (ch >= 0x10000) { 5515 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5516 ch = 0xD800 | ((ch-0x10000) >> 10); 5517 } 5518 STORECHAR(ch); 5519 if (ch2) 5520 STORECHAR(ch2); 5521 } 5522 5523 done: 5524 return v; 5525#undef STORECHAR 5526} 5527 5528PyObject * 5529PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5530 Py_ssize_t size, 5531 const char *errors, 5532 int byteorder) 5533{ 5534 PyObject *result; 5535 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5536 if (tmp == NULL) 5537 return NULL; 5538 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5539 Py_DECREF(tmp); 5540 return result; 5541} 5542 5543PyObject * 5544PyUnicode_AsUTF16String(PyObject *unicode) 5545{ 5546 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5547} 5548 5549/* --- Unicode Escape Codec ----------------------------------------------- */ 5550 5551/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5552 if all the escapes in the string make it still a valid ASCII string. 5553 Returns -1 if any escapes were found which cause the string to 5554 pop out of ASCII range. Otherwise returns the length of the 5555 required buffer to hold the string. 5556 */ 5557static Py_ssize_t 5558length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5559{ 5560 const unsigned char *p = (const unsigned char *)s; 5561 const unsigned char *end = p + size; 5562 Py_ssize_t length = 0; 5563 5564 if (size < 0) 5565 return -1; 5566 5567 for (; p < end; ++p) { 5568 if (*p > 127) { 5569 /* Non-ASCII */ 5570 return -1; 5571 } 5572 else if (*p != '\\') { 5573 /* Normal character */ 5574 ++length; 5575 } 5576 else { 5577 /* Backslash-escape, check next char */ 5578 ++p; 5579 /* Escape sequence reaches till end of string or 5580 non-ASCII follow-up. */ 5581 if (p >= end || *p > 127) 5582 return -1; 5583 switch (*p) { 5584 case '\n': 5585 /* backslash + \n result in zero characters */ 5586 break; 5587 case '\\': case '\'': case '\"': 5588 case 'b': case 'f': case 't': 5589 case 'n': case 'r': case 'v': case 'a': 5590 ++length; 5591 break; 5592 case '0': case '1': case '2': case '3': 5593 case '4': case '5': case '6': case '7': 5594 case 'x': case 'u': case 'U': case 'N': 5595 /* these do not guarantee ASCII characters */ 5596 return -1; 5597 default: 5598 /* count the backslash + the other character */ 5599 length += 2; 5600 } 5601 } 5602 } 5603 return length; 5604} 5605 5606/* Similar to PyUnicode_WRITE but either write into wstr field 5607 or treat string as ASCII. */ 5608#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5609 do { \ 5610 if ((kind) != PyUnicode_WCHAR_KIND) \ 5611 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5612 else \ 5613 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5614 } while (0) 5615 5616#define WRITE_WSTR(buf, index, value) \ 5617 assert(kind == PyUnicode_WCHAR_KIND), \ 5618 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5619 5620 5621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5622 5623PyObject * 5624PyUnicode_DecodeUnicodeEscape(const char *s, 5625 Py_ssize_t size, 5626 const char *errors) 5627{ 5628 const char *starts = s; 5629 Py_ssize_t startinpos; 5630 Py_ssize_t endinpos; 5631 int j; 5632 PyObject *v; 5633 const char *end; 5634 char* message; 5635 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5636 PyObject *errorHandler = NULL; 5637 PyObject *exc = NULL; 5638 Py_ssize_t len; 5639 Py_ssize_t i; 5640 5641 len = length_of_escaped_ascii_string(s, size); 5642 5643 /* After length_of_escaped_ascii_string() there are two alternatives, 5644 either the string is pure ASCII with named escapes like \n, etc. 5645 and we determined it's exact size (common case) 5646 or it contains \x, \u, ... escape sequences. then we create a 5647 legacy wchar string and resize it at the end of this function. */ 5648 if (len >= 0) { 5649 v = PyUnicode_New(len, 127); 5650 if (!v) 5651 goto onError; 5652 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5653 } 5654 else { 5655 /* Escaped strings will always be longer than the resulting 5656 Unicode string, so we start with size here and then reduce the 5657 length after conversion to the true value. 5658 (but if the error callback returns a long replacement string 5659 we'll have to allocate more space) */ 5660 v = PyUnicode_New(size, 127); 5661 if (!v) 5662 goto onError; 5663 len = size; 5664 } 5665 5666 if (size == 0) 5667 return v; 5668 i = 0; 5669 end = s + size; 5670 5671 while (s < end) { 5672 unsigned char c; 5673 Py_UCS4 x; 5674 int digits; 5675 5676 /* The only case in which i == ascii_length is a backslash 5677 followed by a newline. */ 5678 assert(i <= len); 5679 5680 /* Non-escape characters are interpreted as Unicode ordinals */ 5681 if (*s != '\\') { 5682 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5683 goto onError; 5684 continue; 5685 } 5686 5687 startinpos = s-starts; 5688 /* \ - Escapes */ 5689 s++; 5690 c = *s++; 5691 if (s > end) 5692 c = '\0'; /* Invalid after \ */ 5693 5694 /* The only case in which i == ascii_length is a backslash 5695 followed by a newline. */ 5696 assert(i < len || (i == len && c == '\n')); 5697 5698 switch (c) { 5699 5700 /* \x escapes */ 5701#define WRITECHAR(ch) \ 5702 do { \ 5703 if (unicode_putchar(&v, &i, ch) < 0) \ 5704 goto onError; \ 5705 }while(0) 5706 5707 case '\n': break; 5708 case '\\': WRITECHAR('\\'); break; 5709 case '\'': WRITECHAR('\''); break; 5710 case '\"': WRITECHAR('\"'); break; 5711 case 'b': WRITECHAR('\b'); break; 5712 /* FF */ 5713 case 'f': WRITECHAR('\014'); break; 5714 case 't': WRITECHAR('\t'); break; 5715 case 'n': WRITECHAR('\n'); break; 5716 case 'r': WRITECHAR('\r'); break; 5717 /* VT */ 5718 case 'v': WRITECHAR('\013'); break; 5719 /* BEL, not classic C */ 5720 case 'a': WRITECHAR('\007'); break; 5721 5722 /* \OOO (octal) escapes */ 5723 case '0': case '1': case '2': case '3': 5724 case '4': case '5': case '6': case '7': 5725 x = s[-1] - '0'; 5726 if (s < end && '0' <= *s && *s <= '7') { 5727 x = (x<<3) + *s++ - '0'; 5728 if (s < end && '0' <= *s && *s <= '7') 5729 x = (x<<3) + *s++ - '0'; 5730 } 5731 WRITECHAR(x); 5732 break; 5733 5734 /* hex escapes */ 5735 /* \xXX */ 5736 case 'x': 5737 digits = 2; 5738 message = "truncated \\xXX escape"; 5739 goto hexescape; 5740 5741 /* \uXXXX */ 5742 case 'u': 5743 digits = 4; 5744 message = "truncated \\uXXXX escape"; 5745 goto hexescape; 5746 5747 /* \UXXXXXXXX */ 5748 case 'U': 5749 digits = 8; 5750 message = "truncated \\UXXXXXXXX escape"; 5751 hexescape: 5752 chr = 0; 5753 if (s+digits>end) { 5754 endinpos = size; 5755 if (unicode_decode_call_errorhandler( 5756 errors, &errorHandler, 5757 "unicodeescape", "end of string in escape sequence", 5758 &starts, &end, &startinpos, &endinpos, &exc, &s, 5759 &v, &i)) 5760 goto onError; 5761 goto nextByte; 5762 } 5763 for (j = 0; j < digits; ++j) { 5764 c = (unsigned char) s[j]; 5765 if (!Py_ISXDIGIT(c)) { 5766 endinpos = (s+j+1)-starts; 5767 if (unicode_decode_call_errorhandler( 5768 errors, &errorHandler, 5769 "unicodeescape", message, 5770 &starts, &end, &startinpos, &endinpos, &exc, &s, 5771 &v, &i)) 5772 goto onError; 5773 len = PyUnicode_GET_LENGTH(v); 5774 goto nextByte; 5775 } 5776 chr = (chr<<4) & ~0xF; 5777 if (c >= '0' && c <= '9') 5778 chr += c - '0'; 5779 else if (c >= 'a' && c <= 'f') 5780 chr += 10 + c - 'a'; 5781 else 5782 chr += 10 + c - 'A'; 5783 } 5784 s += j; 5785 if (chr == 0xffffffff && PyErr_Occurred()) 5786 /* _decoding_error will have already written into the 5787 target buffer. */ 5788 break; 5789 store: 5790 /* when we get here, chr is a 32-bit unicode character */ 5791 if (chr <= 0x10ffff) { 5792 WRITECHAR(chr); 5793 } else { 5794 endinpos = s-starts; 5795 if (unicode_decode_call_errorhandler( 5796 errors, &errorHandler, 5797 "unicodeescape", "illegal Unicode character", 5798 &starts, &end, &startinpos, &endinpos, &exc, &s, 5799 &v, &i)) 5800 goto onError; 5801 } 5802 break; 5803 5804 /* \N{name} */ 5805 case 'N': 5806 message = "malformed \\N character escape"; 5807 if (ucnhash_CAPI == NULL) { 5808 /* load the unicode data module */ 5809 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5810 PyUnicodeData_CAPSULE_NAME, 1); 5811 if (ucnhash_CAPI == NULL) 5812 goto ucnhashError; 5813 } 5814 if (*s == '{') { 5815 const char *start = s+1; 5816 /* look for the closing brace */ 5817 while (*s != '}' && s < end) 5818 s++; 5819 if (s > start && s < end && *s == '}') { 5820 /* found a name. look it up in the unicode database */ 5821 message = "unknown Unicode character name"; 5822 s++; 5823 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5824 &chr, 0)) 5825 goto store; 5826 } 5827 } 5828 endinpos = s-starts; 5829 if (unicode_decode_call_errorhandler( 5830 errors, &errorHandler, 5831 "unicodeescape", message, 5832 &starts, &end, &startinpos, &endinpos, &exc, &s, 5833 &v, &i)) 5834 goto onError; 5835 break; 5836 5837 default: 5838 if (s > end) { 5839 message = "\\ at end of string"; 5840 s--; 5841 endinpos = s-starts; 5842 if (unicode_decode_call_errorhandler( 5843 errors, &errorHandler, 5844 "unicodeescape", message, 5845 &starts, &end, &startinpos, &endinpos, &exc, &s, 5846 &v, &i)) 5847 goto onError; 5848 } 5849 else { 5850 WRITECHAR('\\'); 5851 WRITECHAR(s[-1]); 5852 } 5853 break; 5854 } 5855 nextByte: 5856 ; 5857 } 5858#undef WRITECHAR 5859 5860 if (PyUnicode_Resize(&v, i) < 0) 5861 goto onError; 5862 Py_XDECREF(errorHandler); 5863 Py_XDECREF(exc); 5864#ifndef DONT_MAKE_RESULT_READY 5865 if (_PyUnicode_READY_REPLACE(&v)) { 5866 Py_DECREF(v); 5867 return NULL; 5868 } 5869#endif 5870 assert(_PyUnicode_CheckConsistency(v, 1)); 5871 return v; 5872 5873 ucnhashError: 5874 PyErr_SetString( 5875 PyExc_UnicodeError, 5876 "\\N escapes not supported (can't load unicodedata module)" 5877 ); 5878 Py_XDECREF(v); 5879 Py_XDECREF(errorHandler); 5880 Py_XDECREF(exc); 5881 return NULL; 5882 5883 onError: 5884 Py_XDECREF(v); 5885 Py_XDECREF(errorHandler); 5886 Py_XDECREF(exc); 5887 return NULL; 5888} 5889 5890#undef WRITE_ASCII_OR_WSTR 5891#undef WRITE_WSTR 5892 5893/* Return a Unicode-Escape string version of the Unicode object. 5894 5895 If quotes is true, the string is enclosed in u"" or u'' quotes as 5896 appropriate. 5897 5898*/ 5899 5900PyObject * 5901PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5902{ 5903 Py_ssize_t i, len; 5904 PyObject *repr; 5905 char *p; 5906 int kind; 5907 void *data; 5908 Py_ssize_t expandsize = 0; 5909 5910 /* Initial allocation is based on the longest-possible unichr 5911 escape. 5912 5913 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5914 unichr, so in this case it's the longest unichr escape. In 5915 narrow (UTF-16) builds this is five chars per source unichr 5916 since there are two unichrs in the surrogate pair, so in narrow 5917 (UTF-16) builds it's not the longest unichr escape. 5918 5919 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5920 so in the narrow (UTF-16) build case it's the longest unichr 5921 escape. 5922 */ 5923 5924 if (!PyUnicode_Check(unicode)) { 5925 PyErr_BadArgument(); 5926 return NULL; 5927 } 5928 if (PyUnicode_READY(unicode) < 0) 5929 return NULL; 5930 len = PyUnicode_GET_LENGTH(unicode); 5931 kind = PyUnicode_KIND(unicode); 5932 data = PyUnicode_DATA(unicode); 5933 switch(kind) { 5934 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5935 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5936 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5937 } 5938 5939 if (len == 0) 5940 return PyBytes_FromStringAndSize(NULL, 0); 5941 5942 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5943 return PyErr_NoMemory(); 5944 5945 repr = PyBytes_FromStringAndSize(NULL, 5946 2 5947 + expandsize*len 5948 + 1); 5949 if (repr == NULL) 5950 return NULL; 5951 5952 p = PyBytes_AS_STRING(repr); 5953 5954 for (i = 0; i < len; i++) { 5955 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5956 5957 /* Escape backslashes */ 5958 if (ch == '\\') { 5959 *p++ = '\\'; 5960 *p++ = (char) ch; 5961 continue; 5962 } 5963 5964 /* Map 21-bit characters to '\U00xxxxxx' */ 5965 else if (ch >= 0x10000) { 5966 *p++ = '\\'; 5967 *p++ = 'U'; 5968 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5969 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5970 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5971 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5972 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5973 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5974 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5975 *p++ = Py_hexdigits[ch & 0x0000000F]; 5976 continue; 5977 } 5978 5979 /* Map 16-bit characters to '\uxxxx' */ 5980 if (ch >= 256) { 5981 *p++ = '\\'; 5982 *p++ = 'u'; 5983 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5984 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5985 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5986 *p++ = Py_hexdigits[ch & 0x000F]; 5987 } 5988 5989 /* Map special whitespace to '\t', \n', '\r' */ 5990 else if (ch == '\t') { 5991 *p++ = '\\'; 5992 *p++ = 't'; 5993 } 5994 else if (ch == '\n') { 5995 *p++ = '\\'; 5996 *p++ = 'n'; 5997 } 5998 else if (ch == '\r') { 5999 *p++ = '\\'; 6000 *p++ = 'r'; 6001 } 6002 6003 /* Map non-printable US ASCII to '\xhh' */ 6004 else if (ch < ' ' || ch >= 0x7F) { 6005 *p++ = '\\'; 6006 *p++ = 'x'; 6007 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6008 *p++ = Py_hexdigits[ch & 0x000F]; 6009 } 6010 6011 /* Copy everything else as-is */ 6012 else 6013 *p++ = (char) ch; 6014 } 6015 6016 assert(p - PyBytes_AS_STRING(repr) > 0); 6017 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6018 return NULL; 6019 return repr; 6020} 6021 6022PyObject * 6023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6024 Py_ssize_t size) 6025{ 6026 PyObject *result; 6027 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6028 if (tmp == NULL) 6029 return NULL; 6030 result = PyUnicode_AsUnicodeEscapeString(tmp); 6031 Py_DECREF(tmp); 6032 return result; 6033} 6034 6035/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6036 6037PyObject * 6038PyUnicode_DecodeRawUnicodeEscape(const char *s, 6039 Py_ssize_t size, 6040 const char *errors) 6041{ 6042 const char *starts = s; 6043 Py_ssize_t startinpos; 6044 Py_ssize_t endinpos; 6045 Py_ssize_t outpos; 6046 PyObject *v; 6047 const char *end; 6048 const char *bs; 6049 PyObject *errorHandler = NULL; 6050 PyObject *exc = NULL; 6051 6052 /* Escaped strings will always be longer than the resulting 6053 Unicode string, so we start with size here and then reduce the 6054 length after conversion to the true value. (But decoding error 6055 handler might have to resize the string) */ 6056 v = PyUnicode_New(size, 127); 6057 if (v == NULL) 6058 goto onError; 6059 if (size == 0) 6060 return v; 6061 outpos = 0; 6062 end = s + size; 6063 while (s < end) { 6064 unsigned char c; 6065 Py_UCS4 x; 6066 int i; 6067 int count; 6068 6069 /* Non-escape characters are interpreted as Unicode ordinals */ 6070 if (*s != '\\') { 6071 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6072 goto onError; 6073 continue; 6074 } 6075 startinpos = s-starts; 6076 6077 /* \u-escapes are only interpreted iff the number of leading 6078 backslashes if odd */ 6079 bs = s; 6080 for (;s < end;) { 6081 if (*s != '\\') 6082 break; 6083 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6084 goto onError; 6085 } 6086 if (((s - bs) & 1) == 0 || 6087 s >= end || 6088 (*s != 'u' && *s != 'U')) { 6089 continue; 6090 } 6091 outpos--; 6092 count = *s=='u' ? 4 : 8; 6093 s++; 6094 6095 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6096 for (x = 0, i = 0; i < count; ++i, ++s) { 6097 c = (unsigned char)*s; 6098 if (!Py_ISXDIGIT(c)) { 6099 endinpos = s-starts; 6100 if (unicode_decode_call_errorhandler( 6101 errors, &errorHandler, 6102 "rawunicodeescape", "truncated \\uXXXX", 6103 &starts, &end, &startinpos, &endinpos, &exc, &s, 6104 &v, &outpos)) 6105 goto onError; 6106 goto nextByte; 6107 } 6108 x = (x<<4) & ~0xF; 6109 if (c >= '0' && c <= '9') 6110 x += c - '0'; 6111 else if (c >= 'a' && c <= 'f') 6112 x += 10 + c - 'a'; 6113 else 6114 x += 10 + c - 'A'; 6115 } 6116 if (x <= 0x10ffff) { 6117 if (unicode_putchar(&v, &outpos, x) < 0) 6118 goto onError; 6119 } else { 6120 endinpos = s-starts; 6121 if (unicode_decode_call_errorhandler( 6122 errors, &errorHandler, 6123 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6124 &starts, &end, &startinpos, &endinpos, &exc, &s, 6125 &v, &outpos)) 6126 goto onError; 6127 } 6128 nextByte: 6129 ; 6130 } 6131 if (PyUnicode_Resize(&v, outpos) < 0) 6132 goto onError; 6133 Py_XDECREF(errorHandler); 6134 Py_XDECREF(exc); 6135 assert(_PyUnicode_CheckConsistency(v, 1)); 6136 return v; 6137 6138 onError: 6139 Py_XDECREF(v); 6140 Py_XDECREF(errorHandler); 6141 Py_XDECREF(exc); 6142 return NULL; 6143} 6144 6145 6146PyObject * 6147PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6148{ 6149 PyObject *repr; 6150 char *p; 6151 char *q; 6152 Py_ssize_t expandsize, pos; 6153 int kind; 6154 void *data; 6155 Py_ssize_t len; 6156 6157 if (!PyUnicode_Check(unicode)) { 6158 PyErr_BadArgument(); 6159 return NULL; 6160 } 6161 if (PyUnicode_READY(unicode) < 0) 6162 return NULL; 6163 kind = PyUnicode_KIND(unicode); 6164 data = PyUnicode_DATA(unicode); 6165 len = PyUnicode_GET_LENGTH(unicode); 6166 6167 switch(kind) { 6168 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 6169 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 6170 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 6171 } 6172 6173 if (len > PY_SSIZE_T_MAX / expandsize) 6174 return PyErr_NoMemory(); 6175 6176 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6177 if (repr == NULL) 6178 return NULL; 6179 if (len == 0) 6180 return repr; 6181 6182 p = q = PyBytes_AS_STRING(repr); 6183 for (pos = 0; pos < len; pos++) { 6184 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6185 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6186 if (ch >= 0x10000) { 6187 *p++ = '\\'; 6188 *p++ = 'U'; 6189 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6190 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6191 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6192 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6193 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6194 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6195 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6196 *p++ = Py_hexdigits[ch & 15]; 6197 } 6198 /* Map 16-bit characters to '\uxxxx' */ 6199 else if (ch >= 256) { 6200 *p++ = '\\'; 6201 *p++ = 'u'; 6202 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6203 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6204 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6205 *p++ = Py_hexdigits[ch & 15]; 6206 } 6207 /* Copy everything else as-is */ 6208 else 6209 *p++ = (char) ch; 6210 } 6211 6212 assert(p > q); 6213 if (_PyBytes_Resize(&repr, p - q) < 0) 6214 return NULL; 6215 return repr; 6216} 6217 6218PyObject * 6219PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6220 Py_ssize_t size) 6221{ 6222 PyObject *result; 6223 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6224 if (tmp == NULL) 6225 return NULL; 6226 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6227 Py_DECREF(tmp); 6228 return result; 6229} 6230 6231/* --- Unicode Internal Codec ------------------------------------------- */ 6232 6233PyObject * 6234_PyUnicode_DecodeUnicodeInternal(const char *s, 6235 Py_ssize_t size, 6236 const char *errors) 6237{ 6238 const char *starts = s; 6239 Py_ssize_t startinpos; 6240 Py_ssize_t endinpos; 6241 Py_ssize_t outpos; 6242 PyObject *v; 6243 const char *end; 6244 const char *reason; 6245 PyObject *errorHandler = NULL; 6246 PyObject *exc = NULL; 6247 6248 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6249 "unicode_internal codec has been deprecated", 6250 1)) 6251 return NULL; 6252 6253 /* XXX overflow detection missing */ 6254 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6255 if (v == NULL) 6256 goto onError; 6257 if (PyUnicode_GET_LENGTH(v) == 0) 6258 return v; 6259 outpos = 0; 6260 end = s + size; 6261 6262 while (s < end) { 6263 Py_UNICODE uch; 6264 Py_UCS4 ch; 6265 /* We copy the raw representation one byte at a time because the 6266 pointer may be unaligned (see test_codeccallbacks). */ 6267 ((char *) &uch)[0] = s[0]; 6268 ((char *) &uch)[1] = s[1]; 6269#ifdef Py_UNICODE_WIDE 6270 ((char *) &uch)[2] = s[2]; 6271 ((char *) &uch)[3] = s[3]; 6272#endif 6273 ch = uch; 6274 6275 /* We have to sanity check the raw data, otherwise doom looms for 6276 some malformed UCS-4 data. */ 6277 if ( 6278#ifdef Py_UNICODE_WIDE 6279 ch > 0x10ffff || 6280#endif 6281 end-s < Py_UNICODE_SIZE 6282 ) 6283 { 6284 startinpos = s - starts; 6285 if (end-s < Py_UNICODE_SIZE) { 6286 endinpos = end-starts; 6287 reason = "truncated input"; 6288 } 6289 else { 6290 endinpos = s - starts + Py_UNICODE_SIZE; 6291 reason = "illegal code point (> 0x10FFFF)"; 6292 } 6293 if (unicode_decode_call_errorhandler( 6294 errors, &errorHandler, 6295 "unicode_internal", reason, 6296 &starts, &end, &startinpos, &endinpos, &exc, &s, 6297 &v, &outpos)) 6298 goto onError; 6299 continue; 6300 } 6301 6302 s += Py_UNICODE_SIZE; 6303#ifndef Py_UNICODE_WIDE 6304 if (ch >= 0xD800 && ch <= 0xDBFF && s < end) 6305 { 6306 Py_UNICODE uch2; 6307 ((char *) &uch2)[0] = s[0]; 6308 ((char *) &uch2)[1] = s[1]; 6309 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF) 6310 { 6311 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000; 6312 s += Py_UNICODE_SIZE; 6313 } 6314 } 6315#endif 6316 6317 if (unicode_putchar(&v, &outpos, ch) < 0) 6318 goto onError; 6319 } 6320 6321 if (PyUnicode_Resize(&v, outpos) < 0) 6322 goto onError; 6323 Py_XDECREF(errorHandler); 6324 Py_XDECREF(exc); 6325 assert(_PyUnicode_CheckConsistency(v, 1)); 6326 return v; 6327 6328 onError: 6329 Py_XDECREF(v); 6330 Py_XDECREF(errorHandler); 6331 Py_XDECREF(exc); 6332 return NULL; 6333} 6334 6335/* --- Latin-1 Codec ------------------------------------------------------ */ 6336 6337PyObject * 6338PyUnicode_DecodeLatin1(const char *s, 6339 Py_ssize_t size, 6340 const char *errors) 6341{ 6342 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6343 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6344} 6345 6346/* create or adjust a UnicodeEncodeError */ 6347static void 6348make_encode_exception(PyObject **exceptionObject, 6349 const char *encoding, 6350 PyObject *unicode, 6351 Py_ssize_t startpos, Py_ssize_t endpos, 6352 const char *reason) 6353{ 6354 if (*exceptionObject == NULL) { 6355 *exceptionObject = PyObject_CallFunction( 6356 PyExc_UnicodeEncodeError, "sOnns", 6357 encoding, unicode, startpos, endpos, reason); 6358 } 6359 else { 6360 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6361 goto onError; 6362 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6363 goto onError; 6364 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6365 goto onError; 6366 return; 6367 onError: 6368 Py_DECREF(*exceptionObject); 6369 *exceptionObject = NULL; 6370 } 6371} 6372 6373/* raises a UnicodeEncodeError */ 6374static void 6375raise_encode_exception(PyObject **exceptionObject, 6376 const char *encoding, 6377 PyObject *unicode, 6378 Py_ssize_t startpos, Py_ssize_t endpos, 6379 const char *reason) 6380{ 6381 make_encode_exception(exceptionObject, 6382 encoding, unicode, startpos, endpos, reason); 6383 if (*exceptionObject != NULL) 6384 PyCodec_StrictErrors(*exceptionObject); 6385} 6386 6387/* error handling callback helper: 6388 build arguments, call the callback and check the arguments, 6389 put the result into newpos and return the replacement string, which 6390 has to be freed by the caller */ 6391static PyObject * 6392unicode_encode_call_errorhandler(const char *errors, 6393 PyObject **errorHandler, 6394 const char *encoding, const char *reason, 6395 PyObject *unicode, PyObject **exceptionObject, 6396 Py_ssize_t startpos, Py_ssize_t endpos, 6397 Py_ssize_t *newpos) 6398{ 6399 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6400 Py_ssize_t len; 6401 PyObject *restuple; 6402 PyObject *resunicode; 6403 6404 if (*errorHandler == NULL) { 6405 *errorHandler = PyCodec_LookupError(errors); 6406 if (*errorHandler == NULL) 6407 return NULL; 6408 } 6409 6410 if (PyUnicode_READY(unicode) < 0) 6411 return NULL; 6412 len = PyUnicode_GET_LENGTH(unicode); 6413 6414 make_encode_exception(exceptionObject, 6415 encoding, unicode, startpos, endpos, reason); 6416 if (*exceptionObject == NULL) 6417 return NULL; 6418 6419 restuple = PyObject_CallFunctionObjArgs( 6420 *errorHandler, *exceptionObject, NULL); 6421 if (restuple == NULL) 6422 return NULL; 6423 if (!PyTuple_Check(restuple)) { 6424 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6425 Py_DECREF(restuple); 6426 return NULL; 6427 } 6428 if (!PyArg_ParseTuple(restuple, argparse, 6429 &resunicode, newpos)) { 6430 Py_DECREF(restuple); 6431 return NULL; 6432 } 6433 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6434 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6435 Py_DECREF(restuple); 6436 return NULL; 6437 } 6438 if (*newpos<0) 6439 *newpos = len + *newpos; 6440 if (*newpos<0 || *newpos>len) { 6441 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6442 Py_DECREF(restuple); 6443 return NULL; 6444 } 6445 Py_INCREF(resunicode); 6446 Py_DECREF(restuple); 6447 return resunicode; 6448} 6449 6450static PyObject * 6451unicode_encode_ucs1(PyObject *unicode, 6452 const char *errors, 6453 unsigned int limit) 6454{ 6455 /* input state */ 6456 Py_ssize_t pos=0, size; 6457 int kind; 6458 void *data; 6459 /* output object */ 6460 PyObject *res; 6461 /* pointer into the output */ 6462 char *str; 6463 /* current output position */ 6464 Py_ssize_t ressize; 6465 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6466 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6467 PyObject *errorHandler = NULL; 6468 PyObject *exc = NULL; 6469 /* the following variable is used for caching string comparisons 6470 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6471 int known_errorHandler = -1; 6472 6473 if (PyUnicode_READY(unicode) < 0) 6474 return NULL; 6475 size = PyUnicode_GET_LENGTH(unicode); 6476 kind = PyUnicode_KIND(unicode); 6477 data = PyUnicode_DATA(unicode); 6478 /* allocate enough for a simple encoding without 6479 replacements, if we need more, we'll resize */ 6480 if (size == 0) 6481 return PyBytes_FromStringAndSize(NULL, 0); 6482 res = PyBytes_FromStringAndSize(NULL, size); 6483 if (res == NULL) 6484 return NULL; 6485 str = PyBytes_AS_STRING(res); 6486 ressize = size; 6487 6488 while (pos < size) { 6489 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6490 6491 /* can we encode this? */ 6492 if (c<limit) { 6493 /* no overflow check, because we know that the space is enough */ 6494 *str++ = (char)c; 6495 ++pos; 6496 } 6497 else { 6498 Py_ssize_t requiredsize; 6499 PyObject *repunicode; 6500 Py_ssize_t repsize, newpos, respos, i; 6501 /* startpos for collecting unencodable chars */ 6502 Py_ssize_t collstart = pos; 6503 Py_ssize_t collend = pos; 6504 /* find all unecodable characters */ 6505 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6506 ++collend; 6507 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6508 if (known_errorHandler==-1) { 6509 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6510 known_errorHandler = 1; 6511 else if (!strcmp(errors, "replace")) 6512 known_errorHandler = 2; 6513 else if (!strcmp(errors, "ignore")) 6514 known_errorHandler = 3; 6515 else if (!strcmp(errors, "xmlcharrefreplace")) 6516 known_errorHandler = 4; 6517 else 6518 known_errorHandler = 0; 6519 } 6520 switch (known_errorHandler) { 6521 case 1: /* strict */ 6522 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6523 goto onError; 6524 case 2: /* replace */ 6525 while (collstart++<collend) 6526 *str++ = '?'; /* fall through */ 6527 case 3: /* ignore */ 6528 pos = collend; 6529 break; 6530 case 4: /* xmlcharrefreplace */ 6531 respos = str - PyBytes_AS_STRING(res); 6532 /* determine replacement size */ 6533 for (i = collstart, repsize = 0; i < collend; ++i) { 6534 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6535 if (ch < 10) 6536 repsize += 2+1+1; 6537 else if (ch < 100) 6538 repsize += 2+2+1; 6539 else if (ch < 1000) 6540 repsize += 2+3+1; 6541 else if (ch < 10000) 6542 repsize += 2+4+1; 6543#ifndef Py_UNICODE_WIDE 6544 else 6545 repsize += 2+5+1; 6546#else 6547 else if (ch < 100000) 6548 repsize += 2+5+1; 6549 else if (ch < 1000000) 6550 repsize += 2+6+1; 6551 else 6552 repsize += 2+7+1; 6553#endif 6554 } 6555 requiredsize = respos+repsize+(size-collend); 6556 if (requiredsize > ressize) { 6557 if (requiredsize<2*ressize) 6558 requiredsize = 2*ressize; 6559 if (_PyBytes_Resize(&res, requiredsize)) 6560 goto onError; 6561 str = PyBytes_AS_STRING(res) + respos; 6562 ressize = requiredsize; 6563 } 6564 /* generate replacement */ 6565 for (i = collstart; i < collend; ++i) { 6566 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6567 } 6568 pos = collend; 6569 break; 6570 default: 6571 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6572 encoding, reason, unicode, &exc, 6573 collstart, collend, &newpos); 6574 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6575 PyUnicode_READY(repunicode) < 0)) 6576 goto onError; 6577 if (PyBytes_Check(repunicode)) { 6578 /* Directly copy bytes result to output. */ 6579 repsize = PyBytes_Size(repunicode); 6580 if (repsize > 1) { 6581 /* Make room for all additional bytes. */ 6582 respos = str - PyBytes_AS_STRING(res); 6583 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6584 Py_DECREF(repunicode); 6585 goto onError; 6586 } 6587 str = PyBytes_AS_STRING(res) + respos; 6588 ressize += repsize-1; 6589 } 6590 memcpy(str, PyBytes_AsString(repunicode), repsize); 6591 str += repsize; 6592 pos = newpos; 6593 Py_DECREF(repunicode); 6594 break; 6595 } 6596 /* need more space? (at least enough for what we 6597 have+the replacement+the rest of the string, so 6598 we won't have to check space for encodable characters) */ 6599 respos = str - PyBytes_AS_STRING(res); 6600 repsize = PyUnicode_GET_LENGTH(repunicode); 6601 requiredsize = respos+repsize+(size-collend); 6602 if (requiredsize > ressize) { 6603 if (requiredsize<2*ressize) 6604 requiredsize = 2*ressize; 6605 if (_PyBytes_Resize(&res, requiredsize)) { 6606 Py_DECREF(repunicode); 6607 goto onError; 6608 } 6609 str = PyBytes_AS_STRING(res) + respos; 6610 ressize = requiredsize; 6611 } 6612 /* check if there is anything unencodable in the replacement 6613 and copy it to the output */ 6614 for (i = 0; repsize-->0; ++i, ++str) { 6615 c = PyUnicode_READ_CHAR(repunicode, i); 6616 if (c >= limit) { 6617 raise_encode_exception(&exc, encoding, unicode, 6618 pos, pos+1, reason); 6619 Py_DECREF(repunicode); 6620 goto onError; 6621 } 6622 *str = (char)c; 6623 } 6624 pos = newpos; 6625 Py_DECREF(repunicode); 6626 } 6627 } 6628 } 6629 /* Resize if we allocated to much */ 6630 size = str - PyBytes_AS_STRING(res); 6631 if (size < ressize) { /* If this falls res will be NULL */ 6632 assert(size >= 0); 6633 if (_PyBytes_Resize(&res, size) < 0) 6634 goto onError; 6635 } 6636 6637 Py_XDECREF(errorHandler); 6638 Py_XDECREF(exc); 6639 return res; 6640 6641 onError: 6642 Py_XDECREF(res); 6643 Py_XDECREF(errorHandler); 6644 Py_XDECREF(exc); 6645 return NULL; 6646} 6647 6648/* Deprecated */ 6649PyObject * 6650PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6651 Py_ssize_t size, 6652 const char *errors) 6653{ 6654 PyObject *result; 6655 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6656 if (unicode == NULL) 6657 return NULL; 6658 result = unicode_encode_ucs1(unicode, errors, 256); 6659 Py_DECREF(unicode); 6660 return result; 6661} 6662 6663PyObject * 6664_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6665{ 6666 if (!PyUnicode_Check(unicode)) { 6667 PyErr_BadArgument(); 6668 return NULL; 6669 } 6670 if (PyUnicode_READY(unicode) == -1) 6671 return NULL; 6672 /* Fast path: if it is a one-byte string, construct 6673 bytes object directly. */ 6674 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6675 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6676 PyUnicode_GET_LENGTH(unicode)); 6677 /* Non-Latin-1 characters present. Defer to above function to 6678 raise the exception. */ 6679 return unicode_encode_ucs1(unicode, errors, 256); 6680} 6681 6682PyObject* 6683PyUnicode_AsLatin1String(PyObject *unicode) 6684{ 6685 return _PyUnicode_AsLatin1String(unicode, NULL); 6686} 6687 6688/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6689 6690PyObject * 6691PyUnicode_DecodeASCII(const char *s, 6692 Py_ssize_t size, 6693 const char *errors) 6694{ 6695 const char *starts = s; 6696 PyObject *v; 6697 int kind; 6698 void *data; 6699 Py_ssize_t startinpos; 6700 Py_ssize_t endinpos; 6701 Py_ssize_t outpos; 6702 const char *e; 6703 int has_error; 6704 const unsigned char *p = (const unsigned char *)s; 6705 const unsigned char *end = p + size; 6706 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6707 PyObject *errorHandler = NULL; 6708 PyObject *exc = NULL; 6709 6710 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6711 if (size == 1 && (unsigned char)s[0] < 128) 6712 return get_latin1_char((unsigned char)s[0]); 6713 6714 has_error = 0; 6715 while (p < end && !has_error) { 6716 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6717 an explanation. */ 6718 if (!((size_t) p & LONG_PTR_MASK)) { 6719 /* Help register allocation */ 6720 register const unsigned char *_p = p; 6721 while (_p < aligned_end) { 6722 unsigned long value = *(unsigned long *) _p; 6723 if (value & ASCII_CHAR_MASK) { 6724 has_error = 1; 6725 break; 6726 } 6727 _p += SIZEOF_LONG; 6728 } 6729 if (_p == end) 6730 break; 6731 if (has_error) 6732 break; 6733 p = _p; 6734 } 6735 if (*p & 0x80) { 6736 has_error = 1; 6737 break; 6738 } 6739 else { 6740 ++p; 6741 } 6742 } 6743 if (!has_error) 6744 return unicode_fromascii((const unsigned char *)s, size); 6745 6746 v = PyUnicode_New(size, 127); 6747 if (v == NULL) 6748 goto onError; 6749 if (size == 0) 6750 return v; 6751 kind = PyUnicode_KIND(v); 6752 data = PyUnicode_DATA(v); 6753 outpos = 0; 6754 e = s + size; 6755 while (s < e) { 6756 register unsigned char c = (unsigned char)*s; 6757 if (c < 128) { 6758 PyUnicode_WRITE(kind, data, outpos++, c); 6759 ++s; 6760 } 6761 else { 6762 startinpos = s-starts; 6763 endinpos = startinpos + 1; 6764 if (unicode_decode_call_errorhandler( 6765 errors, &errorHandler, 6766 "ascii", "ordinal not in range(128)", 6767 &starts, &e, &startinpos, &endinpos, &exc, &s, 6768 &v, &outpos)) 6769 goto onError; 6770 kind = PyUnicode_KIND(v); 6771 data = PyUnicode_DATA(v); 6772 } 6773 } 6774 if (PyUnicode_Resize(&v, outpos) < 0) 6775 goto onError; 6776 Py_XDECREF(errorHandler); 6777 Py_XDECREF(exc); 6778 assert(_PyUnicode_CheckConsistency(v, 1)); 6779 return v; 6780 6781 onError: 6782 Py_XDECREF(v); 6783 Py_XDECREF(errorHandler); 6784 Py_XDECREF(exc); 6785 return NULL; 6786} 6787 6788/* Deprecated */ 6789PyObject * 6790PyUnicode_EncodeASCII(const Py_UNICODE *p, 6791 Py_ssize_t size, 6792 const char *errors) 6793{ 6794 PyObject *result; 6795 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6796 if (unicode == NULL) 6797 return NULL; 6798 result = unicode_encode_ucs1(unicode, errors, 128); 6799 Py_DECREF(unicode); 6800 return result; 6801} 6802 6803PyObject * 6804_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6805{ 6806 if (!PyUnicode_Check(unicode)) { 6807 PyErr_BadArgument(); 6808 return NULL; 6809 } 6810 if (PyUnicode_READY(unicode) == -1) 6811 return NULL; 6812 /* Fast path: if it is an ASCII-only string, construct bytes object 6813 directly. Else defer to above function to raise the exception. */ 6814 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6815 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6816 PyUnicode_GET_LENGTH(unicode)); 6817 return unicode_encode_ucs1(unicode, errors, 128); 6818} 6819 6820PyObject * 6821PyUnicode_AsASCIIString(PyObject *unicode) 6822{ 6823 return _PyUnicode_AsASCIIString(unicode, NULL); 6824} 6825 6826#ifdef HAVE_MBCS 6827 6828/* --- MBCS codecs for Windows -------------------------------------------- */ 6829 6830#if SIZEOF_INT < SIZEOF_SIZE_T 6831#define NEED_RETRY 6832#endif 6833 6834#ifndef WC_ERR_INVALID_CHARS 6835# define WC_ERR_INVALID_CHARS 0x0080 6836#endif 6837 6838static char* 6839code_page_name(UINT code_page, PyObject **obj) 6840{ 6841 *obj = NULL; 6842 if (code_page == CP_ACP) 6843 return "mbcs"; 6844 if (code_page == CP_UTF7) 6845 return "CP_UTF7"; 6846 if (code_page == CP_UTF8) 6847 return "CP_UTF8"; 6848 6849 *obj = PyBytes_FromFormat("cp%u", code_page); 6850 if (*obj == NULL) 6851 return NULL; 6852 return PyBytes_AS_STRING(*obj); 6853} 6854 6855static int 6856is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6857{ 6858 const char *curr = s + offset; 6859 const char *prev; 6860 6861 if (!IsDBCSLeadByteEx(code_page, *curr)) 6862 return 0; 6863 6864 prev = CharPrevExA(code_page, s, curr, 0); 6865 if (prev == curr) 6866 return 1; 6867 /* FIXME: This code is limited to "true" double-byte encodings, 6868 as it assumes an incomplete character consists of a single 6869 byte. */ 6870 if (curr - prev == 2) 6871 return 1; 6872 if (!IsDBCSLeadByteEx(code_page, *prev)) 6873 return 1; 6874 return 0; 6875} 6876 6877static DWORD 6878decode_code_page_flags(UINT code_page) 6879{ 6880 if (code_page == CP_UTF7) { 6881 /* The CP_UTF7 decoder only supports flags=0 */ 6882 return 0; 6883 } 6884 else 6885 return MB_ERR_INVALID_CHARS; 6886} 6887 6888/* 6889 * Decode a byte string from a Windows code page into unicode object in strict 6890 * mode. 6891 * 6892 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6893 * WindowsError and returns -1 on other error. 6894 */ 6895static int 6896decode_code_page_strict(UINT code_page, 6897 PyObject **v, 6898 const char *in, 6899 int insize) 6900{ 6901 const DWORD flags = decode_code_page_flags(code_page); 6902 wchar_t *out; 6903 DWORD outsize; 6904 6905 /* First get the size of the result */ 6906 assert(insize > 0); 6907 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6908 if (outsize <= 0) 6909 goto error; 6910 6911 if (*v == NULL) { 6912 /* Create unicode object */ 6913 *v = (PyObject*)_PyUnicode_New(outsize); 6914 if (*v == NULL) 6915 return -1; 6916 out = PyUnicode_AS_UNICODE(*v); 6917 } 6918 else { 6919 /* Extend unicode object */ 6920 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6921 if (PyUnicode_Resize(v, n + outsize) < 0) 6922 return -1; 6923 out = PyUnicode_AS_UNICODE(*v) + n; 6924 } 6925 6926 /* Do the conversion */ 6927 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6928 if (outsize <= 0) 6929 goto error; 6930 return insize; 6931 6932error: 6933 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6934 return -2; 6935 PyErr_SetFromWindowsErr(0); 6936 return -1; 6937} 6938 6939/* 6940 * Decode a byte string from a code page into unicode object with an error 6941 * handler. 6942 * 6943 * Returns consumed size if succeed, or raise a WindowsError or 6944 * UnicodeDecodeError exception and returns -1 on error. 6945 */ 6946static int 6947decode_code_page_errors(UINT code_page, 6948 PyObject **v, 6949 const char *in, const int size, 6950 const char *errors) 6951{ 6952 const char *startin = in; 6953 const char *endin = in + size; 6954 const DWORD flags = decode_code_page_flags(code_page); 6955 /* Ideally, we should get reason from FormatMessage. This is the Windows 6956 2000 English version of the message. */ 6957 const char *reason = "No mapping for the Unicode character exists " 6958 "in the target code page."; 6959 /* each step cannot decode more than 1 character, but a character can be 6960 represented as a surrogate pair */ 6961 wchar_t buffer[2], *startout, *out; 6962 int insize, outsize; 6963 PyObject *errorHandler = NULL; 6964 PyObject *exc = NULL; 6965 PyObject *encoding_obj = NULL; 6966 char *encoding; 6967 DWORD err; 6968 int ret = -1; 6969 6970 assert(size > 0); 6971 6972 encoding = code_page_name(code_page, &encoding_obj); 6973 if (encoding == NULL) 6974 return -1; 6975 6976 if (errors == NULL || strcmp(errors, "strict") == 0) { 6977 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6978 UnicodeDecodeError. */ 6979 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6980 if (exc != NULL) { 6981 PyCodec_StrictErrors(exc); 6982 Py_CLEAR(exc); 6983 } 6984 goto error; 6985 } 6986 6987 if (*v == NULL) { 6988 /* Create unicode object */ 6989 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6990 PyErr_NoMemory(); 6991 goto error; 6992 } 6993 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6994 if (*v == NULL) 6995 goto error; 6996 startout = PyUnicode_AS_UNICODE(*v); 6997 } 6998 else { 6999 /* Extend unicode object */ 7000 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7001 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7002 PyErr_NoMemory(); 7003 goto error; 7004 } 7005 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7006 goto error; 7007 startout = PyUnicode_AS_UNICODE(*v) + n; 7008 } 7009 7010 /* Decode the byte string character per character */ 7011 out = startout; 7012 while (in < endin) 7013 { 7014 /* Decode a character */ 7015 insize = 1; 7016 do 7017 { 7018 outsize = MultiByteToWideChar(code_page, flags, 7019 in, insize, 7020 buffer, Py_ARRAY_LENGTH(buffer)); 7021 if (outsize > 0) 7022 break; 7023 err = GetLastError(); 7024 if (err != ERROR_NO_UNICODE_TRANSLATION 7025 && err != ERROR_INSUFFICIENT_BUFFER) 7026 { 7027 PyErr_SetFromWindowsErr(0); 7028 goto error; 7029 } 7030 insize++; 7031 } 7032 /* 4=maximum length of a UTF-8 sequence */ 7033 while (insize <= 4 && (in + insize) <= endin); 7034 7035 if (outsize <= 0) { 7036 Py_ssize_t startinpos, endinpos, outpos; 7037 7038 startinpos = in - startin; 7039 endinpos = startinpos + 1; 7040 outpos = out - PyUnicode_AS_UNICODE(*v); 7041 if (unicode_decode_call_errorhandler( 7042 errors, &errorHandler, 7043 encoding, reason, 7044 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7045 v, &outpos)) 7046 { 7047 goto error; 7048 } 7049 out = PyUnicode_AS_UNICODE(*v) + outpos; 7050 } 7051 else { 7052 in += insize; 7053 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7054 out += outsize; 7055 } 7056 } 7057 7058 /* write a NUL character at the end */ 7059 *out = 0; 7060 7061 /* Extend unicode object */ 7062 outsize = out - startout; 7063 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7064 if (PyUnicode_Resize(v, outsize) < 0) 7065 goto error; 7066 ret = size; 7067 7068error: 7069 Py_XDECREF(encoding_obj); 7070 Py_XDECREF(errorHandler); 7071 Py_XDECREF(exc); 7072 return ret; 7073} 7074 7075static PyObject * 7076decode_code_page_stateful(int code_page, 7077 const char *s, Py_ssize_t size, 7078 const char *errors, Py_ssize_t *consumed) 7079{ 7080 PyObject *v = NULL; 7081 int chunk_size, final, converted, done; 7082 7083 if (code_page < 0) { 7084 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7085 return NULL; 7086 } 7087 7088 if (consumed) 7089 *consumed = 0; 7090 7091 do 7092 { 7093#ifdef NEED_RETRY 7094 if (size > INT_MAX) { 7095 chunk_size = INT_MAX; 7096 final = 0; 7097 done = 0; 7098 } 7099 else 7100#endif 7101 { 7102 chunk_size = (int)size; 7103 final = (consumed == NULL); 7104 done = 1; 7105 } 7106 7107 /* Skip trailing lead-byte unless 'final' is set */ 7108 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 7109 --chunk_size; 7110 7111 if (chunk_size == 0 && done) { 7112 if (v != NULL) 7113 break; 7114 Py_INCREF(unicode_empty); 7115 return unicode_empty; 7116 } 7117 7118 7119 converted = decode_code_page_strict(code_page, &v, 7120 s, chunk_size); 7121 if (converted == -2) 7122 converted = decode_code_page_errors(code_page, &v, 7123 s, chunk_size, 7124 errors); 7125 assert(converted != 0); 7126 7127 if (converted < 0) { 7128 Py_XDECREF(v); 7129 return NULL; 7130 } 7131 7132 if (consumed) 7133 *consumed += converted; 7134 7135 s += converted; 7136 size -= converted; 7137 } while (!done); 7138 7139#ifndef DONT_MAKE_RESULT_READY 7140 if (_PyUnicode_READY_REPLACE(&v)) { 7141 Py_DECREF(v); 7142 return NULL; 7143 } 7144#endif 7145 assert(_PyUnicode_CheckConsistency(v, 1)); 7146 return v; 7147} 7148 7149PyObject * 7150PyUnicode_DecodeCodePageStateful(int code_page, 7151 const char *s, 7152 Py_ssize_t size, 7153 const char *errors, 7154 Py_ssize_t *consumed) 7155{ 7156 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7157} 7158 7159PyObject * 7160PyUnicode_DecodeMBCSStateful(const char *s, 7161 Py_ssize_t size, 7162 const char *errors, 7163 Py_ssize_t *consumed) 7164{ 7165 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7166} 7167 7168PyObject * 7169PyUnicode_DecodeMBCS(const char *s, 7170 Py_ssize_t size, 7171 const char *errors) 7172{ 7173 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7174} 7175 7176static DWORD 7177encode_code_page_flags(UINT code_page, const char *errors) 7178{ 7179 if (code_page == CP_UTF8) { 7180 if (winver.dwMajorVersion >= 6) 7181 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7182 and later */ 7183 return WC_ERR_INVALID_CHARS; 7184 else 7185 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7186 return 0; 7187 } 7188 else if (code_page == CP_UTF7) { 7189 /* CP_UTF7 only supports flags=0 */ 7190 return 0; 7191 } 7192 else { 7193 if (errors != NULL && strcmp(errors, "replace") == 0) 7194 return 0; 7195 else 7196 return WC_NO_BEST_FIT_CHARS; 7197 } 7198} 7199 7200/* 7201 * Encode a Unicode string to a Windows code page into a byte string in strict 7202 * mode. 7203 * 7204 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7205 * a WindowsError and returns -1 on other error. 7206 */ 7207static int 7208encode_code_page_strict(UINT code_page, PyObject **outbytes, 7209 PyObject *unicode, Py_ssize_t offset, int len, 7210 const char* errors) 7211{ 7212 BOOL usedDefaultChar = FALSE; 7213 BOOL *pusedDefaultChar = &usedDefaultChar; 7214 int outsize; 7215 PyObject *exc = NULL; 7216 wchar_t *p; 7217 Py_ssize_t size; 7218 const DWORD flags = encode_code_page_flags(code_page, NULL); 7219 char *out; 7220 /* Create a substring so that we can get the UTF-16 representation 7221 of just the slice under consideration. */ 7222 PyObject *substring; 7223 7224 assert(len > 0); 7225 7226 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7227 pusedDefaultChar = &usedDefaultChar; 7228 else 7229 pusedDefaultChar = NULL; 7230 7231 substring = PyUnicode_Substring(unicode, offset, offset+len); 7232 if (substring == NULL) 7233 return -1; 7234 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7235 if (p == NULL) { 7236 Py_DECREF(substring); 7237 return -1; 7238 } 7239 7240 /* First get the size of the result */ 7241 outsize = WideCharToMultiByte(code_page, flags, 7242 p, size, 7243 NULL, 0, 7244 NULL, pusedDefaultChar); 7245 if (outsize <= 0) 7246 goto error; 7247 /* If we used a default char, then we failed! */ 7248 if (pusedDefaultChar && *pusedDefaultChar) { 7249 Py_DECREF(substring); 7250 return -2; 7251 } 7252 7253 if (*outbytes == NULL) { 7254 /* Create string object */ 7255 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7256 if (*outbytes == NULL) { 7257 Py_DECREF(substring); 7258 return -1; 7259 } 7260 out = PyBytes_AS_STRING(*outbytes); 7261 } 7262 else { 7263 /* Extend string object */ 7264 const Py_ssize_t n = PyBytes_Size(*outbytes); 7265 if (outsize > PY_SSIZE_T_MAX - n) { 7266 PyErr_NoMemory(); 7267 Py_DECREF(substring); 7268 return -1; 7269 } 7270 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7271 Py_DECREF(substring); 7272 return -1; 7273 } 7274 out = PyBytes_AS_STRING(*outbytes) + n; 7275 } 7276 7277 /* Do the conversion */ 7278 outsize = WideCharToMultiByte(code_page, flags, 7279 p, size, 7280 out, outsize, 7281 NULL, pusedDefaultChar); 7282 Py_CLEAR(substring); 7283 if (outsize <= 0) 7284 goto error; 7285 if (pusedDefaultChar && *pusedDefaultChar) 7286 return -2; 7287 return 0; 7288 7289error: 7290 Py_XDECREF(substring); 7291 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7292 return -2; 7293 PyErr_SetFromWindowsErr(0); 7294 return -1; 7295} 7296 7297/* 7298 * Encode a Unicode string to a Windows code page into a byte string using a 7299 * error handler. 7300 * 7301 * Returns consumed characters if succeed, or raise a WindowsError and returns 7302 * -1 on other error. 7303 */ 7304static int 7305encode_code_page_errors(UINT code_page, PyObject **outbytes, 7306 PyObject *unicode, Py_ssize_t unicode_offset, 7307 Py_ssize_t insize, const char* errors) 7308{ 7309 const DWORD flags = encode_code_page_flags(code_page, errors); 7310 Py_ssize_t pos = unicode_offset; 7311 Py_ssize_t endin = unicode_offset + insize; 7312 /* Ideally, we should get reason from FormatMessage. This is the Windows 7313 2000 English version of the message. */ 7314 const char *reason = "invalid character"; 7315 /* 4=maximum length of a UTF-8 sequence */ 7316 char buffer[4]; 7317 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7318 Py_ssize_t outsize; 7319 char *out; 7320 PyObject *errorHandler = NULL; 7321 PyObject *exc = NULL; 7322 PyObject *encoding_obj = NULL; 7323 char *encoding; 7324 Py_ssize_t newpos, newoutsize; 7325 PyObject *rep; 7326 int ret = -1; 7327 7328 assert(insize > 0); 7329 7330 encoding = code_page_name(code_page, &encoding_obj); 7331 if (encoding == NULL) 7332 return -1; 7333 7334 if (errors == NULL || strcmp(errors, "strict") == 0) { 7335 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7336 then we raise a UnicodeEncodeError. */ 7337 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7338 if (exc != NULL) { 7339 PyCodec_StrictErrors(exc); 7340 Py_DECREF(exc); 7341 } 7342 Py_XDECREF(encoding_obj); 7343 return -1; 7344 } 7345 7346 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7347 pusedDefaultChar = &usedDefaultChar; 7348 else 7349 pusedDefaultChar = NULL; 7350 7351 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7352 PyErr_NoMemory(); 7353 goto error; 7354 } 7355 outsize = insize * Py_ARRAY_LENGTH(buffer); 7356 7357 if (*outbytes == NULL) { 7358 /* Create string object */ 7359 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7360 if (*outbytes == NULL) 7361 goto error; 7362 out = PyBytes_AS_STRING(*outbytes); 7363 } 7364 else { 7365 /* Extend string object */ 7366 Py_ssize_t n = PyBytes_Size(*outbytes); 7367 if (n > PY_SSIZE_T_MAX - outsize) { 7368 PyErr_NoMemory(); 7369 goto error; 7370 } 7371 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7372 goto error; 7373 out = PyBytes_AS_STRING(*outbytes) + n; 7374 } 7375 7376 /* Encode the string character per character */ 7377 while (pos < endin) 7378 { 7379 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7380 wchar_t chars[2]; 7381 int charsize; 7382 if (ch < 0x10000) { 7383 chars[0] = (wchar_t)ch; 7384 charsize = 1; 7385 } 7386 else { 7387 ch -= 0x10000; 7388 chars[0] = 0xd800 + (ch >> 10); 7389 chars[1] = 0xdc00 + (ch & 0x3ff); 7390 charsize = 2; 7391 } 7392 7393 outsize = WideCharToMultiByte(code_page, flags, 7394 chars, charsize, 7395 buffer, Py_ARRAY_LENGTH(buffer), 7396 NULL, pusedDefaultChar); 7397 if (outsize > 0) { 7398 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7399 { 7400 pos++; 7401 memcpy(out, buffer, outsize); 7402 out += outsize; 7403 continue; 7404 } 7405 } 7406 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7407 PyErr_SetFromWindowsErr(0); 7408 goto error; 7409 } 7410 7411 rep = unicode_encode_call_errorhandler( 7412 errors, &errorHandler, encoding, reason, 7413 unicode, &exc, 7414 pos, pos + 1, &newpos); 7415 if (rep == NULL) 7416 goto error; 7417 pos = newpos; 7418 7419 if (PyBytes_Check(rep)) { 7420 outsize = PyBytes_GET_SIZE(rep); 7421 if (outsize != 1) { 7422 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7423 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7424 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7425 Py_DECREF(rep); 7426 goto error; 7427 } 7428 out = PyBytes_AS_STRING(*outbytes) + offset; 7429 } 7430 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7431 out += outsize; 7432 } 7433 else { 7434 Py_ssize_t i; 7435 enum PyUnicode_Kind kind; 7436 void *data; 7437 7438 if (PyUnicode_READY(rep) < 0) { 7439 Py_DECREF(rep); 7440 goto error; 7441 } 7442 7443 outsize = PyUnicode_GET_LENGTH(rep); 7444 if (outsize != 1) { 7445 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7446 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7447 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7448 Py_DECREF(rep); 7449 goto error; 7450 } 7451 out = PyBytes_AS_STRING(*outbytes) + offset; 7452 } 7453 kind = PyUnicode_KIND(rep); 7454 data = PyUnicode_DATA(rep); 7455 for (i=0; i < outsize; i++) { 7456 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7457 if (ch > 127) { 7458 raise_encode_exception(&exc, 7459 encoding, unicode, 7460 pos, pos + 1, 7461 "unable to encode error handler result to ASCII"); 7462 Py_DECREF(rep); 7463 goto error; 7464 } 7465 *out = (unsigned char)ch; 7466 out++; 7467 } 7468 } 7469 Py_DECREF(rep); 7470 } 7471 /* write a NUL byte */ 7472 *out = 0; 7473 outsize = out - PyBytes_AS_STRING(*outbytes); 7474 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7475 if (_PyBytes_Resize(outbytes, outsize) < 0) 7476 goto error; 7477 ret = 0; 7478 7479error: 7480 Py_XDECREF(encoding_obj); 7481 Py_XDECREF(errorHandler); 7482 Py_XDECREF(exc); 7483 return ret; 7484} 7485 7486static PyObject * 7487encode_code_page(int code_page, 7488 PyObject *unicode, 7489 const char *errors) 7490{ 7491 Py_ssize_t len; 7492 PyObject *outbytes = NULL; 7493 Py_ssize_t offset; 7494 int chunk_len, ret, done; 7495 7496 if (PyUnicode_READY(unicode) < 0) 7497 return NULL; 7498 len = PyUnicode_GET_LENGTH(unicode); 7499 7500 if (code_page < 0) { 7501 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7502 return NULL; 7503 } 7504 7505 if (len == 0) 7506 return PyBytes_FromStringAndSize(NULL, 0); 7507 7508 offset = 0; 7509 do 7510 { 7511#ifdef NEED_RETRY 7512 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7513 chunks. */ 7514 if (len > INT_MAX/2) { 7515 chunk_len = INT_MAX/2; 7516 done = 0; 7517 } 7518 else 7519#endif 7520 { 7521 chunk_len = (int)len; 7522 done = 1; 7523 } 7524 7525 ret = encode_code_page_strict(code_page, &outbytes, 7526 unicode, offset, chunk_len, 7527 errors); 7528 if (ret == -2) 7529 ret = encode_code_page_errors(code_page, &outbytes, 7530 unicode, offset, 7531 chunk_len, errors); 7532 if (ret < 0) { 7533 Py_XDECREF(outbytes); 7534 return NULL; 7535 } 7536 7537 offset += chunk_len; 7538 len -= chunk_len; 7539 } while (!done); 7540 7541 return outbytes; 7542} 7543 7544PyObject * 7545PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7546 Py_ssize_t size, 7547 const char *errors) 7548{ 7549 PyObject *unicode, *res; 7550 unicode = PyUnicode_FromUnicode(p, size); 7551 if (unicode == NULL) 7552 return NULL; 7553 res = encode_code_page(CP_ACP, unicode, errors); 7554 Py_DECREF(unicode); 7555 return res; 7556} 7557 7558PyObject * 7559PyUnicode_EncodeCodePage(int code_page, 7560 PyObject *unicode, 7561 const char *errors) 7562{ 7563 return encode_code_page(code_page, unicode, errors); 7564} 7565 7566PyObject * 7567PyUnicode_AsMBCSString(PyObject *unicode) 7568{ 7569 if (!PyUnicode_Check(unicode)) { 7570 PyErr_BadArgument(); 7571 return NULL; 7572 } 7573 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7574} 7575 7576#undef NEED_RETRY 7577 7578#endif /* HAVE_MBCS */ 7579 7580/* --- Character Mapping Codec -------------------------------------------- */ 7581 7582PyObject * 7583PyUnicode_DecodeCharmap(const char *s, 7584 Py_ssize_t size, 7585 PyObject *mapping, 7586 const char *errors) 7587{ 7588 const char *starts = s; 7589 Py_ssize_t startinpos; 7590 Py_ssize_t endinpos; 7591 Py_ssize_t outpos; 7592 const char *e; 7593 PyObject *v; 7594 Py_ssize_t extrachars = 0; 7595 PyObject *errorHandler = NULL; 7596 PyObject *exc = NULL; 7597 7598 /* Default to Latin-1 */ 7599 if (mapping == NULL) 7600 return PyUnicode_DecodeLatin1(s, size, errors); 7601 7602 v = PyUnicode_New(size, 127); 7603 if (v == NULL) 7604 goto onError; 7605 if (size == 0) 7606 return v; 7607 outpos = 0; 7608 e = s + size; 7609 if (PyUnicode_CheckExact(mapping)) { 7610 Py_ssize_t maplen; 7611 enum PyUnicode_Kind kind; 7612 void *data; 7613 Py_UCS4 x; 7614 7615 if (PyUnicode_READY(mapping) < 0) 7616 return NULL; 7617 7618 maplen = PyUnicode_GET_LENGTH(mapping); 7619 data = PyUnicode_DATA(mapping); 7620 kind = PyUnicode_KIND(mapping); 7621 while (s < e) { 7622 unsigned char ch = *s; 7623 7624 if (ch < maplen) 7625 x = PyUnicode_READ(kind, data, ch); 7626 else 7627 x = 0xfffe; /* invalid value */ 7628 7629 if (x == 0xfffe) 7630 { 7631 /* undefined mapping */ 7632 startinpos = s-starts; 7633 endinpos = startinpos+1; 7634 if (unicode_decode_call_errorhandler( 7635 errors, &errorHandler, 7636 "charmap", "character maps to <undefined>", 7637 &starts, &e, &startinpos, &endinpos, &exc, &s, 7638 &v, &outpos)) { 7639 goto onError; 7640 } 7641 continue; 7642 } 7643 7644 if (unicode_putchar(&v, &outpos, x) < 0) 7645 goto onError; 7646 ++s; 7647 } 7648 } 7649 else { 7650 while (s < e) { 7651 unsigned char ch = *s; 7652 PyObject *w, *x; 7653 7654 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7655 w = PyLong_FromLong((long)ch); 7656 if (w == NULL) 7657 goto onError; 7658 x = PyObject_GetItem(mapping, w); 7659 Py_DECREF(w); 7660 if (x == NULL) { 7661 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7662 /* No mapping found means: mapping is undefined. */ 7663 PyErr_Clear(); 7664 x = Py_None; 7665 Py_INCREF(x); 7666 } else 7667 goto onError; 7668 } 7669 7670 /* Apply mapping */ 7671 if (PyLong_Check(x)) { 7672 long value = PyLong_AS_LONG(x); 7673 if (value < 0 || value > 65535) { 7674 PyErr_SetString(PyExc_TypeError, 7675 "character mapping must be in range(65536)"); 7676 Py_DECREF(x); 7677 goto onError; 7678 } 7679 if (unicode_putchar(&v, &outpos, value) < 0) 7680 goto onError; 7681 } 7682 else if (x == Py_None) { 7683 /* undefined mapping */ 7684 startinpos = s-starts; 7685 endinpos = startinpos+1; 7686 if (unicode_decode_call_errorhandler( 7687 errors, &errorHandler, 7688 "charmap", "character maps to <undefined>", 7689 &starts, &e, &startinpos, &endinpos, &exc, &s, 7690 &v, &outpos)) { 7691 Py_DECREF(x); 7692 goto onError; 7693 } 7694 Py_DECREF(x); 7695 continue; 7696 } 7697 else if (PyUnicode_Check(x)) { 7698 Py_ssize_t targetsize; 7699 7700 if (PyUnicode_READY(x) < 0) 7701 goto onError; 7702 targetsize = PyUnicode_GET_LENGTH(x); 7703 7704 if (targetsize == 1) { 7705 /* 1-1 mapping */ 7706 if (unicode_putchar(&v, &outpos, 7707 PyUnicode_READ_CHAR(x, 0)) < 0) 7708 goto onError; 7709 } 7710 else if (targetsize > 1) { 7711 /* 1-n mapping */ 7712 if (targetsize > extrachars) { 7713 /* resize first */ 7714 Py_ssize_t needed = (targetsize - extrachars) + \ 7715 (targetsize << 2); 7716 extrachars += needed; 7717 /* XXX overflow detection missing */ 7718 if (PyUnicode_Resize(&v, 7719 PyUnicode_GET_LENGTH(v) + needed) < 0) { 7720 Py_DECREF(x); 7721 goto onError; 7722 } 7723 } 7724 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7725 goto onError; 7726 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7727 outpos += targetsize; 7728 extrachars -= targetsize; 7729 } 7730 /* 1-0 mapping: skip the character */ 7731 } 7732 else { 7733 /* wrong return value */ 7734 PyErr_SetString(PyExc_TypeError, 7735 "character mapping must return integer, None or str"); 7736 Py_DECREF(x); 7737 goto onError; 7738 } 7739 Py_DECREF(x); 7740 ++s; 7741 } 7742 } 7743 if (PyUnicode_Resize(&v, outpos) < 0) 7744 goto onError; 7745 Py_XDECREF(errorHandler); 7746 Py_XDECREF(exc); 7747 assert(_PyUnicode_CheckConsistency(v, 1)); 7748 return v; 7749 7750 onError: 7751 Py_XDECREF(errorHandler); 7752 Py_XDECREF(exc); 7753 Py_XDECREF(v); 7754 return NULL; 7755} 7756 7757/* Charmap encoding: the lookup table */ 7758 7759struct encoding_map { 7760 PyObject_HEAD 7761 unsigned char level1[32]; 7762 int count2, count3; 7763 unsigned char level23[1]; 7764}; 7765 7766static PyObject* 7767encoding_map_size(PyObject *obj, PyObject* args) 7768{ 7769 struct encoding_map *map = (struct encoding_map*)obj; 7770 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7771 128*map->count3); 7772} 7773 7774static PyMethodDef encoding_map_methods[] = { 7775 {"size", encoding_map_size, METH_NOARGS, 7776 PyDoc_STR("Return the size (in bytes) of this object") }, 7777 { 0 } 7778}; 7779 7780static void 7781encoding_map_dealloc(PyObject* o) 7782{ 7783 PyObject_FREE(o); 7784} 7785 7786static PyTypeObject EncodingMapType = { 7787 PyVarObject_HEAD_INIT(NULL, 0) 7788 "EncodingMap", /*tp_name*/ 7789 sizeof(struct encoding_map), /*tp_basicsize*/ 7790 0, /*tp_itemsize*/ 7791 /* methods */ 7792 encoding_map_dealloc, /*tp_dealloc*/ 7793 0, /*tp_print*/ 7794 0, /*tp_getattr*/ 7795 0, /*tp_setattr*/ 7796 0, /*tp_reserved*/ 7797 0, /*tp_repr*/ 7798 0, /*tp_as_number*/ 7799 0, /*tp_as_sequence*/ 7800 0, /*tp_as_mapping*/ 7801 0, /*tp_hash*/ 7802 0, /*tp_call*/ 7803 0, /*tp_str*/ 7804 0, /*tp_getattro*/ 7805 0, /*tp_setattro*/ 7806 0, /*tp_as_buffer*/ 7807 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7808 0, /*tp_doc*/ 7809 0, /*tp_traverse*/ 7810 0, /*tp_clear*/ 7811 0, /*tp_richcompare*/ 7812 0, /*tp_weaklistoffset*/ 7813 0, /*tp_iter*/ 7814 0, /*tp_iternext*/ 7815 encoding_map_methods, /*tp_methods*/ 7816 0, /*tp_members*/ 7817 0, /*tp_getset*/ 7818 0, /*tp_base*/ 7819 0, /*tp_dict*/ 7820 0, /*tp_descr_get*/ 7821 0, /*tp_descr_set*/ 7822 0, /*tp_dictoffset*/ 7823 0, /*tp_init*/ 7824 0, /*tp_alloc*/ 7825 0, /*tp_new*/ 7826 0, /*tp_free*/ 7827 0, /*tp_is_gc*/ 7828}; 7829 7830PyObject* 7831PyUnicode_BuildEncodingMap(PyObject* string) 7832{ 7833 PyObject *result; 7834 struct encoding_map *mresult; 7835 int i; 7836 int need_dict = 0; 7837 unsigned char level1[32]; 7838 unsigned char level2[512]; 7839 unsigned char *mlevel1, *mlevel2, *mlevel3; 7840 int count2 = 0, count3 = 0; 7841 int kind; 7842 void *data; 7843 Py_UCS4 ch; 7844 7845 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7846 PyErr_BadArgument(); 7847 return NULL; 7848 } 7849 kind = PyUnicode_KIND(string); 7850 data = PyUnicode_DATA(string); 7851 memset(level1, 0xFF, sizeof level1); 7852 memset(level2, 0xFF, sizeof level2); 7853 7854 /* If there isn't a one-to-one mapping of NULL to \0, 7855 or if there are non-BMP characters, we need to use 7856 a mapping dictionary. */ 7857 if (PyUnicode_READ(kind, data, 0) != 0) 7858 need_dict = 1; 7859 for (i = 1; i < 256; i++) { 7860 int l1, l2; 7861 ch = PyUnicode_READ(kind, data, i); 7862 if (ch == 0 || ch > 0xFFFF) { 7863 need_dict = 1; 7864 break; 7865 } 7866 if (ch == 0xFFFE) 7867 /* unmapped character */ 7868 continue; 7869 l1 = ch >> 11; 7870 l2 = ch >> 7; 7871 if (level1[l1] == 0xFF) 7872 level1[l1] = count2++; 7873 if (level2[l2] == 0xFF) 7874 level2[l2] = count3++; 7875 } 7876 7877 if (count2 >= 0xFF || count3 >= 0xFF) 7878 need_dict = 1; 7879 7880 if (need_dict) { 7881 PyObject *result = PyDict_New(); 7882 PyObject *key, *value; 7883 if (!result) 7884 return NULL; 7885 for (i = 0; i < 256; i++) { 7886 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7887 value = PyLong_FromLong(i); 7888 if (!key || !value) 7889 goto failed1; 7890 if (PyDict_SetItem(result, key, value) == -1) 7891 goto failed1; 7892 Py_DECREF(key); 7893 Py_DECREF(value); 7894 } 7895 return result; 7896 failed1: 7897 Py_XDECREF(key); 7898 Py_XDECREF(value); 7899 Py_DECREF(result); 7900 return NULL; 7901 } 7902 7903 /* Create a three-level trie */ 7904 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7905 16*count2 + 128*count3 - 1); 7906 if (!result) 7907 return PyErr_NoMemory(); 7908 PyObject_Init(result, &EncodingMapType); 7909 mresult = (struct encoding_map*)result; 7910 mresult->count2 = count2; 7911 mresult->count3 = count3; 7912 mlevel1 = mresult->level1; 7913 mlevel2 = mresult->level23; 7914 mlevel3 = mresult->level23 + 16*count2; 7915 memcpy(mlevel1, level1, 32); 7916 memset(mlevel2, 0xFF, 16*count2); 7917 memset(mlevel3, 0, 128*count3); 7918 count3 = 0; 7919 for (i = 1; i < 256; i++) { 7920 int o1, o2, o3, i2, i3; 7921 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7922 /* unmapped character */ 7923 continue; 7924 o1 = PyUnicode_READ(kind, data, i)>>11; 7925 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7926 i2 = 16*mlevel1[o1] + o2; 7927 if (mlevel2[i2] == 0xFF) 7928 mlevel2[i2] = count3++; 7929 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7930 i3 = 128*mlevel2[i2] + o3; 7931 mlevel3[i3] = i; 7932 } 7933 return result; 7934} 7935 7936static int 7937encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7938{ 7939 struct encoding_map *map = (struct encoding_map*)mapping; 7940 int l1 = c>>11; 7941 int l2 = (c>>7) & 0xF; 7942 int l3 = c & 0x7F; 7943 int i; 7944 7945#ifdef Py_UNICODE_WIDE 7946 if (c > 0xFFFF) { 7947 return -1; 7948 } 7949#endif 7950 if (c == 0) 7951 return 0; 7952 /* level 1*/ 7953 i = map->level1[l1]; 7954 if (i == 0xFF) { 7955 return -1; 7956 } 7957 /* level 2*/ 7958 i = map->level23[16*i+l2]; 7959 if (i == 0xFF) { 7960 return -1; 7961 } 7962 /* level 3 */ 7963 i = map->level23[16*map->count2 + 128*i + l3]; 7964 if (i == 0) { 7965 return -1; 7966 } 7967 return i; 7968} 7969 7970/* Lookup the character ch in the mapping. If the character 7971 can't be found, Py_None is returned (or NULL, if another 7972 error occurred). */ 7973static PyObject * 7974charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 7975{ 7976 PyObject *w = PyLong_FromLong((long)c); 7977 PyObject *x; 7978 7979 if (w == NULL) 7980 return NULL; 7981 x = PyObject_GetItem(mapping, w); 7982 Py_DECREF(w); 7983 if (x == NULL) { 7984 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7985 /* No mapping found means: mapping is undefined. */ 7986 PyErr_Clear(); 7987 x = Py_None; 7988 Py_INCREF(x); 7989 return x; 7990 } else 7991 return NULL; 7992 } 7993 else if (x == Py_None) 7994 return x; 7995 else if (PyLong_Check(x)) { 7996 long value = PyLong_AS_LONG(x); 7997 if (value < 0 || value > 255) { 7998 PyErr_SetString(PyExc_TypeError, 7999 "character mapping must be in range(256)"); 8000 Py_DECREF(x); 8001 return NULL; 8002 } 8003 return x; 8004 } 8005 else if (PyBytes_Check(x)) 8006 return x; 8007 else { 8008 /* wrong return value */ 8009 PyErr_Format(PyExc_TypeError, 8010 "character mapping must return integer, bytes or None, not %.400s", 8011 x->ob_type->tp_name); 8012 Py_DECREF(x); 8013 return NULL; 8014 } 8015} 8016 8017static int 8018charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8019{ 8020 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8021 /* exponentially overallocate to minimize reallocations */ 8022 if (requiredsize < 2*outsize) 8023 requiredsize = 2*outsize; 8024 if (_PyBytes_Resize(outobj, requiredsize)) 8025 return -1; 8026 return 0; 8027} 8028 8029typedef enum charmapencode_result { 8030 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8031} charmapencode_result; 8032/* lookup the character, put the result in the output string and adjust 8033 various state variables. Resize the output bytes object if not enough 8034 space is available. Return a new reference to the object that 8035 was put in the output buffer, or Py_None, if the mapping was undefined 8036 (in which case no character was written) or NULL, if a 8037 reallocation error occurred. The caller must decref the result */ 8038static charmapencode_result 8039charmapencode_output(Py_UNICODE c, PyObject *mapping, 8040 PyObject **outobj, Py_ssize_t *outpos) 8041{ 8042 PyObject *rep; 8043 char *outstart; 8044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8045 8046 if (Py_TYPE(mapping) == &EncodingMapType) { 8047 int res = encoding_map_lookup(c, mapping); 8048 Py_ssize_t requiredsize = *outpos+1; 8049 if (res == -1) 8050 return enc_FAILED; 8051 if (outsize<requiredsize) 8052 if (charmapencode_resize(outobj, outpos, requiredsize)) 8053 return enc_EXCEPTION; 8054 outstart = PyBytes_AS_STRING(*outobj); 8055 outstart[(*outpos)++] = (char)res; 8056 return enc_SUCCESS; 8057 } 8058 8059 rep = charmapencode_lookup(c, mapping); 8060 if (rep==NULL) 8061 return enc_EXCEPTION; 8062 else if (rep==Py_None) { 8063 Py_DECREF(rep); 8064 return enc_FAILED; 8065 } else { 8066 if (PyLong_Check(rep)) { 8067 Py_ssize_t requiredsize = *outpos+1; 8068 if (outsize<requiredsize) 8069 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8070 Py_DECREF(rep); 8071 return enc_EXCEPTION; 8072 } 8073 outstart = PyBytes_AS_STRING(*outobj); 8074 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8075 } 8076 else { 8077 const char *repchars = PyBytes_AS_STRING(rep); 8078 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8079 Py_ssize_t requiredsize = *outpos+repsize; 8080 if (outsize<requiredsize) 8081 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8082 Py_DECREF(rep); 8083 return enc_EXCEPTION; 8084 } 8085 outstart = PyBytes_AS_STRING(*outobj); 8086 memcpy(outstart + *outpos, repchars, repsize); 8087 *outpos += repsize; 8088 } 8089 } 8090 Py_DECREF(rep); 8091 return enc_SUCCESS; 8092} 8093 8094/* handle an error in PyUnicode_EncodeCharmap 8095 Return 0 on success, -1 on error */ 8096static int 8097charmap_encoding_error( 8098 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8099 PyObject **exceptionObject, 8100 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8101 PyObject **res, Py_ssize_t *respos) 8102{ 8103 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8104 Py_ssize_t size, repsize; 8105 Py_ssize_t newpos; 8106 Py_UNICODE *uni2; 8107 /* startpos for collecting unencodable chars */ 8108 Py_ssize_t collstartpos = *inpos; 8109 Py_ssize_t collendpos = *inpos+1; 8110 Py_ssize_t collpos; 8111 char *encoding = "charmap"; 8112 char *reason = "character maps to <undefined>"; 8113 charmapencode_result x; 8114 Py_UCS4 ch; 8115 int val; 8116 8117 if (PyUnicode_READY(unicode) < 0) 8118 return -1; 8119 size = PyUnicode_GET_LENGTH(unicode); 8120 /* find all unencodable characters */ 8121 while (collendpos < size) { 8122 PyObject *rep; 8123 if (Py_TYPE(mapping) == &EncodingMapType) { 8124 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8125 val = encoding_map_lookup(ch, mapping); 8126 if (val != -1) 8127 break; 8128 ++collendpos; 8129 continue; 8130 } 8131 8132 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8133 rep = charmapencode_lookup(ch, mapping); 8134 if (rep==NULL) 8135 return -1; 8136 else if (rep!=Py_None) { 8137 Py_DECREF(rep); 8138 break; 8139 } 8140 Py_DECREF(rep); 8141 ++collendpos; 8142 } 8143 /* cache callback name lookup 8144 * (if not done yet, i.e. it's the first error) */ 8145 if (*known_errorHandler==-1) { 8146 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8147 *known_errorHandler = 1; 8148 else if (!strcmp(errors, "replace")) 8149 *known_errorHandler = 2; 8150 else if (!strcmp(errors, "ignore")) 8151 *known_errorHandler = 3; 8152 else if (!strcmp(errors, "xmlcharrefreplace")) 8153 *known_errorHandler = 4; 8154 else 8155 *known_errorHandler = 0; 8156 } 8157 switch (*known_errorHandler) { 8158 case 1: /* strict */ 8159 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8160 return -1; 8161 case 2: /* replace */ 8162 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8163 x = charmapencode_output('?', mapping, res, respos); 8164 if (x==enc_EXCEPTION) { 8165 return -1; 8166 } 8167 else if (x==enc_FAILED) { 8168 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8169 return -1; 8170 } 8171 } 8172 /* fall through */ 8173 case 3: /* ignore */ 8174 *inpos = collendpos; 8175 break; 8176 case 4: /* xmlcharrefreplace */ 8177 /* generate replacement (temporarily (mis)uses p) */ 8178 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8179 char buffer[2+29+1+1]; 8180 char *cp; 8181 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8182 for (cp = buffer; *cp; ++cp) { 8183 x = charmapencode_output(*cp, mapping, res, respos); 8184 if (x==enc_EXCEPTION) 8185 return -1; 8186 else if (x==enc_FAILED) { 8187 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8188 return -1; 8189 } 8190 } 8191 } 8192 *inpos = collendpos; 8193 break; 8194 default: 8195 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8196 encoding, reason, unicode, exceptionObject, 8197 collstartpos, collendpos, &newpos); 8198 if (repunicode == NULL) 8199 return -1; 8200 if (PyBytes_Check(repunicode)) { 8201 /* Directly copy bytes result to output. */ 8202 Py_ssize_t outsize = PyBytes_Size(*res); 8203 Py_ssize_t requiredsize; 8204 repsize = PyBytes_Size(repunicode); 8205 requiredsize = *respos + repsize; 8206 if (requiredsize > outsize) 8207 /* Make room for all additional bytes. */ 8208 if (charmapencode_resize(res, respos, requiredsize)) { 8209 Py_DECREF(repunicode); 8210 return -1; 8211 } 8212 memcpy(PyBytes_AsString(*res) + *respos, 8213 PyBytes_AsString(repunicode), repsize); 8214 *respos += repsize; 8215 *inpos = newpos; 8216 Py_DECREF(repunicode); 8217 break; 8218 } 8219 /* generate replacement */ 8220 repsize = PyUnicode_GET_SIZE(repunicode); 8221 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8222 x = charmapencode_output(*uni2, mapping, res, respos); 8223 if (x==enc_EXCEPTION) { 8224 return -1; 8225 } 8226 else if (x==enc_FAILED) { 8227 Py_DECREF(repunicode); 8228 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8229 return -1; 8230 } 8231 } 8232 *inpos = newpos; 8233 Py_DECREF(repunicode); 8234 } 8235 return 0; 8236} 8237 8238PyObject * 8239_PyUnicode_EncodeCharmap(PyObject *unicode, 8240 PyObject *mapping, 8241 const char *errors) 8242{ 8243 /* output object */ 8244 PyObject *res = NULL; 8245 /* current input position */ 8246 Py_ssize_t inpos = 0; 8247 Py_ssize_t size; 8248 /* current output position */ 8249 Py_ssize_t respos = 0; 8250 PyObject *errorHandler = NULL; 8251 PyObject *exc = NULL; 8252 /* the following variable is used for caching string comparisons 8253 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8254 * 3=ignore, 4=xmlcharrefreplace */ 8255 int known_errorHandler = -1; 8256 8257 if (PyUnicode_READY(unicode) < 0) 8258 return NULL; 8259 size = PyUnicode_GET_LENGTH(unicode); 8260 8261 /* Default to Latin-1 */ 8262 if (mapping == NULL) 8263 return unicode_encode_ucs1(unicode, errors, 256); 8264 8265 /* allocate enough for a simple encoding without 8266 replacements, if we need more, we'll resize */ 8267 res = PyBytes_FromStringAndSize(NULL, size); 8268 if (res == NULL) 8269 goto onError; 8270 if (size == 0) 8271 return res; 8272 8273 while (inpos<size) { 8274 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8275 /* try to encode it */ 8276 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8277 if (x==enc_EXCEPTION) /* error */ 8278 goto onError; 8279 if (x==enc_FAILED) { /* unencodable character */ 8280 if (charmap_encoding_error(unicode, &inpos, mapping, 8281 &exc, 8282 &known_errorHandler, &errorHandler, errors, 8283 &res, &respos)) { 8284 goto onError; 8285 } 8286 } 8287 else 8288 /* done with this character => adjust input position */ 8289 ++inpos; 8290 } 8291 8292 /* Resize if we allocated to much */ 8293 if (respos<PyBytes_GET_SIZE(res)) 8294 if (_PyBytes_Resize(&res, respos) < 0) 8295 goto onError; 8296 8297 Py_XDECREF(exc); 8298 Py_XDECREF(errorHandler); 8299 return res; 8300 8301 onError: 8302 Py_XDECREF(res); 8303 Py_XDECREF(exc); 8304 Py_XDECREF(errorHandler); 8305 return NULL; 8306} 8307 8308/* Deprecated */ 8309PyObject * 8310PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8311 Py_ssize_t size, 8312 PyObject *mapping, 8313 const char *errors) 8314{ 8315 PyObject *result; 8316 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8317 if (unicode == NULL) 8318 return NULL; 8319 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8320 Py_DECREF(unicode); 8321 return result; 8322} 8323 8324PyObject * 8325PyUnicode_AsCharmapString(PyObject *unicode, 8326 PyObject *mapping) 8327{ 8328 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8329 PyErr_BadArgument(); 8330 return NULL; 8331 } 8332 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8333} 8334 8335/* create or adjust a UnicodeTranslateError */ 8336static void 8337make_translate_exception(PyObject **exceptionObject, 8338 PyObject *unicode, 8339 Py_ssize_t startpos, Py_ssize_t endpos, 8340 const char *reason) 8341{ 8342 if (*exceptionObject == NULL) { 8343 *exceptionObject = _PyUnicodeTranslateError_Create( 8344 unicode, startpos, endpos, reason); 8345 } 8346 else { 8347 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8348 goto onError; 8349 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8350 goto onError; 8351 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8352 goto onError; 8353 return; 8354 onError: 8355 Py_DECREF(*exceptionObject); 8356 *exceptionObject = NULL; 8357 } 8358} 8359 8360/* raises a UnicodeTranslateError */ 8361static void 8362raise_translate_exception(PyObject **exceptionObject, 8363 PyObject *unicode, 8364 Py_ssize_t startpos, Py_ssize_t endpos, 8365 const char *reason) 8366{ 8367 make_translate_exception(exceptionObject, 8368 unicode, startpos, endpos, reason); 8369 if (*exceptionObject != NULL) 8370 PyCodec_StrictErrors(*exceptionObject); 8371} 8372 8373/* error handling callback helper: 8374 build arguments, call the callback and check the arguments, 8375 put the result into newpos and return the replacement string, which 8376 has to be freed by the caller */ 8377static PyObject * 8378unicode_translate_call_errorhandler(const char *errors, 8379 PyObject **errorHandler, 8380 const char *reason, 8381 PyObject *unicode, PyObject **exceptionObject, 8382 Py_ssize_t startpos, Py_ssize_t endpos, 8383 Py_ssize_t *newpos) 8384{ 8385 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8386 8387 Py_ssize_t i_newpos; 8388 PyObject *restuple; 8389 PyObject *resunicode; 8390 8391 if (*errorHandler == NULL) { 8392 *errorHandler = PyCodec_LookupError(errors); 8393 if (*errorHandler == NULL) 8394 return NULL; 8395 } 8396 8397 make_translate_exception(exceptionObject, 8398 unicode, startpos, endpos, reason); 8399 if (*exceptionObject == NULL) 8400 return NULL; 8401 8402 restuple = PyObject_CallFunctionObjArgs( 8403 *errorHandler, *exceptionObject, NULL); 8404 if (restuple == NULL) 8405 return NULL; 8406 if (!PyTuple_Check(restuple)) { 8407 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8408 Py_DECREF(restuple); 8409 return NULL; 8410 } 8411 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8412 &resunicode, &i_newpos)) { 8413 Py_DECREF(restuple); 8414 return NULL; 8415 } 8416 if (i_newpos<0) 8417 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8418 else 8419 *newpos = i_newpos; 8420 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8421 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8422 Py_DECREF(restuple); 8423 return NULL; 8424 } 8425 Py_INCREF(resunicode); 8426 Py_DECREF(restuple); 8427 return resunicode; 8428} 8429 8430/* Lookup the character ch in the mapping and put the result in result, 8431 which must be decrefed by the caller. 8432 Return 0 on success, -1 on error */ 8433static int 8434charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8435{ 8436 PyObject *w = PyLong_FromLong((long)c); 8437 PyObject *x; 8438 8439 if (w == NULL) 8440 return -1; 8441 x = PyObject_GetItem(mapping, w); 8442 Py_DECREF(w); 8443 if (x == NULL) { 8444 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8445 /* No mapping found means: use 1:1 mapping. */ 8446 PyErr_Clear(); 8447 *result = NULL; 8448 return 0; 8449 } else 8450 return -1; 8451 } 8452 else if (x == Py_None) { 8453 *result = x; 8454 return 0; 8455 } 8456 else if (PyLong_Check(x)) { 8457 long value = PyLong_AS_LONG(x); 8458 long max = PyUnicode_GetMax(); 8459 if (value < 0 || value > max) { 8460 PyErr_Format(PyExc_TypeError, 8461 "character mapping must be in range(0x%x)", max+1); 8462 Py_DECREF(x); 8463 return -1; 8464 } 8465 *result = x; 8466 return 0; 8467 } 8468 else if (PyUnicode_Check(x)) { 8469 *result = x; 8470 return 0; 8471 } 8472 else { 8473 /* wrong return value */ 8474 PyErr_SetString(PyExc_TypeError, 8475 "character mapping must return integer, None or str"); 8476 Py_DECREF(x); 8477 return -1; 8478 } 8479} 8480/* ensure that *outobj is at least requiredsize characters long, 8481 if not reallocate and adjust various state variables. 8482 Return 0 on success, -1 on error */ 8483static int 8484charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8485 Py_ssize_t requiredsize) 8486{ 8487 Py_ssize_t oldsize = *psize; 8488 if (requiredsize > oldsize) { 8489 /* exponentially overallocate to minimize reallocations */ 8490 if (requiredsize < 2 * oldsize) 8491 requiredsize = 2 * oldsize; 8492 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8493 if (*outobj == 0) 8494 return -1; 8495 *psize = requiredsize; 8496 } 8497 return 0; 8498} 8499/* lookup the character, put the result in the output string and adjust 8500 various state variables. Return a new reference to the object that 8501 was put in the output buffer in *result, or Py_None, if the mapping was 8502 undefined (in which case no character was written). 8503 The called must decref result. 8504 Return 0 on success, -1 on error. */ 8505static int 8506charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8507 PyObject *mapping, Py_UCS4 **output, 8508 Py_ssize_t *osize, Py_ssize_t *opos, 8509 PyObject **res) 8510{ 8511 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8512 if (charmaptranslate_lookup(curinp, mapping, res)) 8513 return -1; 8514 if (*res==NULL) { 8515 /* not found => default to 1:1 mapping */ 8516 (*output)[(*opos)++] = curinp; 8517 } 8518 else if (*res==Py_None) 8519 ; 8520 else if (PyLong_Check(*res)) { 8521 /* no overflow check, because we know that the space is enough */ 8522 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8523 } 8524 else if (PyUnicode_Check(*res)) { 8525 Py_ssize_t repsize; 8526 if (PyUnicode_READY(*res) == -1) 8527 return -1; 8528 repsize = PyUnicode_GET_LENGTH(*res); 8529 if (repsize==1) { 8530 /* no overflow check, because we know that the space is enough */ 8531 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8532 } 8533 else if (repsize!=0) { 8534 /* more than one character */ 8535 Py_ssize_t requiredsize = *opos + 8536 (PyUnicode_GET_LENGTH(input) - ipos) + 8537 repsize - 1; 8538 Py_ssize_t i; 8539 if (charmaptranslate_makespace(output, osize, requiredsize)) 8540 return -1; 8541 for(i = 0; i < repsize; i++) 8542 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8543 } 8544 } 8545 else 8546 return -1; 8547 return 0; 8548} 8549 8550PyObject * 8551_PyUnicode_TranslateCharmap(PyObject *input, 8552 PyObject *mapping, 8553 const char *errors) 8554{ 8555 /* input object */ 8556 char *idata; 8557 Py_ssize_t size, i; 8558 int kind; 8559 /* output buffer */ 8560 Py_UCS4 *output = NULL; 8561 Py_ssize_t osize; 8562 PyObject *res; 8563 /* current output position */ 8564 Py_ssize_t opos; 8565 char *reason = "character maps to <undefined>"; 8566 PyObject *errorHandler = NULL; 8567 PyObject *exc = NULL; 8568 /* the following variable is used for caching string comparisons 8569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8570 * 3=ignore, 4=xmlcharrefreplace */ 8571 int known_errorHandler = -1; 8572 8573 if (mapping == NULL) { 8574 PyErr_BadArgument(); 8575 return NULL; 8576 } 8577 8578 if (PyUnicode_READY(input) == -1) 8579 return NULL; 8580 idata = (char*)PyUnicode_DATA(input); 8581 kind = PyUnicode_KIND(input); 8582 size = PyUnicode_GET_LENGTH(input); 8583 i = 0; 8584 8585 if (size == 0) { 8586 Py_INCREF(input); 8587 return input; 8588 } 8589 8590 /* allocate enough for a simple 1:1 translation without 8591 replacements, if we need more, we'll resize */ 8592 osize = size; 8593 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8594 opos = 0; 8595 if (output == NULL) { 8596 PyErr_NoMemory(); 8597 goto onError; 8598 } 8599 8600 while (i<size) { 8601 /* try to encode it */ 8602 PyObject *x = NULL; 8603 if (charmaptranslate_output(input, i, mapping, 8604 &output, &osize, &opos, &x)) { 8605 Py_XDECREF(x); 8606 goto onError; 8607 } 8608 Py_XDECREF(x); 8609 if (x!=Py_None) /* it worked => adjust input pointer */ 8610 ++i; 8611 else { /* untranslatable character */ 8612 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8613 Py_ssize_t repsize; 8614 Py_ssize_t newpos; 8615 Py_ssize_t uni2; 8616 /* startpos for collecting untranslatable chars */ 8617 Py_ssize_t collstart = i; 8618 Py_ssize_t collend = i+1; 8619 Py_ssize_t coll; 8620 8621 /* find all untranslatable characters */ 8622 while (collend < size) { 8623 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8624 goto onError; 8625 Py_XDECREF(x); 8626 if (x!=Py_None) 8627 break; 8628 ++collend; 8629 } 8630 /* cache callback name lookup 8631 * (if not done yet, i.e. it's the first error) */ 8632 if (known_errorHandler==-1) { 8633 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8634 known_errorHandler = 1; 8635 else if (!strcmp(errors, "replace")) 8636 known_errorHandler = 2; 8637 else if (!strcmp(errors, "ignore")) 8638 known_errorHandler = 3; 8639 else if (!strcmp(errors, "xmlcharrefreplace")) 8640 known_errorHandler = 4; 8641 else 8642 known_errorHandler = 0; 8643 } 8644 switch (known_errorHandler) { 8645 case 1: /* strict */ 8646 raise_translate_exception(&exc, input, collstart, 8647 collend, reason); 8648 goto onError; 8649 case 2: /* replace */ 8650 /* No need to check for space, this is a 1:1 replacement */ 8651 for (coll = collstart; coll<collend; coll++) 8652 output[opos++] = '?'; 8653 /* fall through */ 8654 case 3: /* ignore */ 8655 i = collend; 8656 break; 8657 case 4: /* xmlcharrefreplace */ 8658 /* generate replacement (temporarily (mis)uses i) */ 8659 for (i = collstart; i < collend; ++i) { 8660 char buffer[2+29+1+1]; 8661 char *cp; 8662 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8663 if (charmaptranslate_makespace(&output, &osize, 8664 opos+strlen(buffer)+(size-collend))) 8665 goto onError; 8666 for (cp = buffer; *cp; ++cp) 8667 output[opos++] = *cp; 8668 } 8669 i = collend; 8670 break; 8671 default: 8672 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8673 reason, input, &exc, 8674 collstart, collend, &newpos); 8675 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 8676 goto onError; 8677 /* generate replacement */ 8678 repsize = PyUnicode_GET_LENGTH(repunicode); 8679 if (charmaptranslate_makespace(&output, &osize, 8680 opos+repsize+(size-collend))) { 8681 Py_DECREF(repunicode); 8682 goto onError; 8683 } 8684 for (uni2 = 0; repsize-->0; ++uni2) 8685 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8686 i = newpos; 8687 Py_DECREF(repunicode); 8688 } 8689 } 8690 } 8691 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8692 if (!res) 8693 goto onError; 8694 PyMem_Free(output); 8695 Py_XDECREF(exc); 8696 Py_XDECREF(errorHandler); 8697 return res; 8698 8699 onError: 8700 PyMem_Free(output); 8701 Py_XDECREF(exc); 8702 Py_XDECREF(errorHandler); 8703 return NULL; 8704} 8705 8706/* Deprecated. Use PyUnicode_Translate instead. */ 8707PyObject * 8708PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8709 Py_ssize_t size, 8710 PyObject *mapping, 8711 const char *errors) 8712{ 8713 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8714 if (!unicode) 8715 return NULL; 8716 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8717} 8718 8719PyObject * 8720PyUnicode_Translate(PyObject *str, 8721 PyObject *mapping, 8722 const char *errors) 8723{ 8724 PyObject *result; 8725 8726 str = PyUnicode_FromObject(str); 8727 if (str == NULL) 8728 goto onError; 8729 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8730 Py_DECREF(str); 8731 return result; 8732 8733 onError: 8734 Py_XDECREF(str); 8735 return NULL; 8736} 8737 8738static Py_UCS4 8739fix_decimal_and_space_to_ascii(PyObject *self) 8740{ 8741 /* No need to call PyUnicode_READY(self) because this function is only 8742 called as a callback from fixup() which does it already. */ 8743 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8744 const int kind = PyUnicode_KIND(self); 8745 void *data = PyUnicode_DATA(self); 8746 Py_UCS4 maxchar = 0, ch, fixed; 8747 Py_ssize_t i; 8748 8749 for (i = 0; i < len; ++i) { 8750 ch = PyUnicode_READ(kind, data, i); 8751 fixed = 0; 8752 if (ch > 127) { 8753 if (Py_UNICODE_ISSPACE(ch)) 8754 fixed = ' '; 8755 else { 8756 const int decimal = Py_UNICODE_TODECIMAL(ch); 8757 if (decimal >= 0) 8758 fixed = '0' + decimal; 8759 } 8760 if (fixed != 0) { 8761 if (fixed > maxchar) 8762 maxchar = fixed; 8763 PyUnicode_WRITE(kind, data, i, fixed); 8764 } 8765 else if (ch > maxchar) 8766 maxchar = ch; 8767 } 8768 else if (ch > maxchar) 8769 maxchar = ch; 8770 } 8771 8772 return maxchar; 8773} 8774 8775PyObject * 8776_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8777{ 8778 if (!PyUnicode_Check(unicode)) { 8779 PyErr_BadInternalCall(); 8780 return NULL; 8781 } 8782 if (PyUnicode_READY(unicode) == -1) 8783 return NULL; 8784 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8785 /* If the string is already ASCII, just return the same string */ 8786 Py_INCREF(unicode); 8787 return unicode; 8788 } 8789 return fixup(unicode, fix_decimal_and_space_to_ascii); 8790} 8791 8792PyObject * 8793PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8794 Py_ssize_t length) 8795{ 8796 PyObject *result; 8797 Py_UNICODE *p; /* write pointer into result */ 8798 Py_ssize_t i; 8799 /* Copy to a new string */ 8800 result = (PyObject *)_PyUnicode_New(length); 8801 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8802 if (result == NULL) 8803 return result; 8804 p = PyUnicode_AS_UNICODE(result); 8805 /* Iterate over code points */ 8806 for (i = 0; i < length; i++) { 8807 Py_UNICODE ch =s[i]; 8808 if (ch > 127) { 8809 int decimal = Py_UNICODE_TODECIMAL(ch); 8810 if (decimal >= 0) 8811 p[i] = '0' + decimal; 8812 } 8813 } 8814#ifndef DONT_MAKE_RESULT_READY 8815 if (_PyUnicode_READY_REPLACE(&result)) { 8816 Py_DECREF(result); 8817 return NULL; 8818 } 8819#endif 8820 assert(_PyUnicode_CheckConsistency(result, 1)); 8821 return result; 8822} 8823/* --- Decimal Encoder ---------------------------------------------------- */ 8824 8825int 8826PyUnicode_EncodeDecimal(Py_UNICODE *s, 8827 Py_ssize_t length, 8828 char *output, 8829 const char *errors) 8830{ 8831 Py_UNICODE *p, *end; 8832 PyObject *errorHandler = NULL; 8833 PyObject *exc = NULL; 8834 PyObject *unicode; 8835 const char *encoding = "decimal"; 8836 const char *reason = "invalid decimal Unicode string"; 8837 /* the following variable is used for caching string comparisons 8838 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8839 int known_errorHandler = -1; 8840 8841 if (output == NULL) { 8842 PyErr_BadArgument(); 8843 return -1; 8844 } 8845 8846 p = s; 8847 end = s + length; 8848 while (p < end) { 8849 register Py_UNICODE ch = *p; 8850 int decimal; 8851 PyObject *repunicode; 8852 Py_ssize_t repsize; 8853 Py_ssize_t newpos; 8854 Py_UNICODE *uni2; 8855 Py_UNICODE *collstart; 8856 Py_UNICODE *collend; 8857 8858 if (Py_UNICODE_ISSPACE(ch)) { 8859 *output++ = ' '; 8860 ++p; 8861 continue; 8862 } 8863 decimal = Py_UNICODE_TODECIMAL(ch); 8864 if (decimal >= 0) { 8865 *output++ = '0' + decimal; 8866 ++p; 8867 continue; 8868 } 8869 if (0 < ch && ch < 256) { 8870 *output++ = (char)ch; 8871 ++p; 8872 continue; 8873 } 8874 /* All other characters are considered unencodable */ 8875 collstart = p; 8876 collend = p+1; 8877 while (collend < end) { 8878 if ((0 < *collend && *collend < 256) || 8879 !Py_UNICODE_ISSPACE(*collend) || 8880 Py_UNICODE_TODECIMAL(*collend)) 8881 break; 8882 } 8883 /* cache callback name lookup 8884 * (if not done yet, i.e. it's the first error) */ 8885 if (known_errorHandler==-1) { 8886 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8887 known_errorHandler = 1; 8888 else if (!strcmp(errors, "replace")) 8889 known_errorHandler = 2; 8890 else if (!strcmp(errors, "ignore")) 8891 known_errorHandler = 3; 8892 else if (!strcmp(errors, "xmlcharrefreplace")) 8893 known_errorHandler = 4; 8894 else 8895 known_errorHandler = 0; 8896 } 8897 switch (known_errorHandler) { 8898 case 1: /* strict */ 8899 unicode = PyUnicode_FromUnicode(s, length); 8900 if (unicode == NULL) 8901 goto onError; 8902 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason); 8903 Py_DECREF(unicode); 8904 goto onError; 8905 case 2: /* replace */ 8906 for (p = collstart; p < collend; ++p) 8907 *output++ = '?'; 8908 /* fall through */ 8909 case 3: /* ignore */ 8910 p = collend; 8911 break; 8912 case 4: /* xmlcharrefreplace */ 8913 /* generate replacement (temporarily (mis)uses p) */ 8914 for (p = collstart; p < collend; ++p) 8915 output += sprintf(output, "&#%d;", (int)*p); 8916 p = collend; 8917 break; 8918 default: 8919 unicode = PyUnicode_FromUnicode(s, length); 8920 if (unicode == NULL) 8921 goto onError; 8922 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8923 encoding, reason, unicode, &exc, 8924 collstart-s, collend-s, &newpos); 8925 Py_DECREF(unicode); 8926 if (repunicode == NULL) 8927 goto onError; 8928 if (!PyUnicode_Check(repunicode)) { 8929 /* Byte results not supported, since they have no decimal property. */ 8930 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8931 Py_DECREF(repunicode); 8932 goto onError; 8933 } 8934 /* generate replacement */ 8935 repsize = PyUnicode_GET_SIZE(repunicode); 8936 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8937 Py_UNICODE ch = *uni2; 8938 if (Py_UNICODE_ISSPACE(ch)) 8939 *output++ = ' '; 8940 else { 8941 decimal = Py_UNICODE_TODECIMAL(ch); 8942 if (decimal >= 0) 8943 *output++ = '0' + decimal; 8944 else if (0 < ch && ch < 256) 8945 *output++ = (char)ch; 8946 else { 8947 Py_DECREF(repunicode); 8948 unicode = PyUnicode_FromUnicode(s, length); 8949 if (unicode == NULL) 8950 goto onError; 8951 raise_encode_exception(&exc, encoding, 8952 unicode, collstart-s, collend-s, reason); 8953 Py_DECREF(unicode); 8954 goto onError; 8955 } 8956 } 8957 } 8958 p = s + newpos; 8959 Py_DECREF(repunicode); 8960 } 8961 } 8962 /* 0-terminate the output string */ 8963 *output++ = '\0'; 8964 Py_XDECREF(exc); 8965 Py_XDECREF(errorHandler); 8966 return 0; 8967 8968 onError: 8969 Py_XDECREF(exc); 8970 Py_XDECREF(errorHandler); 8971 return -1; 8972} 8973 8974/* --- Helpers ------------------------------------------------------------ */ 8975 8976static Py_ssize_t 8977any_find_slice(int direction, PyObject* s1, PyObject* s2, 8978 Py_ssize_t start, 8979 Py_ssize_t end) 8980{ 8981 int kind1, kind2, kind; 8982 void *buf1, *buf2; 8983 Py_ssize_t len1, len2, result; 8984 8985 kind1 = PyUnicode_KIND(s1); 8986 kind2 = PyUnicode_KIND(s2); 8987 kind = kind1 > kind2 ? kind1 : kind2; 8988 buf1 = PyUnicode_DATA(s1); 8989 buf2 = PyUnicode_DATA(s2); 8990 if (kind1 != kind) 8991 buf1 = _PyUnicode_AsKind(s1, kind); 8992 if (!buf1) 8993 return -2; 8994 if (kind2 != kind) 8995 buf2 = _PyUnicode_AsKind(s2, kind); 8996 if (!buf2) { 8997 if (kind1 != kind) PyMem_Free(buf1); 8998 return -2; 8999 } 9000 len1 = PyUnicode_GET_LENGTH(s1); 9001 len2 = PyUnicode_GET_LENGTH(s2); 9002 9003 if (direction > 0) { 9004 switch(kind) { 9005 case PyUnicode_1BYTE_KIND: 9006 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9007 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9008 else 9009 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9010 break; 9011 case PyUnicode_2BYTE_KIND: 9012 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9013 break; 9014 case PyUnicode_4BYTE_KIND: 9015 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9016 break; 9017 default: 9018 assert(0); result = -2; 9019 } 9020 } 9021 else { 9022 switch(kind) { 9023 case PyUnicode_1BYTE_KIND: 9024 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9025 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9026 else 9027 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9028 break; 9029 case PyUnicode_2BYTE_KIND: 9030 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9031 break; 9032 case PyUnicode_4BYTE_KIND: 9033 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9034 break; 9035 default: 9036 assert(0); result = -2; 9037 } 9038 } 9039 9040 if (kind1 != kind) 9041 PyMem_Free(buf1); 9042 if (kind2 != kind) 9043 PyMem_Free(buf2); 9044 9045 return result; 9046} 9047 9048Py_ssize_t 9049_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, 9050 Py_ssize_t n_buffer, 9051 void *digits, Py_ssize_t n_digits, 9052 Py_ssize_t min_width, 9053 const char *grouping, 9054 const char *thousands_sep) 9055{ 9056 switch(kind) { 9057 case PyUnicode_1BYTE_KIND: 9058 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9059 return _PyUnicode_ascii_InsertThousandsGrouping( 9060 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9061 min_width, grouping, thousands_sep); 9062 else 9063 return _PyUnicode_ucs1_InsertThousandsGrouping( 9064 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9065 min_width, grouping, thousands_sep); 9066 case PyUnicode_2BYTE_KIND: 9067 return _PyUnicode_ucs2_InsertThousandsGrouping( 9068 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 9069 min_width, grouping, thousands_sep); 9070 case PyUnicode_4BYTE_KIND: 9071 return _PyUnicode_ucs4_InsertThousandsGrouping( 9072 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 9073 min_width, grouping, thousands_sep); 9074 } 9075 assert(0); 9076 return -1; 9077} 9078 9079 9080/* helper macro to fixup start/end slice values */ 9081#define ADJUST_INDICES(start, end, len) \ 9082 if (end > len) \ 9083 end = len; \ 9084 else if (end < 0) { \ 9085 end += len; \ 9086 if (end < 0) \ 9087 end = 0; \ 9088 } \ 9089 if (start < 0) { \ 9090 start += len; \ 9091 if (start < 0) \ 9092 start = 0; \ 9093 } 9094 9095Py_ssize_t 9096PyUnicode_Count(PyObject *str, 9097 PyObject *substr, 9098 Py_ssize_t start, 9099 Py_ssize_t end) 9100{ 9101 Py_ssize_t result; 9102 PyObject* str_obj; 9103 PyObject* sub_obj; 9104 int kind1, kind2, kind; 9105 void *buf1 = NULL, *buf2 = NULL; 9106 Py_ssize_t len1, len2; 9107 9108 str_obj = PyUnicode_FromObject(str); 9109 if (!str_obj || PyUnicode_READY(str_obj) == -1) 9110 return -1; 9111 sub_obj = PyUnicode_FromObject(substr); 9112 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 9113 Py_DECREF(str_obj); 9114 return -1; 9115 } 9116 9117 kind1 = PyUnicode_KIND(str_obj); 9118 kind2 = PyUnicode_KIND(sub_obj); 9119 kind = kind1 > kind2 ? kind1 : kind2; 9120 buf1 = PyUnicode_DATA(str_obj); 9121 if (kind1 != kind) 9122 buf1 = _PyUnicode_AsKind(str_obj, kind); 9123 if (!buf1) 9124 goto onError; 9125 buf2 = PyUnicode_DATA(sub_obj); 9126 if (kind2 != kind) 9127 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9128 if (!buf2) 9129 goto onError; 9130 len1 = PyUnicode_GET_LENGTH(str_obj); 9131 len2 = PyUnicode_GET_LENGTH(sub_obj); 9132 9133 ADJUST_INDICES(start, end, len1); 9134 switch(kind) { 9135 case PyUnicode_1BYTE_KIND: 9136 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9137 result = asciilib_count( 9138 ((Py_UCS1*)buf1) + start, end - start, 9139 buf2, len2, PY_SSIZE_T_MAX 9140 ); 9141 else 9142 result = ucs1lib_count( 9143 ((Py_UCS1*)buf1) + start, end - start, 9144 buf2, len2, PY_SSIZE_T_MAX 9145 ); 9146 break; 9147 case PyUnicode_2BYTE_KIND: 9148 result = ucs2lib_count( 9149 ((Py_UCS2*)buf1) + start, end - start, 9150 buf2, len2, PY_SSIZE_T_MAX 9151 ); 9152 break; 9153 case PyUnicode_4BYTE_KIND: 9154 result = ucs4lib_count( 9155 ((Py_UCS4*)buf1) + start, end - start, 9156 buf2, len2, PY_SSIZE_T_MAX 9157 ); 9158 break; 9159 default: 9160 assert(0); result = 0; 9161 } 9162 9163 Py_DECREF(sub_obj); 9164 Py_DECREF(str_obj); 9165 9166 if (kind1 != kind) 9167 PyMem_Free(buf1); 9168 if (kind2 != kind) 9169 PyMem_Free(buf2); 9170 9171 return result; 9172 onError: 9173 Py_DECREF(sub_obj); 9174 Py_DECREF(str_obj); 9175 if (kind1 != kind && buf1) 9176 PyMem_Free(buf1); 9177 if (kind2 != kind && buf2) 9178 PyMem_Free(buf2); 9179 return -1; 9180} 9181 9182Py_ssize_t 9183PyUnicode_Find(PyObject *str, 9184 PyObject *sub, 9185 Py_ssize_t start, 9186 Py_ssize_t end, 9187 int direction) 9188{ 9189 Py_ssize_t result; 9190 9191 str = PyUnicode_FromObject(str); 9192 if (!str || PyUnicode_READY(str) == -1) 9193 return -2; 9194 sub = PyUnicode_FromObject(sub); 9195 if (!sub || PyUnicode_READY(sub) == -1) { 9196 Py_DECREF(str); 9197 return -2; 9198 } 9199 9200 result = any_find_slice(direction, 9201 str, sub, start, end 9202 ); 9203 9204 Py_DECREF(str); 9205 Py_DECREF(sub); 9206 9207 return result; 9208} 9209 9210Py_ssize_t 9211PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9212 Py_ssize_t start, Py_ssize_t end, 9213 int direction) 9214{ 9215 int kind; 9216 Py_ssize_t result; 9217 if (PyUnicode_READY(str) == -1) 9218 return -2; 9219 if (start < 0 || end < 0) { 9220 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9221 return -2; 9222 } 9223 if (end > PyUnicode_GET_LENGTH(str)) 9224 end = PyUnicode_GET_LENGTH(str); 9225 kind = PyUnicode_KIND(str); 9226 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9227 kind, end-start, ch, direction); 9228 if (result == -1) 9229 return -1; 9230 else 9231 return start + result; 9232} 9233 9234static int 9235tailmatch(PyObject *self, 9236 PyObject *substring, 9237 Py_ssize_t start, 9238 Py_ssize_t end, 9239 int direction) 9240{ 9241 int kind_self; 9242 int kind_sub; 9243 void *data_self; 9244 void *data_sub; 9245 Py_ssize_t offset; 9246 Py_ssize_t i; 9247 Py_ssize_t end_sub; 9248 9249 if (PyUnicode_READY(self) == -1 || 9250 PyUnicode_READY(substring) == -1) 9251 return 0; 9252 9253 if (PyUnicode_GET_LENGTH(substring) == 0) 9254 return 1; 9255 9256 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9257 end -= PyUnicode_GET_LENGTH(substring); 9258 if (end < start) 9259 return 0; 9260 9261 kind_self = PyUnicode_KIND(self); 9262 data_self = PyUnicode_DATA(self); 9263 kind_sub = PyUnicode_KIND(substring); 9264 data_sub = PyUnicode_DATA(substring); 9265 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9266 9267 if (direction > 0) 9268 offset = end; 9269 else 9270 offset = start; 9271 9272 if (PyUnicode_READ(kind_self, data_self, offset) == 9273 PyUnicode_READ(kind_sub, data_sub, 0) && 9274 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9275 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9276 /* If both are of the same kind, memcmp is sufficient */ 9277 if (kind_self == kind_sub) { 9278 return ! memcmp((char *)data_self + 9279 (offset * PyUnicode_KIND(substring)), 9280 data_sub, 9281 PyUnicode_GET_LENGTH(substring) * 9282 PyUnicode_KIND(substring)); 9283 } 9284 /* otherwise we have to compare each character by first accesing it */ 9285 else { 9286 /* We do not need to compare 0 and len(substring)-1 because 9287 the if statement above ensured already that they are equal 9288 when we end up here. */ 9289 // TODO: honor direction and do a forward or backwards search 9290 for (i = 1; i < end_sub; ++i) { 9291 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9292 PyUnicode_READ(kind_sub, data_sub, i)) 9293 return 0; 9294 } 9295 return 1; 9296 } 9297 } 9298 9299 return 0; 9300} 9301 9302Py_ssize_t 9303PyUnicode_Tailmatch(PyObject *str, 9304 PyObject *substr, 9305 Py_ssize_t start, 9306 Py_ssize_t end, 9307 int direction) 9308{ 9309 Py_ssize_t result; 9310 9311 str = PyUnicode_FromObject(str); 9312 if (str == NULL) 9313 return -1; 9314 substr = PyUnicode_FromObject(substr); 9315 if (substr == NULL) { 9316 Py_DECREF(str); 9317 return -1; 9318 } 9319 9320 result = tailmatch(str, substr, 9321 start, end, direction); 9322 Py_DECREF(str); 9323 Py_DECREF(substr); 9324 return result; 9325} 9326 9327/* Apply fixfct filter to the Unicode object self and return a 9328 reference to the modified object */ 9329 9330static PyObject * 9331fixup(PyObject *self, 9332 Py_UCS4 (*fixfct)(PyObject *s)) 9333{ 9334 PyObject *u; 9335 Py_UCS4 maxchar_old, maxchar_new = 0; 9336 9337 if (PyUnicode_READY(self) == -1) 9338 return NULL; 9339 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 9340 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 9341 maxchar_old); 9342 if (u == NULL) 9343 return NULL; 9344 9345 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 9346 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u)); 9347 9348 /* fix functions return the new maximum character in a string, 9349 if the kind of the resulting unicode object does not change, 9350 everything is fine. Otherwise we need to change the string kind 9351 and re-run the fix function. */ 9352 maxchar_new = fixfct(u); 9353 if (maxchar_new == 0) 9354 /* do nothing, keep maxchar_new at 0 which means no changes. */; 9355 else if (maxchar_new <= 127) 9356 maxchar_new = 127; 9357 else if (maxchar_new <= 255) 9358 maxchar_new = 255; 9359 else if (maxchar_new <= 65535) 9360 maxchar_new = 65535; 9361 else 9362 maxchar_new = 1114111; /* 0x10ffff */ 9363 9364 if (!maxchar_new && PyUnicode_CheckExact(self)) { 9365 /* fixfct should return TRUE if it modified the buffer. If 9366 FALSE, return a reference to the original buffer instead 9367 (to save space, not time) */ 9368 Py_INCREF(self); 9369 Py_DECREF(u); 9370 return self; 9371 } 9372 else if (maxchar_new == maxchar_old) { 9373 return u; 9374 } 9375 else { 9376 /* In case the maximum character changed, we need to 9377 convert the string to the new category. */ 9378 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9379 if (v == NULL) { 9380 Py_DECREF(u); 9381 return NULL; 9382 } 9383 if (maxchar_new > maxchar_old) { 9384 /* If the maxchar increased so that the kind changed, not all 9385 characters are representable anymore and we need to fix the 9386 string again. This only happens in very few cases. */ 9387 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); 9388 maxchar_old = fixfct(v); 9389 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9390 } 9391 else { 9392 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); 9393 } 9394 9395 Py_DECREF(u); 9396 assert(_PyUnicode_CheckConsistency(v, 1)); 9397 return v; 9398 } 9399} 9400 9401static Py_UCS4 9402fixupper(PyObject *self) 9403{ 9404 /* No need to call PyUnicode_READY(self) because this function is only 9405 called as a callback from fixup() which does it already. */ 9406 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9407 const int kind = PyUnicode_KIND(self); 9408 void *data = PyUnicode_DATA(self); 9409 int touched = 0; 9410 Py_UCS4 maxchar = 0; 9411 Py_ssize_t i; 9412 9413 for (i = 0; i < len; ++i) { 9414 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9415 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 9416 if (up != ch) { 9417 if (up > maxchar) 9418 maxchar = up; 9419 PyUnicode_WRITE(kind, data, i, up); 9420 touched = 1; 9421 } 9422 else if (ch > maxchar) 9423 maxchar = ch; 9424 } 9425 9426 if (touched) 9427 return maxchar; 9428 else 9429 return 0; 9430} 9431 9432static Py_UCS4 9433fixlower(PyObject *self) 9434{ 9435 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9436 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9437 const int kind = PyUnicode_KIND(self); 9438 void *data = PyUnicode_DATA(self); 9439 int touched = 0; 9440 Py_UCS4 maxchar = 0; 9441 Py_ssize_t i; 9442 9443 for(i = 0; i < len; ++i) { 9444 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9445 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9446 if (lo != ch) { 9447 if (lo > maxchar) 9448 maxchar = lo; 9449 PyUnicode_WRITE(kind, data, i, lo); 9450 touched = 1; 9451 } 9452 else if (ch > maxchar) 9453 maxchar = ch; 9454 } 9455 9456 if (touched) 9457 return maxchar; 9458 else 9459 return 0; 9460} 9461 9462static Py_UCS4 9463fixswapcase(PyObject *self) 9464{ 9465 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9466 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9467 const int kind = PyUnicode_KIND(self); 9468 void *data = PyUnicode_DATA(self); 9469 int touched = 0; 9470 Py_UCS4 maxchar = 0; 9471 Py_ssize_t i; 9472 9473 for(i = 0; i < len; ++i) { 9474 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9475 Py_UCS4 nu = 0; 9476 9477 if (Py_UNICODE_ISUPPER(ch)) 9478 nu = Py_UNICODE_TOLOWER(ch); 9479 else if (Py_UNICODE_ISLOWER(ch)) 9480 nu = Py_UNICODE_TOUPPER(ch); 9481 9482 if (nu != 0) { 9483 if (nu > maxchar) 9484 maxchar = nu; 9485 PyUnicode_WRITE(kind, data, i, nu); 9486 touched = 1; 9487 } 9488 else if (ch > maxchar) 9489 maxchar = ch; 9490 } 9491 9492 if (touched) 9493 return maxchar; 9494 else 9495 return 0; 9496} 9497 9498static Py_UCS4 9499fixcapitalize(PyObject *self) 9500{ 9501 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9502 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9503 const int kind = PyUnicode_KIND(self); 9504 void *data = PyUnicode_DATA(self); 9505 int touched = 0; 9506 Py_UCS4 maxchar = 0; 9507 Py_ssize_t i = 0; 9508 Py_UCS4 ch; 9509 9510 if (len == 0) 9511 return 0; 9512 9513 ch = PyUnicode_READ(kind, data, i); 9514 if (!Py_UNICODE_ISUPPER(ch)) { 9515 maxchar = Py_UNICODE_TOUPPER(ch); 9516 PyUnicode_WRITE(kind, data, i, maxchar); 9517 touched = 1; 9518 } 9519 ++i; 9520 for(; i < len; ++i) { 9521 ch = PyUnicode_READ(kind, data, i); 9522 if (!Py_UNICODE_ISLOWER(ch)) { 9523 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9524 if (lo > maxchar) 9525 maxchar = lo; 9526 PyUnicode_WRITE(kind, data, i, lo); 9527 touched = 1; 9528 } 9529 else if (ch > maxchar) 9530 maxchar = ch; 9531 } 9532 9533 if (touched) 9534 return maxchar; 9535 else 9536 return 0; 9537} 9538 9539static Py_UCS4 9540fixtitle(PyObject *self) 9541{ 9542 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9543 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9544 const int kind = PyUnicode_KIND(self); 9545 void *data = PyUnicode_DATA(self); 9546 Py_UCS4 maxchar = 0; 9547 Py_ssize_t i = 0; 9548 int previous_is_cased; 9549 9550 /* Shortcut for single character strings */ 9551 if (len == 1) { 9552 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9553 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 9554 if (ti != ch) { 9555 PyUnicode_WRITE(kind, data, i, ti); 9556 return ti; 9557 } 9558 else 9559 return 0; 9560 } 9561 previous_is_cased = 0; 9562 for(; i < len; ++i) { 9563 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9564 Py_UCS4 nu; 9565 9566 if (previous_is_cased) 9567 nu = Py_UNICODE_TOLOWER(ch); 9568 else 9569 nu = Py_UNICODE_TOTITLE(ch); 9570 9571 if (nu > maxchar) 9572 maxchar = nu; 9573 PyUnicode_WRITE(kind, data, i, nu); 9574 9575 if (Py_UNICODE_ISLOWER(ch) || 9576 Py_UNICODE_ISUPPER(ch) || 9577 Py_UNICODE_ISTITLE(ch)) 9578 previous_is_cased = 1; 9579 else 9580 previous_is_cased = 0; 9581 } 9582 return maxchar; 9583} 9584 9585PyObject * 9586PyUnicode_Join(PyObject *separator, PyObject *seq) 9587{ 9588 PyObject *sep = NULL; 9589 Py_ssize_t seplen; 9590 PyObject *res = NULL; /* the result */ 9591 PyObject *fseq; /* PySequence_Fast(seq) */ 9592 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9593 PyObject **items; 9594 PyObject *item; 9595 Py_ssize_t sz, i, res_offset; 9596 Py_UCS4 maxchar; 9597 Py_UCS4 item_maxchar; 9598 int use_memcpy; 9599 unsigned char *res_data = NULL, *sep_data = NULL; 9600 PyObject *last_obj; 9601 unsigned int kind = 0; 9602 9603 fseq = PySequence_Fast(seq, ""); 9604 if (fseq == NULL) { 9605 return NULL; 9606 } 9607 9608 /* NOTE: the following code can't call back into Python code, 9609 * so we are sure that fseq won't be mutated. 9610 */ 9611 9612 seqlen = PySequence_Fast_GET_SIZE(fseq); 9613 /* If empty sequence, return u"". */ 9614 if (seqlen == 0) { 9615 Py_DECREF(fseq); 9616 Py_INCREF(unicode_empty); 9617 res = unicode_empty; 9618 return res; 9619 } 9620 9621 /* If singleton sequence with an exact Unicode, return that. */ 9622 last_obj = NULL; 9623 items = PySequence_Fast_ITEMS(fseq); 9624 if (seqlen == 1) { 9625 if (PyUnicode_CheckExact(items[0])) { 9626 res = items[0]; 9627 Py_INCREF(res); 9628 Py_DECREF(fseq); 9629 return res; 9630 } 9631 seplen = 0; 9632 maxchar = 0; 9633 } 9634 else { 9635 /* Set up sep and seplen */ 9636 if (separator == NULL) { 9637 /* fall back to a blank space separator */ 9638 sep = PyUnicode_FromOrdinal(' '); 9639 if (!sep) 9640 goto onError; 9641 seplen = 1; 9642 maxchar = 32; 9643 } 9644 else { 9645 if (!PyUnicode_Check(separator)) { 9646 PyErr_Format(PyExc_TypeError, 9647 "separator: expected str instance," 9648 " %.80s found", 9649 Py_TYPE(separator)->tp_name); 9650 goto onError; 9651 } 9652 if (PyUnicode_READY(separator)) 9653 goto onError; 9654 sep = separator; 9655 seplen = PyUnicode_GET_LENGTH(separator); 9656 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9657 /* inc refcount to keep this code path symmetric with the 9658 above case of a blank separator */ 9659 Py_INCREF(sep); 9660 } 9661 last_obj = sep; 9662 } 9663 9664 /* There are at least two things to join, or else we have a subclass 9665 * of str in the sequence. 9666 * Do a pre-pass to figure out the total amount of space we'll 9667 * need (sz), and see whether all argument are strings. 9668 */ 9669 sz = 0; 9670#ifdef Py_DEBUG 9671 use_memcpy = 0; 9672#else 9673 use_memcpy = 1; 9674#endif 9675 for (i = 0; i < seqlen; i++) { 9676 const Py_ssize_t old_sz = sz; 9677 item = items[i]; 9678 if (!PyUnicode_Check(item)) { 9679 PyErr_Format(PyExc_TypeError, 9680 "sequence item %zd: expected str instance," 9681 " %.80s found", 9682 i, Py_TYPE(item)->tp_name); 9683 goto onError; 9684 } 9685 if (PyUnicode_READY(item) == -1) 9686 goto onError; 9687 sz += PyUnicode_GET_LENGTH(item); 9688 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9689 maxchar = Py_MAX(maxchar, item_maxchar); 9690 if (i != 0) 9691 sz += seplen; 9692 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9693 PyErr_SetString(PyExc_OverflowError, 9694 "join() result is too long for a Python string"); 9695 goto onError; 9696 } 9697 if (use_memcpy && last_obj != NULL) { 9698 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9699 use_memcpy = 0; 9700 } 9701 last_obj = item; 9702 } 9703 9704 res = PyUnicode_New(sz, maxchar); 9705 if (res == NULL) 9706 goto onError; 9707 9708 /* Catenate everything. */ 9709#ifdef Py_DEBUG 9710 use_memcpy = 0; 9711#else 9712 if (use_memcpy) { 9713 res_data = PyUnicode_1BYTE_DATA(res); 9714 kind = PyUnicode_KIND(res); 9715 if (seplen != 0) 9716 sep_data = PyUnicode_1BYTE_DATA(sep); 9717 } 9718#endif 9719 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9720 Py_ssize_t itemlen; 9721 item = items[i]; 9722 /* Copy item, and maybe the separator. */ 9723 if (i && seplen != 0) { 9724 if (use_memcpy) { 9725 Py_MEMCPY(res_data, 9726 sep_data, 9727 kind * seplen); 9728 res_data += kind * seplen; 9729 } 9730 else { 9731 copy_characters(res, res_offset, sep, 0, seplen); 9732 res_offset += seplen; 9733 } 9734 } 9735 itemlen = PyUnicode_GET_LENGTH(item); 9736 if (itemlen != 0) { 9737 if (use_memcpy) { 9738 Py_MEMCPY(res_data, 9739 PyUnicode_DATA(item), 9740 kind * itemlen); 9741 res_data += kind * itemlen; 9742 } 9743 else { 9744 copy_characters(res, res_offset, item, 0, itemlen); 9745 res_offset += itemlen; 9746 } 9747 } 9748 } 9749 if (use_memcpy) 9750 assert(res_data == PyUnicode_1BYTE_DATA(res) 9751 + kind * PyUnicode_GET_LENGTH(res)); 9752 else 9753 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9754 9755 Py_DECREF(fseq); 9756 Py_XDECREF(sep); 9757 assert(_PyUnicode_CheckConsistency(res, 1)); 9758 return res; 9759 9760 onError: 9761 Py_DECREF(fseq); 9762 Py_XDECREF(sep); 9763 Py_XDECREF(res); 9764 return NULL; 9765} 9766 9767#define FILL(kind, data, value, start, length) \ 9768 do { \ 9769 Py_ssize_t i_ = 0; \ 9770 assert(kind != PyUnicode_WCHAR_KIND); \ 9771 switch ((kind)) { \ 9772 case PyUnicode_1BYTE_KIND: { \ 9773 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9774 memset(to_, (unsigned char)value, length); \ 9775 break; \ 9776 } \ 9777 case PyUnicode_2BYTE_KIND: { \ 9778 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9779 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9780 break; \ 9781 } \ 9782 default: { \ 9783 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9784 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9785 break; \ 9786 } \ 9787 } \ 9788 } while (0) 9789 9790static PyObject * 9791pad(PyObject *self, 9792 Py_ssize_t left, 9793 Py_ssize_t right, 9794 Py_UCS4 fill) 9795{ 9796 PyObject *u; 9797 Py_UCS4 maxchar; 9798 int kind; 9799 void *data; 9800 9801 if (left < 0) 9802 left = 0; 9803 if (right < 0) 9804 right = 0; 9805 9806 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9807 Py_INCREF(self); 9808 return self; 9809 } 9810 9811 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9812 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9813 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9814 return NULL; 9815 } 9816 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9817 if (fill > maxchar) 9818 maxchar = fill; 9819 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9820 if (!u) 9821 return NULL; 9822 9823 kind = PyUnicode_KIND(u); 9824 data = PyUnicode_DATA(u); 9825 if (left) 9826 FILL(kind, data, fill, 0, left); 9827 if (right) 9828 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9829 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9830 assert(_PyUnicode_CheckConsistency(u, 1)); 9831 return u; 9832} 9833#undef FILL 9834 9835PyObject * 9836PyUnicode_Splitlines(PyObject *string, int keepends) 9837{ 9838 PyObject *list; 9839 9840 string = PyUnicode_FromObject(string); 9841 if (string == NULL || PyUnicode_READY(string) == -1) 9842 return NULL; 9843 9844 switch(PyUnicode_KIND(string)) { 9845 case PyUnicode_1BYTE_KIND: 9846 if (PyUnicode_IS_ASCII(string)) 9847 list = asciilib_splitlines( 9848 string, PyUnicode_1BYTE_DATA(string), 9849 PyUnicode_GET_LENGTH(string), keepends); 9850 else 9851 list = ucs1lib_splitlines( 9852 string, PyUnicode_1BYTE_DATA(string), 9853 PyUnicode_GET_LENGTH(string), keepends); 9854 break; 9855 case PyUnicode_2BYTE_KIND: 9856 list = ucs2lib_splitlines( 9857 string, PyUnicode_2BYTE_DATA(string), 9858 PyUnicode_GET_LENGTH(string), keepends); 9859 break; 9860 case PyUnicode_4BYTE_KIND: 9861 list = ucs4lib_splitlines( 9862 string, PyUnicode_4BYTE_DATA(string), 9863 PyUnicode_GET_LENGTH(string), keepends); 9864 break; 9865 default: 9866 assert(0); 9867 list = 0; 9868 } 9869 Py_DECREF(string); 9870 return list; 9871} 9872 9873static PyObject * 9874split(PyObject *self, 9875 PyObject *substring, 9876 Py_ssize_t maxcount) 9877{ 9878 int kind1, kind2, kind; 9879 void *buf1, *buf2; 9880 Py_ssize_t len1, len2; 9881 PyObject* out; 9882 9883 if (maxcount < 0) 9884 maxcount = PY_SSIZE_T_MAX; 9885 9886 if (PyUnicode_READY(self) == -1) 9887 return NULL; 9888 9889 if (substring == NULL) 9890 switch(PyUnicode_KIND(self)) { 9891 case PyUnicode_1BYTE_KIND: 9892 if (PyUnicode_IS_ASCII(self)) 9893 return asciilib_split_whitespace( 9894 self, PyUnicode_1BYTE_DATA(self), 9895 PyUnicode_GET_LENGTH(self), maxcount 9896 ); 9897 else 9898 return ucs1lib_split_whitespace( 9899 self, PyUnicode_1BYTE_DATA(self), 9900 PyUnicode_GET_LENGTH(self), maxcount 9901 ); 9902 case PyUnicode_2BYTE_KIND: 9903 return ucs2lib_split_whitespace( 9904 self, PyUnicode_2BYTE_DATA(self), 9905 PyUnicode_GET_LENGTH(self), maxcount 9906 ); 9907 case PyUnicode_4BYTE_KIND: 9908 return ucs4lib_split_whitespace( 9909 self, PyUnicode_4BYTE_DATA(self), 9910 PyUnicode_GET_LENGTH(self), maxcount 9911 ); 9912 default: 9913 assert(0); 9914 return NULL; 9915 } 9916 9917 if (PyUnicode_READY(substring) == -1) 9918 return NULL; 9919 9920 kind1 = PyUnicode_KIND(self); 9921 kind2 = PyUnicode_KIND(substring); 9922 kind = kind1 > kind2 ? kind1 : kind2; 9923 buf1 = PyUnicode_DATA(self); 9924 buf2 = PyUnicode_DATA(substring); 9925 if (kind1 != kind) 9926 buf1 = _PyUnicode_AsKind(self, kind); 9927 if (!buf1) 9928 return NULL; 9929 if (kind2 != kind) 9930 buf2 = _PyUnicode_AsKind(substring, kind); 9931 if (!buf2) { 9932 if (kind1 != kind) PyMem_Free(buf1); 9933 return NULL; 9934 } 9935 len1 = PyUnicode_GET_LENGTH(self); 9936 len2 = PyUnicode_GET_LENGTH(substring); 9937 9938 switch(kind) { 9939 case PyUnicode_1BYTE_KIND: 9940 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9941 out = asciilib_split( 9942 self, buf1, len1, buf2, len2, maxcount); 9943 else 9944 out = ucs1lib_split( 9945 self, buf1, len1, buf2, len2, maxcount); 9946 break; 9947 case PyUnicode_2BYTE_KIND: 9948 out = ucs2lib_split( 9949 self, buf1, len1, buf2, len2, maxcount); 9950 break; 9951 case PyUnicode_4BYTE_KIND: 9952 out = ucs4lib_split( 9953 self, buf1, len1, buf2, len2, maxcount); 9954 break; 9955 default: 9956 out = NULL; 9957 } 9958 if (kind1 != kind) 9959 PyMem_Free(buf1); 9960 if (kind2 != kind) 9961 PyMem_Free(buf2); 9962 return out; 9963} 9964 9965static PyObject * 9966rsplit(PyObject *self, 9967 PyObject *substring, 9968 Py_ssize_t maxcount) 9969{ 9970 int kind1, kind2, kind; 9971 void *buf1, *buf2; 9972 Py_ssize_t len1, len2; 9973 PyObject* out; 9974 9975 if (maxcount < 0) 9976 maxcount = PY_SSIZE_T_MAX; 9977 9978 if (PyUnicode_READY(self) == -1) 9979 return NULL; 9980 9981 if (substring == NULL) 9982 switch(PyUnicode_KIND(self)) { 9983 case PyUnicode_1BYTE_KIND: 9984 if (PyUnicode_IS_ASCII(self)) 9985 return asciilib_rsplit_whitespace( 9986 self, PyUnicode_1BYTE_DATA(self), 9987 PyUnicode_GET_LENGTH(self), maxcount 9988 ); 9989 else 9990 return ucs1lib_rsplit_whitespace( 9991 self, PyUnicode_1BYTE_DATA(self), 9992 PyUnicode_GET_LENGTH(self), maxcount 9993 ); 9994 case PyUnicode_2BYTE_KIND: 9995 return ucs2lib_rsplit_whitespace( 9996 self, PyUnicode_2BYTE_DATA(self), 9997 PyUnicode_GET_LENGTH(self), maxcount 9998 ); 9999 case PyUnicode_4BYTE_KIND: 10000 return ucs4lib_rsplit_whitespace( 10001 self, PyUnicode_4BYTE_DATA(self), 10002 PyUnicode_GET_LENGTH(self), maxcount 10003 ); 10004 default: 10005 assert(0); 10006 return NULL; 10007 } 10008 10009 if (PyUnicode_READY(substring) == -1) 10010 return NULL; 10011 10012 kind1 = PyUnicode_KIND(self); 10013 kind2 = PyUnicode_KIND(substring); 10014 kind = kind1 > kind2 ? kind1 : kind2; 10015 buf1 = PyUnicode_DATA(self); 10016 buf2 = PyUnicode_DATA(substring); 10017 if (kind1 != kind) 10018 buf1 = _PyUnicode_AsKind(self, kind); 10019 if (!buf1) 10020 return NULL; 10021 if (kind2 != kind) 10022 buf2 = _PyUnicode_AsKind(substring, kind); 10023 if (!buf2) { 10024 if (kind1 != kind) PyMem_Free(buf1); 10025 return NULL; 10026 } 10027 len1 = PyUnicode_GET_LENGTH(self); 10028 len2 = PyUnicode_GET_LENGTH(substring); 10029 10030 switch(kind) { 10031 case PyUnicode_1BYTE_KIND: 10032 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10033 out = asciilib_rsplit( 10034 self, buf1, len1, buf2, len2, maxcount); 10035 else 10036 out = ucs1lib_rsplit( 10037 self, buf1, len1, buf2, len2, maxcount); 10038 break; 10039 case PyUnicode_2BYTE_KIND: 10040 out = ucs2lib_rsplit( 10041 self, buf1, len1, buf2, len2, maxcount); 10042 break; 10043 case PyUnicode_4BYTE_KIND: 10044 out = ucs4lib_rsplit( 10045 self, buf1, len1, buf2, len2, maxcount); 10046 break; 10047 default: 10048 out = NULL; 10049 } 10050 if (kind1 != kind) 10051 PyMem_Free(buf1); 10052 if (kind2 != kind) 10053 PyMem_Free(buf2); 10054 return out; 10055} 10056 10057static Py_ssize_t 10058anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10059 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10060{ 10061 switch(kind) { 10062 case PyUnicode_1BYTE_KIND: 10063 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10064 return asciilib_find(buf1, len1, buf2, len2, offset); 10065 else 10066 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10067 case PyUnicode_2BYTE_KIND: 10068 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10069 case PyUnicode_4BYTE_KIND: 10070 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10071 } 10072 assert(0); 10073 return -1; 10074} 10075 10076static Py_ssize_t 10077anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10078 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10079{ 10080 switch(kind) { 10081 case PyUnicode_1BYTE_KIND: 10082 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10083 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10084 else 10085 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10086 case PyUnicode_2BYTE_KIND: 10087 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10088 case PyUnicode_4BYTE_KIND: 10089 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10090 } 10091 assert(0); 10092 return 0; 10093} 10094 10095static PyObject * 10096replace(PyObject *self, PyObject *str1, 10097 PyObject *str2, Py_ssize_t maxcount) 10098{ 10099 PyObject *u; 10100 char *sbuf = PyUnicode_DATA(self); 10101 char *buf1 = PyUnicode_DATA(str1); 10102 char *buf2 = PyUnicode_DATA(str2); 10103 int srelease = 0, release1 = 0, release2 = 0; 10104 int skind = PyUnicode_KIND(self); 10105 int kind1 = PyUnicode_KIND(str1); 10106 int kind2 = PyUnicode_KIND(str2); 10107 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10108 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10109 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10110 int mayshrink; 10111 Py_UCS4 maxchar, maxchar_str2; 10112 10113 if (maxcount < 0) 10114 maxcount = PY_SSIZE_T_MAX; 10115 else if (maxcount == 0 || slen == 0) 10116 goto nothing; 10117 10118 if (str1 == str2) 10119 goto nothing; 10120 if (skind < kind1) 10121 /* substring too wide to be present */ 10122 goto nothing; 10123 10124 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10125 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10126 /* Replacing str1 with str2 may cause a maxchar reduction in the 10127 result string. */ 10128 mayshrink = (maxchar_str2 < maxchar); 10129 maxchar = Py_MAX(maxchar, maxchar_str2); 10130 10131 if (len1 == len2) { 10132 Py_ssize_t i; 10133 /* same length */ 10134 if (len1 == 0) 10135 goto nothing; 10136 if (len1 == 1) { 10137 /* replace characters */ 10138 Py_UCS4 u1, u2; 10139 int rkind; 10140 u1 = PyUnicode_READ_CHAR(str1, 0); 10141 if (findchar(sbuf, PyUnicode_KIND(self), 10142 slen, u1, 1) < 0) 10143 goto nothing; 10144 u2 = PyUnicode_READ_CHAR(str2, 0); 10145 u = PyUnicode_New(slen, maxchar); 10146 if (!u) 10147 goto error; 10148 copy_characters(u, 0, self, 0, slen); 10149 rkind = PyUnicode_KIND(u); 10150 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 10151 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 10152 if (--maxcount < 0) 10153 break; 10154 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 10155 } 10156 } 10157 else { 10158 int rkind = skind; 10159 char *res; 10160 10161 if (kind1 < rkind) { 10162 /* widen substring */ 10163 buf1 = _PyUnicode_AsKind(str1, rkind); 10164 if (!buf1) goto error; 10165 release1 = 1; 10166 } 10167 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10168 if (i < 0) 10169 goto nothing; 10170 if (rkind > kind2) { 10171 /* widen replacement */ 10172 buf2 = _PyUnicode_AsKind(str2, rkind); 10173 if (!buf2) goto error; 10174 release2 = 1; 10175 } 10176 else if (rkind < kind2) { 10177 /* widen self and buf1 */ 10178 rkind = kind2; 10179 if (release1) PyMem_Free(buf1); 10180 sbuf = _PyUnicode_AsKind(self, rkind); 10181 if (!sbuf) goto error; 10182 srelease = 1; 10183 buf1 = _PyUnicode_AsKind(str1, rkind); 10184 if (!buf1) goto error; 10185 release1 = 1; 10186 } 10187 u = PyUnicode_New(slen, maxchar); 10188 if (!u) 10189 goto error; 10190 assert(PyUnicode_KIND(u) == rkind); 10191 res = PyUnicode_DATA(u); 10192 10193 memcpy(res, sbuf, rkind * slen); 10194 /* change everything in-place, starting with this one */ 10195 memcpy(res + rkind * i, 10196 buf2, 10197 rkind * len2); 10198 i += len1; 10199 10200 while ( --maxcount > 0) { 10201 i = anylib_find(rkind, self, 10202 sbuf+rkind*i, slen-i, 10203 str1, buf1, len1, i); 10204 if (i == -1) 10205 break; 10206 memcpy(res + rkind * i, 10207 buf2, 10208 rkind * len2); 10209 i += len1; 10210 } 10211 } 10212 } 10213 else { 10214 Py_ssize_t n, i, j, ires; 10215 Py_ssize_t product, new_size; 10216 int rkind = skind; 10217 char *res; 10218 10219 if (kind1 < rkind) { 10220 /* widen substring */ 10221 buf1 = _PyUnicode_AsKind(str1, rkind); 10222 if (!buf1) goto error; 10223 release1 = 1; 10224 } 10225 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10226 if (n == 0) 10227 goto nothing; 10228 if (kind2 < rkind) { 10229 /* widen replacement */ 10230 buf2 = _PyUnicode_AsKind(str2, rkind); 10231 if (!buf2) goto error; 10232 release2 = 1; 10233 } 10234 else if (kind2 > rkind) { 10235 /* widen self and buf1 */ 10236 rkind = kind2; 10237 sbuf = _PyUnicode_AsKind(self, rkind); 10238 if (!sbuf) goto error; 10239 srelease = 1; 10240 if (release1) PyMem_Free(buf1); 10241 buf1 = _PyUnicode_AsKind(str1, rkind); 10242 if (!buf1) goto error; 10243 release1 = 1; 10244 } 10245 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10246 PyUnicode_GET_LENGTH(str1))); */ 10247 product = n * (len2-len1); 10248 if ((product / (len2-len1)) != n) { 10249 PyErr_SetString(PyExc_OverflowError, 10250 "replace string is too long"); 10251 goto error; 10252 } 10253 new_size = slen + product; 10254 if (new_size == 0) { 10255 Py_INCREF(unicode_empty); 10256 u = unicode_empty; 10257 goto done; 10258 } 10259 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10260 PyErr_SetString(PyExc_OverflowError, 10261 "replace string is too long"); 10262 goto error; 10263 } 10264 u = PyUnicode_New(new_size, maxchar); 10265 if (!u) 10266 goto error; 10267 assert(PyUnicode_KIND(u) == rkind); 10268 res = PyUnicode_DATA(u); 10269 ires = i = 0; 10270 if (len1 > 0) { 10271 while (n-- > 0) { 10272 /* look for next match */ 10273 j = anylib_find(rkind, self, 10274 sbuf + rkind * i, slen-i, 10275 str1, buf1, len1, i); 10276 if (j == -1) 10277 break; 10278 else if (j > i) { 10279 /* copy unchanged part [i:j] */ 10280 memcpy(res + rkind * ires, 10281 sbuf + rkind * i, 10282 rkind * (j-i)); 10283 ires += j - i; 10284 } 10285 /* copy substitution string */ 10286 if (len2 > 0) { 10287 memcpy(res + rkind * ires, 10288 buf2, 10289 rkind * len2); 10290 ires += len2; 10291 } 10292 i = j + len1; 10293 } 10294 if (i < slen) 10295 /* copy tail [i:] */ 10296 memcpy(res + rkind * ires, 10297 sbuf + rkind * i, 10298 rkind * (slen-i)); 10299 } 10300 else { 10301 /* interleave */ 10302 while (n > 0) { 10303 memcpy(res + rkind * ires, 10304 buf2, 10305 rkind * len2); 10306 ires += len2; 10307 if (--n <= 0) 10308 break; 10309 memcpy(res + rkind * ires, 10310 sbuf + rkind * i, 10311 rkind); 10312 ires++; 10313 i++; 10314 } 10315 memcpy(res + rkind * ires, 10316 sbuf + rkind * i, 10317 rkind * (slen-i)); 10318 } 10319 } 10320 10321 if (mayshrink) { 10322 unicode_adjust_maxchar(&u); 10323 if (u == NULL) 10324 goto error; 10325 } 10326 10327 done: 10328 if (srelease) 10329 PyMem_FREE(sbuf); 10330 if (release1) 10331 PyMem_FREE(buf1); 10332 if (release2) 10333 PyMem_FREE(buf2); 10334 assert(_PyUnicode_CheckConsistency(u, 1)); 10335 return u; 10336 10337 nothing: 10338 /* nothing to replace; return original string (when possible) */ 10339 if (srelease) 10340 PyMem_FREE(sbuf); 10341 if (release1) 10342 PyMem_FREE(buf1); 10343 if (release2) 10344 PyMem_FREE(buf2); 10345 if (PyUnicode_CheckExact(self)) { 10346 Py_INCREF(self); 10347 return self; 10348 } 10349 return PyUnicode_Copy(self); 10350 error: 10351 if (srelease && sbuf) 10352 PyMem_FREE(sbuf); 10353 if (release1 && buf1) 10354 PyMem_FREE(buf1); 10355 if (release2 && buf2) 10356 PyMem_FREE(buf2); 10357 return NULL; 10358} 10359 10360/* --- Unicode Object Methods --------------------------------------------- */ 10361 10362PyDoc_STRVAR(title__doc__, 10363 "S.title() -> str\n\ 10364\n\ 10365Return a titlecased version of S, i.e. words start with title case\n\ 10366characters, all remaining cased characters have lower case."); 10367 10368static PyObject* 10369unicode_title(PyObject *self) 10370{ 10371 return fixup(self, fixtitle); 10372} 10373 10374PyDoc_STRVAR(capitalize__doc__, 10375 "S.capitalize() -> str\n\ 10376\n\ 10377Return a capitalized version of S, i.e. make the first character\n\ 10378have upper case and the rest lower case."); 10379 10380static PyObject* 10381unicode_capitalize(PyObject *self) 10382{ 10383 return fixup(self, fixcapitalize); 10384} 10385 10386#if 0 10387PyDoc_STRVAR(capwords__doc__, 10388 "S.capwords() -> str\n\ 10389\n\ 10390Apply .capitalize() to all words in S and return the result with\n\ 10391normalized whitespace (all whitespace strings are replaced by ' ')."); 10392 10393static PyObject* 10394unicode_capwords(PyObject *self) 10395{ 10396 PyObject *list; 10397 PyObject *item; 10398 Py_ssize_t i; 10399 10400 /* Split into words */ 10401 list = split(self, NULL, -1); 10402 if (!list) 10403 return NULL; 10404 10405 /* Capitalize each word */ 10406 for (i = 0; i < PyList_GET_SIZE(list); i++) { 10407 item = fixup(PyList_GET_ITEM(list, i), 10408 fixcapitalize); 10409 if (item == NULL) 10410 goto onError; 10411 Py_DECREF(PyList_GET_ITEM(list, i)); 10412 PyList_SET_ITEM(list, i, item); 10413 } 10414 10415 /* Join the words to form a new string */ 10416 item = PyUnicode_Join(NULL, list); 10417 10418 onError: 10419 Py_DECREF(list); 10420 return item; 10421} 10422#endif 10423 10424/* Argument converter. Coerces to a single unicode character */ 10425 10426static int 10427convert_uc(PyObject *obj, void *addr) 10428{ 10429 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10430 PyObject *uniobj; 10431 10432 uniobj = PyUnicode_FromObject(obj); 10433 if (uniobj == NULL) { 10434 PyErr_SetString(PyExc_TypeError, 10435 "The fill character cannot be converted to Unicode"); 10436 return 0; 10437 } 10438 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10439 PyErr_SetString(PyExc_TypeError, 10440 "The fill character must be exactly one character long"); 10441 Py_DECREF(uniobj); 10442 return 0; 10443 } 10444 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10445 Py_DECREF(uniobj); 10446 return 1; 10447} 10448 10449PyDoc_STRVAR(center__doc__, 10450 "S.center(width[, fillchar]) -> str\n\ 10451\n\ 10452Return S centered in a string of length width. Padding is\n\ 10453done using the specified fill character (default is a space)"); 10454 10455static PyObject * 10456unicode_center(PyObject *self, PyObject *args) 10457{ 10458 Py_ssize_t marg, left; 10459 Py_ssize_t width; 10460 Py_UCS4 fillchar = ' '; 10461 10462 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10463 return NULL; 10464 10465 if (PyUnicode_READY(self) == -1) 10466 return NULL; 10467 10468 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10469 Py_INCREF(self); 10470 return self; 10471 } 10472 10473 marg = width - _PyUnicode_LENGTH(self); 10474 left = marg / 2 + (marg & width & 1); 10475 10476 return pad(self, left, marg - left, fillchar); 10477} 10478 10479/* This function assumes that str1 and str2 are readied by the caller. */ 10480 10481static int 10482unicode_compare(PyObject *str1, PyObject *str2) 10483{ 10484 int kind1, kind2; 10485 void *data1, *data2; 10486 Py_ssize_t len1, len2, i; 10487 10488 kind1 = PyUnicode_KIND(str1); 10489 kind2 = PyUnicode_KIND(str2); 10490 data1 = PyUnicode_DATA(str1); 10491 data2 = PyUnicode_DATA(str2); 10492 len1 = PyUnicode_GET_LENGTH(str1); 10493 len2 = PyUnicode_GET_LENGTH(str2); 10494 10495 for (i = 0; i < len1 && i < len2; ++i) { 10496 Py_UCS4 c1, c2; 10497 c1 = PyUnicode_READ(kind1, data1, i); 10498 c2 = PyUnicode_READ(kind2, data2, i); 10499 10500 if (c1 != c2) 10501 return (c1 < c2) ? -1 : 1; 10502 } 10503 10504 return (len1 < len2) ? -1 : (len1 != len2); 10505} 10506 10507int 10508PyUnicode_Compare(PyObject *left, PyObject *right) 10509{ 10510 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10511 if (PyUnicode_READY(left) == -1 || 10512 PyUnicode_READY(right) == -1) 10513 return -1; 10514 return unicode_compare(left, right); 10515 } 10516 PyErr_Format(PyExc_TypeError, 10517 "Can't compare %.100s and %.100s", 10518 left->ob_type->tp_name, 10519 right->ob_type->tp_name); 10520 return -1; 10521} 10522 10523int 10524PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10525{ 10526 Py_ssize_t i; 10527 int kind; 10528 void *data; 10529 Py_UCS4 chr; 10530 10531 assert(_PyUnicode_CHECK(uni)); 10532 if (PyUnicode_READY(uni) == -1) 10533 return -1; 10534 kind = PyUnicode_KIND(uni); 10535 data = PyUnicode_DATA(uni); 10536 /* Compare Unicode string and source character set string */ 10537 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10538 if (chr != str[i]) 10539 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10540 /* This check keeps Python strings that end in '\0' from comparing equal 10541 to C strings identical up to that point. */ 10542 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10543 return 1; /* uni is longer */ 10544 if (str[i]) 10545 return -1; /* str is longer */ 10546 return 0; 10547} 10548 10549 10550#define TEST_COND(cond) \ 10551 ((cond) ? Py_True : Py_False) 10552 10553PyObject * 10554PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10555{ 10556 int result; 10557 10558 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10559 PyObject *v; 10560 if (PyUnicode_READY(left) == -1 || 10561 PyUnicode_READY(right) == -1) 10562 return NULL; 10563 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10564 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10565 if (op == Py_EQ) { 10566 Py_INCREF(Py_False); 10567 return Py_False; 10568 } 10569 if (op == Py_NE) { 10570 Py_INCREF(Py_True); 10571 return Py_True; 10572 } 10573 } 10574 if (left == right) 10575 result = 0; 10576 else 10577 result = unicode_compare(left, right); 10578 10579 /* Convert the return value to a Boolean */ 10580 switch (op) { 10581 case Py_EQ: 10582 v = TEST_COND(result == 0); 10583 break; 10584 case Py_NE: 10585 v = TEST_COND(result != 0); 10586 break; 10587 case Py_LE: 10588 v = TEST_COND(result <= 0); 10589 break; 10590 case Py_GE: 10591 v = TEST_COND(result >= 0); 10592 break; 10593 case Py_LT: 10594 v = TEST_COND(result == -1); 10595 break; 10596 case Py_GT: 10597 v = TEST_COND(result == 1); 10598 break; 10599 default: 10600 PyErr_BadArgument(); 10601 return NULL; 10602 } 10603 Py_INCREF(v); 10604 return v; 10605 } 10606 10607 Py_RETURN_NOTIMPLEMENTED; 10608} 10609 10610int 10611PyUnicode_Contains(PyObject *container, PyObject *element) 10612{ 10613 PyObject *str, *sub; 10614 int kind1, kind2, kind; 10615 void *buf1, *buf2; 10616 Py_ssize_t len1, len2; 10617 int result; 10618 10619 /* Coerce the two arguments */ 10620 sub = PyUnicode_FromObject(element); 10621 if (!sub) { 10622 PyErr_Format(PyExc_TypeError, 10623 "'in <string>' requires string as left operand, not %s", 10624 element->ob_type->tp_name); 10625 return -1; 10626 } 10627 if (PyUnicode_READY(sub) == -1) 10628 return -1; 10629 10630 str = PyUnicode_FromObject(container); 10631 if (!str || PyUnicode_READY(str) == -1) { 10632 Py_DECREF(sub); 10633 return -1; 10634 } 10635 10636 kind1 = PyUnicode_KIND(str); 10637 kind2 = PyUnicode_KIND(sub); 10638 kind = kind1 > kind2 ? kind1 : kind2; 10639 buf1 = PyUnicode_DATA(str); 10640 buf2 = PyUnicode_DATA(sub); 10641 if (kind1 != kind) 10642 buf1 = _PyUnicode_AsKind(str, kind); 10643 if (!buf1) { 10644 Py_DECREF(sub); 10645 return -1; 10646 } 10647 if (kind2 != kind) 10648 buf2 = _PyUnicode_AsKind(sub, kind); 10649 if (!buf2) { 10650 Py_DECREF(sub); 10651 if (kind1 != kind) PyMem_Free(buf1); 10652 return -1; 10653 } 10654 len1 = PyUnicode_GET_LENGTH(str); 10655 len2 = PyUnicode_GET_LENGTH(sub); 10656 10657 switch(kind) { 10658 case PyUnicode_1BYTE_KIND: 10659 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10660 break; 10661 case PyUnicode_2BYTE_KIND: 10662 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10663 break; 10664 case PyUnicode_4BYTE_KIND: 10665 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10666 break; 10667 default: 10668 result = -1; 10669 assert(0); 10670 } 10671 10672 Py_DECREF(str); 10673 Py_DECREF(sub); 10674 10675 if (kind1 != kind) 10676 PyMem_Free(buf1); 10677 if (kind2 != kind) 10678 PyMem_Free(buf2); 10679 10680 return result; 10681} 10682 10683/* Concat to string or Unicode object giving a new Unicode object. */ 10684 10685PyObject * 10686PyUnicode_Concat(PyObject *left, PyObject *right) 10687{ 10688 PyObject *u = NULL, *v = NULL, *w; 10689 Py_UCS4 maxchar, maxchar2; 10690 10691 /* Coerce the two arguments */ 10692 u = PyUnicode_FromObject(left); 10693 if (u == NULL) 10694 goto onError; 10695 v = PyUnicode_FromObject(right); 10696 if (v == NULL) 10697 goto onError; 10698 10699 /* Shortcuts */ 10700 if (v == unicode_empty) { 10701 Py_DECREF(v); 10702 return u; 10703 } 10704 if (u == unicode_empty) { 10705 Py_DECREF(u); 10706 return v; 10707 } 10708 10709 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10710 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10711 maxchar = Py_MAX(maxchar, maxchar2); 10712 10713 /* Concat the two Unicode strings */ 10714 w = PyUnicode_New( 10715 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10716 maxchar); 10717 if (w == NULL) 10718 goto onError; 10719 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); 10720 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); 10721 Py_DECREF(u); 10722 Py_DECREF(v); 10723 assert(_PyUnicode_CheckConsistency(w, 1)); 10724 return w; 10725 10726 onError: 10727 Py_XDECREF(u); 10728 Py_XDECREF(v); 10729 return NULL; 10730} 10731 10732static void 10733unicode_append_inplace(PyObject **p_left, PyObject *right) 10734{ 10735 Py_ssize_t left_len, right_len, new_len; 10736 10737 assert(PyUnicode_IS_READY(*p_left)); 10738 assert(PyUnicode_IS_READY(right)); 10739 10740 left_len = PyUnicode_GET_LENGTH(*p_left); 10741 right_len = PyUnicode_GET_LENGTH(right); 10742 if (left_len > PY_SSIZE_T_MAX - right_len) { 10743 PyErr_SetString(PyExc_OverflowError, 10744 "strings are too large to concat"); 10745 goto error; 10746 } 10747 new_len = left_len + right_len; 10748 10749 /* Now we own the last reference to 'left', so we can resize it 10750 * in-place. 10751 */ 10752 if (unicode_resize(p_left, new_len) != 0) { 10753 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10754 * deallocated so it cannot be put back into 10755 * 'variable'. The MemoryError is raised when there 10756 * is no value in 'variable', which might (very 10757 * remotely) be a cause of incompatibilities. 10758 */ 10759 goto error; 10760 } 10761 /* copy 'right' into the newly allocated area of 'left' */ 10762 copy_characters(*p_left, left_len, right, 0, right_len); 10763 _PyUnicode_DIRTY(*p_left); 10764 return; 10765 10766error: 10767 Py_DECREF(*p_left); 10768 *p_left = NULL; 10769} 10770 10771void 10772PyUnicode_Append(PyObject **p_left, PyObject *right) 10773{ 10774 PyObject *left, *res; 10775 10776 if (p_left == NULL) { 10777 if (!PyErr_Occurred()) 10778 PyErr_BadInternalCall(); 10779 return; 10780 } 10781 left = *p_left; 10782 if (right == NULL || !PyUnicode_Check(left)) { 10783 if (!PyErr_Occurred()) 10784 PyErr_BadInternalCall(); 10785 goto error; 10786 } 10787 10788 if (PyUnicode_READY(left)) 10789 goto error; 10790 if (PyUnicode_READY(right)) 10791 goto error; 10792 10793 if (PyUnicode_CheckExact(left) && left != unicode_empty 10794 && PyUnicode_CheckExact(right) && right != unicode_empty 10795 && unicode_resizable(left) 10796 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10797 || _PyUnicode_WSTR(left) != NULL)) 10798 { 10799 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10800 to change the structure size, but characters are stored just after 10801 the structure, and so it requires to move all characters which is 10802 not so different than duplicating the string. */ 10803 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10804 { 10805 unicode_append_inplace(p_left, right); 10806 if (p_left != NULL) 10807 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10808 return; 10809 } 10810 } 10811 10812 res = PyUnicode_Concat(left, right); 10813 if (res == NULL) 10814 goto error; 10815 Py_DECREF(left); 10816 *p_left = res; 10817 return; 10818 10819error: 10820 Py_DECREF(*p_left); 10821 *p_left = NULL; 10822} 10823 10824void 10825PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10826{ 10827 PyUnicode_Append(pleft, right); 10828 Py_XDECREF(right); 10829} 10830 10831PyDoc_STRVAR(count__doc__, 10832 "S.count(sub[, start[, end]]) -> int\n\ 10833\n\ 10834Return the number of non-overlapping occurrences of substring sub in\n\ 10835string S[start:end]. Optional arguments start and end are\n\ 10836interpreted as in slice notation."); 10837 10838static PyObject * 10839unicode_count(PyObject *self, PyObject *args) 10840{ 10841 PyObject *substring; 10842 Py_ssize_t start = 0; 10843 Py_ssize_t end = PY_SSIZE_T_MAX; 10844 PyObject *result; 10845 int kind1, kind2, kind; 10846 void *buf1, *buf2; 10847 Py_ssize_t len1, len2, iresult; 10848 10849 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10850 &start, &end)) 10851 return NULL; 10852 10853 kind1 = PyUnicode_KIND(self); 10854 kind2 = PyUnicode_KIND(substring); 10855 kind = kind1 > kind2 ? kind1 : kind2; 10856 buf1 = PyUnicode_DATA(self); 10857 buf2 = PyUnicode_DATA(substring); 10858 if (kind1 != kind) 10859 buf1 = _PyUnicode_AsKind(self, kind); 10860 if (!buf1) { 10861 Py_DECREF(substring); 10862 return NULL; 10863 } 10864 if (kind2 != kind) 10865 buf2 = _PyUnicode_AsKind(substring, kind); 10866 if (!buf2) { 10867 Py_DECREF(substring); 10868 if (kind1 != kind) PyMem_Free(buf1); 10869 return NULL; 10870 } 10871 len1 = PyUnicode_GET_LENGTH(self); 10872 len2 = PyUnicode_GET_LENGTH(substring); 10873 10874 ADJUST_INDICES(start, end, len1); 10875 switch(kind) { 10876 case PyUnicode_1BYTE_KIND: 10877 iresult = ucs1lib_count( 10878 ((Py_UCS1*)buf1) + start, end - start, 10879 buf2, len2, PY_SSIZE_T_MAX 10880 ); 10881 break; 10882 case PyUnicode_2BYTE_KIND: 10883 iresult = ucs2lib_count( 10884 ((Py_UCS2*)buf1) + start, end - start, 10885 buf2, len2, PY_SSIZE_T_MAX 10886 ); 10887 break; 10888 case PyUnicode_4BYTE_KIND: 10889 iresult = ucs4lib_count( 10890 ((Py_UCS4*)buf1) + start, end - start, 10891 buf2, len2, PY_SSIZE_T_MAX 10892 ); 10893 break; 10894 default: 10895 assert(0); iresult = 0; 10896 } 10897 10898 result = PyLong_FromSsize_t(iresult); 10899 10900 if (kind1 != kind) 10901 PyMem_Free(buf1); 10902 if (kind2 != kind) 10903 PyMem_Free(buf2); 10904 10905 Py_DECREF(substring); 10906 10907 return result; 10908} 10909 10910PyDoc_STRVAR(encode__doc__, 10911 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10912\n\ 10913Encode S using the codec registered for encoding. Default encoding\n\ 10914is 'utf-8'. errors may be given to set a different error\n\ 10915handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10916a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10917'xmlcharrefreplace' as well as any other name registered with\n\ 10918codecs.register_error that can handle UnicodeEncodeErrors."); 10919 10920static PyObject * 10921unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10922{ 10923 static char *kwlist[] = {"encoding", "errors", 0}; 10924 char *encoding = NULL; 10925 char *errors = NULL; 10926 10927 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10928 kwlist, &encoding, &errors)) 10929 return NULL; 10930 return PyUnicode_AsEncodedString(self, encoding, errors); 10931} 10932 10933PyDoc_STRVAR(expandtabs__doc__, 10934 "S.expandtabs([tabsize]) -> str\n\ 10935\n\ 10936Return a copy of S where all tab characters are expanded using spaces.\n\ 10937If tabsize is not given, a tab size of 8 characters is assumed."); 10938 10939static PyObject* 10940unicode_expandtabs(PyObject *self, PyObject *args) 10941{ 10942 Py_ssize_t i, j, line_pos, src_len, incr; 10943 Py_UCS4 ch; 10944 PyObject *u; 10945 void *src_data, *dest_data; 10946 int tabsize = 8; 10947 int kind; 10948 int found; 10949 10950 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10951 return NULL; 10952 10953 if (PyUnicode_READY(self) == -1) 10954 return NULL; 10955 10956 /* First pass: determine size of output string */ 10957 src_len = PyUnicode_GET_LENGTH(self); 10958 i = j = line_pos = 0; 10959 kind = PyUnicode_KIND(self); 10960 src_data = PyUnicode_DATA(self); 10961 found = 0; 10962 for (; i < src_len; i++) { 10963 ch = PyUnicode_READ(kind, src_data, i); 10964 if (ch == '\t') { 10965 found = 1; 10966 if (tabsize > 0) { 10967 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10968 if (j > PY_SSIZE_T_MAX - incr) 10969 goto overflow; 10970 line_pos += incr; 10971 j += incr; 10972 } 10973 } 10974 else { 10975 if (j > PY_SSIZE_T_MAX - 1) 10976 goto overflow; 10977 line_pos++; 10978 j++; 10979 if (ch == '\n' || ch == '\r') 10980 line_pos = 0; 10981 } 10982 } 10983 if (!found && PyUnicode_CheckExact(self)) { 10984 Py_INCREF(self); 10985 return self; 10986 } 10987 10988 /* Second pass: create output string and fill it */ 10989 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10990 if (!u) 10991 return NULL; 10992 dest_data = PyUnicode_DATA(u); 10993 10994 i = j = line_pos = 0; 10995 10996 for (; i < src_len; i++) { 10997 ch = PyUnicode_READ(kind, src_data, i); 10998 if (ch == '\t') { 10999 if (tabsize > 0) { 11000 incr = tabsize - (line_pos % tabsize); 11001 line_pos += incr; 11002 while (incr--) { 11003 PyUnicode_WRITE(kind, dest_data, j, ' '); 11004 j++; 11005 } 11006 } 11007 } 11008 else { 11009 line_pos++; 11010 PyUnicode_WRITE(kind, dest_data, j, ch); 11011 j++; 11012 if (ch == '\n' || ch == '\r') 11013 line_pos = 0; 11014 } 11015 } 11016 assert (j == PyUnicode_GET_LENGTH(u)); 11017#ifndef DONT_MAKE_RESULT_READY 11018 if (_PyUnicode_READY_REPLACE(&u)) { 11019 Py_DECREF(u); 11020 return NULL; 11021 } 11022#endif 11023 assert(_PyUnicode_CheckConsistency(u, 1)); 11024 return u; 11025 11026 overflow: 11027 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11028 return NULL; 11029} 11030 11031PyDoc_STRVAR(find__doc__, 11032 "S.find(sub[, start[, end]]) -> int\n\ 11033\n\ 11034Return the lowest index in S where substring sub is found,\n\ 11035such that sub is contained within S[start:end]. Optional\n\ 11036arguments start and end are interpreted as in slice notation.\n\ 11037\n\ 11038Return -1 on failure."); 11039 11040static PyObject * 11041unicode_find(PyObject *self, PyObject *args) 11042{ 11043 PyObject *substring; 11044 Py_ssize_t start; 11045 Py_ssize_t end; 11046 Py_ssize_t result; 11047 11048 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11049 &start, &end)) 11050 return NULL; 11051 11052 if (PyUnicode_READY(self) == -1) 11053 return NULL; 11054 if (PyUnicode_READY(substring) == -1) 11055 return NULL; 11056 11057 result = any_find_slice(1, self, substring, start, end); 11058 11059 Py_DECREF(substring); 11060 11061 if (result == -2) 11062 return NULL; 11063 11064 return PyLong_FromSsize_t(result); 11065} 11066 11067static PyObject * 11068unicode_getitem(PyObject *self, Py_ssize_t index) 11069{ 11070 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 11071 if (ch == (Py_UCS4)-1) 11072 return NULL; 11073 return PyUnicode_FromOrdinal(ch); 11074} 11075 11076/* Believe it or not, this produces the same value for ASCII strings 11077 as bytes_hash(). */ 11078static Py_hash_t 11079unicode_hash(PyObject *self) 11080{ 11081 Py_ssize_t len; 11082 Py_uhash_t x; 11083 11084 if (_PyUnicode_HASH(self) != -1) 11085 return _PyUnicode_HASH(self); 11086 if (PyUnicode_READY(self) == -1) 11087 return -1; 11088 len = PyUnicode_GET_LENGTH(self); 11089 11090 /* The hash function as a macro, gets expanded three times below. */ 11091#define HASH(P) \ 11092 x = (Py_uhash_t)*P << 7; \ 11093 while (--len >= 0) \ 11094 x = (1000003*x) ^ (Py_uhash_t)*P++; 11095 11096 switch (PyUnicode_KIND(self)) { 11097 case PyUnicode_1BYTE_KIND: { 11098 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11099 HASH(c); 11100 break; 11101 } 11102 case PyUnicode_2BYTE_KIND: { 11103 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11104 HASH(s); 11105 break; 11106 } 11107 default: { 11108 Py_UCS4 *l; 11109 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11110 "Impossible switch case in unicode_hash"); 11111 l = PyUnicode_4BYTE_DATA(self); 11112 HASH(l); 11113 break; 11114 } 11115 } 11116 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 11117 11118 if (x == -1) 11119 x = -2; 11120 _PyUnicode_HASH(self) = x; 11121 return x; 11122} 11123#undef HASH 11124 11125PyDoc_STRVAR(index__doc__, 11126 "S.index(sub[, start[, end]]) -> int\n\ 11127\n\ 11128Like S.find() but raise ValueError when the substring is not found."); 11129 11130static PyObject * 11131unicode_index(PyObject *self, PyObject *args) 11132{ 11133 Py_ssize_t result; 11134 PyObject *substring; 11135 Py_ssize_t start; 11136 Py_ssize_t end; 11137 11138 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11139 &start, &end)) 11140 return NULL; 11141 11142 if (PyUnicode_READY(self) == -1) 11143 return NULL; 11144 if (PyUnicode_READY(substring) == -1) 11145 return NULL; 11146 11147 result = any_find_slice(1, self, substring, start, end); 11148 11149 Py_DECREF(substring); 11150 11151 if (result == -2) 11152 return NULL; 11153 11154 if (result < 0) { 11155 PyErr_SetString(PyExc_ValueError, "substring not found"); 11156 return NULL; 11157 } 11158 11159 return PyLong_FromSsize_t(result); 11160} 11161 11162PyDoc_STRVAR(islower__doc__, 11163 "S.islower() -> bool\n\ 11164\n\ 11165Return True if all cased characters in S are lowercase and there is\n\ 11166at least one cased character in S, False otherwise."); 11167 11168static PyObject* 11169unicode_islower(PyObject *self) 11170{ 11171 Py_ssize_t i, length; 11172 int kind; 11173 void *data; 11174 int cased; 11175 11176 if (PyUnicode_READY(self) == -1) 11177 return NULL; 11178 length = PyUnicode_GET_LENGTH(self); 11179 kind = PyUnicode_KIND(self); 11180 data = PyUnicode_DATA(self); 11181 11182 /* Shortcut for single character strings */ 11183 if (length == 1) 11184 return PyBool_FromLong( 11185 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11186 11187 /* Special case for empty strings */ 11188 if (length == 0) 11189 return PyBool_FromLong(0); 11190 11191 cased = 0; 11192 for (i = 0; i < length; i++) { 11193 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11194 11195 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11196 return PyBool_FromLong(0); 11197 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11198 cased = 1; 11199 } 11200 return PyBool_FromLong(cased); 11201} 11202 11203PyDoc_STRVAR(isupper__doc__, 11204 "S.isupper() -> bool\n\ 11205\n\ 11206Return True if all cased characters in S are uppercase and there is\n\ 11207at least one cased character in S, False otherwise."); 11208 11209static PyObject* 11210unicode_isupper(PyObject *self) 11211{ 11212 Py_ssize_t i, length; 11213 int kind; 11214 void *data; 11215 int cased; 11216 11217 if (PyUnicode_READY(self) == -1) 11218 return NULL; 11219 length = PyUnicode_GET_LENGTH(self); 11220 kind = PyUnicode_KIND(self); 11221 data = PyUnicode_DATA(self); 11222 11223 /* Shortcut for single character strings */ 11224 if (length == 1) 11225 return PyBool_FromLong( 11226 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11227 11228 /* Special case for empty strings */ 11229 if (length == 0) 11230 return PyBool_FromLong(0); 11231 11232 cased = 0; 11233 for (i = 0; i < length; i++) { 11234 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11235 11236 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11237 return PyBool_FromLong(0); 11238 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11239 cased = 1; 11240 } 11241 return PyBool_FromLong(cased); 11242} 11243 11244PyDoc_STRVAR(istitle__doc__, 11245 "S.istitle() -> bool\n\ 11246\n\ 11247Return True if S is a titlecased string and there is at least one\n\ 11248character in S, i.e. upper- and titlecase characters may only\n\ 11249follow uncased characters and lowercase characters only cased ones.\n\ 11250Return False otherwise."); 11251 11252static PyObject* 11253unicode_istitle(PyObject *self) 11254{ 11255 Py_ssize_t i, length; 11256 int kind; 11257 void *data; 11258 int cased, previous_is_cased; 11259 11260 if (PyUnicode_READY(self) == -1) 11261 return NULL; 11262 length = PyUnicode_GET_LENGTH(self); 11263 kind = PyUnicode_KIND(self); 11264 data = PyUnicode_DATA(self); 11265 11266 /* Shortcut for single character strings */ 11267 if (length == 1) { 11268 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11269 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11270 (Py_UNICODE_ISUPPER(ch) != 0)); 11271 } 11272 11273 /* Special case for empty strings */ 11274 if (length == 0) 11275 return PyBool_FromLong(0); 11276 11277 cased = 0; 11278 previous_is_cased = 0; 11279 for (i = 0; i < length; i++) { 11280 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11281 11282 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11283 if (previous_is_cased) 11284 return PyBool_FromLong(0); 11285 previous_is_cased = 1; 11286 cased = 1; 11287 } 11288 else if (Py_UNICODE_ISLOWER(ch)) { 11289 if (!previous_is_cased) 11290 return PyBool_FromLong(0); 11291 previous_is_cased = 1; 11292 cased = 1; 11293 } 11294 else 11295 previous_is_cased = 0; 11296 } 11297 return PyBool_FromLong(cased); 11298} 11299 11300PyDoc_STRVAR(isspace__doc__, 11301 "S.isspace() -> bool\n\ 11302\n\ 11303Return True if all characters in S are whitespace\n\ 11304and there is at least one character in S, False otherwise."); 11305 11306static PyObject* 11307unicode_isspace(PyObject *self) 11308{ 11309 Py_ssize_t i, length; 11310 int kind; 11311 void *data; 11312 11313 if (PyUnicode_READY(self) == -1) 11314 return NULL; 11315 length = PyUnicode_GET_LENGTH(self); 11316 kind = PyUnicode_KIND(self); 11317 data = PyUnicode_DATA(self); 11318 11319 /* Shortcut for single character strings */ 11320 if (length == 1) 11321 return PyBool_FromLong( 11322 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11323 11324 /* Special case for empty strings */ 11325 if (length == 0) 11326 return PyBool_FromLong(0); 11327 11328 for (i = 0; i < length; i++) { 11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11330 if (!Py_UNICODE_ISSPACE(ch)) 11331 return PyBool_FromLong(0); 11332 } 11333 return PyBool_FromLong(1); 11334} 11335 11336PyDoc_STRVAR(isalpha__doc__, 11337 "S.isalpha() -> bool\n\ 11338\n\ 11339Return True if all characters in S are alphabetic\n\ 11340and there is at least one character in S, False otherwise."); 11341 11342static PyObject* 11343unicode_isalpha(PyObject *self) 11344{ 11345 Py_ssize_t i, length; 11346 int kind; 11347 void *data; 11348 11349 if (PyUnicode_READY(self) == -1) 11350 return NULL; 11351 length = PyUnicode_GET_LENGTH(self); 11352 kind = PyUnicode_KIND(self); 11353 data = PyUnicode_DATA(self); 11354 11355 /* Shortcut for single character strings */ 11356 if (length == 1) 11357 return PyBool_FromLong( 11358 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11359 11360 /* Special case for empty strings */ 11361 if (length == 0) 11362 return PyBool_FromLong(0); 11363 11364 for (i = 0; i < length; i++) { 11365 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11366 return PyBool_FromLong(0); 11367 } 11368 return PyBool_FromLong(1); 11369} 11370 11371PyDoc_STRVAR(isalnum__doc__, 11372 "S.isalnum() -> bool\n\ 11373\n\ 11374Return True if all characters in S are alphanumeric\n\ 11375and there is at least one character in S, False otherwise."); 11376 11377static PyObject* 11378unicode_isalnum(PyObject *self) 11379{ 11380 int kind; 11381 void *data; 11382 Py_ssize_t len, i; 11383 11384 if (PyUnicode_READY(self) == -1) 11385 return NULL; 11386 11387 kind = PyUnicode_KIND(self); 11388 data = PyUnicode_DATA(self); 11389 len = PyUnicode_GET_LENGTH(self); 11390 11391 /* Shortcut for single character strings */ 11392 if (len == 1) { 11393 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11394 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11395 } 11396 11397 /* Special case for empty strings */ 11398 if (len == 0) 11399 return PyBool_FromLong(0); 11400 11401 for (i = 0; i < len; i++) { 11402 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11403 if (!Py_UNICODE_ISALNUM(ch)) 11404 return PyBool_FromLong(0); 11405 } 11406 return PyBool_FromLong(1); 11407} 11408 11409PyDoc_STRVAR(isdecimal__doc__, 11410 "S.isdecimal() -> bool\n\ 11411\n\ 11412Return True if there are only decimal characters in S,\n\ 11413False otherwise."); 11414 11415static PyObject* 11416unicode_isdecimal(PyObject *self) 11417{ 11418 Py_ssize_t i, length; 11419 int kind; 11420 void *data; 11421 11422 if (PyUnicode_READY(self) == -1) 11423 return NULL; 11424 length = PyUnicode_GET_LENGTH(self); 11425 kind = PyUnicode_KIND(self); 11426 data = PyUnicode_DATA(self); 11427 11428 /* Shortcut for single character strings */ 11429 if (length == 1) 11430 return PyBool_FromLong( 11431 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11432 11433 /* Special case for empty strings */ 11434 if (length == 0) 11435 return PyBool_FromLong(0); 11436 11437 for (i = 0; i < length; i++) { 11438 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11439 return PyBool_FromLong(0); 11440 } 11441 return PyBool_FromLong(1); 11442} 11443 11444PyDoc_STRVAR(isdigit__doc__, 11445 "S.isdigit() -> bool\n\ 11446\n\ 11447Return True if all characters in S are digits\n\ 11448and there is at least one character in S, False otherwise."); 11449 11450static PyObject* 11451unicode_isdigit(PyObject *self) 11452{ 11453 Py_ssize_t i, length; 11454 int kind; 11455 void *data; 11456 11457 if (PyUnicode_READY(self) == -1) 11458 return NULL; 11459 length = PyUnicode_GET_LENGTH(self); 11460 kind = PyUnicode_KIND(self); 11461 data = PyUnicode_DATA(self); 11462 11463 /* Shortcut for single character strings */ 11464 if (length == 1) { 11465 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11466 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11467 } 11468 11469 /* Special case for empty strings */ 11470 if (length == 0) 11471 return PyBool_FromLong(0); 11472 11473 for (i = 0; i < length; i++) { 11474 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11475 return PyBool_FromLong(0); 11476 } 11477 return PyBool_FromLong(1); 11478} 11479 11480PyDoc_STRVAR(isnumeric__doc__, 11481 "S.isnumeric() -> bool\n\ 11482\n\ 11483Return True if there are only numeric characters in S,\n\ 11484False otherwise."); 11485 11486static PyObject* 11487unicode_isnumeric(PyObject *self) 11488{ 11489 Py_ssize_t i, length; 11490 int kind; 11491 void *data; 11492 11493 if (PyUnicode_READY(self) == -1) 11494 return NULL; 11495 length = PyUnicode_GET_LENGTH(self); 11496 kind = PyUnicode_KIND(self); 11497 data = PyUnicode_DATA(self); 11498 11499 /* Shortcut for single character strings */ 11500 if (length == 1) 11501 return PyBool_FromLong( 11502 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11503 11504 /* Special case for empty strings */ 11505 if (length == 0) 11506 return PyBool_FromLong(0); 11507 11508 for (i = 0; i < length; i++) { 11509 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11510 return PyBool_FromLong(0); 11511 } 11512 return PyBool_FromLong(1); 11513} 11514 11515int 11516PyUnicode_IsIdentifier(PyObject *self) 11517{ 11518 int kind; 11519 void *data; 11520 Py_ssize_t i; 11521 Py_UCS4 first; 11522 11523 if (PyUnicode_READY(self) == -1) { 11524 Py_FatalError("identifier not ready"); 11525 return 0; 11526 } 11527 11528 /* Special case for empty strings */ 11529 if (PyUnicode_GET_LENGTH(self) == 0) 11530 return 0; 11531 kind = PyUnicode_KIND(self); 11532 data = PyUnicode_DATA(self); 11533 11534 /* PEP 3131 says that the first character must be in 11535 XID_Start and subsequent characters in XID_Continue, 11536 and for the ASCII range, the 2.x rules apply (i.e 11537 start with letters and underscore, continue with 11538 letters, digits, underscore). However, given the current 11539 definition of XID_Start and XID_Continue, it is sufficient 11540 to check just for these, except that _ must be allowed 11541 as starting an identifier. */ 11542 first = PyUnicode_READ(kind, data, 0); 11543 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11544 return 0; 11545 11546 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11547 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11548 return 0; 11549 return 1; 11550} 11551 11552PyDoc_STRVAR(isidentifier__doc__, 11553 "S.isidentifier() -> bool\n\ 11554\n\ 11555Return True if S is a valid identifier according\n\ 11556to the language definition."); 11557 11558static PyObject* 11559unicode_isidentifier(PyObject *self) 11560{ 11561 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11562} 11563 11564PyDoc_STRVAR(isprintable__doc__, 11565 "S.isprintable() -> bool\n\ 11566\n\ 11567Return True if all characters in S are considered\n\ 11568printable in repr() or S is empty, False otherwise."); 11569 11570static PyObject* 11571unicode_isprintable(PyObject *self) 11572{ 11573 Py_ssize_t i, length; 11574 int kind; 11575 void *data; 11576 11577 if (PyUnicode_READY(self) == -1) 11578 return NULL; 11579 length = PyUnicode_GET_LENGTH(self); 11580 kind = PyUnicode_KIND(self); 11581 data = PyUnicode_DATA(self); 11582 11583 /* Shortcut for single character strings */ 11584 if (length == 1) 11585 return PyBool_FromLong( 11586 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11587 11588 for (i = 0; i < length; i++) { 11589 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11590 Py_RETURN_FALSE; 11591 } 11592 } 11593 Py_RETURN_TRUE; 11594} 11595 11596PyDoc_STRVAR(join__doc__, 11597 "S.join(iterable) -> str\n\ 11598\n\ 11599Return a string which is the concatenation of the strings in the\n\ 11600iterable. The separator between elements is S."); 11601 11602static PyObject* 11603unicode_join(PyObject *self, PyObject *data) 11604{ 11605 return PyUnicode_Join(self, data); 11606} 11607 11608static Py_ssize_t 11609unicode_length(PyObject *self) 11610{ 11611 if (PyUnicode_READY(self) == -1) 11612 return -1; 11613 return PyUnicode_GET_LENGTH(self); 11614} 11615 11616PyDoc_STRVAR(ljust__doc__, 11617 "S.ljust(width[, fillchar]) -> str\n\ 11618\n\ 11619Return S left-justified in a Unicode string of length width. Padding is\n\ 11620done using the specified fill character (default is a space)."); 11621 11622static PyObject * 11623unicode_ljust(PyObject *self, PyObject *args) 11624{ 11625 Py_ssize_t width; 11626 Py_UCS4 fillchar = ' '; 11627 11628 if (PyUnicode_READY(self) == -1) 11629 return NULL; 11630 11631 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11632 return NULL; 11633 11634 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11635 Py_INCREF(self); 11636 return self; 11637 } 11638 11639 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 11640} 11641 11642PyDoc_STRVAR(lower__doc__, 11643 "S.lower() -> str\n\ 11644\n\ 11645Return a copy of the string S converted to lowercase."); 11646 11647static PyObject* 11648unicode_lower(PyObject *self) 11649{ 11650 return fixup(self, fixlower); 11651} 11652 11653#define LEFTSTRIP 0 11654#define RIGHTSTRIP 1 11655#define BOTHSTRIP 2 11656 11657/* Arrays indexed by above */ 11658static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11659 11660#define STRIPNAME(i) (stripformat[i]+3) 11661 11662/* externally visible for str.strip(unicode) */ 11663PyObject * 11664_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11665{ 11666 void *data; 11667 int kind; 11668 Py_ssize_t i, j, len; 11669 BLOOM_MASK sepmask; 11670 11671 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11672 return NULL; 11673 11674 kind = PyUnicode_KIND(self); 11675 data = PyUnicode_DATA(self); 11676 len = PyUnicode_GET_LENGTH(self); 11677 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11678 PyUnicode_DATA(sepobj), 11679 PyUnicode_GET_LENGTH(sepobj)); 11680 11681 i = 0; 11682 if (striptype != RIGHTSTRIP) { 11683 while (i < len && 11684 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11685 i++; 11686 } 11687 } 11688 11689 j = len; 11690 if (striptype != LEFTSTRIP) { 11691 do { 11692 j--; 11693 } while (j >= i && 11694 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11695 j++; 11696 } 11697 11698 return PyUnicode_Substring(self, i, j); 11699} 11700 11701PyObject* 11702PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11703{ 11704 unsigned char *data; 11705 int kind; 11706 Py_ssize_t length; 11707 11708 if (PyUnicode_READY(self) == -1) 11709 return NULL; 11710 11711 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11712 11713 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11714 { 11715 if (PyUnicode_CheckExact(self)) { 11716 Py_INCREF(self); 11717 return self; 11718 } 11719 else 11720 return PyUnicode_Copy(self); 11721 } 11722 11723 length = end - start; 11724 if (length == 1) 11725 return unicode_getitem(self, start); 11726 11727 if (start < 0 || end < 0) { 11728 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11729 return NULL; 11730 } 11731 11732 if (PyUnicode_IS_ASCII(self)) { 11733 kind = PyUnicode_KIND(self); 11734 data = PyUnicode_1BYTE_DATA(self); 11735 return unicode_fromascii(data + start, length); 11736 } 11737 else { 11738 kind = PyUnicode_KIND(self); 11739 data = PyUnicode_1BYTE_DATA(self); 11740 return PyUnicode_FromKindAndData(kind, 11741 data + kind * start, 11742 length); 11743 } 11744} 11745 11746static PyObject * 11747do_strip(PyObject *self, int striptype) 11748{ 11749 int kind; 11750 void *data; 11751 Py_ssize_t len, i, j; 11752 11753 if (PyUnicode_READY(self) == -1) 11754 return NULL; 11755 11756 kind = PyUnicode_KIND(self); 11757 data = PyUnicode_DATA(self); 11758 len = PyUnicode_GET_LENGTH(self); 11759 11760 i = 0; 11761 if (striptype != RIGHTSTRIP) { 11762 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11763 i++; 11764 } 11765 } 11766 11767 j = len; 11768 if (striptype != LEFTSTRIP) { 11769 do { 11770 j--; 11771 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11772 j++; 11773 } 11774 11775 return PyUnicode_Substring(self, i, j); 11776} 11777 11778 11779static PyObject * 11780do_argstrip(PyObject *self, int striptype, PyObject *args) 11781{ 11782 PyObject *sep = NULL; 11783 11784 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11785 return NULL; 11786 11787 if (sep != NULL && sep != Py_None) { 11788 if (PyUnicode_Check(sep)) 11789 return _PyUnicode_XStrip(self, striptype, sep); 11790 else { 11791 PyErr_Format(PyExc_TypeError, 11792 "%s arg must be None or str", 11793 STRIPNAME(striptype)); 11794 return NULL; 11795 } 11796 } 11797 11798 return do_strip(self, striptype); 11799} 11800 11801 11802PyDoc_STRVAR(strip__doc__, 11803 "S.strip([chars]) -> str\n\ 11804\n\ 11805Return a copy of the string S with leading and trailing\n\ 11806whitespace removed.\n\ 11807If chars is given and not None, remove characters in chars instead."); 11808 11809static PyObject * 11810unicode_strip(PyObject *self, PyObject *args) 11811{ 11812 if (PyTuple_GET_SIZE(args) == 0) 11813 return do_strip(self, BOTHSTRIP); /* Common case */ 11814 else 11815 return do_argstrip(self, BOTHSTRIP, args); 11816} 11817 11818 11819PyDoc_STRVAR(lstrip__doc__, 11820 "S.lstrip([chars]) -> str\n\ 11821\n\ 11822Return a copy of the string S with leading whitespace removed.\n\ 11823If chars is given and not None, remove characters in chars instead."); 11824 11825static PyObject * 11826unicode_lstrip(PyObject *self, PyObject *args) 11827{ 11828 if (PyTuple_GET_SIZE(args) == 0) 11829 return do_strip(self, LEFTSTRIP); /* Common case */ 11830 else 11831 return do_argstrip(self, LEFTSTRIP, args); 11832} 11833 11834 11835PyDoc_STRVAR(rstrip__doc__, 11836 "S.rstrip([chars]) -> str\n\ 11837\n\ 11838Return a copy of the string S with trailing whitespace removed.\n\ 11839If chars is given and not None, remove characters in chars instead."); 11840 11841static PyObject * 11842unicode_rstrip(PyObject *self, PyObject *args) 11843{ 11844 if (PyTuple_GET_SIZE(args) == 0) 11845 return do_strip(self, RIGHTSTRIP); /* Common case */ 11846 else 11847 return do_argstrip(self, RIGHTSTRIP, args); 11848} 11849 11850 11851static PyObject* 11852unicode_repeat(PyObject *str, Py_ssize_t len) 11853{ 11854 PyObject *u; 11855 Py_ssize_t nchars, n; 11856 11857 if (len < 1) { 11858 Py_INCREF(unicode_empty); 11859 return unicode_empty; 11860 } 11861 11862 if (len == 1 && PyUnicode_CheckExact(str)) { 11863 /* no repeat, return original string */ 11864 Py_INCREF(str); 11865 return str; 11866 } 11867 11868 if (PyUnicode_READY(str) == -1) 11869 return NULL; 11870 11871 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11872 PyErr_SetString(PyExc_OverflowError, 11873 "repeated string is too long"); 11874 return NULL; 11875 } 11876 nchars = len * PyUnicode_GET_LENGTH(str); 11877 11878 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11879 if (!u) 11880 return NULL; 11881 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11882 11883 if (PyUnicode_GET_LENGTH(str) == 1) { 11884 const int kind = PyUnicode_KIND(str); 11885 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11886 void *to = PyUnicode_DATA(u); 11887 if (kind == PyUnicode_1BYTE_KIND) 11888 memset(to, (unsigned char)fill_char, len); 11889 else { 11890 for (n = 0; n < len; ++n) 11891 PyUnicode_WRITE(kind, to, n, fill_char); 11892 } 11893 } 11894 else { 11895 /* number of characters copied this far */ 11896 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11897 const Py_ssize_t char_size = PyUnicode_KIND(str); 11898 char *to = (char *) PyUnicode_DATA(u); 11899 Py_MEMCPY(to, PyUnicode_DATA(str), 11900 PyUnicode_GET_LENGTH(str) * char_size); 11901 while (done < nchars) { 11902 n = (done <= nchars-done) ? done : nchars-done; 11903 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11904 done += n; 11905 } 11906 } 11907 11908 assert(_PyUnicode_CheckConsistency(u, 1)); 11909 return u; 11910} 11911 11912PyObject * 11913PyUnicode_Replace(PyObject *obj, 11914 PyObject *subobj, 11915 PyObject *replobj, 11916 Py_ssize_t maxcount) 11917{ 11918 PyObject *self; 11919 PyObject *str1; 11920 PyObject *str2; 11921 PyObject *result; 11922 11923 self = PyUnicode_FromObject(obj); 11924 if (self == NULL || PyUnicode_READY(self) == -1) 11925 return NULL; 11926 str1 = PyUnicode_FromObject(subobj); 11927 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11928 Py_DECREF(self); 11929 return NULL; 11930 } 11931 str2 = PyUnicode_FromObject(replobj); 11932 if (str2 == NULL || PyUnicode_READY(str2)) { 11933 Py_DECREF(self); 11934 Py_DECREF(str1); 11935 return NULL; 11936 } 11937 result = replace(self, str1, str2, maxcount); 11938 Py_DECREF(self); 11939 Py_DECREF(str1); 11940 Py_DECREF(str2); 11941 return result; 11942} 11943 11944PyDoc_STRVAR(replace__doc__, 11945 "S.replace(old, new[, count]) -> str\n\ 11946\n\ 11947Return a copy of S with all occurrences of substring\n\ 11948old replaced by new. If the optional argument count is\n\ 11949given, only the first count occurrences are replaced."); 11950 11951static PyObject* 11952unicode_replace(PyObject *self, PyObject *args) 11953{ 11954 PyObject *str1; 11955 PyObject *str2; 11956 Py_ssize_t maxcount = -1; 11957 PyObject *result; 11958 11959 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11960 return NULL; 11961 if (!PyUnicode_READY(self) == -1) 11962 return NULL; 11963 str1 = PyUnicode_FromObject(str1); 11964 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11965 return NULL; 11966 str2 = PyUnicode_FromObject(str2); 11967 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11968 Py_DECREF(str1); 11969 return NULL; 11970 } 11971 11972 result = replace(self, str1, str2, maxcount); 11973 11974 Py_DECREF(str1); 11975 Py_DECREF(str2); 11976 return result; 11977} 11978 11979static PyObject * 11980unicode_repr(PyObject *unicode) 11981{ 11982 PyObject *repr; 11983 Py_ssize_t isize; 11984 Py_ssize_t osize, squote, dquote, i, o; 11985 Py_UCS4 max, quote; 11986 int ikind, okind; 11987 void *idata, *odata; 11988 11989 if (PyUnicode_READY(unicode) == -1) 11990 return NULL; 11991 11992 isize = PyUnicode_GET_LENGTH(unicode); 11993 idata = PyUnicode_DATA(unicode); 11994 11995 /* Compute length of output, quote characters, and 11996 maximum character */ 11997 osize = 2; /* quotes */ 11998 max = 127; 11999 squote = dquote = 0; 12000 ikind = PyUnicode_KIND(unicode); 12001 for (i = 0; i < isize; i++) { 12002 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12003 switch (ch) { 12004 case '\'': squote++; osize++; break; 12005 case '"': dquote++; osize++; break; 12006 case '\\': case '\t': case '\r': case '\n': 12007 osize += 2; break; 12008 default: 12009 /* Fast-path ASCII */ 12010 if (ch < ' ' || ch == 0x7f) 12011 osize += 4; /* \xHH */ 12012 else if (ch < 0x7f) 12013 osize++; 12014 else if (Py_UNICODE_ISPRINTABLE(ch)) { 12015 osize++; 12016 max = ch > max ? ch : max; 12017 } 12018 else if (ch < 0x100) 12019 osize += 4; /* \xHH */ 12020 else if (ch < 0x10000) 12021 osize += 6; /* \uHHHH */ 12022 else 12023 osize += 10; /* \uHHHHHHHH */ 12024 } 12025 } 12026 12027 quote = '\''; 12028 if (squote) { 12029 if (dquote) 12030 /* Both squote and dquote present. Use squote, 12031 and escape them */ 12032 osize += squote; 12033 else 12034 quote = '"'; 12035 } 12036 12037 repr = PyUnicode_New(osize, max); 12038 if (repr == NULL) 12039 return NULL; 12040 okind = PyUnicode_KIND(repr); 12041 odata = PyUnicode_DATA(repr); 12042 12043 PyUnicode_WRITE(okind, odata, 0, quote); 12044 PyUnicode_WRITE(okind, odata, osize-1, quote); 12045 12046 for (i = 0, o = 1; i < isize; i++) { 12047 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12048 12049 /* Escape quotes and backslashes */ 12050 if ((ch == quote) || (ch == '\\')) { 12051 PyUnicode_WRITE(okind, odata, o++, '\\'); 12052 PyUnicode_WRITE(okind, odata, o++, ch); 12053 continue; 12054 } 12055 12056 /* Map special whitespace to '\t', \n', '\r' */ 12057 if (ch == '\t') { 12058 PyUnicode_WRITE(okind, odata, o++, '\\'); 12059 PyUnicode_WRITE(okind, odata, o++, 't'); 12060 } 12061 else if (ch == '\n') { 12062 PyUnicode_WRITE(okind, odata, o++, '\\'); 12063 PyUnicode_WRITE(okind, odata, o++, 'n'); 12064 } 12065 else if (ch == '\r') { 12066 PyUnicode_WRITE(okind, odata, o++, '\\'); 12067 PyUnicode_WRITE(okind, odata, o++, 'r'); 12068 } 12069 12070 /* Map non-printable US ASCII to '\xhh' */ 12071 else if (ch < ' ' || ch == 0x7F) { 12072 PyUnicode_WRITE(okind, odata, o++, '\\'); 12073 PyUnicode_WRITE(okind, odata, o++, 'x'); 12074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12075 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12076 } 12077 12078 /* Copy ASCII characters as-is */ 12079 else if (ch < 0x7F) { 12080 PyUnicode_WRITE(okind, odata, o++, ch); 12081 } 12082 12083 /* Non-ASCII characters */ 12084 else { 12085 /* Map Unicode whitespace and control characters 12086 (categories Z* and C* except ASCII space) 12087 */ 12088 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12089 /* Map 8-bit characters to '\xhh' */ 12090 if (ch <= 0xff) { 12091 PyUnicode_WRITE(okind, odata, o++, '\\'); 12092 PyUnicode_WRITE(okind, odata, o++, 'x'); 12093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12095 } 12096 /* Map 21-bit characters to '\U00xxxxxx' */ 12097 else if (ch >= 0x10000) { 12098 PyUnicode_WRITE(okind, odata, o++, '\\'); 12099 PyUnicode_WRITE(okind, odata, o++, 'U'); 12100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12107 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12108 } 12109 /* Map 16-bit characters to '\uxxxx' */ 12110 else { 12111 PyUnicode_WRITE(okind, odata, o++, '\\'); 12112 PyUnicode_WRITE(okind, odata, o++, 'u'); 12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12117 } 12118 } 12119 /* Copy characters as-is */ 12120 else { 12121 PyUnicode_WRITE(okind, odata, o++, ch); 12122 } 12123 } 12124 } 12125 /* Closing quote already added at the beginning */ 12126 assert(_PyUnicode_CheckConsistency(repr, 1)); 12127 return repr; 12128} 12129 12130PyDoc_STRVAR(rfind__doc__, 12131 "S.rfind(sub[, start[, end]]) -> int\n\ 12132\n\ 12133Return the highest index in S where substring sub is found,\n\ 12134such that sub is contained within S[start:end]. Optional\n\ 12135arguments start and end are interpreted as in slice notation.\n\ 12136\n\ 12137Return -1 on failure."); 12138 12139static PyObject * 12140unicode_rfind(PyObject *self, PyObject *args) 12141{ 12142 PyObject *substring; 12143 Py_ssize_t start; 12144 Py_ssize_t end; 12145 Py_ssize_t result; 12146 12147 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12148 &start, &end)) 12149 return NULL; 12150 12151 if (PyUnicode_READY(self) == -1) 12152 return NULL; 12153 if (PyUnicode_READY(substring) == -1) 12154 return NULL; 12155 12156 result = any_find_slice(-1, self, substring, start, end); 12157 12158 Py_DECREF(substring); 12159 12160 if (result == -2) 12161 return NULL; 12162 12163 return PyLong_FromSsize_t(result); 12164} 12165 12166PyDoc_STRVAR(rindex__doc__, 12167 "S.rindex(sub[, start[, end]]) -> int\n\ 12168\n\ 12169Like S.rfind() but raise ValueError when the substring is not found."); 12170 12171static PyObject * 12172unicode_rindex(PyObject *self, PyObject *args) 12173{ 12174 PyObject *substring; 12175 Py_ssize_t start; 12176 Py_ssize_t end; 12177 Py_ssize_t result; 12178 12179 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12180 &start, &end)) 12181 return NULL; 12182 12183 if (PyUnicode_READY(self) == -1) 12184 return NULL; 12185 if (PyUnicode_READY(substring) == -1) 12186 return NULL; 12187 12188 result = any_find_slice(-1, self, substring, start, end); 12189 12190 Py_DECREF(substring); 12191 12192 if (result == -2) 12193 return NULL; 12194 12195 if (result < 0) { 12196 PyErr_SetString(PyExc_ValueError, "substring not found"); 12197 return NULL; 12198 } 12199 12200 return PyLong_FromSsize_t(result); 12201} 12202 12203PyDoc_STRVAR(rjust__doc__, 12204 "S.rjust(width[, fillchar]) -> str\n\ 12205\n\ 12206Return S right-justified in a string of length width. Padding is\n\ 12207done using the specified fill character (default is a space)."); 12208 12209static PyObject * 12210unicode_rjust(PyObject *self, PyObject *args) 12211{ 12212 Py_ssize_t width; 12213 Py_UCS4 fillchar = ' '; 12214 12215 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12216 return NULL; 12217 12218 if (PyUnicode_READY(self) == -1) 12219 return NULL; 12220 12221 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 12222 Py_INCREF(self); 12223 return self; 12224 } 12225 12226 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 12227} 12228 12229PyObject * 12230PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12231{ 12232 PyObject *result; 12233 12234 s = PyUnicode_FromObject(s); 12235 if (s == NULL) 12236 return NULL; 12237 if (sep != NULL) { 12238 sep = PyUnicode_FromObject(sep); 12239 if (sep == NULL) { 12240 Py_DECREF(s); 12241 return NULL; 12242 } 12243 } 12244 12245 result = split(s, sep, maxsplit); 12246 12247 Py_DECREF(s); 12248 Py_XDECREF(sep); 12249 return result; 12250} 12251 12252PyDoc_STRVAR(split__doc__, 12253 "S.split([sep[, maxsplit]]) -> list of strings\n\ 12254\n\ 12255Return a list of the words in S, using sep as the\n\ 12256delimiter string. If maxsplit is given, at most maxsplit\n\ 12257splits are done. If sep is not specified or is None, any\n\ 12258whitespace string is a separator and empty strings are\n\ 12259removed from the result."); 12260 12261static PyObject* 12262unicode_split(PyObject *self, PyObject *args) 12263{ 12264 PyObject *substring = Py_None; 12265 Py_ssize_t maxcount = -1; 12266 12267 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 12268 return NULL; 12269 12270 if (substring == Py_None) 12271 return split(self, NULL, maxcount); 12272 else if (PyUnicode_Check(substring)) 12273 return split(self, substring, maxcount); 12274 else 12275 return PyUnicode_Split(self, substring, maxcount); 12276} 12277 12278PyObject * 12279PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12280{ 12281 PyObject* str_obj; 12282 PyObject* sep_obj; 12283 PyObject* out; 12284 int kind1, kind2, kind; 12285 void *buf1 = NULL, *buf2 = NULL; 12286 Py_ssize_t len1, len2; 12287 12288 str_obj = PyUnicode_FromObject(str_in); 12289 if (!str_obj || PyUnicode_READY(str_obj) == -1) 12290 return NULL; 12291 sep_obj = PyUnicode_FromObject(sep_in); 12292 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 12293 Py_DECREF(str_obj); 12294 return NULL; 12295 } 12296 12297 kind1 = PyUnicode_KIND(str_obj); 12298 kind2 = PyUnicode_KIND(sep_obj); 12299 kind = Py_MAX(kind1, kind2); 12300 buf1 = PyUnicode_DATA(str_obj); 12301 if (kind1 != kind) 12302 buf1 = _PyUnicode_AsKind(str_obj, kind); 12303 if (!buf1) 12304 goto onError; 12305 buf2 = PyUnicode_DATA(sep_obj); 12306 if (kind2 != kind) 12307 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12308 if (!buf2) 12309 goto onError; 12310 len1 = PyUnicode_GET_LENGTH(str_obj); 12311 len2 = PyUnicode_GET_LENGTH(sep_obj); 12312 12313 switch(PyUnicode_KIND(str_obj)) { 12314 case PyUnicode_1BYTE_KIND: 12315 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12316 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12317 else 12318 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12319 break; 12320 case PyUnicode_2BYTE_KIND: 12321 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12322 break; 12323 case PyUnicode_4BYTE_KIND: 12324 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12325 break; 12326 default: 12327 assert(0); 12328 out = 0; 12329 } 12330 12331 Py_DECREF(sep_obj); 12332 Py_DECREF(str_obj); 12333 if (kind1 != kind) 12334 PyMem_Free(buf1); 12335 if (kind2 != kind) 12336 PyMem_Free(buf2); 12337 12338 return out; 12339 onError: 12340 Py_DECREF(sep_obj); 12341 Py_DECREF(str_obj); 12342 if (kind1 != kind && buf1) 12343 PyMem_Free(buf1); 12344 if (kind2 != kind && buf2) 12345 PyMem_Free(buf2); 12346 return NULL; 12347} 12348 12349 12350PyObject * 12351PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12352{ 12353 PyObject* str_obj; 12354 PyObject* sep_obj; 12355 PyObject* out; 12356 int kind1, kind2, kind; 12357 void *buf1 = NULL, *buf2 = NULL; 12358 Py_ssize_t len1, len2; 12359 12360 str_obj = PyUnicode_FromObject(str_in); 12361 if (!str_obj) 12362 return NULL; 12363 sep_obj = PyUnicode_FromObject(sep_in); 12364 if (!sep_obj) { 12365 Py_DECREF(str_obj); 12366 return NULL; 12367 } 12368 12369 kind1 = PyUnicode_KIND(str_in); 12370 kind2 = PyUnicode_KIND(sep_obj); 12371 kind = Py_MAX(kind1, kind2); 12372 buf1 = PyUnicode_DATA(str_in); 12373 if (kind1 != kind) 12374 buf1 = _PyUnicode_AsKind(str_in, kind); 12375 if (!buf1) 12376 goto onError; 12377 buf2 = PyUnicode_DATA(sep_obj); 12378 if (kind2 != kind) 12379 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12380 if (!buf2) 12381 goto onError; 12382 len1 = PyUnicode_GET_LENGTH(str_obj); 12383 len2 = PyUnicode_GET_LENGTH(sep_obj); 12384 12385 switch(PyUnicode_KIND(str_in)) { 12386 case PyUnicode_1BYTE_KIND: 12387 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12388 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12389 else 12390 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12391 break; 12392 case PyUnicode_2BYTE_KIND: 12393 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12394 break; 12395 case PyUnicode_4BYTE_KIND: 12396 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12397 break; 12398 default: 12399 assert(0); 12400 out = 0; 12401 } 12402 12403 Py_DECREF(sep_obj); 12404 Py_DECREF(str_obj); 12405 if (kind1 != kind) 12406 PyMem_Free(buf1); 12407 if (kind2 != kind) 12408 PyMem_Free(buf2); 12409 12410 return out; 12411 onError: 12412 Py_DECREF(sep_obj); 12413 Py_DECREF(str_obj); 12414 if (kind1 != kind && buf1) 12415 PyMem_Free(buf1); 12416 if (kind2 != kind && buf2) 12417 PyMem_Free(buf2); 12418 return NULL; 12419} 12420 12421PyDoc_STRVAR(partition__doc__, 12422 "S.partition(sep) -> (head, sep, tail)\n\ 12423\n\ 12424Search for the separator sep in S, and return the part before it,\n\ 12425the separator itself, and the part after it. If the separator is not\n\ 12426found, return S and two empty strings."); 12427 12428static PyObject* 12429unicode_partition(PyObject *self, PyObject *separator) 12430{ 12431 return PyUnicode_Partition(self, separator); 12432} 12433 12434PyDoc_STRVAR(rpartition__doc__, 12435 "S.rpartition(sep) -> (head, sep, tail)\n\ 12436\n\ 12437Search for the separator sep in S, starting at the end of S, and return\n\ 12438the part before it, the separator itself, and the part after it. If the\n\ 12439separator is not found, return two empty strings and S."); 12440 12441static PyObject* 12442unicode_rpartition(PyObject *self, PyObject *separator) 12443{ 12444 return PyUnicode_RPartition(self, separator); 12445} 12446 12447PyObject * 12448PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12449{ 12450 PyObject *result; 12451 12452 s = PyUnicode_FromObject(s); 12453 if (s == NULL) 12454 return NULL; 12455 if (sep != NULL) { 12456 sep = PyUnicode_FromObject(sep); 12457 if (sep == NULL) { 12458 Py_DECREF(s); 12459 return NULL; 12460 } 12461 } 12462 12463 result = rsplit(s, sep, maxsplit); 12464 12465 Py_DECREF(s); 12466 Py_XDECREF(sep); 12467 return result; 12468} 12469 12470PyDoc_STRVAR(rsplit__doc__, 12471 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 12472\n\ 12473Return a list of the words in S, using sep as the\n\ 12474delimiter string, starting at the end of the string and\n\ 12475working to the front. If maxsplit is given, at most maxsplit\n\ 12476splits are done. If sep is not specified, any whitespace string\n\ 12477is a separator."); 12478 12479static PyObject* 12480unicode_rsplit(PyObject *self, PyObject *args) 12481{ 12482 PyObject *substring = Py_None; 12483 Py_ssize_t maxcount = -1; 12484 12485 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 12486 return NULL; 12487 12488 if (substring == Py_None) 12489 return rsplit(self, NULL, maxcount); 12490 else if (PyUnicode_Check(substring)) 12491 return rsplit(self, substring, maxcount); 12492 else 12493 return PyUnicode_RSplit(self, substring, maxcount); 12494} 12495 12496PyDoc_STRVAR(splitlines__doc__, 12497 "S.splitlines([keepends]) -> list of strings\n\ 12498\n\ 12499Return a list of the lines in S, breaking at line boundaries.\n\ 12500Line breaks are not included in the resulting list unless keepends\n\ 12501is given and true."); 12502 12503static PyObject* 12504unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12505{ 12506 static char *kwlist[] = {"keepends", 0}; 12507 int keepends = 0; 12508 12509 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12510 kwlist, &keepends)) 12511 return NULL; 12512 12513 return PyUnicode_Splitlines(self, keepends); 12514} 12515 12516static 12517PyObject *unicode_str(PyObject *self) 12518{ 12519 if (PyUnicode_CheckExact(self)) { 12520 Py_INCREF(self); 12521 return self; 12522 } else 12523 /* Subtype -- return genuine unicode string with the same value. */ 12524 return PyUnicode_Copy(self); 12525} 12526 12527PyDoc_STRVAR(swapcase__doc__, 12528 "S.swapcase() -> str\n\ 12529\n\ 12530Return a copy of S with uppercase characters converted to lowercase\n\ 12531and vice versa."); 12532 12533static PyObject* 12534unicode_swapcase(PyObject *self) 12535{ 12536 return fixup(self, fixswapcase); 12537} 12538 12539PyDoc_STRVAR(maketrans__doc__, 12540 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12541\n\ 12542Return a translation table usable for str.translate().\n\ 12543If there is only one argument, it must be a dictionary mapping Unicode\n\ 12544ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12545Character keys will be then converted to ordinals.\n\ 12546If there are two arguments, they must be strings of equal length, and\n\ 12547in the resulting dictionary, each character in x will be mapped to the\n\ 12548character at the same position in y. If there is a third argument, it\n\ 12549must be a string, whose characters will be mapped to None in the result."); 12550 12551static PyObject* 12552unicode_maketrans(PyObject *null, PyObject *args) 12553{ 12554 PyObject *x, *y = NULL, *z = NULL; 12555 PyObject *new = NULL, *key, *value; 12556 Py_ssize_t i = 0; 12557 int res; 12558 12559 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12560 return NULL; 12561 new = PyDict_New(); 12562 if (!new) 12563 return NULL; 12564 if (y != NULL) { 12565 int x_kind, y_kind, z_kind; 12566 void *x_data, *y_data, *z_data; 12567 12568 /* x must be a string too, of equal length */ 12569 if (!PyUnicode_Check(x)) { 12570 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12571 "be a string if there is a second argument"); 12572 goto err; 12573 } 12574 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12575 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12576 "arguments must have equal length"); 12577 goto err; 12578 } 12579 /* create entries for translating chars in x to those in y */ 12580 x_kind = PyUnicode_KIND(x); 12581 y_kind = PyUnicode_KIND(y); 12582 x_data = PyUnicode_DATA(x); 12583 y_data = PyUnicode_DATA(y); 12584 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12585 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12586 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12587 if (!key || !value) 12588 goto err; 12589 res = PyDict_SetItem(new, key, value); 12590 Py_DECREF(key); 12591 Py_DECREF(value); 12592 if (res < 0) 12593 goto err; 12594 } 12595 /* create entries for deleting chars in z */ 12596 if (z != NULL) { 12597 z_kind = PyUnicode_KIND(z); 12598 z_data = PyUnicode_DATA(z); 12599 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12600 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12601 if (!key) 12602 goto err; 12603 res = PyDict_SetItem(new, key, Py_None); 12604 Py_DECREF(key); 12605 if (res < 0) 12606 goto err; 12607 } 12608 } 12609 } else { 12610 int kind; 12611 void *data; 12612 12613 /* x must be a dict */ 12614 if (!PyDict_CheckExact(x)) { 12615 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12616 "to maketrans it must be a dict"); 12617 goto err; 12618 } 12619 /* copy entries into the new dict, converting string keys to int keys */ 12620 while (PyDict_Next(x, &i, &key, &value)) { 12621 if (PyUnicode_Check(key)) { 12622 /* convert string keys to integer keys */ 12623 PyObject *newkey; 12624 if (PyUnicode_GET_LENGTH(key) != 1) { 12625 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12626 "table must be of length 1"); 12627 goto err; 12628 } 12629 kind = PyUnicode_KIND(key); 12630 data = PyUnicode_DATA(key); 12631 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12632 if (!newkey) 12633 goto err; 12634 res = PyDict_SetItem(new, newkey, value); 12635 Py_DECREF(newkey); 12636 if (res < 0) 12637 goto err; 12638 } else if (PyLong_Check(key)) { 12639 /* just keep integer keys */ 12640 if (PyDict_SetItem(new, key, value) < 0) 12641 goto err; 12642 } else { 12643 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12644 "be strings or integers"); 12645 goto err; 12646 } 12647 } 12648 } 12649 return new; 12650 err: 12651 Py_DECREF(new); 12652 return NULL; 12653} 12654 12655PyDoc_STRVAR(translate__doc__, 12656 "S.translate(table) -> str\n\ 12657\n\ 12658Return a copy of the string S, where all characters have been mapped\n\ 12659through the given translation table, which must be a mapping of\n\ 12660Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12661Unmapped characters are left untouched. Characters mapped to None\n\ 12662are deleted."); 12663 12664static PyObject* 12665unicode_translate(PyObject *self, PyObject *table) 12666{ 12667 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12668} 12669 12670PyDoc_STRVAR(upper__doc__, 12671 "S.upper() -> str\n\ 12672\n\ 12673Return a copy of S converted to uppercase."); 12674 12675static PyObject* 12676unicode_upper(PyObject *self) 12677{ 12678 return fixup(self, fixupper); 12679} 12680 12681PyDoc_STRVAR(zfill__doc__, 12682 "S.zfill(width) -> str\n\ 12683\n\ 12684Pad a numeric string S with zeros on the left, to fill a field\n\ 12685of the specified width. The string S is never truncated."); 12686 12687static PyObject * 12688unicode_zfill(PyObject *self, PyObject *args) 12689{ 12690 Py_ssize_t fill; 12691 PyObject *u; 12692 Py_ssize_t width; 12693 int kind; 12694 void *data; 12695 Py_UCS4 chr; 12696 12697 if (PyUnicode_READY(self) == -1) 12698 return NULL; 12699 12700 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12701 return NULL; 12702 12703 if (PyUnicode_GET_LENGTH(self) >= width) { 12704 if (PyUnicode_CheckExact(self)) { 12705 Py_INCREF(self); 12706 return self; 12707 } 12708 else 12709 return PyUnicode_Copy(self); 12710 } 12711 12712 fill = width - _PyUnicode_LENGTH(self); 12713 12714 u = pad(self, fill, 0, '0'); 12715 12716 if (u == NULL) 12717 return NULL; 12718 12719 kind = PyUnicode_KIND(u); 12720 data = PyUnicode_DATA(u); 12721 chr = PyUnicode_READ(kind, data, fill); 12722 12723 if (chr == '+' || chr == '-') { 12724 /* move sign to beginning of string */ 12725 PyUnicode_WRITE(kind, data, 0, chr); 12726 PyUnicode_WRITE(kind, data, fill, '0'); 12727 } 12728 12729 assert(_PyUnicode_CheckConsistency(u, 1)); 12730 return u; 12731} 12732 12733#if 0 12734static PyObject * 12735unicode__decimal2ascii(PyObject *self) 12736{ 12737 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12738} 12739#endif 12740 12741PyDoc_STRVAR(startswith__doc__, 12742 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12743\n\ 12744Return True if S starts with the specified prefix, False otherwise.\n\ 12745With optional start, test S beginning at that position.\n\ 12746With optional end, stop comparing S at that position.\n\ 12747prefix can also be a tuple of strings to try."); 12748 12749static PyObject * 12750unicode_startswith(PyObject *self, 12751 PyObject *args) 12752{ 12753 PyObject *subobj; 12754 PyObject *substring; 12755 Py_ssize_t start = 0; 12756 Py_ssize_t end = PY_SSIZE_T_MAX; 12757 int result; 12758 12759 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12760 return NULL; 12761 if (PyTuple_Check(subobj)) { 12762 Py_ssize_t i; 12763 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12764 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12765 if (substring == NULL) 12766 return NULL; 12767 result = tailmatch(self, substring, start, end, -1); 12768 Py_DECREF(substring); 12769 if (result) { 12770 Py_RETURN_TRUE; 12771 } 12772 } 12773 /* nothing matched */ 12774 Py_RETURN_FALSE; 12775 } 12776 substring = PyUnicode_FromObject(subobj); 12777 if (substring == NULL) { 12778 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12779 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12780 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12781 return NULL; 12782 } 12783 result = tailmatch(self, substring, start, end, -1); 12784 Py_DECREF(substring); 12785 return PyBool_FromLong(result); 12786} 12787 12788 12789PyDoc_STRVAR(endswith__doc__, 12790 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12791\n\ 12792Return True if S ends with the specified suffix, False otherwise.\n\ 12793With optional start, test S beginning at that position.\n\ 12794With optional end, stop comparing S at that position.\n\ 12795suffix can also be a tuple of strings to try."); 12796 12797static PyObject * 12798unicode_endswith(PyObject *self, 12799 PyObject *args) 12800{ 12801 PyObject *subobj; 12802 PyObject *substring; 12803 Py_ssize_t start = 0; 12804 Py_ssize_t end = PY_SSIZE_T_MAX; 12805 int result; 12806 12807 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12808 return NULL; 12809 if (PyTuple_Check(subobj)) { 12810 Py_ssize_t i; 12811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12812 substring = PyUnicode_FromObject( 12813 PyTuple_GET_ITEM(subobj, i)); 12814 if (substring == NULL) 12815 return NULL; 12816 result = tailmatch(self, substring, start, end, +1); 12817 Py_DECREF(substring); 12818 if (result) { 12819 Py_RETURN_TRUE; 12820 } 12821 } 12822 Py_RETURN_FALSE; 12823 } 12824 substring = PyUnicode_FromObject(subobj); 12825 if (substring == NULL) { 12826 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12827 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12828 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12829 return NULL; 12830 } 12831 result = tailmatch(self, substring, start, end, +1); 12832 Py_DECREF(substring); 12833 return PyBool_FromLong(result); 12834} 12835 12836#include "stringlib/unicode_format.h" 12837 12838PyDoc_STRVAR(format__doc__, 12839 "S.format(*args, **kwargs) -> str\n\ 12840\n\ 12841Return a formatted version of S, using substitutions from args and kwargs.\n\ 12842The substitutions are identified by braces ('{' and '}')."); 12843 12844PyDoc_STRVAR(format_map__doc__, 12845 "S.format_map(mapping) -> str\n\ 12846\n\ 12847Return a formatted version of S, using substitutions from mapping.\n\ 12848The substitutions are identified by braces ('{' and '}')."); 12849 12850static PyObject * 12851unicode__format__(PyObject* self, PyObject* args) 12852{ 12853 PyObject *format_spec, *out; 12854 12855 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12856 return NULL; 12857 12858 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, 12859 PyUnicode_GET_LENGTH(format_spec)); 12860 return out; 12861} 12862 12863PyDoc_STRVAR(p_format__doc__, 12864 "S.__format__(format_spec) -> str\n\ 12865\n\ 12866Return a formatted version of S as described by format_spec."); 12867 12868static PyObject * 12869unicode__sizeof__(PyObject *v) 12870{ 12871 Py_ssize_t size; 12872 12873 /* If it's a compact object, account for base structure + 12874 character data. */ 12875 if (PyUnicode_IS_COMPACT_ASCII(v)) 12876 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12877 else if (PyUnicode_IS_COMPACT(v)) 12878 size = sizeof(PyCompactUnicodeObject) + 12879 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12880 else { 12881 /* If it is a two-block object, account for base object, and 12882 for character block if present. */ 12883 size = sizeof(PyUnicodeObject); 12884 if (_PyUnicode_DATA_ANY(v)) 12885 size += (PyUnicode_GET_LENGTH(v) + 1) * 12886 PyUnicode_KIND(v); 12887 } 12888 /* If the wstr pointer is present, account for it unless it is shared 12889 with the data pointer. Check if the data is not shared. */ 12890 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12891 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12892 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12893 size += PyUnicode_UTF8_LENGTH(v) + 1; 12894 12895 return PyLong_FromSsize_t(size); 12896} 12897 12898PyDoc_STRVAR(sizeof__doc__, 12899 "S.__sizeof__() -> size of S in memory, in bytes"); 12900 12901static PyObject * 12902unicode_getnewargs(PyObject *v) 12903{ 12904 PyObject *copy = PyUnicode_Copy(v); 12905 if (!copy) 12906 return NULL; 12907 return Py_BuildValue("(N)", copy); 12908} 12909 12910static PyMethodDef unicode_methods[] = { 12911 12912 /* Order is according to common usage: often used methods should 12913 appear first, since lookup is done sequentially. */ 12914 12915 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12916 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12917 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12918 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12919 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12920 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12921 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12922 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12923 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12924 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12925 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12926 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12927 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12928 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12929 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12930 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12931 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12932 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12933 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12934 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12935 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12936 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12937 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12938 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12939 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12940 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12941 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12942 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12943 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12944 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12945 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12946 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12947 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12948 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12949 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12950 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12951 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12952 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12953 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12954 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12955 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12956 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12957 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12958 {"maketrans", (PyCFunction) unicode_maketrans, 12959 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12960 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12961#if 0 12962 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12963#endif 12964 12965#if 0 12966 /* These methods are just used for debugging the implementation. */ 12967 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12968#endif 12969 12970 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12971 {NULL, NULL} 12972}; 12973 12974static PyObject * 12975unicode_mod(PyObject *v, PyObject *w) 12976{ 12977 if (!PyUnicode_Check(v)) 12978 Py_RETURN_NOTIMPLEMENTED; 12979 return PyUnicode_Format(v, w); 12980} 12981 12982static PyNumberMethods unicode_as_number = { 12983 0, /*nb_add*/ 12984 0, /*nb_subtract*/ 12985 0, /*nb_multiply*/ 12986 unicode_mod, /*nb_remainder*/ 12987}; 12988 12989static PySequenceMethods unicode_as_sequence = { 12990 (lenfunc) unicode_length, /* sq_length */ 12991 PyUnicode_Concat, /* sq_concat */ 12992 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12993 (ssizeargfunc) unicode_getitem, /* sq_item */ 12994 0, /* sq_slice */ 12995 0, /* sq_ass_item */ 12996 0, /* sq_ass_slice */ 12997 PyUnicode_Contains, /* sq_contains */ 12998}; 12999 13000static PyObject* 13001unicode_subscript(PyObject* self, PyObject* item) 13002{ 13003 if (PyUnicode_READY(self) == -1) 13004 return NULL; 13005 13006 if (PyIndex_Check(item)) { 13007 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13008 if (i == -1 && PyErr_Occurred()) 13009 return NULL; 13010 if (i < 0) 13011 i += PyUnicode_GET_LENGTH(self); 13012 return unicode_getitem(self, i); 13013 } else if (PySlice_Check(item)) { 13014 Py_ssize_t start, stop, step, slicelength, cur, i; 13015 PyObject *result; 13016 void *src_data, *dest_data; 13017 int src_kind, dest_kind; 13018 Py_UCS4 ch, max_char, kind_limit; 13019 13020 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13021 &start, &stop, &step, &slicelength) < 0) { 13022 return NULL; 13023 } 13024 13025 if (slicelength <= 0) { 13026 return PyUnicode_New(0, 0); 13027 } else if (start == 0 && step == 1 && 13028 slicelength == PyUnicode_GET_LENGTH(self) && 13029 PyUnicode_CheckExact(self)) { 13030 Py_INCREF(self); 13031 return self; 13032 } else if (step == 1) { 13033 return PyUnicode_Substring(self, 13034 start, start + slicelength); 13035 } 13036 /* General case */ 13037 src_kind = PyUnicode_KIND(self); 13038 src_data = PyUnicode_DATA(self); 13039 if (!PyUnicode_IS_ASCII(self)) { 13040 kind_limit = kind_maxchar_limit(src_kind); 13041 max_char = 0; 13042 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13043 ch = PyUnicode_READ(src_kind, src_data, cur); 13044 if (ch > max_char) { 13045 max_char = ch; 13046 if (max_char >= kind_limit) 13047 break; 13048 } 13049 } 13050 } 13051 else 13052 max_char = 127; 13053 result = PyUnicode_New(slicelength, max_char); 13054 if (result == NULL) 13055 return NULL; 13056 dest_kind = PyUnicode_KIND(result); 13057 dest_data = PyUnicode_DATA(result); 13058 13059 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13060 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13061 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13062 } 13063 assert(_PyUnicode_CheckConsistency(result, 1)); 13064 return result; 13065 } else { 13066 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13067 return NULL; 13068 } 13069} 13070 13071static PyMappingMethods unicode_as_mapping = { 13072 (lenfunc)unicode_length, /* mp_length */ 13073 (binaryfunc)unicode_subscript, /* mp_subscript */ 13074 (objobjargproc)0, /* mp_ass_subscript */ 13075}; 13076 13077 13078/* Helpers for PyUnicode_Format() */ 13079 13080static PyObject * 13081getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 13082{ 13083 Py_ssize_t argidx = *p_argidx; 13084 if (argidx < arglen) { 13085 (*p_argidx)++; 13086 if (arglen < 0) 13087 return args; 13088 else 13089 return PyTuple_GetItem(args, argidx); 13090 } 13091 PyErr_SetString(PyExc_TypeError, 13092 "not enough arguments for format string"); 13093 return NULL; 13094} 13095 13096/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13097 13098static PyObject * 13099formatfloat(PyObject *v, int flags, int prec, int type) 13100{ 13101 char *p; 13102 PyObject *result; 13103 double x; 13104 13105 x = PyFloat_AsDouble(v); 13106 if (x == -1.0 && PyErr_Occurred()) 13107 return NULL; 13108 13109 if (prec < 0) 13110 prec = 6; 13111 13112 p = PyOS_double_to_string(x, type, prec, 13113 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13114 if (p == NULL) 13115 return NULL; 13116 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 13117 PyMem_Free(p); 13118 return result; 13119} 13120 13121static PyObject* 13122formatlong(PyObject *val, int flags, int prec, int type) 13123{ 13124 char *buf; 13125 int len; 13126 PyObject *str; /* temporary string object. */ 13127 PyObject *result; 13128 13129 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 13130 if (!str) 13131 return NULL; 13132 result = PyUnicode_DecodeASCII(buf, len, NULL); 13133 Py_DECREF(str); 13134 return result; 13135} 13136 13137static Py_UCS4 13138formatchar(PyObject *v) 13139{ 13140 /* presume that the buffer is at least 3 characters long */ 13141 if (PyUnicode_Check(v)) { 13142 if (PyUnicode_GET_LENGTH(v) == 1) { 13143 return PyUnicode_READ_CHAR(v, 0); 13144 } 13145 goto onError; 13146 } 13147 else { 13148 /* Integer input truncated to a character */ 13149 long x; 13150 x = PyLong_AsLong(v); 13151 if (x == -1 && PyErr_Occurred()) 13152 goto onError; 13153 13154 if (x < 0 || x > 0x10ffff) { 13155 PyErr_SetString(PyExc_OverflowError, 13156 "%c arg not in range(0x110000)"); 13157 return (Py_UCS4) -1; 13158 } 13159 13160 return (Py_UCS4) x; 13161 } 13162 13163 onError: 13164 PyErr_SetString(PyExc_TypeError, 13165 "%c requires int or char"); 13166 return (Py_UCS4) -1; 13167} 13168 13169static int 13170repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) 13171{ 13172 int r; 13173 assert(count > 0); 13174 assert(PyUnicode_Check(obj)); 13175 if (count > 5) { 13176 PyObject *repeated = unicode_repeat(obj, count); 13177 if (repeated == NULL) 13178 return -1; 13179 r = _PyAccu_Accumulate(acc, repeated); 13180 Py_DECREF(repeated); 13181 return r; 13182 } 13183 else { 13184 do { 13185 if (_PyAccu_Accumulate(acc, obj)) 13186 return -1; 13187 } while (--count); 13188 return 0; 13189 } 13190} 13191 13192PyObject * 13193PyUnicode_Format(PyObject *format, PyObject *args) 13194{ 13195 void *fmt; 13196 int fmtkind; 13197 PyObject *result; 13198 int kind; 13199 int r; 13200 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13201 int args_owned = 0; 13202 PyObject *dict = NULL; 13203 PyObject *temp = NULL; 13204 PyObject *second = NULL; 13205 PyObject *uformat; 13206 _PyAccu acc; 13207 static PyObject *plus, *minus, *blank, *zero, *percent; 13208 13209 if (!plus && !(plus = get_latin1_char('+'))) 13210 return NULL; 13211 if (!minus && !(minus = get_latin1_char('-'))) 13212 return NULL; 13213 if (!blank && !(blank = get_latin1_char(' '))) 13214 return NULL; 13215 if (!zero && !(zero = get_latin1_char('0'))) 13216 return NULL; 13217 if (!percent && !(percent = get_latin1_char('%'))) 13218 return NULL; 13219 13220 if (format == NULL || args == NULL) { 13221 PyErr_BadInternalCall(); 13222 return NULL; 13223 } 13224 uformat = PyUnicode_FromObject(format); 13225 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 13226 return NULL; 13227 if (_PyAccu_Init(&acc)) 13228 goto onError; 13229 fmt = PyUnicode_DATA(uformat); 13230 fmtkind = PyUnicode_KIND(uformat); 13231 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13232 fmtpos = 0; 13233 13234 if (PyTuple_Check(args)) { 13235 arglen = PyTuple_Size(args); 13236 argidx = 0; 13237 } 13238 else { 13239 arglen = -1; 13240 argidx = -2; 13241 } 13242 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 13243 !PyUnicode_Check(args)) 13244 dict = args; 13245 13246 while (--fmtcnt >= 0) { 13247 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13248 PyObject *nonfmt; 13249 Py_ssize_t nonfmtpos; 13250 nonfmtpos = fmtpos++; 13251 while (fmtcnt >= 0 && 13252 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13253 fmtpos++; 13254 fmtcnt--; 13255 } 13256 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos); 13257 if (nonfmt == NULL) 13258 goto onError; 13259 r = _PyAccu_Accumulate(&acc, nonfmt); 13260 Py_DECREF(nonfmt); 13261 if (r) 13262 goto onError; 13263 } 13264 else { 13265 /* Got a format specifier */ 13266 int flags = 0; 13267 Py_ssize_t width = -1; 13268 int prec = -1; 13269 Py_UCS4 c = '\0'; 13270 Py_UCS4 fill, sign; 13271 int isnumok; 13272 PyObject *v = NULL; 13273 void *pbuf = NULL; 13274 Py_ssize_t pindex, len; 13275 PyObject *signobj = NULL, *fillobj = NULL; 13276 13277 fmtpos++; 13278 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 13279 Py_ssize_t keystart; 13280 Py_ssize_t keylen; 13281 PyObject *key; 13282 int pcount = 1; 13283 13284 if (dict == NULL) { 13285 PyErr_SetString(PyExc_TypeError, 13286 "format requires a mapping"); 13287 goto onError; 13288 } 13289 ++fmtpos; 13290 --fmtcnt; 13291 keystart = fmtpos; 13292 /* Skip over balanced parentheses */ 13293 while (pcount > 0 && --fmtcnt >= 0) { 13294 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 13295 --pcount; 13296 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 13297 ++pcount; 13298 fmtpos++; 13299 } 13300 keylen = fmtpos - keystart - 1; 13301 if (fmtcnt < 0 || pcount > 0) { 13302 PyErr_SetString(PyExc_ValueError, 13303 "incomplete format key"); 13304 goto onError; 13305 } 13306 key = PyUnicode_Substring(uformat, 13307 keystart, keystart + keylen); 13308 if (key == NULL) 13309 goto onError; 13310 if (args_owned) { 13311 Py_DECREF(args); 13312 args_owned = 0; 13313 } 13314 args = PyObject_GetItem(dict, key); 13315 Py_DECREF(key); 13316 if (args == NULL) { 13317 goto onError; 13318 } 13319 args_owned = 1; 13320 arglen = -1; 13321 argidx = -2; 13322 } 13323 while (--fmtcnt >= 0) { 13324 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 13325 case '-': flags |= F_LJUST; continue; 13326 case '+': flags |= F_SIGN; continue; 13327 case ' ': flags |= F_BLANK; continue; 13328 case '#': flags |= F_ALT; continue; 13329 case '0': flags |= F_ZERO; continue; 13330 } 13331 break; 13332 } 13333 if (c == '*') { 13334 v = getnextarg(args, arglen, &argidx); 13335 if (v == NULL) 13336 goto onError; 13337 if (!PyLong_Check(v)) { 13338 PyErr_SetString(PyExc_TypeError, 13339 "* wants int"); 13340 goto onError; 13341 } 13342 width = PyLong_AsLong(v); 13343 if (width == -1 && PyErr_Occurred()) 13344 goto onError; 13345 if (width < 0) { 13346 flags |= F_LJUST; 13347 width = -width; 13348 } 13349 if (--fmtcnt >= 0) 13350 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13351 } 13352 else if (c >= '0' && c <= '9') { 13353 width = c - '0'; 13354 while (--fmtcnt >= 0) { 13355 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13356 if (c < '0' || c > '9') 13357 break; 13358 if ((width*10) / 10 != width) { 13359 PyErr_SetString(PyExc_ValueError, 13360 "width too big"); 13361 goto onError; 13362 } 13363 width = width*10 + (c - '0'); 13364 } 13365 } 13366 if (c == '.') { 13367 prec = 0; 13368 if (--fmtcnt >= 0) 13369 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13370 if (c == '*') { 13371 v = getnextarg(args, arglen, &argidx); 13372 if (v == NULL) 13373 goto onError; 13374 if (!PyLong_Check(v)) { 13375 PyErr_SetString(PyExc_TypeError, 13376 "* wants int"); 13377 goto onError; 13378 } 13379 prec = PyLong_AsLong(v); 13380 if (prec == -1 && PyErr_Occurred()) 13381 goto onError; 13382 if (prec < 0) 13383 prec = 0; 13384 if (--fmtcnt >= 0) 13385 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13386 } 13387 else if (c >= '0' && c <= '9') { 13388 prec = c - '0'; 13389 while (--fmtcnt >= 0) { 13390 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13391 if (c < '0' || c > '9') 13392 break; 13393 if ((prec*10) / 10 != prec) { 13394 PyErr_SetString(PyExc_ValueError, 13395 "prec too big"); 13396 goto onError; 13397 } 13398 prec = prec*10 + (c - '0'); 13399 } 13400 } 13401 } /* prec */ 13402 if (fmtcnt >= 0) { 13403 if (c == 'h' || c == 'l' || c == 'L') { 13404 if (--fmtcnt >= 0) 13405 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13406 } 13407 } 13408 if (fmtcnt < 0) { 13409 PyErr_SetString(PyExc_ValueError, 13410 "incomplete format"); 13411 goto onError; 13412 } 13413 if (c != '%') { 13414 v = getnextarg(args, arglen, &argidx); 13415 if (v == NULL) 13416 goto onError; 13417 } 13418 sign = 0; 13419 fill = ' '; 13420 fillobj = blank; 13421 switch (c) { 13422 13423 case '%': 13424 _PyAccu_Accumulate(&acc, percent); 13425 continue; 13426 13427 case 's': 13428 case 'r': 13429 case 'a': 13430 if (PyUnicode_CheckExact(v) && c == 's') { 13431 temp = v; 13432 Py_INCREF(temp); 13433 } 13434 else { 13435 if (c == 's') 13436 temp = PyObject_Str(v); 13437 else if (c == 'r') 13438 temp = PyObject_Repr(v); 13439 else 13440 temp = PyObject_ASCII(v); 13441 if (temp == NULL) 13442 goto onError; 13443 if (PyUnicode_Check(temp)) 13444 /* nothing to do */; 13445 else { 13446 Py_DECREF(temp); 13447 PyErr_SetString(PyExc_TypeError, 13448 "%s argument has non-string str()"); 13449 goto onError; 13450 } 13451 } 13452 if (PyUnicode_READY(temp) == -1) { 13453 Py_CLEAR(temp); 13454 goto onError; 13455 } 13456 pbuf = PyUnicode_DATA(temp); 13457 kind = PyUnicode_KIND(temp); 13458 len = PyUnicode_GET_LENGTH(temp); 13459 if (prec >= 0 && len > prec) 13460 len = prec; 13461 break; 13462 13463 case 'i': 13464 case 'd': 13465 case 'u': 13466 case 'o': 13467 case 'x': 13468 case 'X': 13469 isnumok = 0; 13470 if (PyNumber_Check(v)) { 13471 PyObject *iobj=NULL; 13472 13473 if (PyLong_Check(v)) { 13474 iobj = v; 13475 Py_INCREF(iobj); 13476 } 13477 else { 13478 iobj = PyNumber_Long(v); 13479 } 13480 if (iobj!=NULL) { 13481 if (PyLong_Check(iobj)) { 13482 isnumok = 1; 13483 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13484 Py_DECREF(iobj); 13485 if (!temp) 13486 goto onError; 13487 if (PyUnicode_READY(temp) == -1) { 13488 Py_CLEAR(temp); 13489 goto onError; 13490 } 13491 pbuf = PyUnicode_DATA(temp); 13492 kind = PyUnicode_KIND(temp); 13493 len = PyUnicode_GET_LENGTH(temp); 13494 sign = 1; 13495 } 13496 else { 13497 Py_DECREF(iobj); 13498 } 13499 } 13500 } 13501 if (!isnumok) { 13502 PyErr_Format(PyExc_TypeError, 13503 "%%%c format: a number is required, " 13504 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13505 goto onError; 13506 } 13507 if (flags & F_ZERO) { 13508 fill = '0'; 13509 fillobj = zero; 13510 } 13511 break; 13512 13513 case 'e': 13514 case 'E': 13515 case 'f': 13516 case 'F': 13517 case 'g': 13518 case 'G': 13519 temp = formatfloat(v, flags, prec, c); 13520 if (!temp) 13521 goto onError; 13522 if (PyUnicode_READY(temp) == -1) { 13523 Py_CLEAR(temp); 13524 goto onError; 13525 } 13526 pbuf = PyUnicode_DATA(temp); 13527 kind = PyUnicode_KIND(temp); 13528 len = PyUnicode_GET_LENGTH(temp); 13529 sign = 1; 13530 if (flags & F_ZERO) { 13531 fill = '0'; 13532 fillobj = zero; 13533 } 13534 break; 13535 13536 case 'c': 13537 { 13538 Py_UCS4 ch = formatchar(v); 13539 if (ch == (Py_UCS4) -1) 13540 goto onError; 13541 temp = _PyUnicode_FromUCS4(&ch, 1); 13542 if (temp == NULL) 13543 goto onError; 13544 pbuf = PyUnicode_DATA(temp); 13545 kind = PyUnicode_KIND(temp); 13546 len = PyUnicode_GET_LENGTH(temp); 13547 break; 13548 } 13549 13550 default: 13551 PyErr_Format(PyExc_ValueError, 13552 "unsupported format character '%c' (0x%x) " 13553 "at index %zd", 13554 (31<=c && c<=126) ? (char)c : '?', 13555 (int)c, 13556 fmtpos - 1); 13557 goto onError; 13558 } 13559 /* pbuf is initialized here. */ 13560 pindex = 0; 13561 if (sign) { 13562 if (PyUnicode_READ(kind, pbuf, pindex) == '-') { 13563 signobj = minus; 13564 len--; 13565 pindex++; 13566 } 13567 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') { 13568 signobj = plus; 13569 len--; 13570 pindex++; 13571 } 13572 else if (flags & F_SIGN) 13573 signobj = plus; 13574 else if (flags & F_BLANK) 13575 signobj = blank; 13576 else 13577 sign = 0; 13578 } 13579 if (width < len) 13580 width = len; 13581 if (sign) { 13582 if (fill != ' ') { 13583 assert(signobj != NULL); 13584 if (_PyAccu_Accumulate(&acc, signobj)) 13585 goto onError; 13586 } 13587 if (width > len) 13588 width--; 13589 } 13590 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13591 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13592 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13593 if (fill != ' ') { 13594 second = get_latin1_char( 13595 PyUnicode_READ(kind, pbuf, pindex + 1)); 13596 pindex += 2; 13597 if (second == NULL || 13598 _PyAccu_Accumulate(&acc, zero) || 13599 _PyAccu_Accumulate(&acc, second)) 13600 goto onError; 13601 Py_CLEAR(second); 13602 } 13603 width -= 2; 13604 if (width < 0) 13605 width = 0; 13606 len -= 2; 13607 } 13608 if (width > len && !(flags & F_LJUST)) { 13609 assert(fillobj != NULL); 13610 if (repeat_accumulate(&acc, fillobj, width - len)) 13611 goto onError; 13612 width = len; 13613 } 13614 if (fill == ' ') { 13615 if (sign) { 13616 assert(signobj != NULL); 13617 if (_PyAccu_Accumulate(&acc, signobj)) 13618 goto onError; 13619 } 13620 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13621 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13622 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13623 second = get_latin1_char( 13624 PyUnicode_READ(kind, pbuf, pindex + 1)); 13625 pindex += 2; 13626 if (second == NULL || 13627 _PyAccu_Accumulate(&acc, zero) || 13628 _PyAccu_Accumulate(&acc, second)) 13629 goto onError; 13630 Py_CLEAR(second); 13631 } 13632 } 13633 /* Copy all characters, preserving len */ 13634 if (temp != NULL) { 13635 assert(pbuf == PyUnicode_DATA(temp)); 13636 v = PyUnicode_Substring(temp, pindex, pindex + len); 13637 } 13638 else { 13639 const char *p = (const char *) pbuf; 13640 assert(pbuf != NULL); 13641 p += kind * pindex; 13642 v = PyUnicode_FromKindAndData(kind, p, len); 13643 } 13644 if (v == NULL) 13645 goto onError; 13646 r = _PyAccu_Accumulate(&acc, v); 13647 Py_DECREF(v); 13648 if (r) 13649 goto onError; 13650 if (width > len && repeat_accumulate(&acc, blank, width - len)) 13651 goto onError; 13652 if (dict && (argidx < arglen) && c != '%') { 13653 PyErr_SetString(PyExc_TypeError, 13654 "not all arguments converted during string formatting"); 13655 goto onError; 13656 } 13657 Py_CLEAR(temp); 13658 } /* '%' */ 13659 } /* until end */ 13660 if (argidx < arglen && !dict) { 13661 PyErr_SetString(PyExc_TypeError, 13662 "not all arguments converted during string formatting"); 13663 goto onError; 13664 } 13665 13666 result = _PyAccu_Finish(&acc); 13667 if (args_owned) { 13668 Py_DECREF(args); 13669 } 13670 Py_DECREF(uformat); 13671 Py_XDECREF(temp); 13672 Py_XDECREF(second); 13673 return result; 13674 13675 onError: 13676 Py_DECREF(uformat); 13677 Py_XDECREF(temp); 13678 Py_XDECREF(second); 13679 _PyAccu_Destroy(&acc); 13680 if (args_owned) { 13681 Py_DECREF(args); 13682 } 13683 return NULL; 13684} 13685 13686static PyObject * 13687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13688 13689static PyObject * 13690unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13691{ 13692 PyObject *x = NULL; 13693 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13694 char *encoding = NULL; 13695 char *errors = NULL; 13696 13697 if (type != &PyUnicode_Type) 13698 return unicode_subtype_new(type, args, kwds); 13699 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13700 kwlist, &x, &encoding, &errors)) 13701 return NULL; 13702 if (x == NULL) 13703 return PyUnicode_New(0, 0); 13704 if (encoding == NULL && errors == NULL) 13705 return PyObject_Str(x); 13706 else 13707 return PyUnicode_FromEncodedObject(x, encoding, errors); 13708} 13709 13710static PyObject * 13711unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13712{ 13713 PyObject *unicode, *self; 13714 Py_ssize_t length, char_size; 13715 int share_wstr, share_utf8; 13716 unsigned int kind; 13717 void *data; 13718 13719 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13720 13721 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13722 if (unicode == NULL) 13723 return NULL; 13724 assert(_PyUnicode_CHECK(unicode)); 13725 if (PyUnicode_READY(unicode)) 13726 return NULL; 13727 13728 self = type->tp_alloc(type, 0); 13729 if (self == NULL) { 13730 Py_DECREF(unicode); 13731 return NULL; 13732 } 13733 kind = PyUnicode_KIND(unicode); 13734 length = PyUnicode_GET_LENGTH(unicode); 13735 13736 _PyUnicode_LENGTH(self) = length; 13737#ifdef Py_DEBUG 13738 _PyUnicode_HASH(self) = -1; 13739#else 13740 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13741#endif 13742 _PyUnicode_STATE(self).interned = 0; 13743 _PyUnicode_STATE(self).kind = kind; 13744 _PyUnicode_STATE(self).compact = 0; 13745 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13746 _PyUnicode_STATE(self).ready = 1; 13747 _PyUnicode_WSTR(self) = NULL; 13748 _PyUnicode_UTF8_LENGTH(self) = 0; 13749 _PyUnicode_UTF8(self) = NULL; 13750 _PyUnicode_WSTR_LENGTH(self) = 0; 13751 _PyUnicode_DATA_ANY(self) = NULL; 13752 13753 share_utf8 = 0; 13754 share_wstr = 0; 13755 if (kind == PyUnicode_1BYTE_KIND) { 13756 char_size = 1; 13757 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13758 share_utf8 = 1; 13759 } 13760 else if (kind == PyUnicode_2BYTE_KIND) { 13761 char_size = 2; 13762 if (sizeof(wchar_t) == 2) 13763 share_wstr = 1; 13764 } 13765 else { 13766 assert(kind == PyUnicode_4BYTE_KIND); 13767 char_size = 4; 13768 if (sizeof(wchar_t) == 4) 13769 share_wstr = 1; 13770 } 13771 13772 /* Ensure we won't overflow the length. */ 13773 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13774 PyErr_NoMemory(); 13775 goto onError; 13776 } 13777 data = PyObject_MALLOC((length + 1) * char_size); 13778 if (data == NULL) { 13779 PyErr_NoMemory(); 13780 goto onError; 13781 } 13782 13783 _PyUnicode_DATA_ANY(self) = data; 13784 if (share_utf8) { 13785 _PyUnicode_UTF8_LENGTH(self) = length; 13786 _PyUnicode_UTF8(self) = data; 13787 } 13788 if (share_wstr) { 13789 _PyUnicode_WSTR_LENGTH(self) = length; 13790 _PyUnicode_WSTR(self) = (wchar_t *)data; 13791 } 13792 13793 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13794 kind * (length + 1)); 13795 assert(_PyUnicode_CheckConsistency(self, 1)); 13796#ifdef Py_DEBUG 13797 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13798#endif 13799 Py_DECREF(unicode); 13800 return self; 13801 13802onError: 13803 Py_DECREF(unicode); 13804 Py_DECREF(self); 13805 return NULL; 13806} 13807 13808PyDoc_STRVAR(unicode_doc, 13809 "str(string[, encoding[, errors]]) -> str\n\ 13810\n\ 13811Create a new string object from the given encoded string.\n\ 13812encoding defaults to the current default string encoding.\n\ 13813errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13814 13815static PyObject *unicode_iter(PyObject *seq); 13816 13817PyTypeObject PyUnicode_Type = { 13818 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13819 "str", /* tp_name */ 13820 sizeof(PyUnicodeObject), /* tp_size */ 13821 0, /* tp_itemsize */ 13822 /* Slots */ 13823 (destructor)unicode_dealloc, /* tp_dealloc */ 13824 0, /* tp_print */ 13825 0, /* tp_getattr */ 13826 0, /* tp_setattr */ 13827 0, /* tp_reserved */ 13828 unicode_repr, /* tp_repr */ 13829 &unicode_as_number, /* tp_as_number */ 13830 &unicode_as_sequence, /* tp_as_sequence */ 13831 &unicode_as_mapping, /* tp_as_mapping */ 13832 (hashfunc) unicode_hash, /* tp_hash*/ 13833 0, /* tp_call*/ 13834 (reprfunc) unicode_str, /* tp_str */ 13835 PyObject_GenericGetAttr, /* tp_getattro */ 13836 0, /* tp_setattro */ 13837 0, /* tp_as_buffer */ 13838 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13839 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13840 unicode_doc, /* tp_doc */ 13841 0, /* tp_traverse */ 13842 0, /* tp_clear */ 13843 PyUnicode_RichCompare, /* tp_richcompare */ 13844 0, /* tp_weaklistoffset */ 13845 unicode_iter, /* tp_iter */ 13846 0, /* tp_iternext */ 13847 unicode_methods, /* tp_methods */ 13848 0, /* tp_members */ 13849 0, /* tp_getset */ 13850 &PyBaseObject_Type, /* tp_base */ 13851 0, /* tp_dict */ 13852 0, /* tp_descr_get */ 13853 0, /* tp_descr_set */ 13854 0, /* tp_dictoffset */ 13855 0, /* tp_init */ 13856 0, /* tp_alloc */ 13857 unicode_new, /* tp_new */ 13858 PyObject_Del, /* tp_free */ 13859}; 13860 13861/* Initialize the Unicode implementation */ 13862 13863int _PyUnicode_Init(void) 13864{ 13865 int i; 13866 13867 /* XXX - move this array to unicodectype.c ? */ 13868 Py_UCS2 linebreak[] = { 13869 0x000A, /* LINE FEED */ 13870 0x000D, /* CARRIAGE RETURN */ 13871 0x001C, /* FILE SEPARATOR */ 13872 0x001D, /* GROUP SEPARATOR */ 13873 0x001E, /* RECORD SEPARATOR */ 13874 0x0085, /* NEXT LINE */ 13875 0x2028, /* LINE SEPARATOR */ 13876 0x2029, /* PARAGRAPH SEPARATOR */ 13877 }; 13878 13879 /* Init the implementation */ 13880 unicode_empty = PyUnicode_New(0, 0); 13881 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 13882 if (!unicode_empty) 13883 Py_FatalError("Can't create empty string"); 13884 13885 for (i = 0; i < 256; i++) 13886 unicode_latin1[i] = NULL; 13887 if (PyType_Ready(&PyUnicode_Type) < 0) 13888 Py_FatalError("Can't initialize 'unicode'"); 13889 13890 /* initialize the linebreak bloom filter */ 13891 bloom_linebreak = make_bloom_mask( 13892 PyUnicode_2BYTE_KIND, linebreak, 13893 Py_ARRAY_LENGTH(linebreak)); 13894 13895 PyType_Ready(&EncodingMapType); 13896 13897#ifdef HAVE_MBCS 13898 winver.dwOSVersionInfoSize = sizeof(winver); 13899 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 13900 PyErr_SetFromWindowsErr(0); 13901 return -1; 13902 } 13903#endif 13904 return 0; 13905} 13906 13907/* Finalize the Unicode implementation */ 13908 13909int 13910PyUnicode_ClearFreeList(void) 13911{ 13912 return 0; 13913} 13914 13915void 13916_PyUnicode_Fini(void) 13917{ 13918 int i; 13919 13920 Py_XDECREF(unicode_empty); 13921 unicode_empty = NULL; 13922 13923 for (i = 0; i < 256; i++) { 13924 if (unicode_latin1[i]) { 13925 Py_DECREF(unicode_latin1[i]); 13926 unicode_latin1[i] = NULL; 13927 } 13928 } 13929 _PyUnicode_ClearStaticStrings(); 13930 (void)PyUnicode_ClearFreeList(); 13931} 13932 13933void 13934PyUnicode_InternInPlace(PyObject **p) 13935{ 13936 register PyObject *s = *p; 13937 PyObject *t; 13938#ifdef Py_DEBUG 13939 assert(s != NULL); 13940 assert(_PyUnicode_CHECK(s)); 13941#else 13942 if (s == NULL || !PyUnicode_Check(s)) 13943 return; 13944#endif 13945 /* If it's a subclass, we don't really know what putting 13946 it in the interned dict might do. */ 13947 if (!PyUnicode_CheckExact(s)) 13948 return; 13949 if (PyUnicode_CHECK_INTERNED(s)) 13950 return; 13951 if (_PyUnicode_READY_REPLACE(p)) { 13952 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 13953 return; 13954 } 13955 s = *p; 13956 if (interned == NULL) { 13957 interned = PyDict_New(); 13958 if (interned == NULL) { 13959 PyErr_Clear(); /* Don't leave an exception */ 13960 return; 13961 } 13962 } 13963 /* It might be that the GetItem call fails even 13964 though the key is present in the dictionary, 13965 namely when this happens during a stack overflow. */ 13966 Py_ALLOW_RECURSION 13967 t = PyDict_GetItem(interned, s); 13968 Py_END_ALLOW_RECURSION 13969 13970 if (t) { 13971 Py_INCREF(t); 13972 Py_DECREF(*p); 13973 *p = t; 13974 return; 13975 } 13976 13977 PyThreadState_GET()->recursion_critical = 1; 13978 if (PyDict_SetItem(interned, s, s) < 0) { 13979 PyErr_Clear(); 13980 PyThreadState_GET()->recursion_critical = 0; 13981 return; 13982 } 13983 PyThreadState_GET()->recursion_critical = 0; 13984 /* The two references in interned are not counted by refcnt. 13985 The deallocator will take care of this */ 13986 Py_REFCNT(s) -= 2; 13987 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13988} 13989 13990void 13991PyUnicode_InternImmortal(PyObject **p) 13992{ 13993 PyUnicode_InternInPlace(p); 13994 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13995 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 13996 Py_INCREF(*p); 13997 } 13998} 13999 14000PyObject * 14001PyUnicode_InternFromString(const char *cp) 14002{ 14003 PyObject *s = PyUnicode_FromString(cp); 14004 if (s == NULL) 14005 return NULL; 14006 PyUnicode_InternInPlace(&s); 14007 return s; 14008} 14009 14010void 14011_Py_ReleaseInternedUnicodeStrings(void) 14012{ 14013 PyObject *keys; 14014 PyObject *s; 14015 Py_ssize_t i, n; 14016 Py_ssize_t immortal_size = 0, mortal_size = 0; 14017 14018 if (interned == NULL || !PyDict_Check(interned)) 14019 return; 14020 keys = PyDict_Keys(interned); 14021 if (keys == NULL || !PyList_Check(keys)) { 14022 PyErr_Clear(); 14023 return; 14024 } 14025 14026 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14027 detector, interned unicode strings are not forcibly deallocated; 14028 rather, we give them their stolen references back, and then clear 14029 and DECREF the interned dict. */ 14030 14031 n = PyList_GET_SIZE(keys); 14032 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14033 n); 14034 for (i = 0; i < n; i++) { 14035 s = PyList_GET_ITEM(keys, i); 14036 if (PyUnicode_READY(s) == -1) { 14037 assert(0 && "could not ready string"); 14038 fprintf(stderr, "could not ready string\n"); 14039 } 14040 switch (PyUnicode_CHECK_INTERNED(s)) { 14041 case SSTATE_NOT_INTERNED: 14042 /* XXX Shouldn't happen */ 14043 break; 14044 case SSTATE_INTERNED_IMMORTAL: 14045 Py_REFCNT(s) += 1; 14046 immortal_size += PyUnicode_GET_LENGTH(s); 14047 break; 14048 case SSTATE_INTERNED_MORTAL: 14049 Py_REFCNT(s) += 2; 14050 mortal_size += PyUnicode_GET_LENGTH(s); 14051 break; 14052 default: 14053 Py_FatalError("Inconsistent interned string state."); 14054 } 14055 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14056 } 14057 fprintf(stderr, "total size of all interned strings: " 14058 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14059 "mortal/immortal\n", mortal_size, immortal_size); 14060 Py_DECREF(keys); 14061 PyDict_Clear(interned); 14062 Py_DECREF(interned); 14063 interned = NULL; 14064} 14065 14066 14067/********************* Unicode Iterator **************************/ 14068 14069typedef struct { 14070 PyObject_HEAD 14071 Py_ssize_t it_index; 14072 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14073} unicodeiterobject; 14074 14075static void 14076unicodeiter_dealloc(unicodeiterobject *it) 14077{ 14078 _PyObject_GC_UNTRACK(it); 14079 Py_XDECREF(it->it_seq); 14080 PyObject_GC_Del(it); 14081} 14082 14083static int 14084unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14085{ 14086 Py_VISIT(it->it_seq); 14087 return 0; 14088} 14089 14090static PyObject * 14091unicodeiter_next(unicodeiterobject *it) 14092{ 14093 PyObject *seq, *item; 14094 14095 assert(it != NULL); 14096 seq = it->it_seq; 14097 if (seq == NULL) 14098 return NULL; 14099 assert(_PyUnicode_CHECK(seq)); 14100 14101 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14102 int kind = PyUnicode_KIND(seq); 14103 void *data = PyUnicode_DATA(seq); 14104 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14105 item = PyUnicode_FromOrdinal(chr); 14106 if (item != NULL) 14107 ++it->it_index; 14108 return item; 14109 } 14110 14111 Py_DECREF(seq); 14112 it->it_seq = NULL; 14113 return NULL; 14114} 14115 14116static PyObject * 14117unicodeiter_len(unicodeiterobject *it) 14118{ 14119 Py_ssize_t len = 0; 14120 if (it->it_seq) 14121 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14122 return PyLong_FromSsize_t(len); 14123} 14124 14125PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14126 14127static PyMethodDef unicodeiter_methods[] = { 14128 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14129 length_hint_doc}, 14130 {NULL, NULL} /* sentinel */ 14131}; 14132 14133PyTypeObject PyUnicodeIter_Type = { 14134 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14135 "str_iterator", /* tp_name */ 14136 sizeof(unicodeiterobject), /* tp_basicsize */ 14137 0, /* tp_itemsize */ 14138 /* methods */ 14139 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14140 0, /* tp_print */ 14141 0, /* tp_getattr */ 14142 0, /* tp_setattr */ 14143 0, /* tp_reserved */ 14144 0, /* tp_repr */ 14145 0, /* tp_as_number */ 14146 0, /* tp_as_sequence */ 14147 0, /* tp_as_mapping */ 14148 0, /* tp_hash */ 14149 0, /* tp_call */ 14150 0, /* tp_str */ 14151 PyObject_GenericGetAttr, /* tp_getattro */ 14152 0, /* tp_setattro */ 14153 0, /* tp_as_buffer */ 14154 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14155 0, /* tp_doc */ 14156 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14157 0, /* tp_clear */ 14158 0, /* tp_richcompare */ 14159 0, /* tp_weaklistoffset */ 14160 PyObject_SelfIter, /* tp_iter */ 14161 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14162 unicodeiter_methods, /* tp_methods */ 14163 0, 14164}; 14165 14166static PyObject * 14167unicode_iter(PyObject *seq) 14168{ 14169 unicodeiterobject *it; 14170 14171 if (!PyUnicode_Check(seq)) { 14172 PyErr_BadInternalCall(); 14173 return NULL; 14174 } 14175 if (PyUnicode_READY(seq) == -1) 14176 return NULL; 14177 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14178 if (it == NULL) 14179 return NULL; 14180 it->it_index = 0; 14181 Py_INCREF(seq); 14182 it->it_seq = seq; 14183 _PyObject_GC_TRACK(it); 14184 return (PyObject *)it; 14185} 14186 14187 14188size_t 14189Py_UNICODE_strlen(const Py_UNICODE *u) 14190{ 14191 int res = 0; 14192 while(*u++) 14193 res++; 14194 return res; 14195} 14196 14197Py_UNICODE* 14198Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14199{ 14200 Py_UNICODE *u = s1; 14201 while ((*u++ = *s2++)); 14202 return s1; 14203} 14204 14205Py_UNICODE* 14206Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14207{ 14208 Py_UNICODE *u = s1; 14209 while ((*u++ = *s2++)) 14210 if (n-- == 0) 14211 break; 14212 return s1; 14213} 14214 14215Py_UNICODE* 14216Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14217{ 14218 Py_UNICODE *u1 = s1; 14219 u1 += Py_UNICODE_strlen(u1); 14220 Py_UNICODE_strcpy(u1, s2); 14221 return s1; 14222} 14223 14224int 14225Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14226{ 14227 while (*s1 && *s2 && *s1 == *s2) 14228 s1++, s2++; 14229 if (*s1 && *s2) 14230 return (*s1 < *s2) ? -1 : +1; 14231 if (*s1) 14232 return 1; 14233 if (*s2) 14234 return -1; 14235 return 0; 14236} 14237 14238int 14239Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14240{ 14241 register Py_UNICODE u1, u2; 14242 for (; n != 0; n--) { 14243 u1 = *s1; 14244 u2 = *s2; 14245 if (u1 != u2) 14246 return (u1 < u2) ? -1 : +1; 14247 if (u1 == '\0') 14248 return 0; 14249 s1++; 14250 s2++; 14251 } 14252 return 0; 14253} 14254 14255Py_UNICODE* 14256Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14257{ 14258 const Py_UNICODE *p; 14259 for (p = s; *p; p++) 14260 if (*p == c) 14261 return (Py_UNICODE*)p; 14262 return NULL; 14263} 14264 14265Py_UNICODE* 14266Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14267{ 14268 const Py_UNICODE *p; 14269 p = s + Py_UNICODE_strlen(s); 14270 while (p != s) { 14271 p--; 14272 if (*p == c) 14273 return (Py_UNICODE*)p; 14274 } 14275 return NULL; 14276} 14277 14278Py_UNICODE* 14279PyUnicode_AsUnicodeCopy(PyObject *unicode) 14280{ 14281 Py_UNICODE *u, *copy; 14282 Py_ssize_t len, size; 14283 14284 if (!PyUnicode_Check(unicode)) { 14285 PyErr_BadArgument(); 14286 return NULL; 14287 } 14288 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14289 if (u == NULL) 14290 return NULL; 14291 /* Ensure we won't overflow the size. */ 14292 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14293 PyErr_NoMemory(); 14294 return NULL; 14295 } 14296 size = len + 1; /* copy the null character */ 14297 size *= sizeof(Py_UNICODE); 14298 copy = PyMem_Malloc(size); 14299 if (copy == NULL) { 14300 PyErr_NoMemory(); 14301 return NULL; 14302 } 14303 memcpy(copy, u, size); 14304 return copy; 14305} 14306 14307/* A _string module, to export formatter_parser and formatter_field_name_split 14308 to the string.Formatter class implemented in Python. */ 14309 14310static PyMethodDef _string_methods[] = { 14311 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14312 METH_O, PyDoc_STR("split the argument as a field name")}, 14313 {"formatter_parser", (PyCFunction) formatter_parser, 14314 METH_O, PyDoc_STR("parse the argument as a format string")}, 14315 {NULL, NULL} 14316}; 14317 14318static struct PyModuleDef _string_module = { 14319 PyModuleDef_HEAD_INIT, 14320 "_string", 14321 PyDoc_STR("string helper module"), 14322 0, 14323 _string_methods, 14324 NULL, 14325 NULL, 14326 NULL, 14327 NULL 14328}; 14329 14330PyMODINIT_FUNC 14331PyInit__string(void) 14332{ 14333 return PyModule_Create(&_string_module); 14334} 14335 14336 14337#ifdef __cplusplus 14338} 14339#endif 14340