unicodeobject.c revision 12be46ca8418593fb2716234912b6a8a8d262966
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49#ifdef Py_DEBUG 50# define DONT_MAKE_RESULT_READY 51#endif 52 53/* Endianness switches; defaults to little endian */ 54 55#ifdef WORDS_BIGENDIAN 56# define BYTEORDER_IS_BIG_ENDIAN 57#else 58# define BYTEORDER_IS_LITTLE_ENDIAN 59#endif 60 61/* --- Globals ------------------------------------------------------------ 62 63 The globals are initialized by the _PyUnicode_Init() API and should 64 not be used before calling that API. 65 66*/ 67 68 69#ifdef __cplusplus 70extern "C" { 71#endif 72 73#ifdef Py_DEBUG 74# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 75#else 76# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 77#endif 78 79#define _PyUnicode_UTF8(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8) 81#define PyUnicode_UTF8(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((char*)((PyASCIIObject*)(op) + 1)) : \ 86 _PyUnicode_UTF8(op)) 87#define _PyUnicode_UTF8_LENGTH(op) \ 88 (((PyCompactUnicodeObject*)(op))->utf8_length) 89#define PyUnicode_UTF8_LENGTH(op) \ 90 (assert(_PyUnicode_CHECK(op)), \ 91 assert(PyUnicode_IS_READY(op)), \ 92 PyUnicode_IS_COMPACT_ASCII(op) ? \ 93 ((PyASCIIObject*)(op))->length : \ 94 _PyUnicode_UTF8_LENGTH(op)) 95#define _PyUnicode_WSTR(op) \ 96 (((PyASCIIObject*)(op))->wstr) 97#define _PyUnicode_WSTR_LENGTH(op) \ 98 (((PyCompactUnicodeObject*)(op))->wstr_length) 99#define _PyUnicode_LENGTH(op) \ 100 (((PyASCIIObject *)(op))->length) 101#define _PyUnicode_STATE(op) \ 102 (((PyASCIIObject *)(op))->state) 103#define _PyUnicode_HASH(op) \ 104 (((PyASCIIObject *)(op))->hash) 105#define _PyUnicode_KIND(op) \ 106 (assert(_PyUnicode_CHECK(op)), \ 107 ((PyASCIIObject *)(op))->state.kind) 108#define _PyUnicode_GET_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 ((PyASCIIObject *)(op))->length) 111#define _PyUnicode_DATA_ANY(op) \ 112 (((PyUnicodeObject*)(op))->data.any) 113 114#undef PyUnicode_READY 115#define PyUnicode_READY(op) \ 116 (assert(_PyUnicode_CHECK(op)), \ 117 (PyUnicode_IS_READY(op) ? \ 118 0 : \ 119 _PyUnicode_Ready(op))) 120 121#define _PyUnicode_READY_REPLACE(p_obj) \ 122 (assert(_PyUnicode_CHECK(*p_obj)), \ 123 (PyUnicode_IS_READY(*p_obj) ? \ 124 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj)))) 125 126#define _PyUnicode_SHARE_UTF8(op) \ 127 (assert(_PyUnicode_CHECK(op)), \ 128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 130#define _PyUnicode_SHARE_WSTR(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated UTF-8 memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (!PyUnicode_IS_COMPACT_ASCII(op) \ 139 && _PyUnicode_UTF8(op) \ 140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 141 142/* true if the Unicode object has an allocated wstr memory block 143 (not shared with other data) */ 144#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 145 (assert(_PyUnicode_CHECK(op)), \ 146 (_PyUnicode_WSTR(op) && \ 147 (!PyUnicode_IS_READY(op) || \ 148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 149 150/* Generic helper macro to convert characters of different types. 151 from_type and to_type have to be valid type names, begin and end 152 are pointers to the source characters which should be of type 153 "from_type *". to is a pointer of type "to_type *" and points to the 154 buffer where the result characters are written to. */ 155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 156 do { \ 157 to_type *_to = (to_type *) to; \ 158 const from_type *_iter = (begin); \ 159 const from_type *_end = (end); \ 160 Py_ssize_t n = (_end) - (_iter); \ 161 const from_type *_unrolled_end = \ 162 _iter + (n & ~ (Py_ssize_t) 3); \ 163 while (_iter < (_unrolled_end)) { \ 164 _to[0] = (to_type) _iter[0]; \ 165 _to[1] = (to_type) _iter[1]; \ 166 _to[2] = (to_type) _iter[2]; \ 167 _to[3] = (to_type) _iter[3]; \ 168 _iter += 4; _to += 4; \ 169 } \ 170 while (_iter < (_end)) \ 171 *_to++ = (to_type) *_iter++; \ 172 } while (0) 173 174/* The Unicode string has been modified: reset the hash */ 175#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 176 177/* This dictionary holds all interned unicode strings. Note that references 178 to strings in this dictionary are *not* counted in the string's ob_refcnt. 179 When the interned string reaches a refcnt of 0 the string deallocation 180 function will delete the reference from this dictionary. 181 182 Another way to look at this is that to say that the actual reference 183 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 184*/ 185static PyObject *interned; 186 187/* The empty Unicode object is shared to improve performance. */ 188static PyObject *unicode_empty; 189 190/* List of static strings. */ 191static _Py_Identifier *static_strings; 192 193/* Single character Unicode strings in the Latin-1 range are being 194 shared as well. */ 195static PyObject *unicode_latin1[256]; 196 197/* Fast detection of the most frequent whitespace characters */ 198const unsigned char _Py_ascii_whitespace[] = { 199 0, 0, 0, 0, 0, 0, 0, 0, 200/* case 0x0009: * CHARACTER TABULATION */ 201/* case 0x000A: * LINE FEED */ 202/* case 0x000B: * LINE TABULATION */ 203/* case 0x000C: * FORM FEED */ 204/* case 0x000D: * CARRIAGE RETURN */ 205 0, 1, 1, 1, 1, 1, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207/* case 0x001C: * FILE SEPARATOR */ 208/* case 0x001D: * GROUP SEPARATOR */ 209/* case 0x001E: * RECORD SEPARATOR */ 210/* case 0x001F: * UNIT SEPARATOR */ 211 0, 0, 0, 0, 1, 1, 1, 1, 212/* case 0x0020: * SPACE */ 213 1, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0, 223 0, 0, 0, 0, 0, 0, 0, 0, 224 0, 0, 0, 0, 0, 0, 0, 0, 225 0, 0, 0, 0, 0, 0, 0, 0 226}; 227 228/* forward */ 229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 230static PyObject* get_latin1_char(unsigned char ch); 231static void copy_characters( 232 PyObject *to, Py_ssize_t to_start, 233 PyObject *from, Py_ssize_t from_start, 234 Py_ssize_t how_many); 235#ifdef Py_DEBUG 236static int unicode_is_singleton(PyObject *unicode); 237#endif 238 239static PyObject * 240unicode_fromascii(const unsigned char *s, Py_ssize_t size); 241static PyObject * 242_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 243static PyObject * 244_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 245static PyObject * 246_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 247 248static PyObject * 249unicode_encode_call_errorhandler(const char *errors, 250 PyObject **errorHandler,const char *encoding, const char *reason, 251 PyObject *unicode, PyObject **exceptionObject, 252 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 253 254static void 255raise_encode_exception(PyObject **exceptionObject, 256 const char *encoding, 257 PyObject *unicode, 258 Py_ssize_t startpos, Py_ssize_t endpos, 259 const char *reason); 260 261/* Same for linebreaks */ 262static unsigned char ascii_linebreak[] = { 263 0, 0, 0, 0, 0, 0, 0, 0, 264/* 0x000A, * LINE FEED */ 265/* 0x000B, * LINE TABULATION */ 266/* 0x000C, * FORM FEED */ 267/* 0x000D, * CARRIAGE RETURN */ 268 0, 0, 1, 1, 1, 1, 0, 0, 269 0, 0, 0, 0, 0, 0, 0, 0, 270/* 0x001C, * FILE SEPARATOR */ 271/* 0x001D, * GROUP SEPARATOR */ 272/* 0x001E, * RECORD SEPARATOR */ 273 0, 0, 0, 0, 1, 1, 1, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0, 278 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0 287}; 288 289/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 290 This function is kept for backward compatibility with the old API. */ 291Py_UNICODE 292PyUnicode_GetMax(void) 293{ 294#ifdef Py_UNICODE_WIDE 295 return 0x10FFFF; 296#else 297 /* This is actually an illegal character, so it should 298 not be passed to unichr. */ 299 return 0xFFFF; 300#endif 301} 302 303#ifdef Py_DEBUG 304int 305_PyUnicode_CheckConsistency(PyObject *op, int check_content) 306{ 307 PyASCIIObject *ascii; 308 unsigned int kind; 309 310 assert(PyUnicode_Check(op)); 311 312 ascii = (PyASCIIObject *)op; 313 kind = ascii->state.kind; 314 315 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 316 assert(kind == PyUnicode_1BYTE_KIND); 317 assert(ascii->state.ready == 1); 318 } 319 else { 320 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 321 void *data; 322 323 if (ascii->state.compact == 1) { 324 data = compact + 1; 325 assert(kind == PyUnicode_1BYTE_KIND 326 || kind == PyUnicode_2BYTE_KIND 327 || kind == PyUnicode_4BYTE_KIND); 328 assert(ascii->state.ascii == 0); 329 assert(ascii->state.ready == 1); 330 assert (compact->utf8 != data); 331 } else { 332 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 333 334 data = unicode->data.any; 335 if (kind == PyUnicode_WCHAR_KIND) { 336 assert(ascii->state.compact == 0); 337 assert(ascii->state.ascii == 0); 338 assert(ascii->state.ready == 0); 339 assert(ascii->wstr != NULL); 340 assert(data == NULL); 341 assert(compact->utf8 == NULL); 342 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 343 } 344 else { 345 assert(kind == PyUnicode_1BYTE_KIND 346 || kind == PyUnicode_2BYTE_KIND 347 || kind == PyUnicode_4BYTE_KIND); 348 assert(ascii->state.compact == 0); 349 assert(ascii->state.ready == 1); 350 assert(data != NULL); 351 if (ascii->state.ascii) { 352 assert (compact->utf8 == data); 353 assert (compact->utf8_length == ascii->length); 354 } 355 else 356 assert (compact->utf8 != data); 357 } 358 } 359 if (kind != PyUnicode_WCHAR_KIND) { 360 if ( 361#if SIZEOF_WCHAR_T == 2 362 kind == PyUnicode_2BYTE_KIND 363#else 364 kind == PyUnicode_4BYTE_KIND 365#endif 366 ) 367 { 368 assert(ascii->wstr == data); 369 assert(compact->wstr_length == ascii->length); 370 } else 371 assert(ascii->wstr != data); 372 } 373 374 if (compact->utf8 == NULL) 375 assert(compact->utf8_length == 0); 376 if (ascii->wstr == NULL) 377 assert(compact->wstr_length == 0); 378 } 379 /* check that the best kind is used */ 380 if (check_content && kind != PyUnicode_WCHAR_KIND) 381 { 382 Py_ssize_t i; 383 Py_UCS4 maxchar = 0; 384 void *data = PyUnicode_DATA(ascii); 385 for (i=0; i < ascii->length; i++) 386 { 387 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 388 if (ch > maxchar) 389 maxchar = ch; 390 } 391 if (kind == PyUnicode_1BYTE_KIND) { 392 if (ascii->state.ascii == 0) 393 assert(maxchar >= 128); 394 else 395 assert(maxchar < 128); 396 } 397 else if (kind == PyUnicode_2BYTE_KIND) 398 assert(maxchar >= 0x100); 399 else 400 assert(maxchar >= 0x10000); 401 } 402 if (check_content && !unicode_is_singleton(op)) 403 assert(ascii->hash == -1); 404 return 1; 405} 406#endif 407 408#ifdef HAVE_MBCS 409static OSVERSIONINFOEX winver; 410#endif 411 412/* --- Bloom Filters ----------------------------------------------------- */ 413 414/* stuff to implement simple "bloom filters" for Unicode characters. 415 to keep things simple, we use a single bitmask, using the least 5 416 bits from each unicode characters as the bit index. */ 417 418/* the linebreak mask is set up by Unicode_Init below */ 419 420#if LONG_BIT >= 128 421#define BLOOM_WIDTH 128 422#elif LONG_BIT >= 64 423#define BLOOM_WIDTH 64 424#elif LONG_BIT >= 32 425#define BLOOM_WIDTH 32 426#else 427#error "LONG_BIT is smaller than 32" 428#endif 429 430#define BLOOM_MASK unsigned long 431 432static BLOOM_MASK bloom_linebreak; 433 434#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 435#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 436 437#define BLOOM_LINEBREAK(ch) \ 438 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 439 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 440 441Py_LOCAL_INLINE(BLOOM_MASK) 442make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 443{ 444 /* calculate simple bloom-style bitmask for a given unicode string */ 445 446 BLOOM_MASK mask; 447 Py_ssize_t i; 448 449 mask = 0; 450 for (i = 0; i < len; i++) 451 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 452 453 return mask; 454} 455 456#define BLOOM_MEMBER(mask, chr, str) \ 457 (BLOOM(mask, chr) \ 458 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 459 460/* Compilation of templated routines */ 461 462#include "stringlib/asciilib.h" 463#include "stringlib/fastsearch.h" 464#include "stringlib/partition.h" 465#include "stringlib/split.h" 466#include "stringlib/count.h" 467#include "stringlib/find.h" 468#include "stringlib/find_max_char.h" 469#include "stringlib/localeutil.h" 470#include "stringlib/undef.h" 471 472#include "stringlib/ucs1lib.h" 473#include "stringlib/fastsearch.h" 474#include "stringlib/partition.h" 475#include "stringlib/split.h" 476#include "stringlib/count.h" 477#include "stringlib/find.h" 478#include "stringlib/find_max_char.h" 479#include "stringlib/localeutil.h" 480#include "stringlib/undef.h" 481 482#include "stringlib/ucs2lib.h" 483#include "stringlib/fastsearch.h" 484#include "stringlib/partition.h" 485#include "stringlib/split.h" 486#include "stringlib/count.h" 487#include "stringlib/find.h" 488#include "stringlib/find_max_char.h" 489#include "stringlib/localeutil.h" 490#include "stringlib/undef.h" 491 492#include "stringlib/ucs4lib.h" 493#include "stringlib/fastsearch.h" 494#include "stringlib/partition.h" 495#include "stringlib/split.h" 496#include "stringlib/count.h" 497#include "stringlib/find.h" 498#include "stringlib/find_max_char.h" 499#include "stringlib/localeutil.h" 500#include "stringlib/undef.h" 501 502#include "stringlib/unicodedefs.h" 503#include "stringlib/fastsearch.h" 504#include "stringlib/count.h" 505#include "stringlib/find.h" 506 507/* --- Unicode Object ----------------------------------------------------- */ 508 509static PyObject * 510fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 511 512Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 513 Py_ssize_t size, Py_UCS4 ch, 514 int direction) 515{ 516 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 517 518 switch (kind) { 519 case PyUnicode_1BYTE_KIND: 520 { 521 Py_UCS1 ch1 = (Py_UCS1) ch; 522 if (ch1 == ch) 523 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 524 else 525 return -1; 526 } 527 case PyUnicode_2BYTE_KIND: 528 { 529 Py_UCS2 ch2 = (Py_UCS2) ch; 530 if (ch2 == ch) 531 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 532 else 533 return -1; 534 } 535 case PyUnicode_4BYTE_KIND: 536 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 537 default: 538 assert(0); 539 return -1; 540 } 541} 542 543static PyObject* 544resize_compact(PyObject *unicode, Py_ssize_t length) 545{ 546 Py_ssize_t char_size; 547 Py_ssize_t struct_size; 548 Py_ssize_t new_size; 549 int share_wstr; 550 551 assert(PyUnicode_IS_READY(unicode)); 552 char_size = PyUnicode_KIND(unicode); 553 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 554 struct_size = sizeof(PyASCIIObject); 555 else 556 struct_size = sizeof(PyCompactUnicodeObject); 557 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 558 559 _Py_DEC_REFTOTAL; 560 _Py_ForgetReference(unicode); 561 562 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 563 PyErr_NoMemory(); 564 return NULL; 565 } 566 new_size = (struct_size + (length + 1) * char_size); 567 568 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 569 if (unicode == NULL) { 570 PyObject_Del(unicode); 571 PyErr_NoMemory(); 572 return NULL; 573 } 574 _Py_NewReference(unicode); 575 _PyUnicode_LENGTH(unicode) = length; 576 if (share_wstr) { 577 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 578 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 579 _PyUnicode_WSTR_LENGTH(unicode) = length; 580 } 581 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 582 length, 0); 583 return unicode; 584} 585 586static int 587resize_inplace(PyObject *unicode, Py_ssize_t length) 588{ 589 wchar_t *wstr; 590 assert(!PyUnicode_IS_COMPACT(unicode)); 591 assert(Py_REFCNT(unicode) == 1); 592 593 _PyUnicode_DIRTY(unicode); 594 595 if (PyUnicode_IS_READY(unicode)) { 596 Py_ssize_t char_size; 597 Py_ssize_t new_size; 598 int share_wstr, share_utf8; 599 void *data; 600 601 data = _PyUnicode_DATA_ANY(unicode); 602 assert(data != NULL); 603 char_size = PyUnicode_KIND(unicode); 604 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 605 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 606 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 607 { 608 PyObject_DEL(_PyUnicode_UTF8(unicode)); 609 _PyUnicode_UTF8(unicode) = NULL; 610 _PyUnicode_UTF8_LENGTH(unicode) = 0; 611 } 612 613 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 614 PyErr_NoMemory(); 615 return -1; 616 } 617 new_size = (length + 1) * char_size; 618 619 data = (PyObject *)PyObject_REALLOC(data, new_size); 620 if (data == NULL) { 621 PyErr_NoMemory(); 622 return -1; 623 } 624 _PyUnicode_DATA_ANY(unicode) = data; 625 if (share_wstr) { 626 _PyUnicode_WSTR(unicode) = data; 627 _PyUnicode_WSTR_LENGTH(unicode) = length; 628 } 629 if (share_utf8) { 630 _PyUnicode_UTF8(unicode) = data; 631 _PyUnicode_UTF8_LENGTH(unicode) = length; 632 } 633 _PyUnicode_LENGTH(unicode) = length; 634 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 635 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 636 assert(_PyUnicode_CheckConsistency(unicode, 0)); 637 return 0; 638 } 639 } 640 assert(_PyUnicode_WSTR(unicode) != NULL); 641 642 /* check for integer overflow */ 643 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 644 PyErr_NoMemory(); 645 return -1; 646 } 647 wstr = _PyUnicode_WSTR(unicode); 648 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 649 if (!wstr) { 650 PyErr_NoMemory(); 651 return -1; 652 } 653 _PyUnicode_WSTR(unicode) = wstr; 654 _PyUnicode_WSTR(unicode)[length] = 0; 655 _PyUnicode_WSTR_LENGTH(unicode) = length; 656 assert(_PyUnicode_CheckConsistency(unicode, 0)); 657 return 0; 658} 659 660static PyObject* 661resize_copy(PyObject *unicode, Py_ssize_t length) 662{ 663 Py_ssize_t copy_length; 664 if (PyUnicode_IS_COMPACT(unicode)) { 665 PyObject *copy; 666 assert(PyUnicode_IS_READY(unicode)); 667 668 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 669 if (copy == NULL) 670 return NULL; 671 672 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 673 copy_characters(copy, 0, unicode, 0, copy_length); 674 return copy; 675 } 676 else { 677 PyObject *w; 678 assert(_PyUnicode_WSTR(unicode) != NULL); 679 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 680 w = (PyObject*)_PyUnicode_New(length); 681 if (w == NULL) 682 return NULL; 683 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 684 copy_length = Py_MIN(copy_length, length); 685 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 686 copy_length); 687 return w; 688 } 689} 690 691/* We allocate one more byte to make sure the string is 692 Ux0000 terminated; some code (e.g. new_identifier) 693 relies on that. 694 695 XXX This allocator could further be enhanced by assuring that the 696 free list never reduces its size below 1. 697 698*/ 699 700#ifdef Py_DEBUG 701static int unicode_old_new_calls = 0; 702#endif 703 704static PyUnicodeObject * 705_PyUnicode_New(Py_ssize_t length) 706{ 707 register PyUnicodeObject *unicode; 708 size_t new_size; 709 710 /* Optimization for empty strings */ 711 if (length == 0 && unicode_empty != NULL) { 712 Py_INCREF(unicode_empty); 713 return (PyUnicodeObject*)unicode_empty; 714 } 715 716 /* Ensure we won't overflow the size. */ 717 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 718 return (PyUnicodeObject *)PyErr_NoMemory(); 719 } 720 if (length < 0) { 721 PyErr_SetString(PyExc_SystemError, 722 "Negative size passed to _PyUnicode_New"); 723 return NULL; 724 } 725 726#ifdef Py_DEBUG 727 ++unicode_old_new_calls; 728#endif 729 730 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 731 if (unicode == NULL) 732 return NULL; 733 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 734 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 735 if (!_PyUnicode_WSTR(unicode)) { 736 PyErr_NoMemory(); 737 goto onError; 738 } 739 740 /* Initialize the first element to guard against cases where 741 * the caller fails before initializing str -- unicode_resize() 742 * reads str[0], and the Keep-Alive optimization can keep memory 743 * allocated for str alive across a call to unicode_dealloc(unicode). 744 * We don't want unicode_resize to read uninitialized memory in 745 * that case. 746 */ 747 _PyUnicode_WSTR(unicode)[0] = 0; 748 _PyUnicode_WSTR(unicode)[length] = 0; 749 _PyUnicode_WSTR_LENGTH(unicode) = length; 750 _PyUnicode_HASH(unicode) = -1; 751 _PyUnicode_STATE(unicode).interned = 0; 752 _PyUnicode_STATE(unicode).kind = 0; 753 _PyUnicode_STATE(unicode).compact = 0; 754 _PyUnicode_STATE(unicode).ready = 0; 755 _PyUnicode_STATE(unicode).ascii = 0; 756 _PyUnicode_DATA_ANY(unicode) = NULL; 757 _PyUnicode_LENGTH(unicode) = 0; 758 _PyUnicode_UTF8(unicode) = NULL; 759 _PyUnicode_UTF8_LENGTH(unicode) = 0; 760 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 761 return unicode; 762 763 onError: 764 /* XXX UNREF/NEWREF interface should be more symmetrical */ 765 _Py_DEC_REFTOTAL; 766 _Py_ForgetReference((PyObject *)unicode); 767 PyObject_Del(unicode); 768 return NULL; 769} 770 771static const char* 772unicode_kind_name(PyObject *unicode) 773{ 774 /* don't check consistency: unicode_kind_name() is called from 775 _PyUnicode_Dump() */ 776 if (!PyUnicode_IS_COMPACT(unicode)) 777 { 778 if (!PyUnicode_IS_READY(unicode)) 779 return "wstr"; 780 switch(PyUnicode_KIND(unicode)) 781 { 782 case PyUnicode_1BYTE_KIND: 783 if (PyUnicode_IS_ASCII(unicode)) 784 return "legacy ascii"; 785 else 786 return "legacy latin1"; 787 case PyUnicode_2BYTE_KIND: 788 return "legacy UCS2"; 789 case PyUnicode_4BYTE_KIND: 790 return "legacy UCS4"; 791 default: 792 return "<legacy invalid kind>"; 793 } 794 } 795 assert(PyUnicode_IS_READY(unicode)); 796 switch(PyUnicode_KIND(unicode)) 797 { 798 case PyUnicode_1BYTE_KIND: 799 if (PyUnicode_IS_ASCII(unicode)) 800 return "ascii"; 801 else 802 return "latin1"; 803 case PyUnicode_2BYTE_KIND: 804 return "UCS2"; 805 case PyUnicode_4BYTE_KIND: 806 return "UCS4"; 807 default: 808 return "<invalid compact kind>"; 809 } 810} 811 812#ifdef Py_DEBUG 813static int unicode_new_new_calls = 0; 814 815/* Functions wrapping macros for use in debugger */ 816char *_PyUnicode_utf8(void *unicode){ 817 return PyUnicode_UTF8(unicode); 818} 819 820void *_PyUnicode_compact_data(void *unicode) { 821 return _PyUnicode_COMPACT_DATA(unicode); 822} 823void *_PyUnicode_data(void *unicode){ 824 printf("obj %p\n", unicode); 825 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 826 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 827 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 828 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 829 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 830 return PyUnicode_DATA(unicode); 831} 832 833void 834_PyUnicode_Dump(PyObject *op) 835{ 836 PyASCIIObject *ascii = (PyASCIIObject *)op; 837 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 838 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 839 void *data; 840 841 if (ascii->state.compact) 842 { 843 if (ascii->state.ascii) 844 data = (ascii + 1); 845 else 846 data = (compact + 1); 847 } 848 else 849 data = unicode->data.any; 850 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 851 852 if (ascii->wstr == data) 853 printf("shared "); 854 printf("wstr=%p", ascii->wstr); 855 856 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 857 printf(" (%zu), ", compact->wstr_length); 858 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 859 printf("shared "); 860 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 861 } 862 printf(", data=%p\n", data); 863} 864#endif 865 866PyObject * 867PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 868{ 869 PyObject *obj; 870 PyCompactUnicodeObject *unicode; 871 void *data; 872 int kind_state; 873 int is_sharing, is_ascii; 874 Py_ssize_t char_size; 875 Py_ssize_t struct_size; 876 877 /* Optimization for empty strings */ 878 if (size == 0 && unicode_empty != NULL) { 879 Py_INCREF(unicode_empty); 880 return unicode_empty; 881 } 882 883#ifdef Py_DEBUG 884 ++unicode_new_new_calls; 885#endif 886 887 is_ascii = 0; 888 is_sharing = 0; 889 struct_size = sizeof(PyCompactUnicodeObject); 890 if (maxchar < 128) { 891 kind_state = PyUnicode_1BYTE_KIND; 892 char_size = 1; 893 is_ascii = 1; 894 struct_size = sizeof(PyASCIIObject); 895 } 896 else if (maxchar < 256) { 897 kind_state = PyUnicode_1BYTE_KIND; 898 char_size = 1; 899 } 900 else if (maxchar < 65536) { 901 kind_state = PyUnicode_2BYTE_KIND; 902 char_size = 2; 903 if (sizeof(wchar_t) == 2) 904 is_sharing = 1; 905 } 906 else { 907 kind_state = PyUnicode_4BYTE_KIND; 908 char_size = 4; 909 if (sizeof(wchar_t) == 4) 910 is_sharing = 1; 911 } 912 913 /* Ensure we won't overflow the size. */ 914 if (size < 0) { 915 PyErr_SetString(PyExc_SystemError, 916 "Negative size passed to PyUnicode_New"); 917 return NULL; 918 } 919 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 920 return PyErr_NoMemory(); 921 922 /* Duplicated allocation code from _PyObject_New() instead of a call to 923 * PyObject_New() so we are able to allocate space for the object and 924 * it's data buffer. 925 */ 926 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 927 if (obj == NULL) 928 return PyErr_NoMemory(); 929 obj = PyObject_INIT(obj, &PyUnicode_Type); 930 if (obj == NULL) 931 return NULL; 932 933 unicode = (PyCompactUnicodeObject *)obj; 934 if (is_ascii) 935 data = ((PyASCIIObject*)obj) + 1; 936 else 937 data = unicode + 1; 938 _PyUnicode_LENGTH(unicode) = size; 939 _PyUnicode_HASH(unicode) = -1; 940 _PyUnicode_STATE(unicode).interned = 0; 941 _PyUnicode_STATE(unicode).kind = kind_state; 942 _PyUnicode_STATE(unicode).compact = 1; 943 _PyUnicode_STATE(unicode).ready = 1; 944 _PyUnicode_STATE(unicode).ascii = is_ascii; 945 if (is_ascii) { 946 ((char*)data)[size] = 0; 947 _PyUnicode_WSTR(unicode) = NULL; 948 } 949 else if (kind_state == PyUnicode_1BYTE_KIND) { 950 ((char*)data)[size] = 0; 951 _PyUnicode_WSTR(unicode) = NULL; 952 _PyUnicode_WSTR_LENGTH(unicode) = 0; 953 unicode->utf8 = NULL; 954 unicode->utf8_length = 0; 955 } 956 else { 957 unicode->utf8 = NULL; 958 unicode->utf8_length = 0; 959 if (kind_state == PyUnicode_2BYTE_KIND) 960 ((Py_UCS2*)data)[size] = 0; 961 else /* kind_state == PyUnicode_4BYTE_KIND */ 962 ((Py_UCS4*)data)[size] = 0; 963 if (is_sharing) { 964 _PyUnicode_WSTR_LENGTH(unicode) = size; 965 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 966 } 967 else { 968 _PyUnicode_WSTR_LENGTH(unicode) = 0; 969 _PyUnicode_WSTR(unicode) = NULL; 970 } 971 } 972 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 973 return obj; 974} 975 976#if SIZEOF_WCHAR_T == 2 977/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 978 will decode surrogate pairs, the other conversions are implemented as macros 979 for efficiency. 980 981 This function assumes that unicode can hold one more code point than wstr 982 characters for a terminating null character. */ 983static void 984unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 985 PyObject *unicode) 986{ 987 const wchar_t *iter; 988 Py_UCS4 *ucs4_out; 989 990 assert(unicode != NULL); 991 assert(_PyUnicode_CHECK(unicode)); 992 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 993 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 994 995 for (iter = begin; iter < end; ) { 996 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 997 _PyUnicode_GET_LENGTH(unicode))); 998 if (*iter >= 0xD800 && *iter <= 0xDBFF 999 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1000 { 1001 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000; 1002 iter += 2; 1003 } 1004 else { 1005 *ucs4_out++ = *iter; 1006 iter++; 1007 } 1008 } 1009 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1010 _PyUnicode_GET_LENGTH(unicode))); 1011 1012} 1013#endif 1014 1015static int 1016_PyUnicode_Dirty(PyObject *unicode) 1017{ 1018 assert(_PyUnicode_CHECK(unicode)); 1019 if (Py_REFCNT(unicode) != 1) { 1020 PyErr_SetString(PyExc_SystemError, 1021 "Cannot modify a string having more than 1 reference"); 1022 return -1; 1023 } 1024 _PyUnicode_DIRTY(unicode); 1025 return 0; 1026} 1027 1028static int 1029_copy_characters(PyObject *to, Py_ssize_t to_start, 1030 PyObject *from, Py_ssize_t from_start, 1031 Py_ssize_t how_many, int check_maxchar) 1032{ 1033 unsigned int from_kind, to_kind; 1034 void *from_data, *to_data; 1035 int fast; 1036 1037 assert(PyUnicode_Check(from)); 1038 assert(PyUnicode_Check(to)); 1039 assert(PyUnicode_IS_READY(from)); 1040 assert(PyUnicode_IS_READY(to)); 1041 1042 assert(PyUnicode_GET_LENGTH(from) >= how_many); 1043 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1044 assert(0 <= how_many); 1045 1046 if (how_many == 0) 1047 return 0; 1048 1049 from_kind = PyUnicode_KIND(from); 1050 from_data = PyUnicode_DATA(from); 1051 to_kind = PyUnicode_KIND(to); 1052 to_data = PyUnicode_DATA(to); 1053 1054#ifdef Py_DEBUG 1055 if (!check_maxchar 1056 && (from_kind > to_kind 1057 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 1058 { 1059 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1060 Py_UCS4 ch; 1061 Py_ssize_t i; 1062 for (i=0; i < how_many; i++) { 1063 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1064 assert(ch <= to_maxchar); 1065 } 1066 } 1067#endif 1068 fast = (from_kind == to_kind); 1069 if (check_maxchar 1070 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1071 { 1072 /* deny latin1 => ascii */ 1073 fast = 0; 1074 } 1075 1076 if (fast) { 1077 Py_MEMCPY((char*)to_data + to_kind * to_start, 1078 (char*)from_data + from_kind * from_start, 1079 to_kind * how_many); 1080 } 1081 else if (from_kind == PyUnicode_1BYTE_KIND 1082 && to_kind == PyUnicode_2BYTE_KIND) 1083 { 1084 _PyUnicode_CONVERT_BYTES( 1085 Py_UCS1, Py_UCS2, 1086 PyUnicode_1BYTE_DATA(from) + from_start, 1087 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1088 PyUnicode_2BYTE_DATA(to) + to_start 1089 ); 1090 } 1091 else if (from_kind == PyUnicode_1BYTE_KIND 1092 && to_kind == PyUnicode_4BYTE_KIND) 1093 { 1094 _PyUnicode_CONVERT_BYTES( 1095 Py_UCS1, Py_UCS4, 1096 PyUnicode_1BYTE_DATA(from) + from_start, 1097 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1098 PyUnicode_4BYTE_DATA(to) + to_start 1099 ); 1100 } 1101 else if (from_kind == PyUnicode_2BYTE_KIND 1102 && to_kind == PyUnicode_4BYTE_KIND) 1103 { 1104 _PyUnicode_CONVERT_BYTES( 1105 Py_UCS2, Py_UCS4, 1106 PyUnicode_2BYTE_DATA(from) + from_start, 1107 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1108 PyUnicode_4BYTE_DATA(to) + to_start 1109 ); 1110 } 1111 else { 1112 /* check if max_char(from substring) <= max_char(to) */ 1113 if (from_kind > to_kind 1114 /* latin1 => ascii */ 1115 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1116 { 1117 /* slow path to check for character overflow */ 1118 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1119 Py_UCS4 ch; 1120 Py_ssize_t i; 1121 1122#ifdef Py_DEBUG 1123 for (i=0; i < how_many; i++) { 1124 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1125 assert(ch <= to_maxchar); 1126 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1127 } 1128#else 1129 if (!check_maxchar) { 1130 for (i=0; i < how_many; i++) { 1131 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1132 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1133 } 1134 } 1135 else { 1136 for (i=0; i < how_many; i++) { 1137 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1138 if (ch > to_maxchar) 1139 return 1; 1140 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1141 } 1142 } 1143#endif 1144 } 1145 else { 1146 assert(0 && "inconsistent state"); 1147 return 1; 1148 } 1149 } 1150 return 0; 1151} 1152 1153static void 1154copy_characters(PyObject *to, Py_ssize_t to_start, 1155 PyObject *from, Py_ssize_t from_start, 1156 Py_ssize_t how_many) 1157{ 1158 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1159} 1160 1161Py_ssize_t 1162PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1163 PyObject *from, Py_ssize_t from_start, 1164 Py_ssize_t how_many) 1165{ 1166 int err; 1167 1168 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1169 PyErr_BadInternalCall(); 1170 return -1; 1171 } 1172 1173 if (PyUnicode_READY(from)) 1174 return -1; 1175 if (PyUnicode_READY(to)) 1176 return -1; 1177 1178 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1179 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1180 PyErr_Format(PyExc_SystemError, 1181 "Cannot write %zi characters at %zi " 1182 "in a string of %zi characters", 1183 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1184 return -1; 1185 } 1186 1187 if (how_many == 0) 1188 return 0; 1189 1190 if (_PyUnicode_Dirty(to)) 1191 return -1; 1192 1193 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1194 if (err) { 1195 PyErr_Format(PyExc_SystemError, 1196 "Cannot copy %s characters " 1197 "into a string of %s characters", 1198 unicode_kind_name(from), 1199 unicode_kind_name(to)); 1200 return -1; 1201 } 1202 return how_many; 1203} 1204 1205/* Find the maximum code point and count the number of surrogate pairs so a 1206 correct string length can be computed before converting a string to UCS4. 1207 This function counts single surrogates as a character and not as a pair. 1208 1209 Return 0 on success, or -1 on error. */ 1210static int 1211find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1212 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1213{ 1214 const wchar_t *iter; 1215 1216 assert(num_surrogates != NULL && maxchar != NULL); 1217 *num_surrogates = 0; 1218 *maxchar = 0; 1219 1220 for (iter = begin; iter < end; ) { 1221 if (*iter > *maxchar) { 1222 *maxchar = *iter; 1223#if SIZEOF_WCHAR_T != 2 1224 if (*maxchar >= 0x10000) 1225 return 0; 1226#endif 1227 } 1228#if SIZEOF_WCHAR_T == 2 1229 if (*iter >= 0xD800 && *iter <= 0xDBFF 1230 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) 1231 { 1232 Py_UCS4 surrogate_val; 1233 surrogate_val = (((iter[0] & 0x3FF)<<10) 1234 | (iter[1] & 0x3FF)) + 0x10000; 1235 ++(*num_surrogates); 1236 if (surrogate_val > *maxchar) 1237 *maxchar = surrogate_val; 1238 iter += 2; 1239 } 1240 else 1241 iter++; 1242#else 1243 iter++; 1244#endif 1245 } 1246 return 0; 1247} 1248 1249#ifdef Py_DEBUG 1250static int unicode_ready_calls = 0; 1251#endif 1252 1253static int 1254unicode_ready(PyObject **p_obj, int replace) 1255{ 1256 PyObject *unicode; 1257 wchar_t *end; 1258 Py_UCS4 maxchar = 0; 1259 Py_ssize_t num_surrogates; 1260#if SIZEOF_WCHAR_T == 2 1261 Py_ssize_t length_wo_surrogates; 1262#endif 1263 1264 assert(p_obj != NULL); 1265 unicode = *p_obj; 1266 1267 /* _PyUnicode_Ready() is only intended for old-style API usage where 1268 strings were created using _PyObject_New() and where no canonical 1269 representation (the str field) has been set yet aka strings 1270 which are not yet ready. */ 1271 assert(_PyUnicode_CHECK(unicode)); 1272 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1273 assert(_PyUnicode_WSTR(unicode) != NULL); 1274 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1275 assert(_PyUnicode_UTF8(unicode) == NULL); 1276 /* Actually, it should neither be interned nor be anything else: */ 1277 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1278 1279#ifdef Py_DEBUG 1280 ++unicode_ready_calls; 1281#endif 1282 1283#ifdef Py_DEBUG 1284 assert(!replace || Py_REFCNT(unicode) == 1); 1285#else 1286 if (replace && Py_REFCNT(unicode) != 1) 1287 replace = 0; 1288#endif 1289 if (replace) { 1290 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode); 1291 wchar_t *wstr = _PyUnicode_WSTR(unicode); 1292 /* Optimization for empty strings */ 1293 if (len == 0) { 1294 Py_INCREF(unicode_empty); 1295 Py_DECREF(*p_obj); 1296 *p_obj = unicode_empty; 1297 return 0; 1298 } 1299 if (len == 1 && wstr[0] < 256) { 1300 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]); 1301 if (latin1_char == NULL) 1302 return -1; 1303 Py_DECREF(*p_obj); 1304 *p_obj = latin1_char; 1305 return 0; 1306 } 1307 } 1308 1309 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1310 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1311 &maxchar, &num_surrogates) == -1) 1312 return -1; 1313 1314 if (maxchar < 256) { 1315 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1316 if (!_PyUnicode_DATA_ANY(unicode)) { 1317 PyErr_NoMemory(); 1318 return -1; 1319 } 1320 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1321 _PyUnicode_WSTR(unicode), end, 1322 PyUnicode_1BYTE_DATA(unicode)); 1323 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1324 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1325 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1326 if (maxchar < 128) { 1327 _PyUnicode_STATE(unicode).ascii = 1; 1328 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1329 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1330 } 1331 else { 1332 _PyUnicode_STATE(unicode).ascii = 0; 1333 _PyUnicode_UTF8(unicode) = NULL; 1334 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1335 } 1336 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1337 _PyUnicode_WSTR(unicode) = NULL; 1338 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1339 } 1340 /* In this case we might have to convert down from 4-byte native 1341 wchar_t to 2-byte unicode. */ 1342 else if (maxchar < 65536) { 1343 assert(num_surrogates == 0 && 1344 "FindMaxCharAndNumSurrogatePairs() messed up"); 1345 1346#if SIZEOF_WCHAR_T == 2 1347 /* We can share representations and are done. */ 1348 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1349 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1350 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1351 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1352 _PyUnicode_UTF8(unicode) = NULL; 1353 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1354#else 1355 /* sizeof(wchar_t) == 4 */ 1356 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1357 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1358 if (!_PyUnicode_DATA_ANY(unicode)) { 1359 PyErr_NoMemory(); 1360 return -1; 1361 } 1362 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1363 _PyUnicode_WSTR(unicode), end, 1364 PyUnicode_2BYTE_DATA(unicode)); 1365 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1366 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1367 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1368 _PyUnicode_UTF8(unicode) = NULL; 1369 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1370 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1371 _PyUnicode_WSTR(unicode) = NULL; 1372 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1373#endif 1374 } 1375 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1376 else { 1377#if SIZEOF_WCHAR_T == 2 1378 /* in case the native representation is 2-bytes, we need to allocate a 1379 new normalized 4-byte version. */ 1380 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1381 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1382 if (!_PyUnicode_DATA_ANY(unicode)) { 1383 PyErr_NoMemory(); 1384 return -1; 1385 } 1386 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1387 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1388 _PyUnicode_UTF8(unicode) = NULL; 1389 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1390 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1391 _PyUnicode_STATE(unicode).ready = 1; 1392 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1393 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1394 _PyUnicode_WSTR(unicode) = NULL; 1395 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1396#else 1397 assert(num_surrogates == 0); 1398 1399 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1400 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1401 _PyUnicode_UTF8(unicode) = NULL; 1402 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1403 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1404#endif 1405 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1406 } 1407 _PyUnicode_STATE(unicode).ready = 1; 1408 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1409 return 0; 1410} 1411 1412int 1413_PyUnicode_ReadyReplace(PyObject **op) 1414{ 1415 return unicode_ready(op, 1); 1416} 1417 1418int 1419_PyUnicode_Ready(PyObject *op) 1420{ 1421 return unicode_ready(&op, 0); 1422} 1423 1424static void 1425unicode_dealloc(register PyObject *unicode) 1426{ 1427 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1428 case SSTATE_NOT_INTERNED: 1429 break; 1430 1431 case SSTATE_INTERNED_MORTAL: 1432 /* revive dead object temporarily for DelItem */ 1433 Py_REFCNT(unicode) = 3; 1434 if (PyDict_DelItem(interned, unicode) != 0) 1435 Py_FatalError( 1436 "deletion of interned string failed"); 1437 break; 1438 1439 case SSTATE_INTERNED_IMMORTAL: 1440 Py_FatalError("Immortal interned string died."); 1441 1442 default: 1443 Py_FatalError("Inconsistent interned string state."); 1444 } 1445 1446 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1447 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1448 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1449 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1450 1451 if (PyUnicode_IS_COMPACT(unicode)) { 1452 Py_TYPE(unicode)->tp_free(unicode); 1453 } 1454 else { 1455 if (_PyUnicode_DATA_ANY(unicode)) 1456 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1457 Py_TYPE(unicode)->tp_free(unicode); 1458 } 1459} 1460 1461#ifdef Py_DEBUG 1462static int 1463unicode_is_singleton(PyObject *unicode) 1464{ 1465 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1466 if (unicode == unicode_empty) 1467 return 1; 1468 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1469 { 1470 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1471 if (ch < 256 && unicode_latin1[ch] == unicode) 1472 return 1; 1473 } 1474 return 0; 1475} 1476#endif 1477 1478static int 1479unicode_resizable(PyObject *unicode) 1480{ 1481 if (Py_REFCNT(unicode) != 1) 1482 return 0; 1483 if (PyUnicode_CHECK_INTERNED(unicode)) 1484 return 0; 1485#ifdef Py_DEBUG 1486 /* singleton refcount is greater than 1 */ 1487 assert(!unicode_is_singleton(unicode)); 1488#endif 1489 return 1; 1490} 1491 1492static int 1493unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1494{ 1495 PyObject *unicode; 1496 Py_ssize_t old_length; 1497 1498 assert(p_unicode != NULL); 1499 unicode = *p_unicode; 1500 1501 assert(unicode != NULL); 1502 assert(PyUnicode_Check(unicode)); 1503 assert(0 <= length); 1504 1505 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1506 old_length = PyUnicode_WSTR_LENGTH(unicode); 1507 else 1508 old_length = PyUnicode_GET_LENGTH(unicode); 1509 if (old_length == length) 1510 return 0; 1511 1512 if (!unicode_resizable(unicode)) { 1513 PyObject *copy = resize_copy(unicode, length); 1514 if (copy == NULL) 1515 return -1; 1516 Py_DECREF(*p_unicode); 1517 *p_unicode = copy; 1518 return 0; 1519 } 1520 1521 if (PyUnicode_IS_COMPACT(unicode)) { 1522 *p_unicode = resize_compact(unicode, length); 1523 if (*p_unicode == NULL) 1524 return -1; 1525 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); 1526 return 0; 1527 } 1528 return resize_inplace(unicode, length); 1529} 1530 1531int 1532PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1533{ 1534 PyObject *unicode; 1535 if (p_unicode == NULL) { 1536 PyErr_BadInternalCall(); 1537 return -1; 1538 } 1539 unicode = *p_unicode; 1540 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 1541 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) 1542 { 1543 PyErr_BadInternalCall(); 1544 return -1; 1545 } 1546 return unicode_resize(p_unicode, length); 1547} 1548 1549static PyObject* 1550get_latin1_char(unsigned char ch) 1551{ 1552 PyObject *unicode = unicode_latin1[ch]; 1553 if (!unicode) { 1554 unicode = PyUnicode_New(1, ch); 1555 if (!unicode) 1556 return NULL; 1557 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1558 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1559 unicode_latin1[ch] = unicode; 1560 } 1561 Py_INCREF(unicode); 1562 return unicode; 1563} 1564 1565PyObject * 1566PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1567{ 1568 PyObject *unicode; 1569 Py_UCS4 maxchar = 0; 1570 Py_ssize_t num_surrogates; 1571 1572 if (u == NULL) 1573 return (PyObject*)_PyUnicode_New(size); 1574 1575 /* If the Unicode data is known at construction time, we can apply 1576 some optimizations which share commonly used objects. */ 1577 1578 /* Optimization for empty strings */ 1579 if (size == 0 && unicode_empty != NULL) { 1580 Py_INCREF(unicode_empty); 1581 return unicode_empty; 1582 } 1583 1584 /* Single character Unicode objects in the Latin-1 range are 1585 shared when using this constructor */ 1586 if (size == 1 && *u < 256) 1587 return get_latin1_char((unsigned char)*u); 1588 1589 /* If not empty and not single character, copy the Unicode data 1590 into the new object */ 1591 if (find_maxchar_surrogates(u, u + size, 1592 &maxchar, &num_surrogates) == -1) 1593 return NULL; 1594 1595 unicode = PyUnicode_New(size - num_surrogates, 1596 maxchar); 1597 if (!unicode) 1598 return NULL; 1599 1600 switch (PyUnicode_KIND(unicode)) { 1601 case PyUnicode_1BYTE_KIND: 1602 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1603 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1604 break; 1605 case PyUnicode_2BYTE_KIND: 1606#if Py_UNICODE_SIZE == 2 1607 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1608#else 1609 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1610 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1611#endif 1612 break; 1613 case PyUnicode_4BYTE_KIND: 1614#if SIZEOF_WCHAR_T == 2 1615 /* This is the only case which has to process surrogates, thus 1616 a simple copy loop is not enough and we need a function. */ 1617 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1618#else 1619 assert(num_surrogates == 0); 1620 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1621#endif 1622 break; 1623 default: 1624 assert(0 && "Impossible state"); 1625 } 1626 1627 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1628 return unicode; 1629} 1630 1631PyObject * 1632PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1633{ 1634 if (size < 0) { 1635 PyErr_SetString(PyExc_SystemError, 1636 "Negative size passed to PyUnicode_FromStringAndSize"); 1637 return NULL; 1638 } 1639 1640 /* If the Unicode data is known at construction time, we can apply 1641 some optimizations which share commonly used objects. 1642 Also, this means the input must be UTF-8, so fall back to the 1643 UTF-8 decoder at the end. */ 1644 if (u != NULL) { 1645 1646 /* Optimization for empty strings */ 1647 if (size == 0 && unicode_empty != NULL) { 1648 Py_INCREF(unicode_empty); 1649 return unicode_empty; 1650 } 1651 1652 /* Single characters are shared when using this constructor. 1653 Restrict to ASCII, since the input must be UTF-8. */ 1654 if (size == 1 && (unsigned char)*u < 128) 1655 return get_latin1_char((unsigned char)*u); 1656 1657 return PyUnicode_DecodeUTF8(u, size, NULL); 1658 } 1659 1660 return (PyObject *)_PyUnicode_New(size); 1661} 1662 1663PyObject * 1664PyUnicode_FromString(const char *u) 1665{ 1666 size_t size = strlen(u); 1667 if (size > PY_SSIZE_T_MAX) { 1668 PyErr_SetString(PyExc_OverflowError, "input too long"); 1669 return NULL; 1670 } 1671 1672 return PyUnicode_FromStringAndSize(u, size); 1673} 1674 1675PyObject * 1676_PyUnicode_FromId(_Py_Identifier *id) 1677{ 1678 if (!id->object) { 1679 id->object = PyUnicode_FromString(id->string); 1680 if (!id->object) 1681 return NULL; 1682 PyUnicode_InternInPlace(&id->object); 1683 assert(!id->next); 1684 id->next = static_strings; 1685 static_strings = id; 1686 } 1687 Py_INCREF(id->object); 1688 return id->object; 1689} 1690 1691void 1692_PyUnicode_ClearStaticStrings() 1693{ 1694 _Py_Identifier *i; 1695 for (i = static_strings; i; i = i->next) { 1696 Py_DECREF(i->object); 1697 i->object = NULL; 1698 i->next = NULL; 1699 } 1700} 1701 1702static PyObject* 1703unicode_fromascii(const unsigned char* s, Py_ssize_t size) 1704{ 1705 PyObject *res; 1706#ifdef Py_DEBUG 1707 const unsigned char *p; 1708 const unsigned char *end = s + size; 1709 for (p=s; p < end; p++) { 1710 assert(*p < 128); 1711 } 1712#endif 1713 if (size == 1) 1714 return get_latin1_char(s[0]); 1715 res = PyUnicode_New(size, 127); 1716 if (!res) 1717 return NULL; 1718 memcpy(PyUnicode_1BYTE_DATA(res), s, size); 1719 return res; 1720} 1721 1722static Py_UCS4 1723kind_maxchar_limit(unsigned int kind) 1724{ 1725 switch(kind) { 1726 case PyUnicode_1BYTE_KIND: 1727 return 0x80; 1728 case PyUnicode_2BYTE_KIND: 1729 return 0x100; 1730 case PyUnicode_4BYTE_KIND: 1731 return 0x10000; 1732 default: 1733 assert(0 && "invalid kind"); 1734 return 0x10ffff; 1735 } 1736} 1737 1738static PyObject* 1739_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1740{ 1741 PyObject *res; 1742 unsigned char max_char = 127; 1743 1744 assert(size >= 0); 1745 if (size == 1) 1746 return get_latin1_char(u[0]); 1747 max_char = ucs1lib_find_max_char(u, u + size); 1748 res = PyUnicode_New(size, max_char); 1749 if (!res) 1750 return NULL; 1751 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1752 assert(_PyUnicode_CheckConsistency(res, 1)); 1753 return res; 1754} 1755 1756static PyObject* 1757_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1758{ 1759 PyObject *res; 1760 Py_UCS2 max_char = 0; 1761 1762 assert(size >= 0); 1763 if (size == 1 && u[0] < 256) 1764 return get_latin1_char((unsigned char)u[0]); 1765 max_char = ucs2lib_find_max_char(u, u + size); 1766 res = PyUnicode_New(size, max_char); 1767 if (!res) 1768 return NULL; 1769 if (max_char >= 256) 1770 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1771 else { 1772 _PyUnicode_CONVERT_BYTES( 1773 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1774 } 1775 assert(_PyUnicode_CheckConsistency(res, 1)); 1776 return res; 1777} 1778 1779static PyObject* 1780_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1781{ 1782 PyObject *res; 1783 Py_UCS4 max_char = 0; 1784 1785 assert(size >= 0); 1786 if (size == 1 && u[0] < 256) 1787 return get_latin1_char(u[0]); 1788 max_char = ucs4lib_find_max_char(u, u + size); 1789 res = PyUnicode_New(size, max_char); 1790 if (!res) 1791 return NULL; 1792 if (max_char < 256) 1793 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1794 PyUnicode_1BYTE_DATA(res)); 1795 else if (max_char < 0x10000) 1796 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1797 PyUnicode_2BYTE_DATA(res)); 1798 else 1799 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1800 assert(_PyUnicode_CheckConsistency(res, 1)); 1801 return res; 1802} 1803 1804PyObject* 1805PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1806{ 1807 switch(kind) { 1808 case PyUnicode_1BYTE_KIND: 1809 return _PyUnicode_FromUCS1(buffer, size); 1810 case PyUnicode_2BYTE_KIND: 1811 return _PyUnicode_FromUCS2(buffer, size); 1812 case PyUnicode_4BYTE_KIND: 1813 return _PyUnicode_FromUCS4(buffer, size); 1814 default: 1815 assert(0 && "invalid kind"); 1816 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1817 return NULL; 1818 } 1819} 1820 1821/* Ensure that a string uses the most efficient storage, if it is not the 1822 case: create a new string with of the right kind. Write NULL into *p_unicode 1823 on error. */ 1824static void 1825unicode_adjust_maxchar(PyObject **p_unicode) 1826{ 1827 PyObject *unicode, *copy; 1828 Py_UCS4 max_char; 1829 Py_ssize_t len; 1830 unsigned int kind; 1831 1832 assert(p_unicode != NULL); 1833 unicode = *p_unicode; 1834 assert(PyUnicode_IS_READY(unicode)); 1835 if (PyUnicode_IS_ASCII(unicode)) 1836 return; 1837 1838 len = PyUnicode_GET_LENGTH(unicode); 1839 kind = PyUnicode_KIND(unicode); 1840 if (kind == PyUnicode_1BYTE_KIND) { 1841 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 1842 max_char = ucs1lib_find_max_char(u, u + len); 1843 if (max_char >= 128) 1844 return; 1845 } 1846 else if (kind == PyUnicode_2BYTE_KIND) { 1847 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 1848 max_char = ucs2lib_find_max_char(u, u + len); 1849 if (max_char >= 256) 1850 return; 1851 } 1852 else { 1853 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 1854 assert(kind == PyUnicode_4BYTE_KIND); 1855 max_char = ucs4lib_find_max_char(u, u + len); 1856 if (max_char >= 0x10000) 1857 return; 1858 } 1859 copy = PyUnicode_New(len, max_char); 1860 copy_characters(copy, 0, unicode, 0, len); 1861 Py_DECREF(unicode); 1862 *p_unicode = copy; 1863} 1864 1865PyObject* 1866PyUnicode_Copy(PyObject *unicode) 1867{ 1868 Py_ssize_t size; 1869 PyObject *copy; 1870 void *data; 1871 1872 if (!PyUnicode_Check(unicode)) { 1873 PyErr_BadInternalCall(); 1874 return NULL; 1875 } 1876 if (PyUnicode_READY(unicode)) 1877 return NULL; 1878 1879 size = PyUnicode_GET_LENGTH(unicode); 1880 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode)); 1881 if (!copy) 1882 return NULL; 1883 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1884 1885 data = PyUnicode_DATA(unicode); 1886 switch (PyUnicode_KIND(unicode)) 1887 { 1888 case PyUnicode_1BYTE_KIND: 1889 memcpy(PyUnicode_1BYTE_DATA(copy), data, size); 1890 break; 1891 case PyUnicode_2BYTE_KIND: 1892 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size); 1893 break; 1894 case PyUnicode_4BYTE_KIND: 1895 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size); 1896 break; 1897 default: 1898 assert(0); 1899 break; 1900 } 1901 assert(_PyUnicode_CheckConsistency(copy, 1)); 1902 return copy; 1903} 1904 1905 1906/* Widen Unicode objects to larger buffers. Don't write terminating null 1907 character. Return NULL on error. */ 1908 1909void* 1910_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1911{ 1912 Py_ssize_t len; 1913 void *result; 1914 unsigned int skind; 1915 1916 if (PyUnicode_READY(s)) 1917 return NULL; 1918 1919 len = PyUnicode_GET_LENGTH(s); 1920 skind = PyUnicode_KIND(s); 1921 if (skind >= kind) { 1922 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 1923 return NULL; 1924 } 1925 switch(kind) { 1926 case PyUnicode_2BYTE_KIND: 1927 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 1928 if (!result) 1929 return PyErr_NoMemory(); 1930 assert(skind == PyUnicode_1BYTE_KIND); 1931 _PyUnicode_CONVERT_BYTES( 1932 Py_UCS1, Py_UCS2, 1933 PyUnicode_1BYTE_DATA(s), 1934 PyUnicode_1BYTE_DATA(s) + len, 1935 result); 1936 return result; 1937 case PyUnicode_4BYTE_KIND: 1938 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 1939 if (!result) 1940 return PyErr_NoMemory(); 1941 if (skind == PyUnicode_2BYTE_KIND) { 1942 _PyUnicode_CONVERT_BYTES( 1943 Py_UCS2, Py_UCS4, 1944 PyUnicode_2BYTE_DATA(s), 1945 PyUnicode_2BYTE_DATA(s) + len, 1946 result); 1947 } 1948 else { 1949 assert(skind == PyUnicode_1BYTE_KIND); 1950 _PyUnicode_CONVERT_BYTES( 1951 Py_UCS1, Py_UCS4, 1952 PyUnicode_1BYTE_DATA(s), 1953 PyUnicode_1BYTE_DATA(s) + len, 1954 result); 1955 } 1956 return result; 1957 default: 1958 break; 1959 } 1960 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1961 return NULL; 1962} 1963 1964static Py_UCS4* 1965as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 1966 int copy_null) 1967{ 1968 int kind; 1969 void *data; 1970 Py_ssize_t len, targetlen; 1971 if (PyUnicode_READY(string) == -1) 1972 return NULL; 1973 kind = PyUnicode_KIND(string); 1974 data = PyUnicode_DATA(string); 1975 len = PyUnicode_GET_LENGTH(string); 1976 targetlen = len; 1977 if (copy_null) 1978 targetlen++; 1979 if (!target) { 1980 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 1981 PyErr_NoMemory(); 1982 return NULL; 1983 } 1984 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 1985 if (!target) { 1986 PyErr_NoMemory(); 1987 return NULL; 1988 } 1989 } 1990 else { 1991 if (targetsize < targetlen) { 1992 PyErr_Format(PyExc_SystemError, 1993 "string is longer than the buffer"); 1994 if (copy_null && 0 < targetsize) 1995 target[0] = 0; 1996 return NULL; 1997 } 1998 } 1999 if (kind == PyUnicode_1BYTE_KIND) { 2000 Py_UCS1 *start = (Py_UCS1 *) data; 2001 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2002 } 2003 else if (kind == PyUnicode_2BYTE_KIND) { 2004 Py_UCS2 *start = (Py_UCS2 *) data; 2005 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2006 } 2007 else { 2008 assert(kind == PyUnicode_4BYTE_KIND); 2009 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2010 } 2011 if (copy_null) 2012 target[len] = 0; 2013 return target; 2014} 2015 2016Py_UCS4* 2017PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2018 int copy_null) 2019{ 2020 if (target == NULL || targetsize < 1) { 2021 PyErr_BadInternalCall(); 2022 return NULL; 2023 } 2024 return as_ucs4(string, target, targetsize, copy_null); 2025} 2026 2027Py_UCS4* 2028PyUnicode_AsUCS4Copy(PyObject *string) 2029{ 2030 return as_ucs4(string, NULL, 0, 1); 2031} 2032 2033#ifdef HAVE_WCHAR_H 2034 2035PyObject * 2036PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2037{ 2038 if (w == NULL) { 2039 if (size == 0) 2040 return PyUnicode_New(0, 0); 2041 PyErr_BadInternalCall(); 2042 return NULL; 2043 } 2044 2045 if (size == -1) { 2046 size = wcslen(w); 2047 } 2048 2049 return PyUnicode_FromUnicode(w, size); 2050} 2051 2052#endif /* HAVE_WCHAR_H */ 2053 2054static void 2055makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2056 int zeropad, int width, int precision, char c) 2057{ 2058 *fmt++ = '%'; 2059 if (width) { 2060 if (zeropad) 2061 *fmt++ = '0'; 2062 fmt += sprintf(fmt, "%d", width); 2063 } 2064 if (precision) 2065 fmt += sprintf(fmt, ".%d", precision); 2066 if (longflag) 2067 *fmt++ = 'l'; 2068 else if (longlongflag) { 2069 /* longlongflag should only ever be nonzero on machines with 2070 HAVE_LONG_LONG defined */ 2071#ifdef HAVE_LONG_LONG 2072 char *f = PY_FORMAT_LONG_LONG; 2073 while (*f) 2074 *fmt++ = *f++; 2075#else 2076 /* we shouldn't ever get here */ 2077 assert(0); 2078 *fmt++ = 'l'; 2079#endif 2080 } 2081 else if (size_tflag) { 2082 char *f = PY_FORMAT_SIZE_T; 2083 while (*f) 2084 *fmt++ = *f++; 2085 } 2086 *fmt++ = c; 2087 *fmt = '\0'; 2088} 2089 2090/* helper for PyUnicode_FromFormatV() */ 2091 2092static const char* 2093parse_format_flags(const char *f, 2094 int *p_width, int *p_precision, 2095 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2096{ 2097 int width, precision, longflag, longlongflag, size_tflag; 2098 2099 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2100 f++; 2101 width = 0; 2102 while (Py_ISDIGIT((unsigned)*f)) 2103 width = (width*10) + *f++ - '0'; 2104 precision = 0; 2105 if (*f == '.') { 2106 f++; 2107 while (Py_ISDIGIT((unsigned)*f)) 2108 precision = (precision*10) + *f++ - '0'; 2109 if (*f == '%') { 2110 /* "%.3%s" => f points to "3" */ 2111 f--; 2112 } 2113 } 2114 if (*f == '\0') { 2115 /* bogus format "%.1" => go backward, f points to "1" */ 2116 f--; 2117 } 2118 if (p_width != NULL) 2119 *p_width = width; 2120 if (p_precision != NULL) 2121 *p_precision = precision; 2122 2123 /* Handle %ld, %lu, %lld and %llu. */ 2124 longflag = 0; 2125 longlongflag = 0; 2126 size_tflag = 0; 2127 2128 if (*f == 'l') { 2129 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2130 longflag = 1; 2131 ++f; 2132 } 2133#ifdef HAVE_LONG_LONG 2134 else if (f[1] == 'l' && 2135 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2136 longlongflag = 1; 2137 f += 2; 2138 } 2139#endif 2140 } 2141 /* handle the size_t flag. */ 2142 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2143 size_tflag = 1; 2144 ++f; 2145 } 2146 if (p_longflag != NULL) 2147 *p_longflag = longflag; 2148 if (p_longlongflag != NULL) 2149 *p_longlongflag = longlongflag; 2150 if (p_size_tflag != NULL) 2151 *p_size_tflag = size_tflag; 2152 return f; 2153} 2154 2155/* maximum number of characters required for output of %ld. 21 characters 2156 allows for 64-bit integers (in decimal) and an optional sign. */ 2157#define MAX_LONG_CHARS 21 2158/* maximum number of characters required for output of %lld. 2159 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2160 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2161#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2162 2163PyObject * 2164PyUnicode_FromFormatV(const char *format, va_list vargs) 2165{ 2166 va_list count; 2167 Py_ssize_t callcount = 0; 2168 PyObject **callresults = NULL; 2169 PyObject **callresult = NULL; 2170 Py_ssize_t n = 0; 2171 int width = 0; 2172 int precision = 0; 2173 int zeropad; 2174 const char* f; 2175 PyObject *string; 2176 /* used by sprintf */ 2177 char fmt[61]; /* should be enough for %0width.precisionlld */ 2178 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2179 Py_UCS4 argmaxchar; 2180 Py_ssize_t numbersize = 0; 2181 char *numberresults = NULL; 2182 char *numberresult = NULL; 2183 Py_ssize_t i; 2184 int kind; 2185 void *data; 2186 2187 Py_VA_COPY(count, vargs); 2188 /* step 1: count the number of %S/%R/%A/%s format specifications 2189 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2190 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2191 * result in an array) 2192 * also estimate a upper bound for all the number formats in the string, 2193 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2194 * buffer before putting everything together. */ 2195 for (f = format; *f; f++) { 2196 if (*f == '%') { 2197 int longlongflag; 2198 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2199 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2200 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2201 ++callcount; 2202 2203 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2204#ifdef HAVE_LONG_LONG 2205 if (longlongflag) { 2206 if (width < MAX_LONG_LONG_CHARS) 2207 width = MAX_LONG_LONG_CHARS; 2208 } 2209 else 2210#endif 2211 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2212 including sign. Decimal takes the most space. This 2213 isn't enough for octal. If a width is specified we 2214 need more (which we allocate later). */ 2215 if (width < MAX_LONG_CHARS) 2216 width = MAX_LONG_CHARS; 2217 2218 /* account for the size + '\0' to separate numbers 2219 inside of the numberresults buffer */ 2220 numbersize += (width + 1); 2221 } 2222 } 2223 else if ((unsigned char)*f > 127) { 2224 PyErr_Format(PyExc_ValueError, 2225 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2226 "string, got a non-ASCII byte: 0x%02x", 2227 (unsigned char)*f); 2228 return NULL; 2229 } 2230 } 2231 /* step 2: allocate memory for the results of 2232 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2233 if (callcount) { 2234 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2235 if (!callresults) { 2236 PyErr_NoMemory(); 2237 return NULL; 2238 } 2239 callresult = callresults; 2240 } 2241 /* step 2.5: allocate memory for the results of formating numbers */ 2242 if (numbersize) { 2243 numberresults = PyObject_Malloc(numbersize); 2244 if (!numberresults) { 2245 PyErr_NoMemory(); 2246 goto fail; 2247 } 2248 numberresult = numberresults; 2249 } 2250 2251 /* step 3: format numbers and figure out how large a buffer we need */ 2252 for (f = format; *f; f++) { 2253 if (*f == '%') { 2254 const char* p; 2255 int longflag; 2256 int longlongflag; 2257 int size_tflag; 2258 int numprinted; 2259 2260 p = f; 2261 zeropad = (f[1] == '0'); 2262 f = parse_format_flags(f, &width, &precision, 2263 &longflag, &longlongflag, &size_tflag); 2264 switch (*f) { 2265 case 'c': 2266 { 2267 Py_UCS4 ordinal = va_arg(count, int); 2268 maxchar = Py_MAX(maxchar, ordinal); 2269 n++; 2270 break; 2271 } 2272 case '%': 2273 n++; 2274 break; 2275 case 'i': 2276 case 'd': 2277 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2278 width, precision, *f); 2279 if (longflag) 2280 numprinted = sprintf(numberresult, fmt, 2281 va_arg(count, long)); 2282#ifdef HAVE_LONG_LONG 2283 else if (longlongflag) 2284 numprinted = sprintf(numberresult, fmt, 2285 va_arg(count, PY_LONG_LONG)); 2286#endif 2287 else if (size_tflag) 2288 numprinted = sprintf(numberresult, fmt, 2289 va_arg(count, Py_ssize_t)); 2290 else 2291 numprinted = sprintf(numberresult, fmt, 2292 va_arg(count, int)); 2293 n += numprinted; 2294 /* advance by +1 to skip over the '\0' */ 2295 numberresult += (numprinted + 1); 2296 assert(*(numberresult - 1) == '\0'); 2297 assert(*(numberresult - 2) != '\0'); 2298 assert(numprinted >= 0); 2299 assert(numberresult <= numberresults + numbersize); 2300 break; 2301 case 'u': 2302 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2303 width, precision, 'u'); 2304 if (longflag) 2305 numprinted = sprintf(numberresult, fmt, 2306 va_arg(count, unsigned long)); 2307#ifdef HAVE_LONG_LONG 2308 else if (longlongflag) 2309 numprinted = sprintf(numberresult, fmt, 2310 va_arg(count, unsigned PY_LONG_LONG)); 2311#endif 2312 else if (size_tflag) 2313 numprinted = sprintf(numberresult, fmt, 2314 va_arg(count, size_t)); 2315 else 2316 numprinted = sprintf(numberresult, fmt, 2317 va_arg(count, unsigned int)); 2318 n += numprinted; 2319 numberresult += (numprinted + 1); 2320 assert(*(numberresult - 1) == '\0'); 2321 assert(*(numberresult - 2) != '\0'); 2322 assert(numprinted >= 0); 2323 assert(numberresult <= numberresults + numbersize); 2324 break; 2325 case 'x': 2326 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2327 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2328 n += numprinted; 2329 numberresult += (numprinted + 1); 2330 assert(*(numberresult - 1) == '\0'); 2331 assert(*(numberresult - 2) != '\0'); 2332 assert(numprinted >= 0); 2333 assert(numberresult <= numberresults + numbersize); 2334 break; 2335 case 'p': 2336 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2337 /* %p is ill-defined: ensure leading 0x. */ 2338 if (numberresult[1] == 'X') 2339 numberresult[1] = 'x'; 2340 else if (numberresult[1] != 'x') { 2341 memmove(numberresult + 2, numberresult, 2342 strlen(numberresult) + 1); 2343 numberresult[0] = '0'; 2344 numberresult[1] = 'x'; 2345 numprinted += 2; 2346 } 2347 n += numprinted; 2348 numberresult += (numprinted + 1); 2349 assert(*(numberresult - 1) == '\0'); 2350 assert(*(numberresult - 2) != '\0'); 2351 assert(numprinted >= 0); 2352 assert(numberresult <= numberresults + numbersize); 2353 break; 2354 case 's': 2355 { 2356 /* UTF-8 */ 2357 const char *s = va_arg(count, const char*); 2358 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2359 if (!str) 2360 goto fail; 2361 /* since PyUnicode_DecodeUTF8 returns already flexible 2362 unicode objects, there is no need to call ready on them */ 2363 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2364 maxchar = Py_MAX(maxchar, argmaxchar); 2365 n += PyUnicode_GET_LENGTH(str); 2366 /* Remember the str and switch to the next slot */ 2367 *callresult++ = str; 2368 break; 2369 } 2370 case 'U': 2371 { 2372 PyObject *obj = va_arg(count, PyObject *); 2373 assert(obj && _PyUnicode_CHECK(obj)); 2374 if (PyUnicode_READY(obj) == -1) 2375 goto fail; 2376 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2377 maxchar = Py_MAX(maxchar, argmaxchar); 2378 n += PyUnicode_GET_LENGTH(obj); 2379 break; 2380 } 2381 case 'V': 2382 { 2383 PyObject *obj = va_arg(count, PyObject *); 2384 const char *str = va_arg(count, const char *); 2385 PyObject *str_obj; 2386 assert(obj || str); 2387 assert(!obj || _PyUnicode_CHECK(obj)); 2388 if (obj) { 2389 if (PyUnicode_READY(obj) == -1) 2390 goto fail; 2391 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2392 maxchar = Py_MAX(maxchar, argmaxchar); 2393 n += PyUnicode_GET_LENGTH(obj); 2394 *callresult++ = NULL; 2395 } 2396 else { 2397 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2398 if (!str_obj) 2399 goto fail; 2400 if (PyUnicode_READY(str_obj)) { 2401 Py_DECREF(str_obj); 2402 goto fail; 2403 } 2404 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2405 maxchar = Py_MAX(maxchar, argmaxchar); 2406 n += PyUnicode_GET_LENGTH(str_obj); 2407 *callresult++ = str_obj; 2408 } 2409 break; 2410 } 2411 case 'S': 2412 { 2413 PyObject *obj = va_arg(count, PyObject *); 2414 PyObject *str; 2415 assert(obj); 2416 str = PyObject_Str(obj); 2417 if (!str || PyUnicode_READY(str) == -1) 2418 goto fail; 2419 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2420 maxchar = Py_MAX(maxchar, argmaxchar); 2421 n += PyUnicode_GET_LENGTH(str); 2422 /* Remember the str and switch to the next slot */ 2423 *callresult++ = str; 2424 break; 2425 } 2426 case 'R': 2427 { 2428 PyObject *obj = va_arg(count, PyObject *); 2429 PyObject *repr; 2430 assert(obj); 2431 repr = PyObject_Repr(obj); 2432 if (!repr || PyUnicode_READY(repr) == -1) 2433 goto fail; 2434 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2435 maxchar = Py_MAX(maxchar, argmaxchar); 2436 n += PyUnicode_GET_LENGTH(repr); 2437 /* Remember the repr and switch to the next slot */ 2438 *callresult++ = repr; 2439 break; 2440 } 2441 case 'A': 2442 { 2443 PyObject *obj = va_arg(count, PyObject *); 2444 PyObject *ascii; 2445 assert(obj); 2446 ascii = PyObject_ASCII(obj); 2447 if (!ascii || PyUnicode_READY(ascii) == -1) 2448 goto fail; 2449 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2450 maxchar = Py_MAX(maxchar, argmaxchar); 2451 n += PyUnicode_GET_LENGTH(ascii); 2452 /* Remember the repr and switch to the next slot */ 2453 *callresult++ = ascii; 2454 break; 2455 } 2456 default: 2457 /* if we stumble upon an unknown 2458 formatting code, copy the rest of 2459 the format string to the output 2460 string. (we cannot just skip the 2461 code, since there's no way to know 2462 what's in the argument list) */ 2463 n += strlen(p); 2464 goto expand; 2465 } 2466 } else 2467 n++; 2468 } 2469 expand: 2470 /* step 4: fill the buffer */ 2471 /* Since we've analyzed how much space we need, 2472 we don't have to resize the string. 2473 There can be no errors beyond this point. */ 2474 string = PyUnicode_New(n, maxchar); 2475 if (!string) 2476 goto fail; 2477 kind = PyUnicode_KIND(string); 2478 data = PyUnicode_DATA(string); 2479 callresult = callresults; 2480 numberresult = numberresults; 2481 2482 for (i = 0, f = format; *f; f++) { 2483 if (*f == '%') { 2484 const char* p; 2485 2486 p = f; 2487 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2488 /* checking for == because the last argument could be a empty 2489 string, which causes i to point to end, the assert at the end of 2490 the loop */ 2491 assert(i <= PyUnicode_GET_LENGTH(string)); 2492 2493 switch (*f) { 2494 case 'c': 2495 { 2496 const int ordinal = va_arg(vargs, int); 2497 PyUnicode_WRITE(kind, data, i++, ordinal); 2498 break; 2499 } 2500 case 'i': 2501 case 'd': 2502 case 'u': 2503 case 'x': 2504 case 'p': 2505 /* unused, since we already have the result */ 2506 if (*f == 'p') 2507 (void) va_arg(vargs, void *); 2508 else 2509 (void) va_arg(vargs, int); 2510 /* extract the result from numberresults and append. */ 2511 for (; *numberresult; ++i, ++numberresult) 2512 PyUnicode_WRITE(kind, data, i, *numberresult); 2513 /* skip over the separating '\0' */ 2514 assert(*numberresult == '\0'); 2515 numberresult++; 2516 assert(numberresult <= numberresults + numbersize); 2517 break; 2518 case 's': 2519 { 2520 /* unused, since we already have the result */ 2521 Py_ssize_t size; 2522 (void) va_arg(vargs, char *); 2523 size = PyUnicode_GET_LENGTH(*callresult); 2524 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2525 copy_characters(string, i, *callresult, 0, size); 2526 i += size; 2527 /* We're done with the unicode()/repr() => forget it */ 2528 Py_DECREF(*callresult); 2529 /* switch to next unicode()/repr() result */ 2530 ++callresult; 2531 break; 2532 } 2533 case 'U': 2534 { 2535 PyObject *obj = va_arg(vargs, PyObject *); 2536 Py_ssize_t size; 2537 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2538 size = PyUnicode_GET_LENGTH(obj); 2539 copy_characters(string, i, obj, 0, size); 2540 i += size; 2541 break; 2542 } 2543 case 'V': 2544 { 2545 Py_ssize_t size; 2546 PyObject *obj = va_arg(vargs, PyObject *); 2547 va_arg(vargs, const char *); 2548 if (obj) { 2549 size = PyUnicode_GET_LENGTH(obj); 2550 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2551 copy_characters(string, i, obj, 0, size); 2552 i += size; 2553 } else { 2554 size = PyUnicode_GET_LENGTH(*callresult); 2555 assert(PyUnicode_KIND(*callresult) <= 2556 PyUnicode_KIND(string)); 2557 copy_characters(string, i, *callresult, 0, size); 2558 i += size; 2559 Py_DECREF(*callresult); 2560 } 2561 ++callresult; 2562 break; 2563 } 2564 case 'S': 2565 case 'R': 2566 case 'A': 2567 { 2568 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2569 /* unused, since we already have the result */ 2570 (void) va_arg(vargs, PyObject *); 2571 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2572 copy_characters(string, i, *callresult, 0, size); 2573 i += size; 2574 /* We're done with the unicode()/repr() => forget it */ 2575 Py_DECREF(*callresult); 2576 /* switch to next unicode()/repr() result */ 2577 ++callresult; 2578 break; 2579 } 2580 case '%': 2581 PyUnicode_WRITE(kind, data, i++, '%'); 2582 break; 2583 default: 2584 for (; *p; ++p, ++i) 2585 PyUnicode_WRITE(kind, data, i, *p); 2586 assert(i == PyUnicode_GET_LENGTH(string)); 2587 goto end; 2588 } 2589 } 2590 else { 2591 assert(i < PyUnicode_GET_LENGTH(string)); 2592 PyUnicode_WRITE(kind, data, i++, *f); 2593 } 2594 } 2595 assert(i == PyUnicode_GET_LENGTH(string)); 2596 2597 end: 2598 if (callresults) 2599 PyObject_Free(callresults); 2600 if (numberresults) 2601 PyObject_Free(numberresults); 2602 assert(_PyUnicode_CheckConsistency(string, 1)); 2603 return string; 2604 fail: 2605 if (callresults) { 2606 PyObject **callresult2 = callresults; 2607 while (callresult2 < callresult) { 2608 Py_XDECREF(*callresult2); 2609 ++callresult2; 2610 } 2611 PyObject_Free(callresults); 2612 } 2613 if (numberresults) 2614 PyObject_Free(numberresults); 2615 return NULL; 2616} 2617 2618PyObject * 2619PyUnicode_FromFormat(const char *format, ...) 2620{ 2621 PyObject* ret; 2622 va_list vargs; 2623 2624#ifdef HAVE_STDARG_PROTOTYPES 2625 va_start(vargs, format); 2626#else 2627 va_start(vargs); 2628#endif 2629 ret = PyUnicode_FromFormatV(format, vargs); 2630 va_end(vargs); 2631 return ret; 2632} 2633 2634#ifdef HAVE_WCHAR_H 2635 2636/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2637 convert a Unicode object to a wide character string. 2638 2639 - If w is NULL: return the number of wide characters (including the null 2640 character) required to convert the unicode object. Ignore size argument. 2641 2642 - Otherwise: return the number of wide characters (excluding the null 2643 character) written into w. Write at most size wide characters (including 2644 the null character). */ 2645static Py_ssize_t 2646unicode_aswidechar(PyObject *unicode, 2647 wchar_t *w, 2648 Py_ssize_t size) 2649{ 2650 Py_ssize_t res; 2651 const wchar_t *wstr; 2652 2653 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2654 if (wstr == NULL) 2655 return -1; 2656 2657 if (w != NULL) { 2658 if (size > res) 2659 size = res + 1; 2660 else 2661 res = size; 2662 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2663 return res; 2664 } 2665 else 2666 return res + 1; 2667} 2668 2669Py_ssize_t 2670PyUnicode_AsWideChar(PyObject *unicode, 2671 wchar_t *w, 2672 Py_ssize_t size) 2673{ 2674 if (unicode == NULL) { 2675 PyErr_BadInternalCall(); 2676 return -1; 2677 } 2678 return unicode_aswidechar(unicode, w, size); 2679} 2680 2681wchar_t* 2682PyUnicode_AsWideCharString(PyObject *unicode, 2683 Py_ssize_t *size) 2684{ 2685 wchar_t* buffer; 2686 Py_ssize_t buflen; 2687 2688 if (unicode == NULL) { 2689 PyErr_BadInternalCall(); 2690 return NULL; 2691 } 2692 2693 buflen = unicode_aswidechar(unicode, NULL, 0); 2694 if (buflen == -1) 2695 return NULL; 2696 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2697 PyErr_NoMemory(); 2698 return NULL; 2699 } 2700 2701 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2702 if (buffer == NULL) { 2703 PyErr_NoMemory(); 2704 return NULL; 2705 } 2706 buflen = unicode_aswidechar(unicode, buffer, buflen); 2707 if (buflen == -1) 2708 return NULL; 2709 if (size != NULL) 2710 *size = buflen; 2711 return buffer; 2712} 2713 2714#endif /* HAVE_WCHAR_H */ 2715 2716PyObject * 2717PyUnicode_FromOrdinal(int ordinal) 2718{ 2719 PyObject *v; 2720 if (ordinal < 0 || ordinal > 0x10ffff) { 2721 PyErr_SetString(PyExc_ValueError, 2722 "chr() arg not in range(0x110000)"); 2723 return NULL; 2724 } 2725 2726 if (ordinal < 256) 2727 return get_latin1_char(ordinal); 2728 2729 v = PyUnicode_New(1, ordinal); 2730 if (v == NULL) 2731 return NULL; 2732 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2733 assert(_PyUnicode_CheckConsistency(v, 1)); 2734 return v; 2735} 2736 2737PyObject * 2738PyUnicode_FromObject(register PyObject *obj) 2739{ 2740 /* XXX Perhaps we should make this API an alias of 2741 PyObject_Str() instead ?! */ 2742 if (PyUnicode_CheckExact(obj)) { 2743 if (PyUnicode_READY(obj)) 2744 return NULL; 2745 Py_INCREF(obj); 2746 return obj; 2747 } 2748 if (PyUnicode_Check(obj)) { 2749 /* For a Unicode subtype that's not a Unicode object, 2750 return a true Unicode object with the same data. */ 2751 return PyUnicode_Copy(obj); 2752 } 2753 PyErr_Format(PyExc_TypeError, 2754 "Can't convert '%.100s' object to str implicitly", 2755 Py_TYPE(obj)->tp_name); 2756 return NULL; 2757} 2758 2759PyObject * 2760PyUnicode_FromEncodedObject(register PyObject *obj, 2761 const char *encoding, 2762 const char *errors) 2763{ 2764 Py_buffer buffer; 2765 PyObject *v; 2766 2767 if (obj == NULL) { 2768 PyErr_BadInternalCall(); 2769 return NULL; 2770 } 2771 2772 /* Decoding bytes objects is the most common case and should be fast */ 2773 if (PyBytes_Check(obj)) { 2774 if (PyBytes_GET_SIZE(obj) == 0) { 2775 Py_INCREF(unicode_empty); 2776 v = unicode_empty; 2777 } 2778 else { 2779 v = PyUnicode_Decode( 2780 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2781 encoding, errors); 2782 } 2783 return v; 2784 } 2785 2786 if (PyUnicode_Check(obj)) { 2787 PyErr_SetString(PyExc_TypeError, 2788 "decoding str is not supported"); 2789 return NULL; 2790 } 2791 2792 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2793 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2794 PyErr_Format(PyExc_TypeError, 2795 "coercing to str: need bytes, bytearray " 2796 "or buffer-like object, %.80s found", 2797 Py_TYPE(obj)->tp_name); 2798 return NULL; 2799 } 2800 2801 if (buffer.len == 0) { 2802 Py_INCREF(unicode_empty); 2803 v = unicode_empty; 2804 } 2805 else 2806 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2807 2808 PyBuffer_Release(&buffer); 2809 return v; 2810} 2811 2812/* Convert encoding to lower case and replace '_' with '-' in order to 2813 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2814 1 on success. */ 2815static int 2816normalize_encoding(const char *encoding, 2817 char *lower, 2818 size_t lower_len) 2819{ 2820 const char *e; 2821 char *l; 2822 char *l_end; 2823 2824 if (encoding == NULL) { 2825 strcpy(lower, "utf-8"); 2826 return 1; 2827 } 2828 e = encoding; 2829 l = lower; 2830 l_end = &lower[lower_len - 1]; 2831 while (*e) { 2832 if (l == l_end) 2833 return 0; 2834 if (Py_ISUPPER(*e)) { 2835 *l++ = Py_TOLOWER(*e++); 2836 } 2837 else if (*e == '_') { 2838 *l++ = '-'; 2839 e++; 2840 } 2841 else { 2842 *l++ = *e++; 2843 } 2844 } 2845 *l = '\0'; 2846 return 1; 2847} 2848 2849PyObject * 2850PyUnicode_Decode(const char *s, 2851 Py_ssize_t size, 2852 const char *encoding, 2853 const char *errors) 2854{ 2855 PyObject *buffer = NULL, *unicode; 2856 Py_buffer info; 2857 char lower[11]; /* Enough for any encoding shortcut */ 2858 2859 /* Shortcuts for common default encodings */ 2860 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2861 if ((strcmp(lower, "utf-8") == 0) || 2862 (strcmp(lower, "utf8") == 0)) 2863 return PyUnicode_DecodeUTF8(s, size, errors); 2864 else if ((strcmp(lower, "latin-1") == 0) || 2865 (strcmp(lower, "latin1") == 0) || 2866 (strcmp(lower, "iso-8859-1") == 0)) 2867 return PyUnicode_DecodeLatin1(s, size, errors); 2868#ifdef HAVE_MBCS 2869 else if (strcmp(lower, "mbcs") == 0) 2870 return PyUnicode_DecodeMBCS(s, size, errors); 2871#endif 2872 else if (strcmp(lower, "ascii") == 0) 2873 return PyUnicode_DecodeASCII(s, size, errors); 2874 else if (strcmp(lower, "utf-16") == 0) 2875 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2876 else if (strcmp(lower, "utf-32") == 0) 2877 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2878 } 2879 2880 /* Decode via the codec registry */ 2881 buffer = NULL; 2882 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2883 goto onError; 2884 buffer = PyMemoryView_FromBuffer(&info); 2885 if (buffer == NULL) 2886 goto onError; 2887 unicode = PyCodec_Decode(buffer, encoding, errors); 2888 if (unicode == NULL) 2889 goto onError; 2890 if (!PyUnicode_Check(unicode)) { 2891 PyErr_Format(PyExc_TypeError, 2892 "decoder did not return a str object (type=%.400s)", 2893 Py_TYPE(unicode)->tp_name); 2894 Py_DECREF(unicode); 2895 goto onError; 2896 } 2897 Py_DECREF(buffer); 2898#ifndef DONT_MAKE_RESULT_READY 2899 if (_PyUnicode_READY_REPLACE(&unicode)) { 2900 Py_DECREF(unicode); 2901 return NULL; 2902 } 2903#endif 2904 assert(_PyUnicode_CheckConsistency(unicode, 1)); 2905 return unicode; 2906 2907 onError: 2908 Py_XDECREF(buffer); 2909 return NULL; 2910} 2911 2912PyObject * 2913PyUnicode_AsDecodedObject(PyObject *unicode, 2914 const char *encoding, 2915 const char *errors) 2916{ 2917 PyObject *v; 2918 2919 if (!PyUnicode_Check(unicode)) { 2920 PyErr_BadArgument(); 2921 goto onError; 2922 } 2923 2924 if (encoding == NULL) 2925 encoding = PyUnicode_GetDefaultEncoding(); 2926 2927 /* Decode via the codec registry */ 2928 v = PyCodec_Decode(unicode, encoding, errors); 2929 if (v == NULL) 2930 goto onError; 2931 assert(_PyUnicode_CheckConsistency(v, 1)); 2932 return v; 2933 2934 onError: 2935 return NULL; 2936} 2937 2938PyObject * 2939PyUnicode_AsDecodedUnicode(PyObject *unicode, 2940 const char *encoding, 2941 const char *errors) 2942{ 2943 PyObject *v; 2944 2945 if (!PyUnicode_Check(unicode)) { 2946 PyErr_BadArgument(); 2947 goto onError; 2948 } 2949 2950 if (encoding == NULL) 2951 encoding = PyUnicode_GetDefaultEncoding(); 2952 2953 /* Decode via the codec registry */ 2954 v = PyCodec_Decode(unicode, encoding, errors); 2955 if (v == NULL) 2956 goto onError; 2957 if (!PyUnicode_Check(v)) { 2958 PyErr_Format(PyExc_TypeError, 2959 "decoder did not return a str object (type=%.400s)", 2960 Py_TYPE(v)->tp_name); 2961 Py_DECREF(v); 2962 goto onError; 2963 } 2964 assert(_PyUnicode_CheckConsistency(v, 1)); 2965 return v; 2966 2967 onError: 2968 return NULL; 2969} 2970 2971PyObject * 2972PyUnicode_Encode(const Py_UNICODE *s, 2973 Py_ssize_t size, 2974 const char *encoding, 2975 const char *errors) 2976{ 2977 PyObject *v, *unicode; 2978 2979 unicode = PyUnicode_FromUnicode(s, size); 2980 if (unicode == NULL) 2981 return NULL; 2982 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 2983 Py_DECREF(unicode); 2984 return v; 2985} 2986 2987PyObject * 2988PyUnicode_AsEncodedObject(PyObject *unicode, 2989 const char *encoding, 2990 const char *errors) 2991{ 2992 PyObject *v; 2993 2994 if (!PyUnicode_Check(unicode)) { 2995 PyErr_BadArgument(); 2996 goto onError; 2997 } 2998 2999 if (encoding == NULL) 3000 encoding = PyUnicode_GetDefaultEncoding(); 3001 3002 /* Encode via the codec registry */ 3003 v = PyCodec_Encode(unicode, encoding, errors); 3004 if (v == NULL) 3005 goto onError; 3006 return v; 3007 3008 onError: 3009 return NULL; 3010} 3011 3012PyObject * 3013PyUnicode_EncodeFSDefault(PyObject *unicode) 3014{ 3015#ifdef HAVE_MBCS 3016 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3017 PyUnicode_GET_SIZE(unicode), 3018 NULL); 3019#elif defined(__APPLE__) 3020 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3021#else 3022 PyInterpreterState *interp = PyThreadState_GET()->interp; 3023 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3024 cannot use it to encode and decode filenames before it is loaded. Load 3025 the Python codec requires to encode at least its own filename. Use the C 3026 version of the locale codec until the codec registry is initialized and 3027 the Python codec is loaded. 3028 3029 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3030 cannot only rely on it: check also interp->fscodec_initialized for 3031 subinterpreters. */ 3032 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3033 return PyUnicode_AsEncodedString(unicode, 3034 Py_FileSystemDefaultEncoding, 3035 "surrogateescape"); 3036 } 3037 else { 3038 /* locale encoding with surrogateescape */ 3039 wchar_t *wchar; 3040 char *bytes; 3041 PyObject *bytes_obj; 3042 size_t error_pos; 3043 3044 wchar = PyUnicode_AsWideCharString(unicode, NULL); 3045 if (wchar == NULL) 3046 return NULL; 3047 bytes = _Py_wchar2char(wchar, &error_pos); 3048 if (bytes == NULL) { 3049 if (error_pos != (size_t)-1) { 3050 char *errmsg = strerror(errno); 3051 PyObject *exc = NULL; 3052 if (errmsg == NULL) 3053 errmsg = "Py_wchar2char() failed"; 3054 raise_encode_exception(&exc, 3055 "filesystemencoding", unicode, 3056 error_pos, error_pos+1, 3057 errmsg); 3058 Py_XDECREF(exc); 3059 } 3060 else 3061 PyErr_NoMemory(); 3062 PyMem_Free(wchar); 3063 return NULL; 3064 } 3065 PyMem_Free(wchar); 3066 3067 bytes_obj = PyBytes_FromString(bytes); 3068 PyMem_Free(bytes); 3069 return bytes_obj; 3070 } 3071#endif 3072} 3073 3074PyObject * 3075PyUnicode_AsEncodedString(PyObject *unicode, 3076 const char *encoding, 3077 const char *errors) 3078{ 3079 PyObject *v; 3080 char lower[11]; /* Enough for any encoding shortcut */ 3081 3082 if (!PyUnicode_Check(unicode)) { 3083 PyErr_BadArgument(); 3084 return NULL; 3085 } 3086 3087 /* Shortcuts for common default encodings */ 3088 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3089 if ((strcmp(lower, "utf-8") == 0) || 3090 (strcmp(lower, "utf8") == 0)) 3091 { 3092 if (errors == NULL || strcmp(errors, "strict") == 0) 3093 return _PyUnicode_AsUTF8String(unicode, NULL); 3094 else 3095 return _PyUnicode_AsUTF8String(unicode, errors); 3096 } 3097 else if ((strcmp(lower, "latin-1") == 0) || 3098 (strcmp(lower, "latin1") == 0) || 3099 (strcmp(lower, "iso-8859-1") == 0)) 3100 return _PyUnicode_AsLatin1String(unicode, errors); 3101#ifdef HAVE_MBCS 3102 else if (strcmp(lower, "mbcs") == 0) 3103 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), 3104 PyUnicode_GET_SIZE(unicode), 3105 errors); 3106#endif 3107 else if (strcmp(lower, "ascii") == 0) 3108 return _PyUnicode_AsASCIIString(unicode, errors); 3109 } 3110 3111 /* Encode via the codec registry */ 3112 v = PyCodec_Encode(unicode, encoding, errors); 3113 if (v == NULL) 3114 return NULL; 3115 3116 /* The normal path */ 3117 if (PyBytes_Check(v)) 3118 return v; 3119 3120 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3121 if (PyByteArray_Check(v)) { 3122 int error; 3123 PyObject *b; 3124 3125 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3126 "encoder %s returned bytearray instead of bytes", 3127 encoding); 3128 if (error) { 3129 Py_DECREF(v); 3130 return NULL; 3131 } 3132 3133 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3134 Py_DECREF(v); 3135 return b; 3136 } 3137 3138 PyErr_Format(PyExc_TypeError, 3139 "encoder did not return a bytes object (type=%.400s)", 3140 Py_TYPE(v)->tp_name); 3141 Py_DECREF(v); 3142 return NULL; 3143} 3144 3145PyObject * 3146PyUnicode_AsEncodedUnicode(PyObject *unicode, 3147 const char *encoding, 3148 const char *errors) 3149{ 3150 PyObject *v; 3151 3152 if (!PyUnicode_Check(unicode)) { 3153 PyErr_BadArgument(); 3154 goto onError; 3155 } 3156 3157 if (encoding == NULL) 3158 encoding = PyUnicode_GetDefaultEncoding(); 3159 3160 /* Encode via the codec registry */ 3161 v = PyCodec_Encode(unicode, encoding, errors); 3162 if (v == NULL) 3163 goto onError; 3164 if (!PyUnicode_Check(v)) { 3165 PyErr_Format(PyExc_TypeError, 3166 "encoder did not return an str object (type=%.400s)", 3167 Py_TYPE(v)->tp_name); 3168 Py_DECREF(v); 3169 goto onError; 3170 } 3171 return v; 3172 3173 onError: 3174 return NULL; 3175} 3176 3177PyObject* 3178PyUnicode_DecodeFSDefault(const char *s) { 3179 Py_ssize_t size = (Py_ssize_t)strlen(s); 3180 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3181} 3182 3183PyObject* 3184PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3185{ 3186#ifdef HAVE_MBCS 3187 return PyUnicode_DecodeMBCS(s, size, NULL); 3188#elif defined(__APPLE__) 3189 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 3190#else 3191 PyInterpreterState *interp = PyThreadState_GET()->interp; 3192 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3193 cannot use it to encode and decode filenames before it is loaded. Load 3194 the Python codec requires to encode at least its own filename. Use the C 3195 version of the locale codec until the codec registry is initialized and 3196 the Python codec is loaded. 3197 3198 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3199 cannot only rely on it: check also interp->fscodec_initialized for 3200 subinterpreters. */ 3201 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3202 return PyUnicode_Decode(s, size, 3203 Py_FileSystemDefaultEncoding, 3204 "surrogateescape"); 3205 } 3206 else { 3207 /* locale encoding with surrogateescape */ 3208 wchar_t *wchar; 3209 PyObject *unicode; 3210 size_t len; 3211 3212 if (s[size] != '\0' || size != strlen(s)) { 3213 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3214 return NULL; 3215 } 3216 3217 wchar = _Py_char2wchar(s, &len); 3218 if (wchar == NULL) 3219 return PyErr_NoMemory(); 3220 3221 unicode = PyUnicode_FromWideChar(wchar, len); 3222 PyMem_Free(wchar); 3223 return unicode; 3224 } 3225#endif 3226} 3227 3228 3229int 3230PyUnicode_FSConverter(PyObject* arg, void* addr) 3231{ 3232 PyObject *output = NULL; 3233 Py_ssize_t size; 3234 void *data; 3235 if (arg == NULL) { 3236 Py_DECREF(*(PyObject**)addr); 3237 return 1; 3238 } 3239 if (PyBytes_Check(arg)) { 3240 output = arg; 3241 Py_INCREF(output); 3242 } 3243 else { 3244 arg = PyUnicode_FromObject(arg); 3245 if (!arg) 3246 return 0; 3247 output = PyUnicode_EncodeFSDefault(arg); 3248 Py_DECREF(arg); 3249 if (!output) 3250 return 0; 3251 if (!PyBytes_Check(output)) { 3252 Py_DECREF(output); 3253 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3254 return 0; 3255 } 3256 } 3257 size = PyBytes_GET_SIZE(output); 3258 data = PyBytes_AS_STRING(output); 3259 if (size != strlen(data)) { 3260 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3261 Py_DECREF(output); 3262 return 0; 3263 } 3264 *(PyObject**)addr = output; 3265 return Py_CLEANUP_SUPPORTED; 3266} 3267 3268 3269int 3270PyUnicode_FSDecoder(PyObject* arg, void* addr) 3271{ 3272 PyObject *output = NULL; 3273 if (arg == NULL) { 3274 Py_DECREF(*(PyObject**)addr); 3275 return 1; 3276 } 3277 if (PyUnicode_Check(arg)) { 3278 if (PyUnicode_READY(arg)) 3279 return 0; 3280 output = arg; 3281 Py_INCREF(output); 3282 } 3283 else { 3284 arg = PyBytes_FromObject(arg); 3285 if (!arg) 3286 return 0; 3287 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3288 PyBytes_GET_SIZE(arg)); 3289 Py_DECREF(arg); 3290 if (!output) 3291 return 0; 3292 if (!PyUnicode_Check(output)) { 3293 Py_DECREF(output); 3294 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3295 return 0; 3296 } 3297 } 3298 if (PyUnicode_READY(output) < 0) { 3299 Py_DECREF(output); 3300 return 0; 3301 } 3302 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3303 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3304 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3305 Py_DECREF(output); 3306 return 0; 3307 } 3308 *(PyObject**)addr = output; 3309 return Py_CLEANUP_SUPPORTED; 3310} 3311 3312 3313char* 3314PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3315{ 3316 PyObject *bytes; 3317 3318 if (!PyUnicode_Check(unicode)) { 3319 PyErr_BadArgument(); 3320 return NULL; 3321 } 3322 if (PyUnicode_READY(unicode) == -1) 3323 return NULL; 3324 3325 if (PyUnicode_UTF8(unicode) == NULL) { 3326 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3327 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3328 if (bytes == NULL) 3329 return NULL; 3330 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3331 if (_PyUnicode_UTF8(unicode) == NULL) { 3332 Py_DECREF(bytes); 3333 return NULL; 3334 } 3335 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3336 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3337 PyBytes_AS_STRING(bytes), 3338 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3339 Py_DECREF(bytes); 3340 } 3341 3342 if (psize) 3343 *psize = PyUnicode_UTF8_LENGTH(unicode); 3344 return PyUnicode_UTF8(unicode); 3345} 3346 3347char* 3348PyUnicode_AsUTF8(PyObject *unicode) 3349{ 3350 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3351} 3352 3353#ifdef Py_DEBUG 3354static int unicode_as_unicode_calls = 0; 3355#endif 3356 3357 3358Py_UNICODE * 3359PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3360{ 3361 const unsigned char *one_byte; 3362#if SIZEOF_WCHAR_T == 4 3363 const Py_UCS2 *two_bytes; 3364#else 3365 const Py_UCS4 *four_bytes; 3366 const Py_UCS4 *ucs4_end; 3367 Py_ssize_t num_surrogates; 3368#endif 3369 wchar_t *w; 3370 wchar_t *wchar_end; 3371 3372 if (!PyUnicode_Check(unicode)) { 3373 PyErr_BadArgument(); 3374 return NULL; 3375 } 3376 if (_PyUnicode_WSTR(unicode) == NULL) { 3377 /* Non-ASCII compact unicode object */ 3378 assert(_PyUnicode_KIND(unicode) != 0); 3379 assert(PyUnicode_IS_READY(unicode)); 3380 3381#ifdef Py_DEBUG 3382 ++unicode_as_unicode_calls; 3383#endif 3384 3385 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3386#if SIZEOF_WCHAR_T == 2 3387 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3388 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3389 num_surrogates = 0; 3390 3391 for (; four_bytes < ucs4_end; ++four_bytes) { 3392 if (*four_bytes > 0xFFFF) 3393 ++num_surrogates; 3394 } 3395 3396 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3397 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3398 if (!_PyUnicode_WSTR(unicode)) { 3399 PyErr_NoMemory(); 3400 return NULL; 3401 } 3402 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3403 3404 w = _PyUnicode_WSTR(unicode); 3405 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3406 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3407 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3408 if (*four_bytes > 0xFFFF) { 3409 /* encode surrogate pair in this case */ 3410 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10); 3411 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF); 3412 } 3413 else 3414 *w = *four_bytes; 3415 3416 if (w > wchar_end) { 3417 assert(0 && "Miscalculated string end"); 3418 } 3419 } 3420 *w = 0; 3421#else 3422 /* sizeof(wchar_t) == 4 */ 3423 Py_FatalError("Impossible unicode object state, wstr and str " 3424 "should share memory already."); 3425 return NULL; 3426#endif 3427 } 3428 else { 3429 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3430 (_PyUnicode_LENGTH(unicode) + 1)); 3431 if (!_PyUnicode_WSTR(unicode)) { 3432 PyErr_NoMemory(); 3433 return NULL; 3434 } 3435 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3436 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3437 w = _PyUnicode_WSTR(unicode); 3438 wchar_end = w + _PyUnicode_LENGTH(unicode); 3439 3440 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3441 one_byte = PyUnicode_1BYTE_DATA(unicode); 3442 for (; w < wchar_end; ++one_byte, ++w) 3443 *w = *one_byte; 3444 /* null-terminate the wstr */ 3445 *w = 0; 3446 } 3447 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3448#if SIZEOF_WCHAR_T == 4 3449 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3450 for (; w < wchar_end; ++two_bytes, ++w) 3451 *w = *two_bytes; 3452 /* null-terminate the wstr */ 3453 *w = 0; 3454#else 3455 /* sizeof(wchar_t) == 2 */ 3456 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3457 _PyUnicode_WSTR(unicode) = NULL; 3458 Py_FatalError("Impossible unicode object state, wstr " 3459 "and str should share memory already."); 3460 return NULL; 3461#endif 3462 } 3463 else { 3464 assert(0 && "This should never happen."); 3465 } 3466 } 3467 } 3468 if (size != NULL) 3469 *size = PyUnicode_WSTR_LENGTH(unicode); 3470 return _PyUnicode_WSTR(unicode); 3471} 3472 3473Py_UNICODE * 3474PyUnicode_AsUnicode(PyObject *unicode) 3475{ 3476 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3477} 3478 3479 3480Py_ssize_t 3481PyUnicode_GetSize(PyObject *unicode) 3482{ 3483 if (!PyUnicode_Check(unicode)) { 3484 PyErr_BadArgument(); 3485 goto onError; 3486 } 3487 return PyUnicode_GET_SIZE(unicode); 3488 3489 onError: 3490 return -1; 3491} 3492 3493Py_ssize_t 3494PyUnicode_GetLength(PyObject *unicode) 3495{ 3496 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3497 PyErr_BadArgument(); 3498 return -1; 3499 } 3500 3501 return PyUnicode_GET_LENGTH(unicode); 3502} 3503 3504Py_UCS4 3505PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3506{ 3507 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3508 PyErr_BadArgument(); 3509 return (Py_UCS4)-1; 3510 } 3511 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3512 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3513 return (Py_UCS4)-1; 3514 } 3515 return PyUnicode_READ_CHAR(unicode, index); 3516} 3517 3518int 3519PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3520{ 3521 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3522 PyErr_BadArgument(); 3523 return -1; 3524 } 3525 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3526 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3527 return -1; 3528 } 3529 if (_PyUnicode_Dirty(unicode)) 3530 return -1; 3531 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3532 index, ch); 3533 return 0; 3534} 3535 3536const char * 3537PyUnicode_GetDefaultEncoding(void) 3538{ 3539 return "utf-8"; 3540} 3541 3542/* create or adjust a UnicodeDecodeError */ 3543static void 3544make_decode_exception(PyObject **exceptionObject, 3545 const char *encoding, 3546 const char *input, Py_ssize_t length, 3547 Py_ssize_t startpos, Py_ssize_t endpos, 3548 const char *reason) 3549{ 3550 if (*exceptionObject == NULL) { 3551 *exceptionObject = PyUnicodeDecodeError_Create( 3552 encoding, input, length, startpos, endpos, reason); 3553 } 3554 else { 3555 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3556 goto onError; 3557 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3558 goto onError; 3559 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3560 goto onError; 3561 } 3562 return; 3563 3564onError: 3565 Py_DECREF(*exceptionObject); 3566 *exceptionObject = NULL; 3567} 3568 3569/* error handling callback helper: 3570 build arguments, call the callback and check the arguments, 3571 if no exception occurred, copy the replacement to the output 3572 and adjust various state variables. 3573 return 0 on success, -1 on error 3574*/ 3575 3576static int 3577unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3578 const char *encoding, const char *reason, 3579 const char **input, const char **inend, Py_ssize_t *startinpos, 3580 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3581 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) 3582{ 3583 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3584 3585 PyObject *restuple = NULL; 3586 PyObject *repunicode = NULL; 3587 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); 3588 Py_ssize_t insize; 3589 Py_ssize_t requiredsize; 3590 Py_ssize_t newpos; 3591 const Py_UNICODE *repptr; 3592 PyObject *inputobj = NULL; 3593 Py_ssize_t repsize; 3594 int res = -1; 3595 3596 if (*errorHandler == NULL) { 3597 *errorHandler = PyCodec_LookupError(errors); 3598 if (*errorHandler == NULL) 3599 goto onError; 3600 } 3601 3602 make_decode_exception(exceptionObject, 3603 encoding, 3604 *input, *inend - *input, 3605 *startinpos, *endinpos, 3606 reason); 3607 if (*exceptionObject == NULL) 3608 goto onError; 3609 3610 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3611 if (restuple == NULL) 3612 goto onError; 3613 if (!PyTuple_Check(restuple)) { 3614 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3615 goto onError; 3616 } 3617 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3618 goto onError; 3619 3620 /* Copy back the bytes variables, which might have been modified by the 3621 callback */ 3622 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3623 if (!inputobj) 3624 goto onError; 3625 if (!PyBytes_Check(inputobj)) { 3626 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3627 } 3628 *input = PyBytes_AS_STRING(inputobj); 3629 insize = PyBytes_GET_SIZE(inputobj); 3630 *inend = *input + insize; 3631 /* we can DECREF safely, as the exception has another reference, 3632 so the object won't go away. */ 3633 Py_DECREF(inputobj); 3634 3635 if (newpos<0) 3636 newpos = insize+newpos; 3637 if (newpos<0 || newpos>insize) { 3638 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3639 goto onError; 3640 } 3641 3642 /* need more space? (at least enough for what we 3643 have+the replacement+the rest of the string (starting 3644 at the new input position), so we won't have to check space 3645 when there are no errors in the rest of the string) */ 3646 repptr = PyUnicode_AS_UNICODE(repunicode); 3647 repsize = PyUnicode_GET_SIZE(repunicode); 3648 requiredsize = *outpos + repsize + insize-newpos; 3649 if (requiredsize > outsize) { 3650 if (requiredsize<2*outsize) 3651 requiredsize = 2*outsize; 3652 if (PyUnicode_Resize(output, requiredsize) < 0) 3653 goto onError; 3654 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; 3655 } 3656 *endinpos = newpos; 3657 *inptr = *input + newpos; 3658 Py_UNICODE_COPY(*outptr, repptr, repsize); 3659 *outptr += repsize; 3660 *outpos += repsize; 3661 3662 /* we made it! */ 3663 res = 0; 3664 3665 onError: 3666 Py_XDECREF(restuple); 3667 return res; 3668} 3669 3670/* --- UTF-7 Codec -------------------------------------------------------- */ 3671 3672/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3673 3674/* Three simple macros defining base-64. */ 3675 3676/* Is c a base-64 character? */ 3677 3678#define IS_BASE64(c) \ 3679 (((c) >= 'A' && (c) <= 'Z') || \ 3680 ((c) >= 'a' && (c) <= 'z') || \ 3681 ((c) >= '0' && (c) <= '9') || \ 3682 (c) == '+' || (c) == '/') 3683 3684/* given that c is a base-64 character, what is its base-64 value? */ 3685 3686#define FROM_BASE64(c) \ 3687 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3688 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3689 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3690 (c) == '+' ? 62 : 63) 3691 3692/* What is the base-64 character of the bottom 6 bits of n? */ 3693 3694#define TO_BASE64(n) \ 3695 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3696 3697/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3698 * decoded as itself. We are permissive on decoding; the only ASCII 3699 * byte not decoding to itself is the + which begins a base64 3700 * string. */ 3701 3702#define DECODE_DIRECT(c) \ 3703 ((c) <= 127 && (c) != '+') 3704 3705/* The UTF-7 encoder treats ASCII characters differently according to 3706 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3707 * the above). See RFC2152. This array identifies these different 3708 * sets: 3709 * 0 : "Set D" 3710 * alphanumeric and '(),-./:? 3711 * 1 : "Set O" 3712 * !"#$%&*;<=>@[]^_`{|} 3713 * 2 : "whitespace" 3714 * ht nl cr sp 3715 * 3 : special (must be base64 encoded) 3716 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3717 */ 3718 3719static 3720char utf7_category[128] = { 3721/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3722 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3723/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3724 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3725/* sp ! " # $ % & ' ( ) * + , - . / */ 3726 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3727/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3728 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3729/* @ A B C D E F G H I J K L M N O */ 3730 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3731/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3732 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3733/* ` a b c d e f g h i j k l m n o */ 3734 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3735/* p q r s t u v w x y z { | } ~ del */ 3736 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3737}; 3738 3739/* ENCODE_DIRECT: this character should be encoded as itself. The 3740 * answer depends on whether we are encoding set O as itself, and also 3741 * on whether we are encoding whitespace as itself. RFC2152 makes it 3742 * clear that the answers to these questions vary between 3743 * applications, so this code needs to be flexible. */ 3744 3745#define ENCODE_DIRECT(c, directO, directWS) \ 3746 ((c) < 128 && (c) > 0 && \ 3747 ((utf7_category[(c)] == 0) || \ 3748 (directWS && (utf7_category[(c)] == 2)) || \ 3749 (directO && (utf7_category[(c)] == 1)))) 3750 3751PyObject * 3752PyUnicode_DecodeUTF7(const char *s, 3753 Py_ssize_t size, 3754 const char *errors) 3755{ 3756 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3757} 3758 3759/* The decoder. The only state we preserve is our read position, 3760 * i.e. how many characters we have consumed. So if we end in the 3761 * middle of a shift sequence we have to back off the read position 3762 * and the output to the beginning of the sequence, otherwise we lose 3763 * all the shift state (seen bits, number of bits seen, high 3764 * surrogate). */ 3765 3766PyObject * 3767PyUnicode_DecodeUTF7Stateful(const char *s, 3768 Py_ssize_t size, 3769 const char *errors, 3770 Py_ssize_t *consumed) 3771{ 3772 const char *starts = s; 3773 Py_ssize_t startinpos; 3774 Py_ssize_t endinpos; 3775 Py_ssize_t outpos; 3776 const char *e; 3777 PyObject *unicode; 3778 Py_UNICODE *p; 3779 const char *errmsg = ""; 3780 int inShift = 0; 3781 Py_UNICODE *shiftOutStart; 3782 unsigned int base64bits = 0; 3783 unsigned long base64buffer = 0; 3784 Py_UNICODE surrogate = 0; 3785 PyObject *errorHandler = NULL; 3786 PyObject *exc = NULL; 3787 3788 unicode = (PyObject*)_PyUnicode_New(size); 3789 if (!unicode) 3790 return NULL; 3791 if (size == 0) { 3792 if (consumed) 3793 *consumed = 0; 3794 return unicode; 3795 } 3796 3797 p = PyUnicode_AS_UNICODE(unicode); 3798 shiftOutStart = p; 3799 e = s + size; 3800 3801 while (s < e) { 3802 Py_UNICODE ch; 3803 restart: 3804 ch = (unsigned char) *s; 3805 3806 if (inShift) { /* in a base-64 section */ 3807 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3808 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3809 base64bits += 6; 3810 s++; 3811 if (base64bits >= 16) { 3812 /* we have enough bits for a UTF-16 value */ 3813 Py_UNICODE outCh = (Py_UNICODE) 3814 (base64buffer >> (base64bits-16)); 3815 base64bits -= 16; 3816 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3817 if (surrogate) { 3818 /* expecting a second surrogate */ 3819 if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3820#ifdef Py_UNICODE_WIDE 3821 *p++ = (((surrogate & 0x3FF)<<10) 3822 | (outCh & 0x3FF)) + 0x10000; 3823#else 3824 *p++ = surrogate; 3825 *p++ = outCh; 3826#endif 3827 surrogate = 0; 3828 } 3829 else { 3830 surrogate = 0; 3831 errmsg = "second surrogate missing"; 3832 goto utf7Error; 3833 } 3834 } 3835 else if (outCh >= 0xD800 && outCh <= 0xDBFF) { 3836 /* first surrogate */ 3837 surrogate = outCh; 3838 } 3839 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { 3840 errmsg = "unexpected second surrogate"; 3841 goto utf7Error; 3842 } 3843 else { 3844 *p++ = outCh; 3845 } 3846 } 3847 } 3848 else { /* now leaving a base-64 section */ 3849 inShift = 0; 3850 s++; 3851 if (surrogate) { 3852 errmsg = "second surrogate missing at end of shift sequence"; 3853 goto utf7Error; 3854 } 3855 if (base64bits > 0) { /* left-over bits */ 3856 if (base64bits >= 6) { 3857 /* We've seen at least one base-64 character */ 3858 errmsg = "partial character in shift sequence"; 3859 goto utf7Error; 3860 } 3861 else { 3862 /* Some bits remain; they should be zero */ 3863 if (base64buffer != 0) { 3864 errmsg = "non-zero padding bits in shift sequence"; 3865 goto utf7Error; 3866 } 3867 } 3868 } 3869 if (ch != '-') { 3870 /* '-' is absorbed; other terminating 3871 characters are preserved */ 3872 *p++ = ch; 3873 } 3874 } 3875 } 3876 else if ( ch == '+' ) { 3877 startinpos = s-starts; 3878 s++; /* consume '+' */ 3879 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3880 s++; 3881 *p++ = '+'; 3882 } 3883 else { /* begin base64-encoded section */ 3884 inShift = 1; 3885 shiftOutStart = p; 3886 base64bits = 0; 3887 } 3888 } 3889 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3890 *p++ = ch; 3891 s++; 3892 } 3893 else { 3894 startinpos = s-starts; 3895 s++; 3896 errmsg = "unexpected special character"; 3897 goto utf7Error; 3898 } 3899 continue; 3900utf7Error: 3901 outpos = p-PyUnicode_AS_UNICODE(unicode); 3902 endinpos = s-starts; 3903 if (unicode_decode_call_errorhandler( 3904 errors, &errorHandler, 3905 "utf7", errmsg, 3906 &starts, &e, &startinpos, &endinpos, &exc, &s, 3907 &unicode, &outpos, &p)) 3908 goto onError; 3909 } 3910 3911 /* end of string */ 3912 3913 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 3914 /* if we're in an inconsistent state, that's an error */ 3915 if (surrogate || 3916 (base64bits >= 6) || 3917 (base64bits > 0 && base64buffer != 0)) { 3918 outpos = p-PyUnicode_AS_UNICODE(unicode); 3919 endinpos = size; 3920 if (unicode_decode_call_errorhandler( 3921 errors, &errorHandler, 3922 "utf7", "unterminated shift sequence", 3923 &starts, &e, &startinpos, &endinpos, &exc, &s, 3924 &unicode, &outpos, &p)) 3925 goto onError; 3926 if (s < e) 3927 goto restart; 3928 } 3929 } 3930 3931 /* return state */ 3932 if (consumed) { 3933 if (inShift) { 3934 p = shiftOutStart; /* back off output */ 3935 *consumed = startinpos; 3936 } 3937 else { 3938 *consumed = s-starts; 3939 } 3940 } 3941 3942 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 3943 goto onError; 3944 3945 Py_XDECREF(errorHandler); 3946 Py_XDECREF(exc); 3947#ifndef DONT_MAKE_RESULT_READY 3948 if (_PyUnicode_READY_REPLACE(&unicode)) { 3949 Py_DECREF(unicode); 3950 return NULL; 3951 } 3952#endif 3953 assert(_PyUnicode_CheckConsistency(unicode, 1)); 3954 return unicode; 3955 3956 onError: 3957 Py_XDECREF(errorHandler); 3958 Py_XDECREF(exc); 3959 Py_DECREF(unicode); 3960 return NULL; 3961} 3962 3963 3964PyObject * 3965PyUnicode_EncodeUTF7(const Py_UNICODE *s, 3966 Py_ssize_t size, 3967 int base64SetO, 3968 int base64WhiteSpace, 3969 const char *errors) 3970{ 3971 PyObject *v; 3972 /* It might be possible to tighten this worst case */ 3973 Py_ssize_t allocated = 8 * size; 3974 int inShift = 0; 3975 Py_ssize_t i = 0; 3976 unsigned int base64bits = 0; 3977 unsigned long base64buffer = 0; 3978 char * out; 3979 char * start; 3980 3981 if (size == 0) 3982 return PyBytes_FromStringAndSize(NULL, 0); 3983 3984 if (allocated / 8 != size) 3985 return PyErr_NoMemory(); 3986 3987 v = PyBytes_FromStringAndSize(NULL, allocated); 3988 if (v == NULL) 3989 return NULL; 3990 3991 start = out = PyBytes_AS_STRING(v); 3992 for (;i < size; ++i) { 3993 Py_UNICODE ch = s[i]; 3994 3995 if (inShift) { 3996 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 3997 /* shifting out */ 3998 if (base64bits) { /* output remaining bits */ 3999 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4000 base64buffer = 0; 4001 base64bits = 0; 4002 } 4003 inShift = 0; 4004 /* Characters not in the BASE64 set implicitly unshift the sequence 4005 so no '-' is required, except if the character is itself a '-' */ 4006 if (IS_BASE64(ch) || ch == '-') { 4007 *out++ = '-'; 4008 } 4009 *out++ = (char) ch; 4010 } 4011 else { 4012 goto encode_char; 4013 } 4014 } 4015 else { /* not in a shift sequence */ 4016 if (ch == '+') { 4017 *out++ = '+'; 4018 *out++ = '-'; 4019 } 4020 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4021 *out++ = (char) ch; 4022 } 4023 else { 4024 *out++ = '+'; 4025 inShift = 1; 4026 goto encode_char; 4027 } 4028 } 4029 continue; 4030encode_char: 4031#ifdef Py_UNICODE_WIDE 4032 if (ch >= 0x10000) { 4033 /* code first surrogate */ 4034 base64bits += 16; 4035 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4036 while (base64bits >= 6) { 4037 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4038 base64bits -= 6; 4039 } 4040 /* prepare second surrogate */ 4041 ch = 0xDC00 | ((ch-0x10000) & 0x3FF); 4042 } 4043#endif 4044 base64bits += 16; 4045 base64buffer = (base64buffer << 16) | ch; 4046 while (base64bits >= 6) { 4047 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4048 base64bits -= 6; 4049 } 4050 } 4051 if (base64bits) 4052 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4053 if (inShift) 4054 *out++ = '-'; 4055 if (_PyBytes_Resize(&v, out - start) < 0) 4056 return NULL; 4057 return v; 4058} 4059 4060#undef IS_BASE64 4061#undef FROM_BASE64 4062#undef TO_BASE64 4063#undef DECODE_DIRECT 4064#undef ENCODE_DIRECT 4065 4066/* --- UTF-8 Codec -------------------------------------------------------- */ 4067 4068static 4069char utf8_code_length[256] = { 4070 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 4071 illegal prefix. See RFC 3629 for details */ 4072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 4073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4075 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4076 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4077 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4078 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4079 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 4080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 4081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 4084 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 4085 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 4086 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 4087 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 4088}; 4089 4090PyObject * 4091PyUnicode_DecodeUTF8(const char *s, 4092 Py_ssize_t size, 4093 const char *errors) 4094{ 4095 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4096} 4097 4098/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 4099#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 4100 4101/* Mask to quickly check whether a C 'long' contains a 4102 non-ASCII, UTF8-encoded char. */ 4103#if (SIZEOF_LONG == 8) 4104# define ASCII_CHAR_MASK 0x8080808080808080L 4105#elif (SIZEOF_LONG == 4) 4106# define ASCII_CHAR_MASK 0x80808080L 4107#else 4108# error C 'long' size should be either 4 or 8! 4109#endif 4110 4111/* Scans a UTF-8 string and returns the maximum character to be expected, 4112 the size of the decoded unicode string and if any major errors were 4113 encountered. 4114 4115 This function does check basic UTF-8 sanity, it does however NOT CHECK 4116 if the string contains surrogates, and if all continuation bytes are 4117 within the correct ranges, these checks are performed in 4118 PyUnicode_DecodeUTF8Stateful. 4119 4120 If it sets has_errors to 1, it means the value of unicode_size and max_char 4121 will be bogus and you should not rely on useful information in them. 4122 */ 4123static Py_UCS4 4124utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, 4125 Py_ssize_t *unicode_size, Py_ssize_t* consumed, 4126 int *has_errors) 4127{ 4128 Py_ssize_t n; 4129 Py_ssize_t char_count = 0; 4130 Py_UCS4 max_char = 127, new_max; 4131 Py_UCS4 upper_bound; 4132 const unsigned char *p = (const unsigned char *)s; 4133 const unsigned char *end = p + string_size; 4134 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 4135 int err = 0; 4136 4137 for (; p < end && !err; ++p, ++char_count) { 4138 /* Only check value if it's not a ASCII char... */ 4139 if (*p < 0x80) { 4140 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 4141 an explanation. */ 4142 if (!((size_t) p & LONG_PTR_MASK)) { 4143 /* Help register allocation */ 4144 register const unsigned char *_p = p; 4145 while (_p < aligned_end) { 4146 unsigned long value = *(unsigned long *) _p; 4147 if (value & ASCII_CHAR_MASK) 4148 break; 4149 _p += SIZEOF_LONG; 4150 char_count += SIZEOF_LONG; 4151 } 4152 p = _p; 4153 if (p == end) 4154 break; 4155 } 4156 } 4157 if (*p >= 0x80) { 4158 n = utf8_code_length[*p]; 4159 new_max = max_char; 4160 switch (n) { 4161 /* invalid start byte */ 4162 case 0: 4163 err = 1; 4164 break; 4165 case 2: 4166 /* Code points between 0x00FF and 0x07FF inclusive. 4167 Approximate the upper bound of the code point, 4168 if this flips over 255 we can be sure it will be more 4169 than 255 and the string will need 2 bytes per code coint, 4170 if it stays under or equal to 255, we can be sure 1 byte 4171 is enough. 4172 ((*p & 0b00011111) << 6) | 0b00111111 */ 4173 upper_bound = ((*p & 0x1F) << 6) | 0x3F; 4174 if (max_char < upper_bound) 4175 new_max = upper_bound; 4176 /* Ensure we track at least that we left ASCII space. */ 4177 if (new_max < 128) 4178 new_max = 128; 4179 break; 4180 case 3: 4181 /* Between 0x0FFF and 0xFFFF inclusive, so values are 4182 always > 255 and <= 65535 and will always need 2 bytes. */ 4183 if (max_char < 65535) 4184 new_max = 65535; 4185 break; 4186 case 4: 4187 /* Code point will be above 0xFFFF for sure in this case. */ 4188 new_max = 65537; 4189 break; 4190 /* Internal error, this should be caught by the first if */ 4191 case 1: 4192 default: 4193 assert(0 && "Impossible case in utf8_max_char_and_size"); 4194 err = 1; 4195 } 4196 /* Instead of number of overall bytes for this code point, 4197 n contains the number of following bytes: */ 4198 --n; 4199 /* Check if the follow up chars are all valid continuation bytes */ 4200 if (n >= 1) { 4201 const unsigned char *cont; 4202 if ((p + n) >= end) { 4203 if (consumed == 0) 4204 /* incomplete data, non-incremental decoding */ 4205 err = 1; 4206 break; 4207 } 4208 for (cont = p + 1; cont < (p + n); ++cont) { 4209 if ((*cont & 0xc0) != 0x80) { 4210 err = 1; 4211 break; 4212 } 4213 } 4214 p += n; 4215 } 4216 else 4217 err = 1; 4218 max_char = new_max; 4219 } 4220 } 4221 4222 if (unicode_size) 4223 *unicode_size = char_count; 4224 if (has_errors) 4225 *has_errors = err; 4226 return max_char; 4227} 4228 4229/* Similar to PyUnicode_WRITE but can also write into wstr field 4230 of the legacy unicode representation */ 4231#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ 4232 do { \ 4233 const int k_ = (kind); \ 4234 if (k_ == PyUnicode_WCHAR_KIND) \ 4235 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 4236 else if (k_ == PyUnicode_1BYTE_KIND) \ 4237 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 4238 else if (k_ == PyUnicode_2BYTE_KIND) \ 4239 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ 4240 else \ 4241 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ 4242 } while (0) 4243 4244PyObject * 4245PyUnicode_DecodeUTF8Stateful(const char *s, 4246 Py_ssize_t size, 4247 const char *errors, 4248 Py_ssize_t *consumed) 4249{ 4250 const char *starts = s; 4251 int n; 4252 int k; 4253 Py_ssize_t startinpos; 4254 Py_ssize_t endinpos; 4255 const char *e, *aligned_end; 4256 PyObject *unicode; 4257 const char *errmsg = ""; 4258 PyObject *errorHandler = NULL; 4259 PyObject *exc = NULL; 4260 Py_UCS4 maxchar = 0; 4261 Py_ssize_t unicode_size; 4262 Py_ssize_t i; 4263 int kind; 4264 void *data; 4265 int has_errors; 4266 Py_UNICODE *error_outptr; 4267#if SIZEOF_WCHAR_T == 2 4268 Py_ssize_t wchar_offset = 0; 4269#endif 4270 4271 if (size == 0) { 4272 if (consumed) 4273 *consumed = 0; 4274 return (PyObject *)PyUnicode_New(0, 0); 4275 } 4276 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, 4277 consumed, &has_errors); 4278 if (has_errors) { 4279 unicode = (PyObject*)_PyUnicode_New(size); 4280 if (!unicode) 4281 return NULL; 4282 kind = PyUnicode_WCHAR_KIND; 4283 data = PyUnicode_AS_UNICODE(unicode); 4284 assert(data != NULL); 4285 } 4286 else { 4287 unicode = PyUnicode_New(unicode_size, maxchar); 4288 if (!unicode) 4289 return NULL; 4290 /* When the string is ASCII only, just use memcpy and return. 4291 unicode_size may be != size if there is an incomplete UTF-8 4292 sequence at the end of the ASCII block. */ 4293 if (maxchar < 128 && size == unicode_size) { 4294 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4295 return unicode; 4296 } 4297 kind = PyUnicode_KIND(unicode); 4298 data = PyUnicode_DATA(unicode); 4299 } 4300 /* Unpack UTF-8 encoded data */ 4301 i = 0; 4302 e = s + size; 4303 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4304 4305 while (s < e) { 4306 Py_UCS4 ch = (unsigned char)*s; 4307 4308 if (ch < 0x80) { 4309 /* Fast path for runs of ASCII characters. Given that common UTF-8 4310 input will consist of an overwhelming majority of ASCII 4311 characters, we try to optimize for this case by checking 4312 as many characters as a C 'long' can contain. 4313 First, check if we can do an aligned read, as most CPUs have 4314 a penalty for unaligned reads. 4315 */ 4316 if (!((size_t) s & LONG_PTR_MASK)) { 4317 /* Help register allocation */ 4318 register const char *_s = s; 4319 register Py_ssize_t _i = i; 4320 while (_s < aligned_end) { 4321 /* Read a whole long at a time (either 4 or 8 bytes), 4322 and do a fast unrolled copy if it only contains ASCII 4323 characters. */ 4324 unsigned long value = *(unsigned long *) _s; 4325 if (value & ASCII_CHAR_MASK) 4326 break; 4327 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); 4328 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); 4329 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); 4330 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); 4331#if (SIZEOF_LONG == 8) 4332 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); 4333 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); 4334 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); 4335 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); 4336#endif 4337 _s += SIZEOF_LONG; 4338 _i += SIZEOF_LONG; 4339 } 4340 s = _s; 4341 i = _i; 4342 if (s == e) 4343 break; 4344 ch = (unsigned char)*s; 4345 } 4346 } 4347 4348 if (ch < 0x80) { 4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4350 s++; 4351 continue; 4352 } 4353 4354 n = utf8_code_length[ch]; 4355 4356 if (s + n > e) { 4357 if (consumed) 4358 break; 4359 else { 4360 errmsg = "unexpected end of data"; 4361 startinpos = s-starts; 4362 endinpos = startinpos+1; 4363 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4364 endinpos++; 4365 goto utf8Error; 4366 } 4367 } 4368 4369 switch (n) { 4370 4371 case 0: 4372 errmsg = "invalid start byte"; 4373 startinpos = s-starts; 4374 endinpos = startinpos+1; 4375 goto utf8Error; 4376 4377 case 1: 4378 errmsg = "internal error"; 4379 startinpos = s-starts; 4380 endinpos = startinpos+1; 4381 goto utf8Error; 4382 4383 case 2: 4384 if ((s[1] & 0xc0) != 0x80) { 4385 errmsg = "invalid continuation byte"; 4386 startinpos = s-starts; 4387 endinpos = startinpos + 1; 4388 goto utf8Error; 4389 } 4390 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4391 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4392 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4393 break; 4394 4395 case 3: 4396 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4397 will result in surrogates in range d800-dfff. Surrogates are 4398 not valid UTF-8 so they are rejected. 4399 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4400 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4401 if ((s[1] & 0xc0) != 0x80 || 4402 (s[2] & 0xc0) != 0x80 || 4403 ((unsigned char)s[0] == 0xE0 && 4404 (unsigned char)s[1] < 0xA0) || 4405 ((unsigned char)s[0] == 0xED && 4406 (unsigned char)s[1] > 0x9F)) { 4407 errmsg = "invalid continuation byte"; 4408 startinpos = s-starts; 4409 endinpos = startinpos + 1; 4410 4411 /* if s[1] first two bits are 1 and 0, then the invalid 4412 continuation byte is s[2], so increment endinpos by 1, 4413 if not, s[1] is invalid and endinpos doesn't need to 4414 be incremented. */ 4415 if ((s[1] & 0xC0) == 0x80) 4416 endinpos++; 4417 goto utf8Error; 4418 } 4419 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4420 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4421 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4422 break; 4423 4424 case 4: 4425 if ((s[1] & 0xc0) != 0x80 || 4426 (s[2] & 0xc0) != 0x80 || 4427 (s[3] & 0xc0) != 0x80 || 4428 ((unsigned char)s[0] == 0xF0 && 4429 (unsigned char)s[1] < 0x90) || 4430 ((unsigned char)s[0] == 0xF4 && 4431 (unsigned char)s[1] > 0x8F)) { 4432 errmsg = "invalid continuation byte"; 4433 startinpos = s-starts; 4434 endinpos = startinpos + 1; 4435 if ((s[1] & 0xC0) == 0x80) { 4436 endinpos++; 4437 if ((s[2] & 0xC0) == 0x80) 4438 endinpos++; 4439 } 4440 goto utf8Error; 4441 } 4442 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4443 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4444 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4445 4446 /* If the string is flexible or we have native UCS-4, write 4447 directly.. */ 4448 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND) 4449 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); 4450 4451 else { 4452 /* compute and append the two surrogates: */ 4453 4454 /* translate from 10000..10FFFF to 0..FFFF */ 4455 ch -= 0x10000; 4456 4457 /* high surrogate = top 10 bits added to D800 */ 4458 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4459 (Py_UNICODE)(0xD800 + (ch >> 10))); 4460 4461 /* low surrogate = bottom 10 bits added to DC00 */ 4462 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, 4463 (Py_UNICODE)(0xDC00 + (ch & 0x03FF))); 4464 } 4465#if SIZEOF_WCHAR_T == 2 4466 wchar_offset++; 4467#endif 4468 break; 4469 } 4470 s += n; 4471 continue; 4472 4473 utf8Error: 4474 /* If this is not yet a resizable string, make it one.. */ 4475 if (kind != PyUnicode_WCHAR_KIND) { 4476 const Py_UNICODE *u; 4477 PyObject *new_unicode = (PyObject*)_PyUnicode_New(size); 4478 if (!new_unicode) 4479 goto onError; 4480 u = PyUnicode_AsUnicode(unicode); 4481 if (!u) 4482 goto onError; 4483#if SIZEOF_WCHAR_T == 2 4484 i += wchar_offset; 4485#endif 4486 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i); 4487 Py_DECREF(unicode); 4488 unicode = new_unicode; 4489 kind = 0; 4490 data = PyUnicode_AS_UNICODE(new_unicode); 4491 assert(data != NULL); 4492 } 4493 error_outptr = PyUnicode_AS_UNICODE(unicode) + i; 4494 if (unicode_decode_call_errorhandler( 4495 errors, &errorHandler, 4496 "utf8", errmsg, 4497 &starts, &e, &startinpos, &endinpos, &exc, &s, 4498 &unicode, &i, &error_outptr)) 4499 goto onError; 4500 /* Update data because unicode_decode_call_errorhandler might have 4501 re-created or resized the unicode object. */ 4502 data = PyUnicode_AS_UNICODE(unicode); 4503 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4504 } 4505 /* Ensure the unicode_size calculation above was correct: */ 4506 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); 4507 4508 if (consumed) 4509 *consumed = s-starts; 4510 4511 /* Adjust length and ready string when it contained errors and 4512 is of the old resizable kind. */ 4513 if (kind == PyUnicode_WCHAR_KIND) { 4514 if (PyUnicode_Resize(&unicode, i) < 0) 4515 goto onError; 4516 } 4517 4518 Py_XDECREF(errorHandler); 4519 Py_XDECREF(exc); 4520#ifndef DONT_MAKE_RESULT_READY 4521 if (_PyUnicode_READY_REPLACE(&unicode)) { 4522 Py_DECREF(unicode); 4523 return NULL; 4524 } 4525#endif 4526 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4527 return unicode; 4528 4529 onError: 4530 Py_XDECREF(errorHandler); 4531 Py_XDECREF(exc); 4532 Py_DECREF(unicode); 4533 return NULL; 4534} 4535 4536#undef WRITE_FLEXIBLE_OR_WSTR 4537 4538#ifdef __APPLE__ 4539 4540/* Simplified UTF-8 decoder using surrogateescape error handler, 4541 used to decode the command line arguments on Mac OS X. */ 4542 4543wchar_t* 4544_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4545{ 4546 int n; 4547 const char *e; 4548 wchar_t *unicode, *p; 4549 4550 /* Note: size will always be longer than the resulting Unicode 4551 character count */ 4552 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4553 PyErr_NoMemory(); 4554 return NULL; 4555 } 4556 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4557 if (!unicode) 4558 return NULL; 4559 4560 /* Unpack UTF-8 encoded data */ 4561 p = unicode; 4562 e = s + size; 4563 while (s < e) { 4564 Py_UCS4 ch = (unsigned char)*s; 4565 4566 if (ch < 0x80) { 4567 *p++ = (wchar_t)ch; 4568 s++; 4569 continue; 4570 } 4571 4572 n = utf8_code_length[ch]; 4573 if (s + n > e) { 4574 goto surrogateescape; 4575 } 4576 4577 switch (n) { 4578 case 0: 4579 case 1: 4580 goto surrogateescape; 4581 4582 case 2: 4583 if ((s[1] & 0xc0) != 0x80) 4584 goto surrogateescape; 4585 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4586 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4587 *p++ = (wchar_t)ch; 4588 break; 4589 4590 case 3: 4591 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4592 will result in surrogates in range d800-dfff. Surrogates are 4593 not valid UTF-8 so they are rejected. 4594 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4595 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4596 if ((s[1] & 0xc0) != 0x80 || 4597 (s[2] & 0xc0) != 0x80 || 4598 ((unsigned char)s[0] == 0xE0 && 4599 (unsigned char)s[1] < 0xA0) || 4600 ((unsigned char)s[0] == 0xED && 4601 (unsigned char)s[1] > 0x9F)) { 4602 4603 goto surrogateescape; 4604 } 4605 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4606 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4607 *p++ = (wchar_t)ch; 4608 break; 4609 4610 case 4: 4611 if ((s[1] & 0xc0) != 0x80 || 4612 (s[2] & 0xc0) != 0x80 || 4613 (s[3] & 0xc0) != 0x80 || 4614 ((unsigned char)s[0] == 0xF0 && 4615 (unsigned char)s[1] < 0x90) || 4616 ((unsigned char)s[0] == 0xF4 && 4617 (unsigned char)s[1] > 0x8F)) { 4618 goto surrogateescape; 4619 } 4620 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4621 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4622 assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); 4623 4624#if SIZEOF_WCHAR_T == 4 4625 *p++ = (wchar_t)ch; 4626#else 4627 /* compute and append the two surrogates: */ 4628 4629 /* translate from 10000..10FFFF to 0..FFFF */ 4630 ch -= 0x10000; 4631 4632 /* high surrogate = top 10 bits added to D800 */ 4633 *p++ = (wchar_t)(0xD800 + (ch >> 10)); 4634 4635 /* low surrogate = bottom 10 bits added to DC00 */ 4636 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); 4637#endif 4638 break; 4639 } 4640 s += n; 4641 continue; 4642 4643 surrogateescape: 4644 *p++ = 0xDC00 + ch; 4645 s++; 4646 } 4647 *p = L'\0'; 4648 return unicode; 4649} 4650 4651#endif /* __APPLE__ */ 4652 4653/* Primary internal function which creates utf8 encoded bytes objects. 4654 4655 Allocation strategy: if the string is short, convert into a stack buffer 4656 and allocate exactly as much space needed at the end. Else allocate the 4657 maximum possible needed (4 result bytes per Unicode character), and return 4658 the excess memory at the end. 4659*/ 4660PyObject * 4661_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4662{ 4663#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4664 4665 Py_ssize_t i; /* index into s of next input byte */ 4666 PyObject *result; /* result string object */ 4667 char *p; /* next free byte in output buffer */ 4668 Py_ssize_t nallocated; /* number of result bytes allocated */ 4669 Py_ssize_t nneeded; /* number of result bytes needed */ 4670 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4671 PyObject *errorHandler = NULL; 4672 PyObject *exc = NULL; 4673 int kind; 4674 void *data; 4675 Py_ssize_t size; 4676 4677 if (!PyUnicode_Check(unicode)) { 4678 PyErr_BadArgument(); 4679 return NULL; 4680 } 4681 4682 if (PyUnicode_READY(unicode) == -1) 4683 return NULL; 4684 4685 if (PyUnicode_UTF8(unicode)) 4686 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4687 PyUnicode_UTF8_LENGTH(unicode)); 4688 4689 kind = PyUnicode_KIND(unicode); 4690 data = PyUnicode_DATA(unicode); 4691 size = PyUnicode_GET_LENGTH(unicode); 4692 4693 assert(size >= 0); 4694 4695 if (size <= MAX_SHORT_UNICHARS) { 4696 /* Write into the stack buffer; nallocated can't overflow. 4697 * At the end, we'll allocate exactly as much heap space as it 4698 * turns out we need. 4699 */ 4700 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4701 result = NULL; /* will allocate after we're done */ 4702 p = stackbuf; 4703 } 4704 else { 4705 /* Overallocate on the heap, and give the excess back at the end. */ 4706 nallocated = size * 4; 4707 if (nallocated / 4 != size) /* overflow! */ 4708 return PyErr_NoMemory(); 4709 result = PyBytes_FromStringAndSize(NULL, nallocated); 4710 if (result == NULL) 4711 return NULL; 4712 p = PyBytes_AS_STRING(result); 4713 } 4714 4715 for (i = 0; i < size;) { 4716 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4717 4718 if (ch < 0x80) 4719 /* Encode ASCII */ 4720 *p++ = (char) ch; 4721 4722 else if (ch < 0x0800) { 4723 /* Encode Latin-1 */ 4724 *p++ = (char)(0xc0 | (ch >> 6)); 4725 *p++ = (char)(0x80 | (ch & 0x3f)); 4726 } else if (0xD800 <= ch && ch <= 0xDFFF) { 4727 Py_ssize_t newpos; 4728 PyObject *rep; 4729 Py_ssize_t repsize, k, startpos; 4730 startpos = i-1; 4731 rep = unicode_encode_call_errorhandler( 4732 errors, &errorHandler, "utf-8", "surrogates not allowed", 4733 unicode, &exc, startpos, startpos+1, &newpos); 4734 if (!rep) 4735 goto error; 4736 4737 if (PyBytes_Check(rep)) 4738 repsize = PyBytes_GET_SIZE(rep); 4739 else 4740 repsize = PyUnicode_GET_SIZE(rep); 4741 4742 if (repsize > 4) { 4743 Py_ssize_t offset; 4744 4745 if (result == NULL) 4746 offset = p - stackbuf; 4747 else 4748 offset = p - PyBytes_AS_STRING(result); 4749 4750 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4751 /* integer overflow */ 4752 PyErr_NoMemory(); 4753 goto error; 4754 } 4755 nallocated += repsize - 4; 4756 if (result != NULL) { 4757 if (_PyBytes_Resize(&result, nallocated) < 0) 4758 goto error; 4759 } else { 4760 result = PyBytes_FromStringAndSize(NULL, nallocated); 4761 if (result == NULL) 4762 goto error; 4763 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4764 } 4765 p = PyBytes_AS_STRING(result) + offset; 4766 } 4767 4768 if (PyBytes_Check(rep)) { 4769 char *prep = PyBytes_AS_STRING(rep); 4770 for(k = repsize; k > 0; k--) 4771 *p++ = *prep++; 4772 } else /* rep is unicode */ { 4773 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep); 4774 Py_UNICODE c; 4775 4776 for(k=0; k<repsize; k++) { 4777 c = prep[k]; 4778 if (0x80 <= c) { 4779 raise_encode_exception(&exc, "utf-8", 4780 unicode, 4781 i-1, i, 4782 "surrogates not allowed"); 4783 goto error; 4784 } 4785 *p++ = (char)prep[k]; 4786 } 4787 } 4788 Py_DECREF(rep); 4789 } else if (ch < 0x10000) { 4790 *p++ = (char)(0xe0 | (ch >> 12)); 4791 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4792 *p++ = (char)(0x80 | (ch & 0x3f)); 4793 } else /* ch >= 0x10000 */ { 4794 /* Encode UCS4 Unicode ordinals */ 4795 *p++ = (char)(0xf0 | (ch >> 18)); 4796 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4797 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4798 *p++ = (char)(0x80 | (ch & 0x3f)); 4799 } 4800 } 4801 4802 if (result == NULL) { 4803 /* This was stack allocated. */ 4804 nneeded = p - stackbuf; 4805 assert(nneeded <= nallocated); 4806 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4807 } 4808 else { 4809 /* Cut back to size actually needed. */ 4810 nneeded = p - PyBytes_AS_STRING(result); 4811 assert(nneeded <= nallocated); 4812 _PyBytes_Resize(&result, nneeded); 4813 } 4814 4815 Py_XDECREF(errorHandler); 4816 Py_XDECREF(exc); 4817 return result; 4818 error: 4819 Py_XDECREF(errorHandler); 4820 Py_XDECREF(exc); 4821 Py_XDECREF(result); 4822 return NULL; 4823 4824#undef MAX_SHORT_UNICHARS 4825} 4826 4827PyObject * 4828PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4829 Py_ssize_t size, 4830 const char *errors) 4831{ 4832 PyObject *v, *unicode; 4833 4834 unicode = PyUnicode_FromUnicode(s, size); 4835 if (unicode == NULL) 4836 return NULL; 4837 v = _PyUnicode_AsUTF8String(unicode, errors); 4838 Py_DECREF(unicode); 4839 return v; 4840} 4841 4842PyObject * 4843PyUnicode_AsUTF8String(PyObject *unicode) 4844{ 4845 return _PyUnicode_AsUTF8String(unicode, NULL); 4846} 4847 4848/* --- UTF-32 Codec ------------------------------------------------------- */ 4849 4850PyObject * 4851PyUnicode_DecodeUTF32(const char *s, 4852 Py_ssize_t size, 4853 const char *errors, 4854 int *byteorder) 4855{ 4856 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4857} 4858 4859PyObject * 4860PyUnicode_DecodeUTF32Stateful(const char *s, 4861 Py_ssize_t size, 4862 const char *errors, 4863 int *byteorder, 4864 Py_ssize_t *consumed) 4865{ 4866 const char *starts = s; 4867 Py_ssize_t startinpos; 4868 Py_ssize_t endinpos; 4869 Py_ssize_t outpos; 4870 PyObject *unicode; 4871 Py_UNICODE *p; 4872#ifndef Py_UNICODE_WIDE 4873 int pairs = 0; 4874 const unsigned char *qq; 4875#else 4876 const int pairs = 0; 4877#endif 4878 const unsigned char *q, *e; 4879 int bo = 0; /* assume native ordering by default */ 4880 const char *errmsg = ""; 4881 /* Offsets from q for retrieving bytes in the right order. */ 4882#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4883 int iorder[] = {0, 1, 2, 3}; 4884#else 4885 int iorder[] = {3, 2, 1, 0}; 4886#endif 4887 PyObject *errorHandler = NULL; 4888 PyObject *exc = NULL; 4889 4890 q = (unsigned char *)s; 4891 e = q + size; 4892 4893 if (byteorder) 4894 bo = *byteorder; 4895 4896 /* Check for BOM marks (U+FEFF) in the input and adjust current 4897 byte order setting accordingly. In native mode, the leading BOM 4898 mark is skipped, in all other modes, it is copied to the output 4899 stream as-is (giving a ZWNBSP character). */ 4900 if (bo == 0) { 4901 if (size >= 4) { 4902 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4903 (q[iorder[1]] << 8) | q[iorder[0]]; 4904#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4905 if (bom == 0x0000FEFF) { 4906 q += 4; 4907 bo = -1; 4908 } 4909 else if (bom == 0xFFFE0000) { 4910 q += 4; 4911 bo = 1; 4912 } 4913#else 4914 if (bom == 0x0000FEFF) { 4915 q += 4; 4916 bo = 1; 4917 } 4918 else if (bom == 0xFFFE0000) { 4919 q += 4; 4920 bo = -1; 4921 } 4922#endif 4923 } 4924 } 4925 4926 if (bo == -1) { 4927 /* force LE */ 4928 iorder[0] = 0; 4929 iorder[1] = 1; 4930 iorder[2] = 2; 4931 iorder[3] = 3; 4932 } 4933 else if (bo == 1) { 4934 /* force BE */ 4935 iorder[0] = 3; 4936 iorder[1] = 2; 4937 iorder[2] = 1; 4938 iorder[3] = 0; 4939 } 4940 4941 /* On narrow builds we split characters outside the BMP into two 4942 codepoints => count how much extra space we need. */ 4943#ifndef Py_UNICODE_WIDE 4944 for (qq = q; qq < e; qq += 4) 4945 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0) 4946 pairs++; 4947#endif 4948 4949 /* This might be one to much, because of a BOM */ 4950 unicode = (PyObject*)_PyUnicode_New((size+3)/4+pairs); 4951 if (!unicode) 4952 return NULL; 4953 if (size == 0) 4954 return unicode; 4955 4956 /* Unpack UTF-32 encoded data */ 4957 p = PyUnicode_AS_UNICODE(unicode); 4958 4959 while (q < e) { 4960 Py_UCS4 ch; 4961 /* remaining bytes at the end? (size should be divisible by 4) */ 4962 if (e-q<4) { 4963 if (consumed) 4964 break; 4965 errmsg = "truncated data"; 4966 startinpos = ((const char *)q)-starts; 4967 endinpos = ((const char *)e)-starts; 4968 goto utf32Error; 4969 /* The remaining input chars are ignored if the callback 4970 chooses to skip the input */ 4971 } 4972 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4973 (q[iorder[1]] << 8) | q[iorder[0]]; 4974 4975 if (ch >= 0x110000) 4976 { 4977 errmsg = "codepoint not in range(0x110000)"; 4978 startinpos = ((const char *)q)-starts; 4979 endinpos = startinpos+4; 4980 goto utf32Error; 4981 } 4982#ifndef Py_UNICODE_WIDE 4983 if (ch >= 0x10000) 4984 { 4985 *p++ = 0xD800 | ((ch-0x10000) >> 10); 4986 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); 4987 } 4988 else 4989#endif 4990 *p++ = ch; 4991 q += 4; 4992 continue; 4993 utf32Error: 4994 outpos = p-PyUnicode_AS_UNICODE(unicode); 4995 if (unicode_decode_call_errorhandler( 4996 errors, &errorHandler, 4997 "utf32", errmsg, 4998 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 4999 &unicode, &outpos, &p)) 5000 goto onError; 5001 } 5002 5003 if (byteorder) 5004 *byteorder = bo; 5005 5006 if (consumed) 5007 *consumed = (const char *)q-starts; 5008 5009 /* Adjust length */ 5010 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5011 goto onError; 5012 5013 Py_XDECREF(errorHandler); 5014 Py_XDECREF(exc); 5015#ifndef DONT_MAKE_RESULT_READY 5016 if (_PyUnicode_READY_REPLACE(&unicode)) { 5017 Py_DECREF(unicode); 5018 return NULL; 5019 } 5020#endif 5021 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5022 return unicode; 5023 5024 onError: 5025 Py_DECREF(unicode); 5026 Py_XDECREF(errorHandler); 5027 Py_XDECREF(exc); 5028 return NULL; 5029} 5030 5031PyObject * 5032PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5033 Py_ssize_t size, 5034 const char *errors, 5035 int byteorder) 5036{ 5037 PyObject *v; 5038 unsigned char *p; 5039 Py_ssize_t nsize, bytesize; 5040#ifndef Py_UNICODE_WIDE 5041 Py_ssize_t i, pairs; 5042#else 5043 const int pairs = 0; 5044#endif 5045 /* Offsets from p for storing byte pairs in the right order. */ 5046#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5047 int iorder[] = {0, 1, 2, 3}; 5048#else 5049 int iorder[] = {3, 2, 1, 0}; 5050#endif 5051 5052#define STORECHAR(CH) \ 5053 do { \ 5054 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5055 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5056 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5057 p[iorder[0]] = (CH) & 0xff; \ 5058 p += 4; \ 5059 } while(0) 5060 5061 /* In narrow builds we can output surrogate pairs as one codepoint, 5062 so we need less space. */ 5063#ifndef Py_UNICODE_WIDE 5064 for (i = pairs = 0; i < size-1; i++) 5065 if (0xD800 <= s[i] && s[i] <= 0xDBFF && 5066 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) 5067 pairs++; 5068#endif 5069 nsize = (size - pairs + (byteorder == 0)); 5070 bytesize = nsize * 4; 5071 if (bytesize / 4 != nsize) 5072 return PyErr_NoMemory(); 5073 v = PyBytes_FromStringAndSize(NULL, bytesize); 5074 if (v == NULL) 5075 return NULL; 5076 5077 p = (unsigned char *)PyBytes_AS_STRING(v); 5078 if (byteorder == 0) 5079 STORECHAR(0xFEFF); 5080 if (size == 0) 5081 goto done; 5082 5083 if (byteorder == -1) { 5084 /* force LE */ 5085 iorder[0] = 0; 5086 iorder[1] = 1; 5087 iorder[2] = 2; 5088 iorder[3] = 3; 5089 } 5090 else if (byteorder == 1) { 5091 /* force BE */ 5092 iorder[0] = 3; 5093 iorder[1] = 2; 5094 iorder[2] = 1; 5095 iorder[3] = 0; 5096 } 5097 5098 while (size-- > 0) { 5099 Py_UCS4 ch = *s++; 5100#ifndef Py_UNICODE_WIDE 5101 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { 5102 Py_UCS4 ch2 = *s; 5103 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5104 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5105 s++; 5106 size--; 5107 } 5108 } 5109#endif 5110 STORECHAR(ch); 5111 } 5112 5113 done: 5114 return v; 5115#undef STORECHAR 5116} 5117 5118PyObject * 5119PyUnicode_AsUTF32String(PyObject *unicode) 5120{ 5121 if (!PyUnicode_Check(unicode)) { 5122 PyErr_BadArgument(); 5123 return NULL; 5124 } 5125 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), 5126 PyUnicode_GET_SIZE(unicode), 5127 NULL, 5128 0); 5129} 5130 5131/* --- UTF-16 Codec ------------------------------------------------------- */ 5132 5133PyObject * 5134PyUnicode_DecodeUTF16(const char *s, 5135 Py_ssize_t size, 5136 const char *errors, 5137 int *byteorder) 5138{ 5139 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5140} 5141 5142/* Two masks for fast checking of whether a C 'long' may contain 5143 UTF16-encoded surrogate characters. This is an efficient heuristic, 5144 assuming that non-surrogate characters with a code point >= 0x8000 are 5145 rare in most input. 5146 FAST_CHAR_MASK is used when the input is in native byte ordering, 5147 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 5148*/ 5149#if (SIZEOF_LONG == 8) 5150# define FAST_CHAR_MASK 0x8000800080008000L 5151# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 5152#elif (SIZEOF_LONG == 4) 5153# define FAST_CHAR_MASK 0x80008000L 5154# define SWAPPED_FAST_CHAR_MASK 0x00800080L 5155#else 5156# error C 'long' size should be either 4 or 8! 5157#endif 5158 5159PyObject * 5160PyUnicode_DecodeUTF16Stateful(const char *s, 5161 Py_ssize_t size, 5162 const char *errors, 5163 int *byteorder, 5164 Py_ssize_t *consumed) 5165{ 5166 const char *starts = s; 5167 Py_ssize_t startinpos; 5168 Py_ssize_t endinpos; 5169 Py_ssize_t outpos; 5170 PyObject *unicode; 5171 Py_UNICODE *p; 5172 const unsigned char *q, *e, *aligned_end; 5173 int bo = 0; /* assume native ordering by default */ 5174 int native_ordering = 0; 5175 const char *errmsg = ""; 5176 /* Offsets from q for retrieving byte pairs in the right order. */ 5177#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5178 int ihi = 1, ilo = 0; 5179#else 5180 int ihi = 0, ilo = 1; 5181#endif 5182 PyObject *errorHandler = NULL; 5183 PyObject *exc = NULL; 5184 5185 /* Note: size will always be longer than the resulting Unicode 5186 character count */ 5187 unicode = (PyObject*)_PyUnicode_New(size); 5188 if (!unicode) 5189 return NULL; 5190 if (size == 0) 5191 return unicode; 5192 5193 /* Unpack UTF-16 encoded data */ 5194 p = PyUnicode_AS_UNICODE(unicode); 5195 q = (unsigned char *)s; 5196 e = q + size - 1; 5197 5198 if (byteorder) 5199 bo = *byteorder; 5200 5201 /* Check for BOM marks (U+FEFF) in the input and adjust current 5202 byte order setting accordingly. In native mode, the leading BOM 5203 mark is skipped, in all other modes, it is copied to the output 5204 stream as-is (giving a ZWNBSP character). */ 5205 if (bo == 0) { 5206 if (size >= 2) { 5207 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; 5208#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5209 if (bom == 0xFEFF) { 5210 q += 2; 5211 bo = -1; 5212 } 5213 else if (bom == 0xFFFE) { 5214 q += 2; 5215 bo = 1; 5216 } 5217#else 5218 if (bom == 0xFEFF) { 5219 q += 2; 5220 bo = 1; 5221 } 5222 else if (bom == 0xFFFE) { 5223 q += 2; 5224 bo = -1; 5225 } 5226#endif 5227 } 5228 } 5229 5230 if (bo == -1) { 5231 /* force LE */ 5232 ihi = 1; 5233 ilo = 0; 5234 } 5235 else if (bo == 1) { 5236 /* force BE */ 5237 ihi = 0; 5238 ilo = 1; 5239 } 5240#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5241 native_ordering = ilo < ihi; 5242#else 5243 native_ordering = ilo > ihi; 5244#endif 5245 5246 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5247 while (q < e) { 5248 Py_UNICODE ch; 5249 /* First check for possible aligned read of a C 'long'. Unaligned 5250 reads are more expensive, better to defer to another iteration. */ 5251 if (!((size_t) q & LONG_PTR_MASK)) { 5252 /* Fast path for runs of non-surrogate chars. */ 5253 register const unsigned char *_q = q; 5254 Py_UNICODE *_p = p; 5255 if (native_ordering) { 5256 /* Native ordering is simple: as long as the input cannot 5257 possibly contain a surrogate char, do an unrolled copy 5258 of several 16-bit code points to the target object. 5259 The non-surrogate check is done on several input bytes 5260 at a time (as many as a C 'long' can contain). */ 5261 while (_q < aligned_end) { 5262 unsigned long data = * (unsigned long *) _q; 5263 if (data & FAST_CHAR_MASK) 5264 break; 5265 _p[0] = ((unsigned short *) _q)[0]; 5266 _p[1] = ((unsigned short *) _q)[1]; 5267#if (SIZEOF_LONG == 8) 5268 _p[2] = ((unsigned short *) _q)[2]; 5269 _p[3] = ((unsigned short *) _q)[3]; 5270#endif 5271 _q += SIZEOF_LONG; 5272 _p += SIZEOF_LONG / 2; 5273 } 5274 } 5275 else { 5276 /* Byteswapped ordering is similar, but we must decompose 5277 the copy bytewise, and take care of zero'ing out the 5278 upper bytes if the target object is in 32-bit units 5279 (that is, in UCS-4 builds). */ 5280 while (_q < aligned_end) { 5281 unsigned long data = * (unsigned long *) _q; 5282 if (data & SWAPPED_FAST_CHAR_MASK) 5283 break; 5284 /* Zero upper bytes in UCS-4 builds */ 5285#if (Py_UNICODE_SIZE > 2) 5286 _p[0] = 0; 5287 _p[1] = 0; 5288#if (SIZEOF_LONG == 8) 5289 _p[2] = 0; 5290 _p[3] = 0; 5291#endif 5292#endif 5293 /* Issue #4916; UCS-4 builds on big endian machines must 5294 fill the two last bytes of each 4-byte unit. */ 5295#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) 5296# define OFF 2 5297#else 5298# define OFF 0 5299#endif 5300 ((unsigned char *) _p)[OFF + 1] = _q[0]; 5301 ((unsigned char *) _p)[OFF + 0] = _q[1]; 5302 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2]; 5303 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3]; 5304#if (SIZEOF_LONG == 8) 5305 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4]; 5306 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5]; 5307 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6]; 5308 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7]; 5309#endif 5310#undef OFF 5311 _q += SIZEOF_LONG; 5312 _p += SIZEOF_LONG / 2; 5313 } 5314 } 5315 p = _p; 5316 q = _q; 5317 if (q >= e) 5318 break; 5319 } 5320 ch = (q[ihi] << 8) | q[ilo]; 5321 5322 q += 2; 5323 5324 if (ch < 0xD800 || ch > 0xDFFF) { 5325 *p++ = ch; 5326 continue; 5327 } 5328 5329 /* UTF-16 code pair: */ 5330 if (q > e) { 5331 errmsg = "unexpected end of data"; 5332 startinpos = (((const char *)q) - 2) - starts; 5333 endinpos = ((const char *)e) + 1 - starts; 5334 goto utf16Error; 5335 } 5336 if (0xD800 <= ch && ch <= 0xDBFF) { 5337 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; 5338 q += 2; 5339 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { 5340#ifndef Py_UNICODE_WIDE 5341 *p++ = ch; 5342 *p++ = ch2; 5343#else 5344 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; 5345#endif 5346 continue; 5347 } 5348 else { 5349 errmsg = "illegal UTF-16 surrogate"; 5350 startinpos = (((const char *)q)-4)-starts; 5351 endinpos = startinpos+2; 5352 goto utf16Error; 5353 } 5354 5355 } 5356 errmsg = "illegal encoding"; 5357 startinpos = (((const char *)q)-2)-starts; 5358 endinpos = startinpos+2; 5359 /* Fall through to report the error */ 5360 5361 utf16Error: 5362 outpos = p - PyUnicode_AS_UNICODE(unicode); 5363 if (unicode_decode_call_errorhandler( 5364 errors, 5365 &errorHandler, 5366 "utf16", errmsg, 5367 &starts, 5368 (const char **)&e, 5369 &startinpos, 5370 &endinpos, 5371 &exc, 5372 (const char **)&q, 5373 &unicode, 5374 &outpos, 5375 &p)) 5376 goto onError; 5377 } 5378 /* remaining byte at the end? (size should be even) */ 5379 if (e == q) { 5380 if (!consumed) { 5381 errmsg = "truncated data"; 5382 startinpos = ((const char *)q) - starts; 5383 endinpos = ((const char *)e) + 1 - starts; 5384 outpos = p - PyUnicode_AS_UNICODE(unicode); 5385 if (unicode_decode_call_errorhandler( 5386 errors, 5387 &errorHandler, 5388 "utf16", errmsg, 5389 &starts, 5390 (const char **)&e, 5391 &startinpos, 5392 &endinpos, 5393 &exc, 5394 (const char **)&q, 5395 &unicode, 5396 &outpos, 5397 &p)) 5398 goto onError; 5399 /* The remaining input chars are ignored if the callback 5400 chooses to skip the input */ 5401 } 5402 } 5403 5404 if (byteorder) 5405 *byteorder = bo; 5406 5407 if (consumed) 5408 *consumed = (const char *)q-starts; 5409 5410 /* Adjust length */ 5411 if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) 5412 goto onError; 5413 5414 Py_XDECREF(errorHandler); 5415 Py_XDECREF(exc); 5416#ifndef DONT_MAKE_RESULT_READY 5417 if (_PyUnicode_READY_REPLACE(&unicode)) { 5418 Py_DECREF(unicode); 5419 return NULL; 5420 } 5421#endif 5422 assert(_PyUnicode_CheckConsistency(unicode, 1)); 5423 return unicode; 5424 5425 onError: 5426 Py_DECREF(unicode); 5427 Py_XDECREF(errorHandler); 5428 Py_XDECREF(exc); 5429 return NULL; 5430} 5431 5432#undef FAST_CHAR_MASK 5433#undef SWAPPED_FAST_CHAR_MASK 5434 5435PyObject * 5436PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5437 Py_ssize_t size, 5438 const char *errors, 5439 int byteorder) 5440{ 5441 PyObject *v; 5442 unsigned char *p; 5443 Py_ssize_t nsize, bytesize; 5444#ifdef Py_UNICODE_WIDE 5445 Py_ssize_t i, pairs; 5446#else 5447 const int pairs = 0; 5448#endif 5449 /* Offsets from p for storing byte pairs in the right order. */ 5450#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5451 int ihi = 1, ilo = 0; 5452#else 5453 int ihi = 0, ilo = 1; 5454#endif 5455 5456#define STORECHAR(CH) \ 5457 do { \ 5458 p[ihi] = ((CH) >> 8) & 0xff; \ 5459 p[ilo] = (CH) & 0xff; \ 5460 p += 2; \ 5461 } while(0) 5462 5463#ifdef Py_UNICODE_WIDE 5464 for (i = pairs = 0; i < size; i++) 5465 if (s[i] >= 0x10000) 5466 pairs++; 5467#endif 5468 /* 2 * (size + pairs + (byteorder == 0)) */ 5469 if (size > PY_SSIZE_T_MAX || 5470 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5471 return PyErr_NoMemory(); 5472 nsize = size + pairs + (byteorder == 0); 5473 bytesize = nsize * 2; 5474 if (bytesize / 2 != nsize) 5475 return PyErr_NoMemory(); 5476 v = PyBytes_FromStringAndSize(NULL, bytesize); 5477 if (v == NULL) 5478 return NULL; 5479 5480 p = (unsigned char *)PyBytes_AS_STRING(v); 5481 if (byteorder == 0) 5482 STORECHAR(0xFEFF); 5483 if (size == 0) 5484 goto done; 5485 5486 if (byteorder == -1) { 5487 /* force LE */ 5488 ihi = 1; 5489 ilo = 0; 5490 } 5491 else if (byteorder == 1) { 5492 /* force BE */ 5493 ihi = 0; 5494 ilo = 1; 5495 } 5496 5497 while (size-- > 0) { 5498 Py_UNICODE ch = *s++; 5499 Py_UNICODE ch2 = 0; 5500#ifdef Py_UNICODE_WIDE 5501 if (ch >= 0x10000) { 5502 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); 5503 ch = 0xD800 | ((ch-0x10000) >> 10); 5504 } 5505#endif 5506 STORECHAR(ch); 5507 if (ch2) 5508 STORECHAR(ch2); 5509 } 5510 5511 done: 5512 return v; 5513#undef STORECHAR 5514} 5515 5516PyObject * 5517PyUnicode_AsUTF16String(PyObject *unicode) 5518{ 5519 if (!PyUnicode_Check(unicode)) { 5520 PyErr_BadArgument(); 5521 return NULL; 5522 } 5523 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), 5524 PyUnicode_GET_SIZE(unicode), 5525 NULL, 5526 0); 5527} 5528 5529/* --- Unicode Escape Codec ----------------------------------------------- */ 5530 5531/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5532 if all the escapes in the string make it still a valid ASCII string. 5533 Returns -1 if any escapes were found which cause the string to 5534 pop out of ASCII range. Otherwise returns the length of the 5535 required buffer to hold the string. 5536 */ 5537static Py_ssize_t 5538length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5539{ 5540 const unsigned char *p = (const unsigned char *)s; 5541 const unsigned char *end = p + size; 5542 Py_ssize_t length = 0; 5543 5544 if (size < 0) 5545 return -1; 5546 5547 for (; p < end; ++p) { 5548 if (*p > 127) { 5549 /* Non-ASCII */ 5550 return -1; 5551 } 5552 else if (*p != '\\') { 5553 /* Normal character */ 5554 ++length; 5555 } 5556 else { 5557 /* Backslash-escape, check next char */ 5558 ++p; 5559 /* Escape sequence reaches till end of string or 5560 non-ASCII follow-up. */ 5561 if (p >= end || *p > 127) 5562 return -1; 5563 switch (*p) { 5564 case '\n': 5565 /* backslash + \n result in zero characters */ 5566 break; 5567 case '\\': case '\'': case '\"': 5568 case 'b': case 'f': case 't': 5569 case 'n': case 'r': case 'v': case 'a': 5570 ++length; 5571 break; 5572 case '0': case '1': case '2': case '3': 5573 case '4': case '5': case '6': case '7': 5574 case 'x': case 'u': case 'U': case 'N': 5575 /* these do not guarantee ASCII characters */ 5576 return -1; 5577 default: 5578 /* count the backslash + the other character */ 5579 length += 2; 5580 } 5581 } 5582 } 5583 return length; 5584} 5585 5586/* Similar to PyUnicode_WRITE but either write into wstr field 5587 or treat string as ASCII. */ 5588#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \ 5589 do { \ 5590 if ((kind) != PyUnicode_WCHAR_KIND) \ 5591 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ 5592 else \ 5593 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ 5594 } while (0) 5595 5596#define WRITE_WSTR(buf, index, value) \ 5597 assert(kind == PyUnicode_WCHAR_KIND), \ 5598 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value) 5599 5600 5601static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5602 5603PyObject * 5604PyUnicode_DecodeUnicodeEscape(const char *s, 5605 Py_ssize_t size, 5606 const char *errors) 5607{ 5608 const char *starts = s; 5609 Py_ssize_t startinpos; 5610 Py_ssize_t endinpos; 5611 int j; 5612 PyObject *v; 5613 Py_UNICODE *p; 5614 const char *end; 5615 char* message; 5616 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5617 PyObject *errorHandler = NULL; 5618 PyObject *exc = NULL; 5619 Py_ssize_t ascii_length; 5620 Py_ssize_t i; 5621 int kind; 5622 void *data; 5623 5624 ascii_length = length_of_escaped_ascii_string(s, size); 5625 5626 /* After length_of_escaped_ascii_string() there are two alternatives, 5627 either the string is pure ASCII with named escapes like \n, etc. 5628 and we determined it's exact size (common case) 5629 or it contains \x, \u, ... escape sequences. then we create a 5630 legacy wchar string and resize it at the end of this function. */ 5631 if (ascii_length >= 0) { 5632 v = PyUnicode_New(ascii_length, 127); 5633 if (!v) 5634 goto onError; 5635 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5636 kind = PyUnicode_1BYTE_KIND; 5637 data = PyUnicode_DATA(v); 5638 } 5639 else { 5640 /* Escaped strings will always be longer than the resulting 5641 Unicode string, so we start with size here and then reduce the 5642 length after conversion to the true value. 5643 (but if the error callback returns a long replacement string 5644 we'll have to allocate more space) */ 5645 v = (PyObject*)_PyUnicode_New(size); 5646 if (!v) 5647 goto onError; 5648 kind = PyUnicode_WCHAR_KIND; 5649 data = PyUnicode_AS_UNICODE(v); 5650 } 5651 5652 if (size == 0) 5653 return v; 5654 i = 0; 5655 end = s + size; 5656 5657 while (s < end) { 5658 unsigned char c; 5659 Py_UNICODE x; 5660 int digits; 5661 5662 if (kind == PyUnicode_WCHAR_KIND) { 5663 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5664 } 5665 else { 5666 /* The only case in which i == ascii_length is a backslash 5667 followed by a newline. */ 5668 assert(i <= ascii_length); 5669 } 5670 5671 /* Non-escape characters are interpreted as Unicode ordinals */ 5672 if (*s != '\\') { 5673 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); 5674 continue; 5675 } 5676 5677 startinpos = s-starts; 5678 /* \ - Escapes */ 5679 s++; 5680 c = *s++; 5681 if (s > end) 5682 c = '\0'; /* Invalid after \ */ 5683 5684 if (kind == PyUnicode_WCHAR_KIND) { 5685 assert(i < _PyUnicode_WSTR_LENGTH(v)); 5686 } 5687 else { 5688 /* The only case in which i == ascii_length is a backslash 5689 followed by a newline. */ 5690 assert(i < ascii_length || (i == ascii_length && c == '\n')); 5691 } 5692 5693 switch (c) { 5694 5695 /* \x escapes */ 5696 case '\n': break; 5697 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; 5698 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; 5699 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; 5700 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; 5701 /* FF */ 5702 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; 5703 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; 5704 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; 5705 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; 5706 /* VT */ 5707 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; 5708 /* BEL, not classic C */ 5709 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; 5710 5711 /* \OOO (octal) escapes */ 5712 case '0': case '1': case '2': case '3': 5713 case '4': case '5': case '6': case '7': 5714 x = s[-1] - '0'; 5715 if (s < end && '0' <= *s && *s <= '7') { 5716 x = (x<<3) + *s++ - '0'; 5717 if (s < end && '0' <= *s && *s <= '7') 5718 x = (x<<3) + *s++ - '0'; 5719 } 5720 WRITE_WSTR(data, i++, x); 5721 break; 5722 5723 /* hex escapes */ 5724 /* \xXX */ 5725 case 'x': 5726 digits = 2; 5727 message = "truncated \\xXX escape"; 5728 goto hexescape; 5729 5730 /* \uXXXX */ 5731 case 'u': 5732 digits = 4; 5733 message = "truncated \\uXXXX escape"; 5734 goto hexescape; 5735 5736 /* \UXXXXXXXX */ 5737 case 'U': 5738 digits = 8; 5739 message = "truncated \\UXXXXXXXX escape"; 5740 hexescape: 5741 chr = 0; 5742 p = PyUnicode_AS_UNICODE(v) + i; 5743 if (s+digits>end) { 5744 endinpos = size; 5745 if (unicode_decode_call_errorhandler( 5746 errors, &errorHandler, 5747 "unicodeescape", "end of string in escape sequence", 5748 &starts, &end, &startinpos, &endinpos, &exc, &s, 5749 &v, &i, &p)) 5750 goto onError; 5751 data = PyUnicode_AS_UNICODE(v); 5752 goto nextByte; 5753 } 5754 for (j = 0; j < digits; ++j) { 5755 c = (unsigned char) s[j]; 5756 if (!Py_ISXDIGIT(c)) { 5757 endinpos = (s+j+1)-starts; 5758 p = PyUnicode_AS_UNICODE(v) + i; 5759 if (unicode_decode_call_errorhandler( 5760 errors, &errorHandler, 5761 "unicodeescape", message, 5762 &starts, &end, &startinpos, &endinpos, &exc, &s, 5763 &v, &i, &p)) 5764 goto onError; 5765 data = PyUnicode_AS_UNICODE(v); 5766 goto nextByte; 5767 } 5768 chr = (chr<<4) & ~0xF; 5769 if (c >= '0' && c <= '9') 5770 chr += c - '0'; 5771 else if (c >= 'a' && c <= 'f') 5772 chr += 10 + c - 'a'; 5773 else 5774 chr += 10 + c - 'A'; 5775 } 5776 s += j; 5777 if (chr == 0xffffffff && PyErr_Occurred()) 5778 /* _decoding_error will have already written into the 5779 target buffer. */ 5780 break; 5781 store: 5782 /* when we get here, chr is a 32-bit unicode character */ 5783 if (chr <= 0xffff) 5784 /* UCS-2 character */ 5785 WRITE_WSTR(data, i++, chr); 5786 else if (chr <= 0x10ffff) { 5787 /* UCS-4 character. Either store directly, or as 5788 surrogate pair. */ 5789#ifdef Py_UNICODE_WIDE 5790 WRITE_WSTR(data, i++, chr); 5791#else 5792 chr -= 0x10000L; 5793 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10)); 5794 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF)); 5795#endif 5796 } else { 5797 endinpos = s-starts; 5798 p = PyUnicode_AS_UNICODE(v) + i; 5799 if (unicode_decode_call_errorhandler( 5800 errors, &errorHandler, 5801 "unicodeescape", "illegal Unicode character", 5802 &starts, &end, &startinpos, &endinpos, &exc, &s, 5803 &v, &i, &p)) 5804 goto onError; 5805 data = PyUnicode_AS_UNICODE(v); 5806 } 5807 break; 5808 5809 /* \N{name} */ 5810 case 'N': 5811 message = "malformed \\N character escape"; 5812 if (ucnhash_CAPI == NULL) { 5813 /* load the unicode data module */ 5814 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5815 PyUnicodeData_CAPSULE_NAME, 1); 5816 if (ucnhash_CAPI == NULL) 5817 goto ucnhashError; 5818 } 5819 if (*s == '{') { 5820 const char *start = s+1; 5821 /* look for the closing brace */ 5822 while (*s != '}' && s < end) 5823 s++; 5824 if (s > start && s < end && *s == '}') { 5825 /* found a name. look it up in the unicode database */ 5826 message = "unknown Unicode character name"; 5827 s++; 5828 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5829 &chr, 0)) 5830 goto store; 5831 } 5832 } 5833 endinpos = s-starts; 5834 p = PyUnicode_AS_UNICODE(v) + i; 5835 if (unicode_decode_call_errorhandler( 5836 errors, &errorHandler, 5837 "unicodeescape", message, 5838 &starts, &end, &startinpos, &endinpos, &exc, &s, 5839 &v, &i, &p)) 5840 goto onError; 5841 data = PyUnicode_AS_UNICODE(v); 5842 break; 5843 5844 default: 5845 if (s > end) { 5846 assert(kind == PyUnicode_WCHAR_KIND); 5847 message = "\\ at end of string"; 5848 s--; 5849 endinpos = s-starts; 5850 p = PyUnicode_AS_UNICODE(v) + i; 5851 if (unicode_decode_call_errorhandler( 5852 errors, &errorHandler, 5853 "unicodeescape", message, 5854 &starts, &end, &startinpos, &endinpos, &exc, &s, 5855 &v, &i, &p)) 5856 goto onError; 5857 data = PyUnicode_AS_UNICODE(v); 5858 } 5859 else { 5860 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); 5861 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); 5862 } 5863 break; 5864 } 5865 nextByte: 5866 ; 5867 } 5868 /* Ensure the length prediction worked in case of ASCII strings */ 5869 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); 5870 5871 if (kind == PyUnicode_WCHAR_KIND) 5872 { 5873 if (PyUnicode_Resize(&v, i) < 0) 5874 goto onError; 5875 } 5876 Py_XDECREF(errorHandler); 5877 Py_XDECREF(exc); 5878#ifndef DONT_MAKE_RESULT_READY 5879 if (_PyUnicode_READY_REPLACE(&v)) { 5880 Py_DECREF(v); 5881 return NULL; 5882 } 5883#endif 5884 assert(_PyUnicode_CheckConsistency(v, 1)); 5885 return v; 5886 5887 ucnhashError: 5888 PyErr_SetString( 5889 PyExc_UnicodeError, 5890 "\\N escapes not supported (can't load unicodedata module)" 5891 ); 5892 Py_XDECREF(v); 5893 Py_XDECREF(errorHandler); 5894 Py_XDECREF(exc); 5895 return NULL; 5896 5897 onError: 5898 Py_XDECREF(v); 5899 Py_XDECREF(errorHandler); 5900 Py_XDECREF(exc); 5901 return NULL; 5902} 5903 5904#undef WRITE_ASCII_OR_WSTR 5905#undef WRITE_WSTR 5906 5907/* Return a Unicode-Escape string version of the Unicode object. 5908 5909 If quotes is true, the string is enclosed in u"" or u'' quotes as 5910 appropriate. 5911 5912*/ 5913 5914PyObject * 5915PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5916 Py_ssize_t size) 5917{ 5918 PyObject *repr; 5919 char *p; 5920 5921#ifdef Py_UNICODE_WIDE 5922 const Py_ssize_t expandsize = 10; 5923#else 5924 const Py_ssize_t expandsize = 6; 5925#endif 5926 5927 /* XXX(nnorwitz): rather than over-allocating, it would be 5928 better to choose a different scheme. Perhaps scan the 5929 first N-chars of the string and allocate based on that size. 5930 */ 5931 /* Initial allocation is based on the longest-possible unichr 5932 escape. 5933 5934 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5935 unichr, so in this case it's the longest unichr escape. In 5936 narrow (UTF-16) builds this is five chars per source unichr 5937 since there are two unichrs in the surrogate pair, so in narrow 5938 (UTF-16) builds it's not the longest unichr escape. 5939 5940 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5941 so in the narrow (UTF-16) build case it's the longest unichr 5942 escape. 5943 */ 5944 5945 if (size == 0) 5946 return PyBytes_FromStringAndSize(NULL, 0); 5947 5948 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5949 return PyErr_NoMemory(); 5950 5951 repr = PyBytes_FromStringAndSize(NULL, 5952 2 5953 + expandsize*size 5954 + 1); 5955 if (repr == NULL) 5956 return NULL; 5957 5958 p = PyBytes_AS_STRING(repr); 5959 5960 while (size-- > 0) { 5961 Py_UNICODE ch = *s++; 5962 5963 /* Escape backslashes */ 5964 if (ch == '\\') { 5965 *p++ = '\\'; 5966 *p++ = (char) ch; 5967 continue; 5968 } 5969 5970#ifdef Py_UNICODE_WIDE 5971 /* Map 21-bit characters to '\U00xxxxxx' */ 5972 else if (ch >= 0x10000) { 5973 *p++ = '\\'; 5974 *p++ = 'U'; 5975 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5976 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5977 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5978 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5979 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5980 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5981 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5982 *p++ = Py_hexdigits[ch & 0x0000000F]; 5983 continue; 5984 } 5985#else 5986 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 5987 else if (ch >= 0xD800 && ch < 0xDC00) { 5988 Py_UNICODE ch2; 5989 Py_UCS4 ucs; 5990 5991 ch2 = *s++; 5992 size--; 5993 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 5994 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 5995 *p++ = '\\'; 5996 *p++ = 'U'; 5997 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F]; 5998 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F]; 5999 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F]; 6000 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F]; 6001 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F]; 6002 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F]; 6003 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F]; 6004 *p++ = Py_hexdigits[ucs & 0x0000000F]; 6005 continue; 6006 } 6007 /* Fall through: isolated surrogates are copied as-is */ 6008 s--; 6009 size++; 6010 } 6011#endif 6012 6013 /* Map 16-bit characters to '\uxxxx' */ 6014 if (ch >= 256) { 6015 *p++ = '\\'; 6016 *p++ = 'u'; 6017 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 6018 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 6019 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6020 *p++ = Py_hexdigits[ch & 0x000F]; 6021 } 6022 6023 /* Map special whitespace to '\t', \n', '\r' */ 6024 else if (ch == '\t') { 6025 *p++ = '\\'; 6026 *p++ = 't'; 6027 } 6028 else if (ch == '\n') { 6029 *p++ = '\\'; 6030 *p++ = 'n'; 6031 } 6032 else if (ch == '\r') { 6033 *p++ = '\\'; 6034 *p++ = 'r'; 6035 } 6036 6037 /* Map non-printable US ASCII to '\xhh' */ 6038 else if (ch < ' ' || ch >= 0x7F) { 6039 *p++ = '\\'; 6040 *p++ = 'x'; 6041 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6042 *p++ = Py_hexdigits[ch & 0x000F]; 6043 } 6044 6045 /* Copy everything else as-is */ 6046 else 6047 *p++ = (char) ch; 6048 } 6049 6050 assert(p - PyBytes_AS_STRING(repr) > 0); 6051 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6052 return NULL; 6053 return repr; 6054} 6055 6056PyObject * 6057PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 6058{ 6059 PyObject *s; 6060 if (!PyUnicode_Check(unicode)) { 6061 PyErr_BadArgument(); 6062 return NULL; 6063 } 6064 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6065 PyUnicode_GET_SIZE(unicode)); 6066 return s; 6067} 6068 6069/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6070 6071PyObject * 6072PyUnicode_DecodeRawUnicodeEscape(const char *s, 6073 Py_ssize_t size, 6074 const char *errors) 6075{ 6076 const char *starts = s; 6077 Py_ssize_t startinpos; 6078 Py_ssize_t endinpos; 6079 Py_ssize_t outpos; 6080 PyObject *v; 6081 Py_UNICODE *p; 6082 const char *end; 6083 const char *bs; 6084 PyObject *errorHandler = NULL; 6085 PyObject *exc = NULL; 6086 6087 /* Escaped strings will always be longer than the resulting 6088 Unicode string, so we start with size here and then reduce the 6089 length after conversion to the true value. (But decoding error 6090 handler might have to resize the string) */ 6091 v = (PyObject*)_PyUnicode_New(size); 6092 if (v == NULL) 6093 goto onError; 6094 if (size == 0) 6095 return v; 6096 p = PyUnicode_AS_UNICODE(v); 6097 end = s + size; 6098 while (s < end) { 6099 unsigned char c; 6100 Py_UCS4 x; 6101 int i; 6102 int count; 6103 6104 /* Non-escape characters are interpreted as Unicode ordinals */ 6105 if (*s != '\\') { 6106 *p++ = (unsigned char)*s++; 6107 continue; 6108 } 6109 startinpos = s-starts; 6110 6111 /* \u-escapes are only interpreted iff the number of leading 6112 backslashes if odd */ 6113 bs = s; 6114 for (;s < end;) { 6115 if (*s != '\\') 6116 break; 6117 *p++ = (unsigned char)*s++; 6118 } 6119 if (((s - bs) & 1) == 0 || 6120 s >= end || 6121 (*s != 'u' && *s != 'U')) { 6122 continue; 6123 } 6124 p--; 6125 count = *s=='u' ? 4 : 8; 6126 s++; 6127 6128 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6129 outpos = p-PyUnicode_AS_UNICODE(v); 6130 for (x = 0, i = 0; i < count; ++i, ++s) { 6131 c = (unsigned char)*s; 6132 if (!Py_ISXDIGIT(c)) { 6133 endinpos = s-starts; 6134 if (unicode_decode_call_errorhandler( 6135 errors, &errorHandler, 6136 "rawunicodeescape", "truncated \\uXXXX", 6137 &starts, &end, &startinpos, &endinpos, &exc, &s, 6138 &v, &outpos, &p)) 6139 goto onError; 6140 goto nextByte; 6141 } 6142 x = (x<<4) & ~0xF; 6143 if (c >= '0' && c <= '9') 6144 x += c - '0'; 6145 else if (c >= 'a' && c <= 'f') 6146 x += 10 + c - 'a'; 6147 else 6148 x += 10 + c - 'A'; 6149 } 6150 if (x <= 0xffff) 6151 /* UCS-2 character */ 6152 *p++ = (Py_UNICODE) x; 6153 else if (x <= 0x10ffff) { 6154 /* UCS-4 character. Either store directly, or as 6155 surrogate pair. */ 6156#ifdef Py_UNICODE_WIDE 6157 *p++ = (Py_UNICODE) x; 6158#else 6159 x -= 0x10000L; 6160 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); 6161 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); 6162#endif 6163 } else { 6164 endinpos = s-starts; 6165 outpos = p-PyUnicode_AS_UNICODE(v); 6166 if (unicode_decode_call_errorhandler( 6167 errors, &errorHandler, 6168 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6169 &starts, &end, &startinpos, &endinpos, &exc, &s, 6170 &v, &outpos, &p)) 6171 goto onError; 6172 } 6173 nextByte: 6174 ; 6175 } 6176 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6177 goto onError; 6178 Py_XDECREF(errorHandler); 6179 Py_XDECREF(exc); 6180#ifndef DONT_MAKE_RESULT_READY 6181 if (_PyUnicode_READY_REPLACE(&v)) { 6182 Py_DECREF(v); 6183 return NULL; 6184 } 6185#endif 6186 assert(_PyUnicode_CheckConsistency(v, 1)); 6187 return v; 6188 6189 onError: 6190 Py_XDECREF(v); 6191 Py_XDECREF(errorHandler); 6192 Py_XDECREF(exc); 6193 return NULL; 6194} 6195 6196PyObject * 6197PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6198 Py_ssize_t size) 6199{ 6200 PyObject *repr; 6201 char *p; 6202 char *q; 6203 6204#ifdef Py_UNICODE_WIDE 6205 const Py_ssize_t expandsize = 10; 6206#else 6207 const Py_ssize_t expandsize = 6; 6208#endif 6209 6210 if (size > PY_SSIZE_T_MAX / expandsize) 6211 return PyErr_NoMemory(); 6212 6213 repr = PyBytes_FromStringAndSize(NULL, expandsize * size); 6214 if (repr == NULL) 6215 return NULL; 6216 if (size == 0) 6217 return repr; 6218 6219 p = q = PyBytes_AS_STRING(repr); 6220 while (size-- > 0) { 6221 Py_UNICODE ch = *s++; 6222#ifdef Py_UNICODE_WIDE 6223 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6224 if (ch >= 0x10000) { 6225 *p++ = '\\'; 6226 *p++ = 'U'; 6227 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6228 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6229 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6230 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6231 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6232 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6233 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6234 *p++ = Py_hexdigits[ch & 15]; 6235 } 6236 else 6237#else 6238 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ 6239 if (ch >= 0xD800 && ch < 0xDC00) { 6240 Py_UNICODE ch2; 6241 Py_UCS4 ucs; 6242 6243 ch2 = *s++; 6244 size--; 6245 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { 6246 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; 6247 *p++ = '\\'; 6248 *p++ = 'U'; 6249 *p++ = Py_hexdigits[(ucs >> 28) & 0xf]; 6250 *p++ = Py_hexdigits[(ucs >> 24) & 0xf]; 6251 *p++ = Py_hexdigits[(ucs >> 20) & 0xf]; 6252 *p++ = Py_hexdigits[(ucs >> 16) & 0xf]; 6253 *p++ = Py_hexdigits[(ucs >> 12) & 0xf]; 6254 *p++ = Py_hexdigits[(ucs >> 8) & 0xf]; 6255 *p++ = Py_hexdigits[(ucs >> 4) & 0xf]; 6256 *p++ = Py_hexdigits[ucs & 0xf]; 6257 continue; 6258 } 6259 /* Fall through: isolated surrogates are copied as-is */ 6260 s--; 6261 size++; 6262 } 6263#endif 6264 /* Map 16-bit characters to '\uxxxx' */ 6265 if (ch >= 256) { 6266 *p++ = '\\'; 6267 *p++ = 'u'; 6268 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6269 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6270 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6271 *p++ = Py_hexdigits[ch & 15]; 6272 } 6273 /* Copy everything else as-is */ 6274 else 6275 *p++ = (char) ch; 6276 } 6277 size = p - q; 6278 6279 assert(size > 0); 6280 if (_PyBytes_Resize(&repr, size) < 0) 6281 return NULL; 6282 return repr; 6283} 6284 6285PyObject * 6286PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6287{ 6288 PyObject *s; 6289 if (!PyUnicode_Check(unicode)) { 6290 PyErr_BadArgument(); 6291 return NULL; 6292 } 6293 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), 6294 PyUnicode_GET_SIZE(unicode)); 6295 6296 return s; 6297} 6298 6299/* --- Unicode Internal Codec ------------------------------------------- */ 6300 6301PyObject * 6302_PyUnicode_DecodeUnicodeInternal(const char *s, 6303 Py_ssize_t size, 6304 const char *errors) 6305{ 6306 const char *starts = s; 6307 Py_ssize_t startinpos; 6308 Py_ssize_t endinpos; 6309 Py_ssize_t outpos; 6310 PyObject *v; 6311 Py_UNICODE *p; 6312 const char *end; 6313 const char *reason; 6314 PyObject *errorHandler = NULL; 6315 PyObject *exc = NULL; 6316 6317#ifdef Py_UNICODE_WIDE 6318 Py_UNICODE unimax = PyUnicode_GetMax(); 6319#endif 6320 6321 /* XXX overflow detection missing */ 6322 v = (PyObject*)_PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); 6323 if (v == NULL) 6324 goto onError; 6325 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH 6326 as string was created with the old API. */ 6327 if (PyUnicode_GET_SIZE(v) == 0) 6328 return v; 6329 p = PyUnicode_AS_UNICODE(v); 6330 end = s + size; 6331 6332 while (s < end) { 6333 memcpy(p, s, sizeof(Py_UNICODE)); 6334 /* We have to sanity check the raw data, otherwise doom looms for 6335 some malformed UCS-4 data. */ 6336 if ( 6337#ifdef Py_UNICODE_WIDE 6338 *p > unimax || *p < 0 || 6339#endif 6340 end-s < Py_UNICODE_SIZE 6341 ) 6342 { 6343 startinpos = s - starts; 6344 if (end-s < Py_UNICODE_SIZE) { 6345 endinpos = end-starts; 6346 reason = "truncated input"; 6347 } 6348 else { 6349 endinpos = s - starts + Py_UNICODE_SIZE; 6350 reason = "illegal code point (> 0x10FFFF)"; 6351 } 6352 outpos = p - PyUnicode_AS_UNICODE(v); 6353 if (unicode_decode_call_errorhandler( 6354 errors, &errorHandler, 6355 "unicode_internal", reason, 6356 &starts, &end, &startinpos, &endinpos, &exc, &s, 6357 &v, &outpos, &p)) { 6358 goto onError; 6359 } 6360 } 6361 else { 6362 p++; 6363 s += Py_UNICODE_SIZE; 6364 } 6365 } 6366 6367 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 6368 goto onError; 6369 Py_XDECREF(errorHandler); 6370 Py_XDECREF(exc); 6371#ifndef DONT_MAKE_RESULT_READY 6372 if (_PyUnicode_READY_REPLACE(&v)) { 6373 Py_DECREF(v); 6374 return NULL; 6375 } 6376#endif 6377 assert(_PyUnicode_CheckConsistency(v, 1)); 6378 return v; 6379 6380 onError: 6381 Py_XDECREF(v); 6382 Py_XDECREF(errorHandler); 6383 Py_XDECREF(exc); 6384 return NULL; 6385} 6386 6387/* --- Latin-1 Codec ------------------------------------------------------ */ 6388 6389PyObject * 6390PyUnicode_DecodeLatin1(const char *s, 6391 Py_ssize_t size, 6392 const char *errors) 6393{ 6394 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6395 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6396} 6397 6398/* create or adjust a UnicodeEncodeError */ 6399static void 6400make_encode_exception(PyObject **exceptionObject, 6401 const char *encoding, 6402 PyObject *unicode, 6403 Py_ssize_t startpos, Py_ssize_t endpos, 6404 const char *reason) 6405{ 6406 if (*exceptionObject == NULL) { 6407 *exceptionObject = PyObject_CallFunction( 6408 PyExc_UnicodeEncodeError, "sOnns", 6409 encoding, unicode, startpos, endpos, reason); 6410 } 6411 else { 6412 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6413 goto onError; 6414 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6415 goto onError; 6416 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6417 goto onError; 6418 return; 6419 onError: 6420 Py_DECREF(*exceptionObject); 6421 *exceptionObject = NULL; 6422 } 6423} 6424 6425/* raises a UnicodeEncodeError */ 6426static void 6427raise_encode_exception(PyObject **exceptionObject, 6428 const char *encoding, 6429 PyObject *unicode, 6430 Py_ssize_t startpos, Py_ssize_t endpos, 6431 const char *reason) 6432{ 6433 make_encode_exception(exceptionObject, 6434 encoding, unicode, startpos, endpos, reason); 6435 if (*exceptionObject != NULL) 6436 PyCodec_StrictErrors(*exceptionObject); 6437} 6438 6439/* error handling callback helper: 6440 build arguments, call the callback and check the arguments, 6441 put the result into newpos and return the replacement string, which 6442 has to be freed by the caller */ 6443static PyObject * 6444unicode_encode_call_errorhandler(const char *errors, 6445 PyObject **errorHandler, 6446 const char *encoding, const char *reason, 6447 PyObject *unicode, PyObject **exceptionObject, 6448 Py_ssize_t startpos, Py_ssize_t endpos, 6449 Py_ssize_t *newpos) 6450{ 6451 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6452 Py_ssize_t len; 6453 PyObject *restuple; 6454 PyObject *resunicode; 6455 6456 if (*errorHandler == NULL) { 6457 *errorHandler = PyCodec_LookupError(errors); 6458 if (*errorHandler == NULL) 6459 return NULL; 6460 } 6461 6462 if (PyUnicode_READY(unicode) < 0) 6463 return NULL; 6464 len = PyUnicode_GET_LENGTH(unicode); 6465 6466 make_encode_exception(exceptionObject, 6467 encoding, unicode, startpos, endpos, reason); 6468 if (*exceptionObject == NULL) 6469 return NULL; 6470 6471 restuple = PyObject_CallFunctionObjArgs( 6472 *errorHandler, *exceptionObject, NULL); 6473 if (restuple == NULL) 6474 return NULL; 6475 if (!PyTuple_Check(restuple)) { 6476 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6477 Py_DECREF(restuple); 6478 return NULL; 6479 } 6480 if (!PyArg_ParseTuple(restuple, argparse, 6481 &resunicode, newpos)) { 6482 Py_DECREF(restuple); 6483 return NULL; 6484 } 6485 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6486 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6487 Py_DECREF(restuple); 6488 return NULL; 6489 } 6490 if (*newpos<0) 6491 *newpos = len + *newpos; 6492 if (*newpos<0 || *newpos>len) { 6493 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6494 Py_DECREF(restuple); 6495 return NULL; 6496 } 6497 Py_INCREF(resunicode); 6498 Py_DECREF(restuple); 6499 return resunicode; 6500} 6501 6502static PyObject * 6503unicode_encode_ucs1(PyObject *unicode, 6504 const char *errors, 6505 unsigned int limit) 6506{ 6507 /* input state */ 6508 Py_ssize_t pos=0, size; 6509 int kind; 6510 void *data; 6511 /* output object */ 6512 PyObject *res; 6513 /* pointer into the output */ 6514 char *str; 6515 /* current output position */ 6516 Py_ssize_t ressize; 6517 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6518 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6519 PyObject *errorHandler = NULL; 6520 PyObject *exc = NULL; 6521 /* the following variable is used for caching string comparisons 6522 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6523 int known_errorHandler = -1; 6524 6525 if (PyUnicode_READY(unicode) < 0) 6526 return NULL; 6527 size = PyUnicode_GET_LENGTH(unicode); 6528 kind = PyUnicode_KIND(unicode); 6529 data = PyUnicode_DATA(unicode); 6530 /* allocate enough for a simple encoding without 6531 replacements, if we need more, we'll resize */ 6532 if (size == 0) 6533 return PyBytes_FromStringAndSize(NULL, 0); 6534 res = PyBytes_FromStringAndSize(NULL, size); 6535 if (res == NULL) 6536 return NULL; 6537 str = PyBytes_AS_STRING(res); 6538 ressize = size; 6539 6540 while (pos < size) { 6541 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6542 6543 /* can we encode this? */ 6544 if (c<limit) { 6545 /* no overflow check, because we know that the space is enough */ 6546 *str++ = (char)c; 6547 ++pos; 6548 } 6549 else { 6550 Py_ssize_t requiredsize; 6551 PyObject *repunicode; 6552 Py_ssize_t repsize, newpos, respos, i; 6553 /* startpos for collecting unencodable chars */ 6554 Py_ssize_t collstart = pos; 6555 Py_ssize_t collend = pos; 6556 /* find all unecodable characters */ 6557 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6558 ++collend; 6559 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6560 if (known_errorHandler==-1) { 6561 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6562 known_errorHandler = 1; 6563 else if (!strcmp(errors, "replace")) 6564 known_errorHandler = 2; 6565 else if (!strcmp(errors, "ignore")) 6566 known_errorHandler = 3; 6567 else if (!strcmp(errors, "xmlcharrefreplace")) 6568 known_errorHandler = 4; 6569 else 6570 known_errorHandler = 0; 6571 } 6572 switch (known_errorHandler) { 6573 case 1: /* strict */ 6574 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6575 goto onError; 6576 case 2: /* replace */ 6577 while (collstart++<collend) 6578 *str++ = '?'; /* fall through */ 6579 case 3: /* ignore */ 6580 pos = collend; 6581 break; 6582 case 4: /* xmlcharrefreplace */ 6583 respos = str - PyBytes_AS_STRING(res); 6584 /* determine replacement size */ 6585 for (i = collstart, repsize = 0; i < collend; ++i) { 6586 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6587 if (ch < 10) 6588 repsize += 2+1+1; 6589 else if (ch < 100) 6590 repsize += 2+2+1; 6591 else if (ch < 1000) 6592 repsize += 2+3+1; 6593 else if (ch < 10000) 6594 repsize += 2+4+1; 6595#ifndef Py_UNICODE_WIDE 6596 else 6597 repsize += 2+5+1; 6598#else 6599 else if (ch < 100000) 6600 repsize += 2+5+1; 6601 else if (ch < 1000000) 6602 repsize += 2+6+1; 6603 else 6604 repsize += 2+7+1; 6605#endif 6606 } 6607 requiredsize = respos+repsize+(size-collend); 6608 if (requiredsize > ressize) { 6609 if (requiredsize<2*ressize) 6610 requiredsize = 2*ressize; 6611 if (_PyBytes_Resize(&res, requiredsize)) 6612 goto onError; 6613 str = PyBytes_AS_STRING(res) + respos; 6614 ressize = requiredsize; 6615 } 6616 /* generate replacement */ 6617 for (i = collstart; i < collend; ++i) { 6618 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6619 } 6620 pos = collend; 6621 break; 6622 default: 6623 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6624 encoding, reason, unicode, &exc, 6625 collstart, collend, &newpos); 6626 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6627 PyUnicode_READY(repunicode) < 0)) 6628 goto onError; 6629 if (PyBytes_Check(repunicode)) { 6630 /* Directly copy bytes result to output. */ 6631 repsize = PyBytes_Size(repunicode); 6632 if (repsize > 1) { 6633 /* Make room for all additional bytes. */ 6634 respos = str - PyBytes_AS_STRING(res); 6635 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6636 Py_DECREF(repunicode); 6637 goto onError; 6638 } 6639 str = PyBytes_AS_STRING(res) + respos; 6640 ressize += repsize-1; 6641 } 6642 memcpy(str, PyBytes_AsString(repunicode), repsize); 6643 str += repsize; 6644 pos = newpos; 6645 Py_DECREF(repunicode); 6646 break; 6647 } 6648 /* need more space? (at least enough for what we 6649 have+the replacement+the rest of the string, so 6650 we won't have to check space for encodable characters) */ 6651 respos = str - PyBytes_AS_STRING(res); 6652 repsize = PyUnicode_GET_LENGTH(repunicode); 6653 requiredsize = respos+repsize+(size-collend); 6654 if (requiredsize > ressize) { 6655 if (requiredsize<2*ressize) 6656 requiredsize = 2*ressize; 6657 if (_PyBytes_Resize(&res, requiredsize)) { 6658 Py_DECREF(repunicode); 6659 goto onError; 6660 } 6661 str = PyBytes_AS_STRING(res) + respos; 6662 ressize = requiredsize; 6663 } 6664 /* check if there is anything unencodable in the replacement 6665 and copy it to the output */ 6666 for (i = 0; repsize-->0; ++i, ++str) { 6667 c = PyUnicode_READ_CHAR(repunicode, i); 6668 if (c >= limit) { 6669 raise_encode_exception(&exc, encoding, unicode, 6670 pos, pos+1, reason); 6671 Py_DECREF(repunicode); 6672 goto onError; 6673 } 6674 *str = (char)c; 6675 } 6676 pos = newpos; 6677 Py_DECREF(repunicode); 6678 } 6679 } 6680 } 6681 /* Resize if we allocated to much */ 6682 size = str - PyBytes_AS_STRING(res); 6683 if (size < ressize) { /* If this falls res will be NULL */ 6684 assert(size >= 0); 6685 if (_PyBytes_Resize(&res, size) < 0) 6686 goto onError; 6687 } 6688 6689 Py_XDECREF(errorHandler); 6690 Py_XDECREF(exc); 6691 return res; 6692 6693 onError: 6694 Py_XDECREF(res); 6695 Py_XDECREF(errorHandler); 6696 Py_XDECREF(exc); 6697 return NULL; 6698} 6699 6700/* Deprecated */ 6701PyObject * 6702PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6703 Py_ssize_t size, 6704 const char *errors) 6705{ 6706 PyObject *result; 6707 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6708 if (unicode == NULL) 6709 return NULL; 6710 result = unicode_encode_ucs1(unicode, errors, 256); 6711 Py_DECREF(unicode); 6712 return result; 6713} 6714 6715PyObject * 6716_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6717{ 6718 if (!PyUnicode_Check(unicode)) { 6719 PyErr_BadArgument(); 6720 return NULL; 6721 } 6722 if (PyUnicode_READY(unicode) == -1) 6723 return NULL; 6724 /* Fast path: if it is a one-byte string, construct 6725 bytes object directly. */ 6726 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6727 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6728 PyUnicode_GET_LENGTH(unicode)); 6729 /* Non-Latin-1 characters present. Defer to above function to 6730 raise the exception. */ 6731 return unicode_encode_ucs1(unicode, errors, 256); 6732} 6733 6734PyObject* 6735PyUnicode_AsLatin1String(PyObject *unicode) 6736{ 6737 return _PyUnicode_AsLatin1String(unicode, NULL); 6738} 6739 6740/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6741 6742PyObject * 6743PyUnicode_DecodeASCII(const char *s, 6744 Py_ssize_t size, 6745 const char *errors) 6746{ 6747 const char *starts = s; 6748 PyObject *v; 6749 Py_UNICODE *u; 6750 Py_ssize_t startinpos; 6751 Py_ssize_t endinpos; 6752 Py_ssize_t outpos; 6753 const char *e; 6754 int has_error; 6755 const unsigned char *p = (const unsigned char *)s; 6756 const unsigned char *end = p + size; 6757 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6758 PyObject *errorHandler = NULL; 6759 PyObject *exc = NULL; 6760 6761 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6762 if (size == 1 && (unsigned char)s[0] < 128) 6763 return get_latin1_char((unsigned char)s[0]); 6764 6765 has_error = 0; 6766 while (p < end && !has_error) { 6767 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6768 an explanation. */ 6769 if (!((size_t) p & LONG_PTR_MASK)) { 6770 /* Help register allocation */ 6771 register const unsigned char *_p = p; 6772 while (_p < aligned_end) { 6773 unsigned long value = *(unsigned long *) _p; 6774 if (value & ASCII_CHAR_MASK) { 6775 has_error = 1; 6776 break; 6777 } 6778 _p += SIZEOF_LONG; 6779 } 6780 if (_p == end) 6781 break; 6782 if (has_error) 6783 break; 6784 p = _p; 6785 } 6786 if (*p & 0x80) { 6787 has_error = 1; 6788 break; 6789 } 6790 else { 6791 ++p; 6792 } 6793 } 6794 if (!has_error) 6795 return unicode_fromascii((const unsigned char *)s, size); 6796 6797 v = (PyObject*)_PyUnicode_New(size); 6798 if (v == NULL) 6799 goto onError; 6800 if (size == 0) 6801 return v; 6802 u = PyUnicode_AS_UNICODE(v); 6803 e = s + size; 6804 while (s < e) { 6805 register unsigned char c = (unsigned char)*s; 6806 if (c < 128) { 6807 *u++ = c; 6808 ++s; 6809 } 6810 else { 6811 startinpos = s-starts; 6812 endinpos = startinpos + 1; 6813 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); 6814 if (unicode_decode_call_errorhandler( 6815 errors, &errorHandler, 6816 "ascii", "ordinal not in range(128)", 6817 &starts, &e, &startinpos, &endinpos, &exc, &s, 6818 &v, &outpos, &u)) 6819 goto onError; 6820 } 6821 } 6822 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 6823 if (PyUnicode_Resize(&v, u - PyUnicode_AS_UNICODE(v)) < 0) 6824 goto onError; 6825 Py_XDECREF(errorHandler); 6826 Py_XDECREF(exc); 6827#ifndef DONT_MAKE_RESULT_READY 6828 if (_PyUnicode_READY_REPLACE(&v)) { 6829 Py_DECREF(v); 6830 return NULL; 6831 } 6832#endif 6833 assert(_PyUnicode_CheckConsistency(v, 1)); 6834 return v; 6835 6836 onError: 6837 Py_XDECREF(v); 6838 Py_XDECREF(errorHandler); 6839 Py_XDECREF(exc); 6840 return NULL; 6841} 6842 6843/* Deprecated */ 6844PyObject * 6845PyUnicode_EncodeASCII(const Py_UNICODE *p, 6846 Py_ssize_t size, 6847 const char *errors) 6848{ 6849 PyObject *result; 6850 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6851 if (unicode == NULL) 6852 return NULL; 6853 result = unicode_encode_ucs1(unicode, errors, 128); 6854 Py_DECREF(unicode); 6855 return result; 6856} 6857 6858PyObject * 6859_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6860{ 6861 if (!PyUnicode_Check(unicode)) { 6862 PyErr_BadArgument(); 6863 return NULL; 6864 } 6865 if (PyUnicode_READY(unicode) == -1) 6866 return NULL; 6867 /* Fast path: if it is an ASCII-only string, construct bytes object 6868 directly. Else defer to above function to raise the exception. */ 6869 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6870 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6871 PyUnicode_GET_LENGTH(unicode)); 6872 return unicode_encode_ucs1(unicode, errors, 128); 6873} 6874 6875PyObject * 6876PyUnicode_AsASCIIString(PyObject *unicode) 6877{ 6878 return _PyUnicode_AsASCIIString(unicode, NULL); 6879} 6880 6881#ifdef HAVE_MBCS 6882 6883/* --- MBCS codecs for Windows -------------------------------------------- */ 6884 6885#if SIZEOF_INT < SIZEOF_SIZE_T 6886#define NEED_RETRY 6887#endif 6888 6889#ifndef WC_ERR_INVALID_CHARS 6890# define WC_ERR_INVALID_CHARS 0x0080 6891#endif 6892 6893static char* 6894code_page_name(UINT code_page, PyObject **obj) 6895{ 6896 *obj = NULL; 6897 if (code_page == CP_ACP) 6898 return "mbcs"; 6899 if (code_page == CP_UTF7) 6900 return "CP_UTF7"; 6901 if (code_page == CP_UTF8) 6902 return "CP_UTF8"; 6903 6904 *obj = PyBytes_FromFormat("cp%u", code_page); 6905 if (*obj == NULL) 6906 return NULL; 6907 return PyBytes_AS_STRING(*obj); 6908} 6909 6910static int 6911is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6912{ 6913 const char *curr = s + offset; 6914 const char *prev; 6915 6916 if (!IsDBCSLeadByteEx(code_page, *curr)) 6917 return 0; 6918 6919 prev = CharPrevExA(code_page, s, curr, 0); 6920 if (prev == curr) 6921 return 1; 6922 /* FIXME: This code is limited to "true" double-byte encodings, 6923 as it assumes an incomplete character consists of a single 6924 byte. */ 6925 if (curr - prev == 2) 6926 return 1; 6927 if (!IsDBCSLeadByteEx(code_page, *prev)) 6928 return 1; 6929 return 0; 6930} 6931 6932static DWORD 6933decode_code_page_flags(UINT code_page) 6934{ 6935 if (code_page == CP_UTF7) { 6936 /* The CP_UTF7 decoder only supports flags=0 */ 6937 return 0; 6938 } 6939 else 6940 return MB_ERR_INVALID_CHARS; 6941} 6942 6943/* 6944 * Decode a byte string from a Windows code page into unicode object in strict 6945 * mode. 6946 * 6947 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6948 * WindowsError and returns -1 on other error. 6949 */ 6950static int 6951decode_code_page_strict(UINT code_page, 6952 PyObject **v, 6953 const char *in, 6954 int insize) 6955{ 6956 const DWORD flags = decode_code_page_flags(code_page); 6957 Py_UNICODE *out; 6958 DWORD outsize; 6959 6960 /* First get the size of the result */ 6961 assert(insize > 0); 6962 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6963 if (outsize <= 0) 6964 goto error; 6965 6966 if (*v == NULL) { 6967 /* Create unicode object */ 6968 *v = (PyObject*)_PyUnicode_New(outsize); 6969 if (*v == NULL) 6970 return -1; 6971 out = PyUnicode_AS_UNICODE(*v); 6972 } 6973 else { 6974 /* Extend unicode object */ 6975 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6976 if (PyUnicode_Resize(v, n + outsize) < 0) 6977 return -1; 6978 out = PyUnicode_AS_UNICODE(*v) + n; 6979 } 6980 6981 /* Do the conversion */ 6982 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6983 if (outsize <= 0) 6984 goto error; 6985 return insize; 6986 6987error: 6988 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6989 return -2; 6990 PyErr_SetFromWindowsErr(0); 6991 return -1; 6992} 6993 6994/* 6995 * Decode a byte string from a code page into unicode object with an error 6996 * handler. 6997 * 6998 * Returns consumed size if succeed, or raise a WindowsError or 6999 * UnicodeDecodeError exception and returns -1 on error. 7000 */ 7001static int 7002decode_code_page_errors(UINT code_page, 7003 PyObject **v, 7004 const char *in, const int size, 7005 const char *errors) 7006{ 7007 const char *startin = in; 7008 const char *endin = in + size; 7009 const DWORD flags = decode_code_page_flags(code_page); 7010 /* Ideally, we should get reason from FormatMessage. This is the Windows 7011 2000 English version of the message. */ 7012 const char *reason = "No mapping for the Unicode character exists " 7013 "in the target code page."; 7014 /* each step cannot decode more than 1 character, but a character can be 7015 represented as a surrogate pair */ 7016 wchar_t buffer[2], *startout, *out; 7017 int insize, outsize; 7018 PyObject *errorHandler = NULL; 7019 PyObject *exc = NULL; 7020 PyObject *encoding_obj = NULL; 7021 char *encoding; 7022 DWORD err; 7023 int ret = -1; 7024 7025 assert(size > 0); 7026 7027 encoding = code_page_name(code_page, &encoding_obj); 7028 if (encoding == NULL) 7029 return -1; 7030 7031 if (errors == NULL || strcmp(errors, "strict") == 0) { 7032 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 7033 UnicodeDecodeError. */ 7034 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 7035 if (exc != NULL) { 7036 PyCodec_StrictErrors(exc); 7037 Py_CLEAR(exc); 7038 } 7039 goto error; 7040 } 7041 7042 if (*v == NULL) { 7043 /* Create unicode object */ 7044 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7045 PyErr_NoMemory(); 7046 goto error; 7047 } 7048 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 7049 if (*v == NULL) 7050 goto error; 7051 startout = PyUnicode_AS_UNICODE(*v); 7052 } 7053 else { 7054 /* Extend unicode object */ 7055 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 7056 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7057 PyErr_NoMemory(); 7058 goto error; 7059 } 7060 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7061 goto error; 7062 startout = PyUnicode_AS_UNICODE(*v) + n; 7063 } 7064 7065 /* Decode the byte string character per character */ 7066 out = startout; 7067 while (in < endin) 7068 { 7069 /* Decode a character */ 7070 insize = 1; 7071 do 7072 { 7073 outsize = MultiByteToWideChar(code_page, flags, 7074 in, insize, 7075 buffer, Py_ARRAY_LENGTH(buffer)); 7076 if (outsize > 0) 7077 break; 7078 err = GetLastError(); 7079 if (err != ERROR_NO_UNICODE_TRANSLATION 7080 && err != ERROR_INSUFFICIENT_BUFFER) 7081 { 7082 PyErr_SetFromWindowsErr(0); 7083 goto error; 7084 } 7085 insize++; 7086 } 7087 /* 4=maximum length of a UTF-8 sequence */ 7088 while (insize <= 4 && (in + insize) <= endin); 7089 7090 if (outsize <= 0) { 7091 Py_ssize_t startinpos, endinpos, outpos; 7092 7093 startinpos = in - startin; 7094 endinpos = startinpos + 1; 7095 outpos = out - PyUnicode_AS_UNICODE(*v); 7096 if (unicode_decode_call_errorhandler( 7097 errors, &errorHandler, 7098 encoding, reason, 7099 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7100 v, &outpos, &out)) 7101 { 7102 goto error; 7103 } 7104 } 7105 else { 7106 in += insize; 7107 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7108 out += outsize; 7109 } 7110 } 7111 7112 /* write a NUL character at the end */ 7113 *out = 0; 7114 7115 /* Extend unicode object */ 7116 outsize = out - startout; 7117 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7118 if (PyUnicode_Resize(v, outsize) < 0) 7119 goto error; 7120 ret = size; 7121 7122error: 7123 Py_XDECREF(encoding_obj); 7124 Py_XDECREF(errorHandler); 7125 Py_XDECREF(exc); 7126 return ret; 7127} 7128 7129static PyObject * 7130decode_code_page_stateful(int code_page, 7131 const char *s, Py_ssize_t size, 7132 const char *errors, Py_ssize_t *consumed) 7133{ 7134 PyObject *v = NULL; 7135 int chunk_size, final, converted, done; 7136 7137 if (code_page < 0) { 7138 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7139 return NULL; 7140 } 7141 7142 if (consumed) 7143 *consumed = 0; 7144 7145 do 7146 { 7147#ifdef NEED_RETRY 7148 if (size > INT_MAX) { 7149 chunk_size = INT_MAX; 7150 final = 0; 7151 done = 0; 7152 } 7153 else 7154#endif 7155 { 7156 chunk_size = (int)size; 7157 final = (consumed == NULL); 7158 done = 1; 7159 } 7160 7161 /* Skip trailing lead-byte unless 'final' is set */ 7162 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 7163 --chunk_size; 7164 7165 if (chunk_size == 0 && done) { 7166 if (v != NULL) 7167 break; 7168 Py_INCREF(unicode_empty); 7169 return unicode_empty; 7170 } 7171 7172 7173 converted = decode_code_page_strict(code_page, &v, 7174 s, chunk_size); 7175 if (converted == -2) 7176 converted = decode_code_page_errors(code_page, &v, 7177 s, chunk_size, 7178 errors); 7179 assert(converted != 0); 7180 7181 if (converted < 0) { 7182 Py_XDECREF(v); 7183 return NULL; 7184 } 7185 7186 if (consumed) 7187 *consumed += converted; 7188 7189 s += converted; 7190 size -= converted; 7191 } while (!done); 7192 7193#ifndef DONT_MAKE_RESULT_READY 7194 if (_PyUnicode_READY_REPLACE(&v)) { 7195 Py_DECREF(v); 7196 return NULL; 7197 } 7198#endif 7199 assert(_PyUnicode_CheckConsistency(v, 1)); 7200 return v; 7201} 7202 7203PyObject * 7204PyUnicode_DecodeCodePageStateful(int code_page, 7205 const char *s, 7206 Py_ssize_t size, 7207 const char *errors, 7208 Py_ssize_t *consumed) 7209{ 7210 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7211} 7212 7213PyObject * 7214PyUnicode_DecodeMBCSStateful(const char *s, 7215 Py_ssize_t size, 7216 const char *errors, 7217 Py_ssize_t *consumed) 7218{ 7219 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7220} 7221 7222PyObject * 7223PyUnicode_DecodeMBCS(const char *s, 7224 Py_ssize_t size, 7225 const char *errors) 7226{ 7227 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7228} 7229 7230static DWORD 7231encode_code_page_flags(UINT code_page, const char *errors) 7232{ 7233 if (code_page == CP_UTF8) { 7234 if (winver.dwMajorVersion >= 6) 7235 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7236 and later */ 7237 return WC_ERR_INVALID_CHARS; 7238 else 7239 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7240 return 0; 7241 } 7242 else if (code_page == CP_UTF7) { 7243 /* CP_UTF7 only supports flags=0 */ 7244 return 0; 7245 } 7246 else { 7247 if (errors != NULL && strcmp(errors, "replace") == 0) 7248 return 0; 7249 else 7250 return WC_NO_BEST_FIT_CHARS; 7251 } 7252} 7253 7254/* 7255 * Encode a Unicode string to a Windows code page into a byte string in strict 7256 * mode. 7257 * 7258 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7259 * a WindowsError and returns -1 on other error. 7260 */ 7261static int 7262encode_code_page_strict(UINT code_page, PyObject **outbytes, 7263 PyObject *unicode, Py_ssize_t offset, int len, 7264 const char* errors) 7265{ 7266 BOOL usedDefaultChar = FALSE; 7267 BOOL *pusedDefaultChar = &usedDefaultChar; 7268 int outsize; 7269 PyObject *exc = NULL; 7270 Py_UNICODE *p; 7271 Py_ssize_t size; 7272 const DWORD flags = encode_code_page_flags(code_page, NULL); 7273 char *out; 7274 /* Create a substring so that we can get the UTF-16 representation 7275 of just the slice under consideration. */ 7276 PyObject *substring; 7277 7278 assert(len > 0); 7279 7280 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7281 pusedDefaultChar = &usedDefaultChar; 7282 else 7283 pusedDefaultChar = NULL; 7284 7285 substring = PyUnicode_Substring(unicode, offset, offset+len); 7286 if (substring == NULL) 7287 return -1; 7288 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7289 if (p == NULL) { 7290 Py_DECREF(substring); 7291 return -1; 7292 } 7293 7294 /* First get the size of the result */ 7295 outsize = WideCharToMultiByte(code_page, flags, 7296 p, size, 7297 NULL, 0, 7298 NULL, pusedDefaultChar); 7299 if (outsize <= 0) 7300 goto error; 7301 /* If we used a default char, then we failed! */ 7302 if (pusedDefaultChar && *pusedDefaultChar) { 7303 Py_DECREF(substring); 7304 return -2; 7305 } 7306 7307 if (*outbytes == NULL) { 7308 /* Create string object */ 7309 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7310 if (*outbytes == NULL) { 7311 Py_DECREF(substring); 7312 return -1; 7313 } 7314 out = PyBytes_AS_STRING(*outbytes); 7315 } 7316 else { 7317 /* Extend string object */ 7318 const Py_ssize_t n = PyBytes_Size(*outbytes); 7319 if (outsize > PY_SSIZE_T_MAX - n) { 7320 PyErr_NoMemory(); 7321 Py_DECREF(substring); 7322 return -1; 7323 } 7324 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7325 Py_DECREF(substring); 7326 return -1; 7327 } 7328 out = PyBytes_AS_STRING(*outbytes) + n; 7329 } 7330 7331 /* Do the conversion */ 7332 outsize = WideCharToMultiByte(code_page, flags, 7333 p, size, 7334 out, outsize, 7335 NULL, pusedDefaultChar); 7336 Py_CLEAR(substring); 7337 if (outsize <= 0) 7338 goto error; 7339 if (pusedDefaultChar && *pusedDefaultChar) 7340 return -2; 7341 return 0; 7342 7343error: 7344 Py_XDECREF(substring); 7345 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7346 return -2; 7347 PyErr_SetFromWindowsErr(0); 7348 return -1; 7349} 7350 7351/* 7352 * Encode a Unicode string to a Windows code page into a byte string using a 7353 * error handler. 7354 * 7355 * Returns consumed characters if succeed, or raise a WindowsError and returns 7356 * -1 on other error. 7357 */ 7358static int 7359encode_code_page_errors(UINT code_page, PyObject **outbytes, 7360 PyObject *unicode, Py_ssize_t unicode_offset, 7361 Py_ssize_t insize, const char* errors) 7362{ 7363 const DWORD flags = encode_code_page_flags(code_page, errors); 7364 Py_ssize_t pos = unicode_offset; 7365 Py_ssize_t endin = unicode_offset + insize; 7366 /* Ideally, we should get reason from FormatMessage. This is the Windows 7367 2000 English version of the message. */ 7368 const char *reason = "invalid character"; 7369 /* 4=maximum length of a UTF-8 sequence */ 7370 char buffer[4]; 7371 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7372 Py_ssize_t outsize; 7373 char *out; 7374 PyObject *errorHandler = NULL; 7375 PyObject *exc = NULL; 7376 PyObject *encoding_obj = NULL; 7377 char *encoding; 7378 Py_ssize_t newpos, newoutsize; 7379 PyObject *rep; 7380 int ret = -1; 7381 7382 assert(insize > 0); 7383 7384 encoding = code_page_name(code_page, &encoding_obj); 7385 if (encoding == NULL) 7386 return -1; 7387 7388 if (errors == NULL || strcmp(errors, "strict") == 0) { 7389 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7390 then we raise a UnicodeEncodeError. */ 7391 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7392 if (exc != NULL) { 7393 PyCodec_StrictErrors(exc); 7394 Py_DECREF(exc); 7395 } 7396 Py_XDECREF(encoding_obj); 7397 return -1; 7398 } 7399 7400 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7401 pusedDefaultChar = &usedDefaultChar; 7402 else 7403 pusedDefaultChar = NULL; 7404 7405 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7406 PyErr_NoMemory(); 7407 goto error; 7408 } 7409 outsize = insize * Py_ARRAY_LENGTH(buffer); 7410 7411 if (*outbytes == NULL) { 7412 /* Create string object */ 7413 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7414 if (*outbytes == NULL) 7415 goto error; 7416 out = PyBytes_AS_STRING(*outbytes); 7417 } 7418 else { 7419 /* Extend string object */ 7420 Py_ssize_t n = PyBytes_Size(*outbytes); 7421 if (n > PY_SSIZE_T_MAX - outsize) { 7422 PyErr_NoMemory(); 7423 goto error; 7424 } 7425 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7426 goto error; 7427 out = PyBytes_AS_STRING(*outbytes) + n; 7428 } 7429 7430 /* Encode the string character per character */ 7431 while (pos < endin) 7432 { 7433 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7434 wchar_t chars[2]; 7435 int charsize; 7436 if (ch < 0x10000) { 7437 chars[0] = (wchar_t)ch; 7438 charsize = 1; 7439 } 7440 else { 7441 ch -= 0x10000; 7442 chars[0] = 0xd800 + (ch >> 10); 7443 chars[1] = 0xdc00 + (ch & 0x3ff); 7444 charsize = 2; 7445 } 7446 7447 outsize = WideCharToMultiByte(code_page, flags, 7448 chars, charsize, 7449 buffer, Py_ARRAY_LENGTH(buffer), 7450 NULL, pusedDefaultChar); 7451 if (outsize > 0) { 7452 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7453 { 7454 pos++; 7455 memcpy(out, buffer, outsize); 7456 out += outsize; 7457 continue; 7458 } 7459 } 7460 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7461 PyErr_SetFromWindowsErr(0); 7462 goto error; 7463 } 7464 7465 rep = unicode_encode_call_errorhandler( 7466 errors, &errorHandler, encoding, reason, 7467 unicode, &exc, 7468 pos, pos + 1, &newpos); 7469 if (rep == NULL) 7470 goto error; 7471 pos = newpos; 7472 7473 if (PyBytes_Check(rep)) { 7474 outsize = PyBytes_GET_SIZE(rep); 7475 if (outsize != 1) { 7476 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7477 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7478 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7479 Py_DECREF(rep); 7480 goto error; 7481 } 7482 out = PyBytes_AS_STRING(*outbytes) + offset; 7483 } 7484 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7485 out += outsize; 7486 } 7487 else { 7488 Py_ssize_t i; 7489 enum PyUnicode_Kind kind; 7490 void *data; 7491 7492 if (PyUnicode_READY(rep) < 0) { 7493 Py_DECREF(rep); 7494 goto error; 7495 } 7496 7497 outsize = PyUnicode_GET_LENGTH(rep); 7498 if (outsize != 1) { 7499 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7500 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7501 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7502 Py_DECREF(rep); 7503 goto error; 7504 } 7505 out = PyBytes_AS_STRING(*outbytes) + offset; 7506 } 7507 kind = PyUnicode_KIND(rep); 7508 data = PyUnicode_DATA(rep); 7509 for (i=0; i < outsize; i++) { 7510 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7511 if (ch > 127) { 7512 raise_encode_exception(&exc, 7513 encoding, unicode, 7514 pos, pos + 1, 7515 "unable to encode error handler result to ASCII"); 7516 Py_DECREF(rep); 7517 goto error; 7518 } 7519 *out = (unsigned char)ch; 7520 out++; 7521 } 7522 } 7523 Py_DECREF(rep); 7524 } 7525 /* write a NUL byte */ 7526 *out = 0; 7527 outsize = out - PyBytes_AS_STRING(*outbytes); 7528 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7529 if (_PyBytes_Resize(outbytes, outsize) < 0) 7530 goto error; 7531 ret = 0; 7532 7533error: 7534 Py_XDECREF(encoding_obj); 7535 Py_XDECREF(errorHandler); 7536 Py_XDECREF(exc); 7537 return ret; 7538} 7539 7540static PyObject * 7541encode_code_page(int code_page, 7542 PyObject *unicode, 7543 const char *errors) 7544{ 7545 Py_ssize_t len; 7546 PyObject *outbytes = NULL; 7547 Py_ssize_t offset; 7548 int chunk_len, ret, done; 7549 7550 if (PyUnicode_READY(unicode) < 0) 7551 return NULL; 7552 len = PyUnicode_GET_LENGTH(unicode); 7553 7554 if (code_page < 0) { 7555 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7556 return NULL; 7557 } 7558 7559 if (len == 0) 7560 return PyBytes_FromStringAndSize(NULL, 0); 7561 7562 offset = 0; 7563 do 7564 { 7565#ifdef NEED_RETRY 7566 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7567 chunks. */ 7568 if (len > INT_MAX/2) { 7569 chunk_len = INT_MAX/2; 7570 done = 0; 7571 } 7572 else 7573#endif 7574 { 7575 chunk_len = (int)len; 7576 done = 1; 7577 } 7578 7579 ret = encode_code_page_strict(code_page, &outbytes, 7580 unicode, offset, chunk_len, 7581 errors); 7582 if (ret == -2) 7583 ret = encode_code_page_errors(code_page, &outbytes, 7584 unicode, offset, 7585 chunk_len, errors); 7586 if (ret < 0) { 7587 Py_XDECREF(outbytes); 7588 return NULL; 7589 } 7590 7591 offset += chunk_len; 7592 len -= chunk_len; 7593 } while (!done); 7594 7595 return outbytes; 7596} 7597 7598PyObject * 7599PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7600 Py_ssize_t size, 7601 const char *errors) 7602{ 7603 PyObject *unicode, *res; 7604 unicode = PyUnicode_FromUnicode(p, size); 7605 if (unicode == NULL) 7606 return NULL; 7607 res = encode_code_page(CP_ACP, unicode, errors); 7608 Py_DECREF(unicode); 7609 return res; 7610} 7611 7612PyObject * 7613PyUnicode_EncodeCodePage(int code_page, 7614 PyObject *unicode, 7615 const char *errors) 7616{ 7617 return encode_code_page(code_page, unicode, errors); 7618} 7619 7620PyObject * 7621PyUnicode_AsMBCSString(PyObject *unicode) 7622{ 7623 if (!PyUnicode_Check(unicode)) { 7624 PyErr_BadArgument(); 7625 return NULL; 7626 } 7627 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7628} 7629 7630#undef NEED_RETRY 7631 7632#endif /* HAVE_MBCS */ 7633 7634/* --- Character Mapping Codec -------------------------------------------- */ 7635 7636PyObject * 7637PyUnicode_DecodeCharmap(const char *s, 7638 Py_ssize_t size, 7639 PyObject *mapping, 7640 const char *errors) 7641{ 7642 const char *starts = s; 7643 Py_ssize_t startinpos; 7644 Py_ssize_t endinpos; 7645 Py_ssize_t outpos; 7646 const char *e; 7647 PyObject *v; 7648 Py_UNICODE *p; 7649 Py_ssize_t extrachars = 0; 7650 PyObject *errorHandler = NULL; 7651 PyObject *exc = NULL; 7652 Py_UNICODE *mapstring = NULL; 7653 Py_ssize_t maplen = 0; 7654 7655 /* Default to Latin-1 */ 7656 if (mapping == NULL) 7657 return PyUnicode_DecodeLatin1(s, size, errors); 7658 7659 v = (PyObject*)_PyUnicode_New(size); 7660 if (v == NULL) 7661 goto onError; 7662 if (size == 0) 7663 return v; 7664 p = PyUnicode_AS_UNICODE(v); 7665 e = s + size; 7666 if (PyUnicode_CheckExact(mapping)) { 7667 mapstring = PyUnicode_AS_UNICODE(mapping); 7668 maplen = PyUnicode_GET_SIZE(mapping); 7669 while (s < e) { 7670 unsigned char ch = *s; 7671 Py_UNICODE x = 0xfffe; /* illegal value */ 7672 7673 if (ch < maplen) 7674 x = mapstring[ch]; 7675 7676 if (x == 0xfffe) { 7677 /* undefined mapping */ 7678 outpos = p-PyUnicode_AS_UNICODE(v); 7679 startinpos = s-starts; 7680 endinpos = startinpos+1; 7681 if (unicode_decode_call_errorhandler( 7682 errors, &errorHandler, 7683 "charmap", "character maps to <undefined>", 7684 &starts, &e, &startinpos, &endinpos, &exc, &s, 7685 &v, &outpos, &p)) { 7686 goto onError; 7687 } 7688 continue; 7689 } 7690 *p++ = x; 7691 ++s; 7692 } 7693 } 7694 else { 7695 while (s < e) { 7696 unsigned char ch = *s; 7697 PyObject *w, *x; 7698 7699 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7700 w = PyLong_FromLong((long)ch); 7701 if (w == NULL) 7702 goto onError; 7703 x = PyObject_GetItem(mapping, w); 7704 Py_DECREF(w); 7705 if (x == NULL) { 7706 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7707 /* No mapping found means: mapping is undefined. */ 7708 PyErr_Clear(); 7709 x = Py_None; 7710 Py_INCREF(x); 7711 } else 7712 goto onError; 7713 } 7714 7715 /* Apply mapping */ 7716 if (PyLong_Check(x)) { 7717 long value = PyLong_AS_LONG(x); 7718 if (value < 0 || value > 65535) { 7719 PyErr_SetString(PyExc_TypeError, 7720 "character mapping must be in range(65536)"); 7721 Py_DECREF(x); 7722 goto onError; 7723 } 7724 *p++ = (Py_UNICODE)value; 7725 } 7726 else if (x == Py_None) { 7727 /* undefined mapping */ 7728 outpos = p-PyUnicode_AS_UNICODE(v); 7729 startinpos = s-starts; 7730 endinpos = startinpos+1; 7731 if (unicode_decode_call_errorhandler( 7732 errors, &errorHandler, 7733 "charmap", "character maps to <undefined>", 7734 &starts, &e, &startinpos, &endinpos, &exc, &s, 7735 &v, &outpos, &p)) { 7736 Py_DECREF(x); 7737 goto onError; 7738 } 7739 Py_DECREF(x); 7740 continue; 7741 } 7742 else if (PyUnicode_Check(x)) { 7743 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); 7744 7745 if (targetsize == 1) 7746 /* 1-1 mapping */ 7747 *p++ = *PyUnicode_AS_UNICODE(x); 7748 7749 else if (targetsize > 1) { 7750 /* 1-n mapping */ 7751 if (targetsize > extrachars) { 7752 /* resize first */ 7753 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); 7754 Py_ssize_t needed = (targetsize - extrachars) + \ 7755 (targetsize << 2); 7756 extrachars += needed; 7757 /* XXX overflow detection missing */ 7758 if (PyUnicode_Resize(&v, 7759 PyUnicode_GET_SIZE(v) + needed) < 0) { 7760 Py_DECREF(x); 7761 goto onError; 7762 } 7763 p = PyUnicode_AS_UNICODE(v) + oldpos; 7764 } 7765 Py_UNICODE_COPY(p, 7766 PyUnicode_AS_UNICODE(x), 7767 targetsize); 7768 p += targetsize; 7769 extrachars -= targetsize; 7770 } 7771 /* 1-0 mapping: skip the character */ 7772 } 7773 else { 7774 /* wrong return value */ 7775 PyErr_SetString(PyExc_TypeError, 7776 "character mapping must return integer, None or str"); 7777 Py_DECREF(x); 7778 goto onError; 7779 } 7780 Py_DECREF(x); 7781 ++s; 7782 } 7783 } 7784 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) 7785 if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) 7786 goto onError; 7787 Py_XDECREF(errorHandler); 7788 Py_XDECREF(exc); 7789#ifndef DONT_MAKE_RESULT_READY 7790 if (_PyUnicode_READY_REPLACE(&v)) { 7791 Py_DECREF(v); 7792 return NULL; 7793 } 7794#endif 7795 assert(_PyUnicode_CheckConsistency(v, 1)); 7796 return v; 7797 7798 onError: 7799 Py_XDECREF(errorHandler); 7800 Py_XDECREF(exc); 7801 Py_XDECREF(v); 7802 return NULL; 7803} 7804 7805/* Charmap encoding: the lookup table */ 7806 7807struct encoding_map { 7808 PyObject_HEAD 7809 unsigned char level1[32]; 7810 int count2, count3; 7811 unsigned char level23[1]; 7812}; 7813 7814static PyObject* 7815encoding_map_size(PyObject *obj, PyObject* args) 7816{ 7817 struct encoding_map *map = (struct encoding_map*)obj; 7818 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7819 128*map->count3); 7820} 7821 7822static PyMethodDef encoding_map_methods[] = { 7823 {"size", encoding_map_size, METH_NOARGS, 7824 PyDoc_STR("Return the size (in bytes) of this object") }, 7825 { 0 } 7826}; 7827 7828static void 7829encoding_map_dealloc(PyObject* o) 7830{ 7831 PyObject_FREE(o); 7832} 7833 7834static PyTypeObject EncodingMapType = { 7835 PyVarObject_HEAD_INIT(NULL, 0) 7836 "EncodingMap", /*tp_name*/ 7837 sizeof(struct encoding_map), /*tp_basicsize*/ 7838 0, /*tp_itemsize*/ 7839 /* methods */ 7840 encoding_map_dealloc, /*tp_dealloc*/ 7841 0, /*tp_print*/ 7842 0, /*tp_getattr*/ 7843 0, /*tp_setattr*/ 7844 0, /*tp_reserved*/ 7845 0, /*tp_repr*/ 7846 0, /*tp_as_number*/ 7847 0, /*tp_as_sequence*/ 7848 0, /*tp_as_mapping*/ 7849 0, /*tp_hash*/ 7850 0, /*tp_call*/ 7851 0, /*tp_str*/ 7852 0, /*tp_getattro*/ 7853 0, /*tp_setattro*/ 7854 0, /*tp_as_buffer*/ 7855 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7856 0, /*tp_doc*/ 7857 0, /*tp_traverse*/ 7858 0, /*tp_clear*/ 7859 0, /*tp_richcompare*/ 7860 0, /*tp_weaklistoffset*/ 7861 0, /*tp_iter*/ 7862 0, /*tp_iternext*/ 7863 encoding_map_methods, /*tp_methods*/ 7864 0, /*tp_members*/ 7865 0, /*tp_getset*/ 7866 0, /*tp_base*/ 7867 0, /*tp_dict*/ 7868 0, /*tp_descr_get*/ 7869 0, /*tp_descr_set*/ 7870 0, /*tp_dictoffset*/ 7871 0, /*tp_init*/ 7872 0, /*tp_alloc*/ 7873 0, /*tp_new*/ 7874 0, /*tp_free*/ 7875 0, /*tp_is_gc*/ 7876}; 7877 7878PyObject* 7879PyUnicode_BuildEncodingMap(PyObject* string) 7880{ 7881 PyObject *result; 7882 struct encoding_map *mresult; 7883 int i; 7884 int need_dict = 0; 7885 unsigned char level1[32]; 7886 unsigned char level2[512]; 7887 unsigned char *mlevel1, *mlevel2, *mlevel3; 7888 int count2 = 0, count3 = 0; 7889 int kind; 7890 void *data; 7891 Py_UCS4 ch; 7892 7893 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7894 PyErr_BadArgument(); 7895 return NULL; 7896 } 7897 kind = PyUnicode_KIND(string); 7898 data = PyUnicode_DATA(string); 7899 memset(level1, 0xFF, sizeof level1); 7900 memset(level2, 0xFF, sizeof level2); 7901 7902 /* If there isn't a one-to-one mapping of NULL to \0, 7903 or if there are non-BMP characters, we need to use 7904 a mapping dictionary. */ 7905 if (PyUnicode_READ(kind, data, 0) != 0) 7906 need_dict = 1; 7907 for (i = 1; i < 256; i++) { 7908 int l1, l2; 7909 ch = PyUnicode_READ(kind, data, i); 7910 if (ch == 0 || ch > 0xFFFF) { 7911 need_dict = 1; 7912 break; 7913 } 7914 if (ch == 0xFFFE) 7915 /* unmapped character */ 7916 continue; 7917 l1 = ch >> 11; 7918 l2 = ch >> 7; 7919 if (level1[l1] == 0xFF) 7920 level1[l1] = count2++; 7921 if (level2[l2] == 0xFF) 7922 level2[l2] = count3++; 7923 } 7924 7925 if (count2 >= 0xFF || count3 >= 0xFF) 7926 need_dict = 1; 7927 7928 if (need_dict) { 7929 PyObject *result = PyDict_New(); 7930 PyObject *key, *value; 7931 if (!result) 7932 return NULL; 7933 for (i = 0; i < 256; i++) { 7934 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7935 value = PyLong_FromLong(i); 7936 if (!key || !value) 7937 goto failed1; 7938 if (PyDict_SetItem(result, key, value) == -1) 7939 goto failed1; 7940 Py_DECREF(key); 7941 Py_DECREF(value); 7942 } 7943 return result; 7944 failed1: 7945 Py_XDECREF(key); 7946 Py_XDECREF(value); 7947 Py_DECREF(result); 7948 return NULL; 7949 } 7950 7951 /* Create a three-level trie */ 7952 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7953 16*count2 + 128*count3 - 1); 7954 if (!result) 7955 return PyErr_NoMemory(); 7956 PyObject_Init(result, &EncodingMapType); 7957 mresult = (struct encoding_map*)result; 7958 mresult->count2 = count2; 7959 mresult->count3 = count3; 7960 mlevel1 = mresult->level1; 7961 mlevel2 = mresult->level23; 7962 mlevel3 = mresult->level23 + 16*count2; 7963 memcpy(mlevel1, level1, 32); 7964 memset(mlevel2, 0xFF, 16*count2); 7965 memset(mlevel3, 0, 128*count3); 7966 count3 = 0; 7967 for (i = 1; i < 256; i++) { 7968 int o1, o2, o3, i2, i3; 7969 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7970 /* unmapped character */ 7971 continue; 7972 o1 = PyUnicode_READ(kind, data, i)>>11; 7973 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7974 i2 = 16*mlevel1[o1] + o2; 7975 if (mlevel2[i2] == 0xFF) 7976 mlevel2[i2] = count3++; 7977 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7978 i3 = 128*mlevel2[i2] + o3; 7979 mlevel3[i3] = i; 7980 } 7981 return result; 7982} 7983 7984static int 7985encoding_map_lookup(Py_UNICODE c, PyObject *mapping) 7986{ 7987 struct encoding_map *map = (struct encoding_map*)mapping; 7988 int l1 = c>>11; 7989 int l2 = (c>>7) & 0xF; 7990 int l3 = c & 0x7F; 7991 int i; 7992 7993#ifdef Py_UNICODE_WIDE 7994 if (c > 0xFFFF) { 7995 return -1; 7996 } 7997#endif 7998 if (c == 0) 7999 return 0; 8000 /* level 1*/ 8001 i = map->level1[l1]; 8002 if (i == 0xFF) { 8003 return -1; 8004 } 8005 /* level 2*/ 8006 i = map->level23[16*i+l2]; 8007 if (i == 0xFF) { 8008 return -1; 8009 } 8010 /* level 3 */ 8011 i = map->level23[16*map->count2 + 128*i + l3]; 8012 if (i == 0) { 8013 return -1; 8014 } 8015 return i; 8016} 8017 8018/* Lookup the character ch in the mapping. If the character 8019 can't be found, Py_None is returned (or NULL, if another 8020 error occurred). */ 8021static PyObject * 8022charmapencode_lookup(Py_UNICODE c, PyObject *mapping) 8023{ 8024 PyObject *w = PyLong_FromLong((long)c); 8025 PyObject *x; 8026 8027 if (w == NULL) 8028 return NULL; 8029 x = PyObject_GetItem(mapping, w); 8030 Py_DECREF(w); 8031 if (x == NULL) { 8032 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8033 /* No mapping found means: mapping is undefined. */ 8034 PyErr_Clear(); 8035 x = Py_None; 8036 Py_INCREF(x); 8037 return x; 8038 } else 8039 return NULL; 8040 } 8041 else if (x == Py_None) 8042 return x; 8043 else if (PyLong_Check(x)) { 8044 long value = PyLong_AS_LONG(x); 8045 if (value < 0 || value > 255) { 8046 PyErr_SetString(PyExc_TypeError, 8047 "character mapping must be in range(256)"); 8048 Py_DECREF(x); 8049 return NULL; 8050 } 8051 return x; 8052 } 8053 else if (PyBytes_Check(x)) 8054 return x; 8055 else { 8056 /* wrong return value */ 8057 PyErr_Format(PyExc_TypeError, 8058 "character mapping must return integer, bytes or None, not %.400s", 8059 x->ob_type->tp_name); 8060 Py_DECREF(x); 8061 return NULL; 8062 } 8063} 8064 8065static int 8066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8067{ 8068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8069 /* exponentially overallocate to minimize reallocations */ 8070 if (requiredsize < 2*outsize) 8071 requiredsize = 2*outsize; 8072 if (_PyBytes_Resize(outobj, requiredsize)) 8073 return -1; 8074 return 0; 8075} 8076 8077typedef enum charmapencode_result { 8078 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8079} charmapencode_result; 8080/* lookup the character, put the result in the output string and adjust 8081 various state variables. Resize the output bytes object if not enough 8082 space is available. Return a new reference to the object that 8083 was put in the output buffer, or Py_None, if the mapping was undefined 8084 (in which case no character was written) or NULL, if a 8085 reallocation error occurred. The caller must decref the result */ 8086static charmapencode_result 8087charmapencode_output(Py_UNICODE c, PyObject *mapping, 8088 PyObject **outobj, Py_ssize_t *outpos) 8089{ 8090 PyObject *rep; 8091 char *outstart; 8092 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8093 8094 if (Py_TYPE(mapping) == &EncodingMapType) { 8095 int res = encoding_map_lookup(c, mapping); 8096 Py_ssize_t requiredsize = *outpos+1; 8097 if (res == -1) 8098 return enc_FAILED; 8099 if (outsize<requiredsize) 8100 if (charmapencode_resize(outobj, outpos, requiredsize)) 8101 return enc_EXCEPTION; 8102 outstart = PyBytes_AS_STRING(*outobj); 8103 outstart[(*outpos)++] = (char)res; 8104 return enc_SUCCESS; 8105 } 8106 8107 rep = charmapencode_lookup(c, mapping); 8108 if (rep==NULL) 8109 return enc_EXCEPTION; 8110 else if (rep==Py_None) { 8111 Py_DECREF(rep); 8112 return enc_FAILED; 8113 } else { 8114 if (PyLong_Check(rep)) { 8115 Py_ssize_t requiredsize = *outpos+1; 8116 if (outsize<requiredsize) 8117 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8118 Py_DECREF(rep); 8119 return enc_EXCEPTION; 8120 } 8121 outstart = PyBytes_AS_STRING(*outobj); 8122 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8123 } 8124 else { 8125 const char *repchars = PyBytes_AS_STRING(rep); 8126 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8127 Py_ssize_t requiredsize = *outpos+repsize; 8128 if (outsize<requiredsize) 8129 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8130 Py_DECREF(rep); 8131 return enc_EXCEPTION; 8132 } 8133 outstart = PyBytes_AS_STRING(*outobj); 8134 memcpy(outstart + *outpos, repchars, repsize); 8135 *outpos += repsize; 8136 } 8137 } 8138 Py_DECREF(rep); 8139 return enc_SUCCESS; 8140} 8141 8142/* handle an error in PyUnicode_EncodeCharmap 8143 Return 0 on success, -1 on error */ 8144static int 8145charmap_encoding_error( 8146 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8147 PyObject **exceptionObject, 8148 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8149 PyObject **res, Py_ssize_t *respos) 8150{ 8151 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8152 Py_ssize_t size, repsize; 8153 Py_ssize_t newpos; 8154 Py_UNICODE *uni2; 8155 /* startpos for collecting unencodable chars */ 8156 Py_ssize_t collstartpos = *inpos; 8157 Py_ssize_t collendpos = *inpos+1; 8158 Py_ssize_t collpos; 8159 char *encoding = "charmap"; 8160 char *reason = "character maps to <undefined>"; 8161 charmapencode_result x; 8162 Py_UCS4 ch; 8163 int val; 8164 8165 if (PyUnicode_READY(unicode) < 0) 8166 return -1; 8167 size = PyUnicode_GET_LENGTH(unicode); 8168 /* find all unencodable characters */ 8169 while (collendpos < size) { 8170 PyObject *rep; 8171 if (Py_TYPE(mapping) == &EncodingMapType) { 8172 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8173 val = encoding_map_lookup(ch, mapping); 8174 if (val != -1) 8175 break; 8176 ++collendpos; 8177 continue; 8178 } 8179 8180 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8181 rep = charmapencode_lookup(ch, mapping); 8182 if (rep==NULL) 8183 return -1; 8184 else if (rep!=Py_None) { 8185 Py_DECREF(rep); 8186 break; 8187 } 8188 Py_DECREF(rep); 8189 ++collendpos; 8190 } 8191 /* cache callback name lookup 8192 * (if not done yet, i.e. it's the first error) */ 8193 if (*known_errorHandler==-1) { 8194 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8195 *known_errorHandler = 1; 8196 else if (!strcmp(errors, "replace")) 8197 *known_errorHandler = 2; 8198 else if (!strcmp(errors, "ignore")) 8199 *known_errorHandler = 3; 8200 else if (!strcmp(errors, "xmlcharrefreplace")) 8201 *known_errorHandler = 4; 8202 else 8203 *known_errorHandler = 0; 8204 } 8205 switch (*known_errorHandler) { 8206 case 1: /* strict */ 8207 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8208 return -1; 8209 case 2: /* replace */ 8210 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8211 x = charmapencode_output('?', mapping, res, respos); 8212 if (x==enc_EXCEPTION) { 8213 return -1; 8214 } 8215 else if (x==enc_FAILED) { 8216 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8217 return -1; 8218 } 8219 } 8220 /* fall through */ 8221 case 3: /* ignore */ 8222 *inpos = collendpos; 8223 break; 8224 case 4: /* xmlcharrefreplace */ 8225 /* generate replacement (temporarily (mis)uses p) */ 8226 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8227 char buffer[2+29+1+1]; 8228 char *cp; 8229 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8230 for (cp = buffer; *cp; ++cp) { 8231 x = charmapencode_output(*cp, mapping, res, respos); 8232 if (x==enc_EXCEPTION) 8233 return -1; 8234 else if (x==enc_FAILED) { 8235 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8236 return -1; 8237 } 8238 } 8239 } 8240 *inpos = collendpos; 8241 break; 8242 default: 8243 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8244 encoding, reason, unicode, exceptionObject, 8245 collstartpos, collendpos, &newpos); 8246 if (repunicode == NULL) 8247 return -1; 8248 if (PyBytes_Check(repunicode)) { 8249 /* Directly copy bytes result to output. */ 8250 Py_ssize_t outsize = PyBytes_Size(*res); 8251 Py_ssize_t requiredsize; 8252 repsize = PyBytes_Size(repunicode); 8253 requiredsize = *respos + repsize; 8254 if (requiredsize > outsize) 8255 /* Make room for all additional bytes. */ 8256 if (charmapencode_resize(res, respos, requiredsize)) { 8257 Py_DECREF(repunicode); 8258 return -1; 8259 } 8260 memcpy(PyBytes_AsString(*res) + *respos, 8261 PyBytes_AsString(repunicode), repsize); 8262 *respos += repsize; 8263 *inpos = newpos; 8264 Py_DECREF(repunicode); 8265 break; 8266 } 8267 /* generate replacement */ 8268 repsize = PyUnicode_GET_SIZE(repunicode); 8269 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8270 x = charmapencode_output(*uni2, mapping, res, respos); 8271 if (x==enc_EXCEPTION) { 8272 return -1; 8273 } 8274 else if (x==enc_FAILED) { 8275 Py_DECREF(repunicode); 8276 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8277 return -1; 8278 } 8279 } 8280 *inpos = newpos; 8281 Py_DECREF(repunicode); 8282 } 8283 return 0; 8284} 8285 8286PyObject * 8287_PyUnicode_EncodeCharmap(PyObject *unicode, 8288 PyObject *mapping, 8289 const char *errors) 8290{ 8291 /* output object */ 8292 PyObject *res = NULL; 8293 /* current input position */ 8294 Py_ssize_t inpos = 0; 8295 Py_ssize_t size; 8296 /* current output position */ 8297 Py_ssize_t respos = 0; 8298 PyObject *errorHandler = NULL; 8299 PyObject *exc = NULL; 8300 /* the following variable is used for caching string comparisons 8301 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8302 * 3=ignore, 4=xmlcharrefreplace */ 8303 int known_errorHandler = -1; 8304 8305 if (PyUnicode_READY(unicode) < 0) 8306 return NULL; 8307 size = PyUnicode_GET_LENGTH(unicode); 8308 8309 /* Default to Latin-1 */ 8310 if (mapping == NULL) 8311 return unicode_encode_ucs1(unicode, errors, 256); 8312 8313 /* allocate enough for a simple encoding without 8314 replacements, if we need more, we'll resize */ 8315 res = PyBytes_FromStringAndSize(NULL, size); 8316 if (res == NULL) 8317 goto onError; 8318 if (size == 0) 8319 return res; 8320 8321 while (inpos<size) { 8322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8323 /* try to encode it */ 8324 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8325 if (x==enc_EXCEPTION) /* error */ 8326 goto onError; 8327 if (x==enc_FAILED) { /* unencodable character */ 8328 if (charmap_encoding_error(unicode, &inpos, mapping, 8329 &exc, 8330 &known_errorHandler, &errorHandler, errors, 8331 &res, &respos)) { 8332 goto onError; 8333 } 8334 } 8335 else 8336 /* done with this character => adjust input position */ 8337 ++inpos; 8338 } 8339 8340 /* Resize if we allocated to much */ 8341 if (respos<PyBytes_GET_SIZE(res)) 8342 if (_PyBytes_Resize(&res, respos) < 0) 8343 goto onError; 8344 8345 Py_XDECREF(exc); 8346 Py_XDECREF(errorHandler); 8347 return res; 8348 8349 onError: 8350 Py_XDECREF(res); 8351 Py_XDECREF(exc); 8352 Py_XDECREF(errorHandler); 8353 return NULL; 8354} 8355 8356/* Deprecated */ 8357PyObject * 8358PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8359 Py_ssize_t size, 8360 PyObject *mapping, 8361 const char *errors) 8362{ 8363 PyObject *result; 8364 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8365 if (unicode == NULL) 8366 return NULL; 8367 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8368 Py_DECREF(unicode); 8369 return result; 8370} 8371 8372PyObject * 8373PyUnicode_AsCharmapString(PyObject *unicode, 8374 PyObject *mapping) 8375{ 8376 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8377 PyErr_BadArgument(); 8378 return NULL; 8379 } 8380 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8381} 8382 8383/* create or adjust a UnicodeTranslateError */ 8384static void 8385make_translate_exception(PyObject **exceptionObject, 8386 PyObject *unicode, 8387 Py_ssize_t startpos, Py_ssize_t endpos, 8388 const char *reason) 8389{ 8390 if (*exceptionObject == NULL) { 8391 *exceptionObject = _PyUnicodeTranslateError_Create( 8392 unicode, startpos, endpos, reason); 8393 } 8394 else { 8395 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8396 goto onError; 8397 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8398 goto onError; 8399 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8400 goto onError; 8401 return; 8402 onError: 8403 Py_DECREF(*exceptionObject); 8404 *exceptionObject = NULL; 8405 } 8406} 8407 8408/* raises a UnicodeTranslateError */ 8409static void 8410raise_translate_exception(PyObject **exceptionObject, 8411 PyObject *unicode, 8412 Py_ssize_t startpos, Py_ssize_t endpos, 8413 const char *reason) 8414{ 8415 make_translate_exception(exceptionObject, 8416 unicode, startpos, endpos, reason); 8417 if (*exceptionObject != NULL) 8418 PyCodec_StrictErrors(*exceptionObject); 8419} 8420 8421/* error handling callback helper: 8422 build arguments, call the callback and check the arguments, 8423 put the result into newpos and return the replacement string, which 8424 has to be freed by the caller */ 8425static PyObject * 8426unicode_translate_call_errorhandler(const char *errors, 8427 PyObject **errorHandler, 8428 const char *reason, 8429 PyObject *unicode, PyObject **exceptionObject, 8430 Py_ssize_t startpos, Py_ssize_t endpos, 8431 Py_ssize_t *newpos) 8432{ 8433 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8434 8435 Py_ssize_t i_newpos; 8436 PyObject *restuple; 8437 PyObject *resunicode; 8438 8439 if (*errorHandler == NULL) { 8440 *errorHandler = PyCodec_LookupError(errors); 8441 if (*errorHandler == NULL) 8442 return NULL; 8443 } 8444 8445 make_translate_exception(exceptionObject, 8446 unicode, startpos, endpos, reason); 8447 if (*exceptionObject == NULL) 8448 return NULL; 8449 8450 restuple = PyObject_CallFunctionObjArgs( 8451 *errorHandler, *exceptionObject, NULL); 8452 if (restuple == NULL) 8453 return NULL; 8454 if (!PyTuple_Check(restuple)) { 8455 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8456 Py_DECREF(restuple); 8457 return NULL; 8458 } 8459 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8460 &resunicode, &i_newpos)) { 8461 Py_DECREF(restuple); 8462 return NULL; 8463 } 8464 if (i_newpos<0) 8465 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8466 else 8467 *newpos = i_newpos; 8468 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8469 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8470 Py_DECREF(restuple); 8471 return NULL; 8472 } 8473 Py_INCREF(resunicode); 8474 Py_DECREF(restuple); 8475 return resunicode; 8476} 8477 8478/* Lookup the character ch in the mapping and put the result in result, 8479 which must be decrefed by the caller. 8480 Return 0 on success, -1 on error */ 8481static int 8482charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8483{ 8484 PyObject *w = PyLong_FromLong((long)c); 8485 PyObject *x; 8486 8487 if (w == NULL) 8488 return -1; 8489 x = PyObject_GetItem(mapping, w); 8490 Py_DECREF(w); 8491 if (x == NULL) { 8492 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8493 /* No mapping found means: use 1:1 mapping. */ 8494 PyErr_Clear(); 8495 *result = NULL; 8496 return 0; 8497 } else 8498 return -1; 8499 } 8500 else if (x == Py_None) { 8501 *result = x; 8502 return 0; 8503 } 8504 else if (PyLong_Check(x)) { 8505 long value = PyLong_AS_LONG(x); 8506 long max = PyUnicode_GetMax(); 8507 if (value < 0 || value > max) { 8508 PyErr_Format(PyExc_TypeError, 8509 "character mapping must be in range(0x%x)", max+1); 8510 Py_DECREF(x); 8511 return -1; 8512 } 8513 *result = x; 8514 return 0; 8515 } 8516 else if (PyUnicode_Check(x)) { 8517 *result = x; 8518 return 0; 8519 } 8520 else { 8521 /* wrong return value */ 8522 PyErr_SetString(PyExc_TypeError, 8523 "character mapping must return integer, None or str"); 8524 Py_DECREF(x); 8525 return -1; 8526 } 8527} 8528/* ensure that *outobj is at least requiredsize characters long, 8529 if not reallocate and adjust various state variables. 8530 Return 0 on success, -1 on error */ 8531static int 8532charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8533 Py_ssize_t requiredsize) 8534{ 8535 Py_ssize_t oldsize = *psize; 8536 if (requiredsize > oldsize) { 8537 /* exponentially overallocate to minimize reallocations */ 8538 if (requiredsize < 2 * oldsize) 8539 requiredsize = 2 * oldsize; 8540 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8541 if (*outobj == 0) 8542 return -1; 8543 *psize = requiredsize; 8544 } 8545 return 0; 8546} 8547/* lookup the character, put the result in the output string and adjust 8548 various state variables. Return a new reference to the object that 8549 was put in the output buffer in *result, or Py_None, if the mapping was 8550 undefined (in which case no character was written). 8551 The called must decref result. 8552 Return 0 on success, -1 on error. */ 8553static int 8554charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8555 PyObject *mapping, Py_UCS4 **output, 8556 Py_ssize_t *osize, Py_ssize_t *opos, 8557 PyObject **res) 8558{ 8559 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8560 if (charmaptranslate_lookup(curinp, mapping, res)) 8561 return -1; 8562 if (*res==NULL) { 8563 /* not found => default to 1:1 mapping */ 8564 (*output)[(*opos)++] = curinp; 8565 } 8566 else if (*res==Py_None) 8567 ; 8568 else if (PyLong_Check(*res)) { 8569 /* no overflow check, because we know that the space is enough */ 8570 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8571 } 8572 else if (PyUnicode_Check(*res)) { 8573 Py_ssize_t repsize; 8574 if (PyUnicode_READY(*res) == -1) 8575 return -1; 8576 repsize = PyUnicode_GET_LENGTH(*res); 8577 if (repsize==1) { 8578 /* no overflow check, because we know that the space is enough */ 8579 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8580 } 8581 else if (repsize!=0) { 8582 /* more than one character */ 8583 Py_ssize_t requiredsize = *opos + 8584 (PyUnicode_GET_LENGTH(input) - ipos) + 8585 repsize - 1; 8586 Py_ssize_t i; 8587 if (charmaptranslate_makespace(output, osize, requiredsize)) 8588 return -1; 8589 for(i = 0; i < repsize; i++) 8590 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8591 } 8592 } 8593 else 8594 return -1; 8595 return 0; 8596} 8597 8598PyObject * 8599_PyUnicode_TranslateCharmap(PyObject *input, 8600 PyObject *mapping, 8601 const char *errors) 8602{ 8603 /* input object */ 8604 char *idata; 8605 Py_ssize_t size, i; 8606 int kind; 8607 /* output buffer */ 8608 Py_UCS4 *output = NULL; 8609 Py_ssize_t osize; 8610 PyObject *res; 8611 /* current output position */ 8612 Py_ssize_t opos; 8613 char *reason = "character maps to <undefined>"; 8614 PyObject *errorHandler = NULL; 8615 PyObject *exc = NULL; 8616 /* the following variable is used for caching string comparisons 8617 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8618 * 3=ignore, 4=xmlcharrefreplace */ 8619 int known_errorHandler = -1; 8620 8621 if (mapping == NULL) { 8622 PyErr_BadArgument(); 8623 return NULL; 8624 } 8625 8626 if (PyUnicode_READY(input) == -1) 8627 return NULL; 8628 idata = (char*)PyUnicode_DATA(input); 8629 kind = PyUnicode_KIND(input); 8630 size = PyUnicode_GET_LENGTH(input); 8631 i = 0; 8632 8633 if (size == 0) { 8634 Py_INCREF(input); 8635 return input; 8636 } 8637 8638 /* allocate enough for a simple 1:1 translation without 8639 replacements, if we need more, we'll resize */ 8640 osize = size; 8641 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8642 opos = 0; 8643 if (output == NULL) { 8644 PyErr_NoMemory(); 8645 goto onError; 8646 } 8647 8648 while (i<size) { 8649 /* try to encode it */ 8650 PyObject *x = NULL; 8651 if (charmaptranslate_output(input, i, mapping, 8652 &output, &osize, &opos, &x)) { 8653 Py_XDECREF(x); 8654 goto onError; 8655 } 8656 Py_XDECREF(x); 8657 if (x!=Py_None) /* it worked => adjust input pointer */ 8658 ++i; 8659 else { /* untranslatable character */ 8660 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8661 Py_ssize_t repsize; 8662 Py_ssize_t newpos; 8663 Py_ssize_t uni2; 8664 /* startpos for collecting untranslatable chars */ 8665 Py_ssize_t collstart = i; 8666 Py_ssize_t collend = i+1; 8667 Py_ssize_t coll; 8668 8669 /* find all untranslatable characters */ 8670 while (collend < size) { 8671 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8672 goto onError; 8673 Py_XDECREF(x); 8674 if (x!=Py_None) 8675 break; 8676 ++collend; 8677 } 8678 /* cache callback name lookup 8679 * (if not done yet, i.e. it's the first error) */ 8680 if (known_errorHandler==-1) { 8681 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8682 known_errorHandler = 1; 8683 else if (!strcmp(errors, "replace")) 8684 known_errorHandler = 2; 8685 else if (!strcmp(errors, "ignore")) 8686 known_errorHandler = 3; 8687 else if (!strcmp(errors, "xmlcharrefreplace")) 8688 known_errorHandler = 4; 8689 else 8690 known_errorHandler = 0; 8691 } 8692 switch (known_errorHandler) { 8693 case 1: /* strict */ 8694 raise_translate_exception(&exc, input, collstart, 8695 collend, reason); 8696 goto onError; 8697 case 2: /* replace */ 8698 /* No need to check for space, this is a 1:1 replacement */ 8699 for (coll = collstart; coll<collend; coll++) 8700 output[opos++] = '?'; 8701 /* fall through */ 8702 case 3: /* ignore */ 8703 i = collend; 8704 break; 8705 case 4: /* xmlcharrefreplace */ 8706 /* generate replacement (temporarily (mis)uses i) */ 8707 for (i = collstart; i < collend; ++i) { 8708 char buffer[2+29+1+1]; 8709 char *cp; 8710 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8711 if (charmaptranslate_makespace(&output, &osize, 8712 opos+strlen(buffer)+(size-collend))) 8713 goto onError; 8714 for (cp = buffer; *cp; ++cp) 8715 output[opos++] = *cp; 8716 } 8717 i = collend; 8718 break; 8719 default: 8720 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8721 reason, input, &exc, 8722 collstart, collend, &newpos); 8723 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode)) 8724 goto onError; 8725 /* generate replacement */ 8726 repsize = PyUnicode_GET_LENGTH(repunicode); 8727 if (charmaptranslate_makespace(&output, &osize, 8728 opos+repsize+(size-collend))) { 8729 Py_DECREF(repunicode); 8730 goto onError; 8731 } 8732 for (uni2 = 0; repsize-->0; ++uni2) 8733 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8734 i = newpos; 8735 Py_DECREF(repunicode); 8736 } 8737 } 8738 } 8739 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8740 if (!res) 8741 goto onError; 8742 PyMem_Free(output); 8743 Py_XDECREF(exc); 8744 Py_XDECREF(errorHandler); 8745 return res; 8746 8747 onError: 8748 PyMem_Free(output); 8749 Py_XDECREF(exc); 8750 Py_XDECREF(errorHandler); 8751 return NULL; 8752} 8753 8754/* Deprecated. Use PyUnicode_Translate instead. */ 8755PyObject * 8756PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8757 Py_ssize_t size, 8758 PyObject *mapping, 8759 const char *errors) 8760{ 8761 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8762 if (!unicode) 8763 return NULL; 8764 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8765} 8766 8767PyObject * 8768PyUnicode_Translate(PyObject *str, 8769 PyObject *mapping, 8770 const char *errors) 8771{ 8772 PyObject *result; 8773 8774 str = PyUnicode_FromObject(str); 8775 if (str == NULL) 8776 goto onError; 8777 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8778 Py_DECREF(str); 8779 return result; 8780 8781 onError: 8782 Py_XDECREF(str); 8783 return NULL; 8784} 8785 8786static Py_UCS4 8787fix_decimal_and_space_to_ascii(PyObject *self) 8788{ 8789 /* No need to call PyUnicode_READY(self) because this function is only 8790 called as a callback from fixup() which does it already. */ 8791 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8792 const int kind = PyUnicode_KIND(self); 8793 void *data = PyUnicode_DATA(self); 8794 Py_UCS4 maxchar = 0, ch, fixed; 8795 Py_ssize_t i; 8796 8797 for (i = 0; i < len; ++i) { 8798 ch = PyUnicode_READ(kind, data, i); 8799 fixed = 0; 8800 if (ch > 127) { 8801 if (Py_UNICODE_ISSPACE(ch)) 8802 fixed = ' '; 8803 else { 8804 const int decimal = Py_UNICODE_TODECIMAL(ch); 8805 if (decimal >= 0) 8806 fixed = '0' + decimal; 8807 } 8808 if (fixed != 0) { 8809 if (fixed > maxchar) 8810 maxchar = fixed; 8811 PyUnicode_WRITE(kind, data, i, fixed); 8812 } 8813 else if (ch > maxchar) 8814 maxchar = ch; 8815 } 8816 else if (ch > maxchar) 8817 maxchar = ch; 8818 } 8819 8820 return maxchar; 8821} 8822 8823PyObject * 8824_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8825{ 8826 if (!PyUnicode_Check(unicode)) { 8827 PyErr_BadInternalCall(); 8828 return NULL; 8829 } 8830 if (PyUnicode_READY(unicode) == -1) 8831 return NULL; 8832 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8833 /* If the string is already ASCII, just return the same string */ 8834 Py_INCREF(unicode); 8835 return unicode; 8836 } 8837 return fixup(unicode, fix_decimal_and_space_to_ascii); 8838} 8839 8840PyObject * 8841PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8842 Py_ssize_t length) 8843{ 8844 PyObject *result; 8845 Py_UNICODE *p; /* write pointer into result */ 8846 Py_ssize_t i; 8847 /* Copy to a new string */ 8848 result = (PyObject *)_PyUnicode_New(length); 8849 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); 8850 if (result == NULL) 8851 return result; 8852 p = PyUnicode_AS_UNICODE(result); 8853 /* Iterate over code points */ 8854 for (i = 0; i < length; i++) { 8855 Py_UNICODE ch =s[i]; 8856 if (ch > 127) { 8857 int decimal = Py_UNICODE_TODECIMAL(ch); 8858 if (decimal >= 0) 8859 p[i] = '0' + decimal; 8860 } 8861 } 8862#ifndef DONT_MAKE_RESULT_READY 8863 if (_PyUnicode_READY_REPLACE(&result)) { 8864 Py_DECREF(result); 8865 return NULL; 8866 } 8867#endif 8868 assert(_PyUnicode_CheckConsistency(result, 1)); 8869 return result; 8870} 8871/* --- Decimal Encoder ---------------------------------------------------- */ 8872 8873int 8874PyUnicode_EncodeDecimal(Py_UNICODE *s, 8875 Py_ssize_t length, 8876 char *output, 8877 const char *errors) 8878{ 8879 Py_UNICODE *p, *end; 8880 PyObject *errorHandler = NULL; 8881 PyObject *exc = NULL; 8882 PyObject *unicode; 8883 const char *encoding = "decimal"; 8884 const char *reason = "invalid decimal Unicode string"; 8885 /* the following variable is used for caching string comparisons 8886 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 8887 int known_errorHandler = -1; 8888 8889 if (output == NULL) { 8890 PyErr_BadArgument(); 8891 return -1; 8892 } 8893 8894 p = s; 8895 end = s + length; 8896 while (p < end) { 8897 register Py_UNICODE ch = *p; 8898 int decimal; 8899 PyObject *repunicode; 8900 Py_ssize_t repsize; 8901 Py_ssize_t newpos; 8902 Py_UNICODE *uni2; 8903 Py_UNICODE *collstart; 8904 Py_UNICODE *collend; 8905 8906 if (Py_UNICODE_ISSPACE(ch)) { 8907 *output++ = ' '; 8908 ++p; 8909 continue; 8910 } 8911 decimal = Py_UNICODE_TODECIMAL(ch); 8912 if (decimal >= 0) { 8913 *output++ = '0' + decimal; 8914 ++p; 8915 continue; 8916 } 8917 if (0 < ch && ch < 256) { 8918 *output++ = (char)ch; 8919 ++p; 8920 continue; 8921 } 8922 /* All other characters are considered unencodable */ 8923 collstart = p; 8924 collend = p+1; 8925 while (collend < end) { 8926 if ((0 < *collend && *collend < 256) || 8927 !Py_UNICODE_ISSPACE(*collend) || 8928 Py_UNICODE_TODECIMAL(*collend)) 8929 break; 8930 } 8931 /* cache callback name lookup 8932 * (if not done yet, i.e. it's the first error) */ 8933 if (known_errorHandler==-1) { 8934 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8935 known_errorHandler = 1; 8936 else if (!strcmp(errors, "replace")) 8937 known_errorHandler = 2; 8938 else if (!strcmp(errors, "ignore")) 8939 known_errorHandler = 3; 8940 else if (!strcmp(errors, "xmlcharrefreplace")) 8941 known_errorHandler = 4; 8942 else 8943 known_errorHandler = 0; 8944 } 8945 switch (known_errorHandler) { 8946 case 1: /* strict */ 8947 unicode = PyUnicode_FromUnicode(s, length); 8948 if (unicode == NULL) 8949 goto onError; 8950 raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason); 8951 Py_DECREF(unicode); 8952 goto onError; 8953 case 2: /* replace */ 8954 for (p = collstart; p < collend; ++p) 8955 *output++ = '?'; 8956 /* fall through */ 8957 case 3: /* ignore */ 8958 p = collend; 8959 break; 8960 case 4: /* xmlcharrefreplace */ 8961 /* generate replacement (temporarily (mis)uses p) */ 8962 for (p = collstart; p < collend; ++p) 8963 output += sprintf(output, "&#%d;", (int)*p); 8964 p = collend; 8965 break; 8966 default: 8967 unicode = PyUnicode_FromUnicode(s, length); 8968 if (unicode == NULL) 8969 goto onError; 8970 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 8971 encoding, reason, unicode, &exc, 8972 collstart-s, collend-s, &newpos); 8973 Py_DECREF(unicode); 8974 if (repunicode == NULL) 8975 goto onError; 8976 if (!PyUnicode_Check(repunicode)) { 8977 /* Byte results not supported, since they have no decimal property. */ 8978 PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); 8979 Py_DECREF(repunicode); 8980 goto onError; 8981 } 8982 /* generate replacement */ 8983 repsize = PyUnicode_GET_SIZE(repunicode); 8984 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { 8985 Py_UNICODE ch = *uni2; 8986 if (Py_UNICODE_ISSPACE(ch)) 8987 *output++ = ' '; 8988 else { 8989 decimal = Py_UNICODE_TODECIMAL(ch); 8990 if (decimal >= 0) 8991 *output++ = '0' + decimal; 8992 else if (0 < ch && ch < 256) 8993 *output++ = (char)ch; 8994 else { 8995 Py_DECREF(repunicode); 8996 unicode = PyUnicode_FromUnicode(s, length); 8997 if (unicode == NULL) 8998 goto onError; 8999 raise_encode_exception(&exc, encoding, 9000 unicode, collstart-s, collend-s, reason); 9001 Py_DECREF(unicode); 9002 goto onError; 9003 } 9004 } 9005 } 9006 p = s + newpos; 9007 Py_DECREF(repunicode); 9008 } 9009 } 9010 /* 0-terminate the output string */ 9011 *output++ = '\0'; 9012 Py_XDECREF(exc); 9013 Py_XDECREF(errorHandler); 9014 return 0; 9015 9016 onError: 9017 Py_XDECREF(exc); 9018 Py_XDECREF(errorHandler); 9019 return -1; 9020} 9021 9022/* --- Helpers ------------------------------------------------------------ */ 9023 9024static Py_ssize_t 9025any_find_slice(int direction, PyObject* s1, PyObject* s2, 9026 Py_ssize_t start, 9027 Py_ssize_t end) 9028{ 9029 int kind1, kind2, kind; 9030 void *buf1, *buf2; 9031 Py_ssize_t len1, len2, result; 9032 9033 kind1 = PyUnicode_KIND(s1); 9034 kind2 = PyUnicode_KIND(s2); 9035 kind = kind1 > kind2 ? kind1 : kind2; 9036 buf1 = PyUnicode_DATA(s1); 9037 buf2 = PyUnicode_DATA(s2); 9038 if (kind1 != kind) 9039 buf1 = _PyUnicode_AsKind(s1, kind); 9040 if (!buf1) 9041 return -2; 9042 if (kind2 != kind) 9043 buf2 = _PyUnicode_AsKind(s2, kind); 9044 if (!buf2) { 9045 if (kind1 != kind) PyMem_Free(buf1); 9046 return -2; 9047 } 9048 len1 = PyUnicode_GET_LENGTH(s1); 9049 len2 = PyUnicode_GET_LENGTH(s2); 9050 9051 if (direction > 0) { 9052 switch(kind) { 9053 case PyUnicode_1BYTE_KIND: 9054 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9055 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9056 else 9057 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9058 break; 9059 case PyUnicode_2BYTE_KIND: 9060 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9061 break; 9062 case PyUnicode_4BYTE_KIND: 9063 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9064 break; 9065 default: 9066 assert(0); result = -2; 9067 } 9068 } 9069 else { 9070 switch(kind) { 9071 case PyUnicode_1BYTE_KIND: 9072 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9073 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9074 else 9075 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9076 break; 9077 case PyUnicode_2BYTE_KIND: 9078 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9079 break; 9080 case PyUnicode_4BYTE_KIND: 9081 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9082 break; 9083 default: 9084 assert(0); result = -2; 9085 } 9086 } 9087 9088 if (kind1 != kind) 9089 PyMem_Free(buf1); 9090 if (kind2 != kind) 9091 PyMem_Free(buf2); 9092 9093 return result; 9094} 9095 9096Py_ssize_t 9097_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, 9098 Py_ssize_t n_buffer, 9099 void *digits, Py_ssize_t n_digits, 9100 Py_ssize_t min_width, 9101 const char *grouping, 9102 const char *thousands_sep) 9103{ 9104 switch(kind) { 9105 case PyUnicode_1BYTE_KIND: 9106 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9107 return _PyUnicode_ascii_InsertThousandsGrouping( 9108 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9109 min_width, grouping, thousands_sep); 9110 else 9111 return _PyUnicode_ucs1_InsertThousandsGrouping( 9112 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9113 min_width, grouping, thousands_sep); 9114 case PyUnicode_2BYTE_KIND: 9115 return _PyUnicode_ucs2_InsertThousandsGrouping( 9116 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 9117 min_width, grouping, thousands_sep); 9118 case PyUnicode_4BYTE_KIND: 9119 return _PyUnicode_ucs4_InsertThousandsGrouping( 9120 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 9121 min_width, grouping, thousands_sep); 9122 } 9123 assert(0); 9124 return -1; 9125} 9126 9127 9128/* helper macro to fixup start/end slice values */ 9129#define ADJUST_INDICES(start, end, len) \ 9130 if (end > len) \ 9131 end = len; \ 9132 else if (end < 0) { \ 9133 end += len; \ 9134 if (end < 0) \ 9135 end = 0; \ 9136 } \ 9137 if (start < 0) { \ 9138 start += len; \ 9139 if (start < 0) \ 9140 start = 0; \ 9141 } 9142 9143Py_ssize_t 9144PyUnicode_Count(PyObject *str, 9145 PyObject *substr, 9146 Py_ssize_t start, 9147 Py_ssize_t end) 9148{ 9149 Py_ssize_t result; 9150 PyObject* str_obj; 9151 PyObject* sub_obj; 9152 int kind1, kind2, kind; 9153 void *buf1 = NULL, *buf2 = NULL; 9154 Py_ssize_t len1, len2; 9155 9156 str_obj = PyUnicode_FromObject(str); 9157 if (!str_obj || PyUnicode_READY(str_obj) == -1) 9158 return -1; 9159 sub_obj = PyUnicode_FromObject(substr); 9160 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 9161 Py_DECREF(str_obj); 9162 return -1; 9163 } 9164 9165 kind1 = PyUnicode_KIND(str_obj); 9166 kind2 = PyUnicode_KIND(sub_obj); 9167 kind = kind1 > kind2 ? kind1 : kind2; 9168 buf1 = PyUnicode_DATA(str_obj); 9169 if (kind1 != kind) 9170 buf1 = _PyUnicode_AsKind(str_obj, kind); 9171 if (!buf1) 9172 goto onError; 9173 buf2 = PyUnicode_DATA(sub_obj); 9174 if (kind2 != kind) 9175 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9176 if (!buf2) 9177 goto onError; 9178 len1 = PyUnicode_GET_LENGTH(str_obj); 9179 len2 = PyUnicode_GET_LENGTH(sub_obj); 9180 9181 ADJUST_INDICES(start, end, len1); 9182 switch(kind) { 9183 case PyUnicode_1BYTE_KIND: 9184 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9185 result = asciilib_count( 9186 ((Py_UCS1*)buf1) + start, end - start, 9187 buf2, len2, PY_SSIZE_T_MAX 9188 ); 9189 else 9190 result = ucs1lib_count( 9191 ((Py_UCS1*)buf1) + start, end - start, 9192 buf2, len2, PY_SSIZE_T_MAX 9193 ); 9194 break; 9195 case PyUnicode_2BYTE_KIND: 9196 result = ucs2lib_count( 9197 ((Py_UCS2*)buf1) + start, end - start, 9198 buf2, len2, PY_SSIZE_T_MAX 9199 ); 9200 break; 9201 case PyUnicode_4BYTE_KIND: 9202 result = ucs4lib_count( 9203 ((Py_UCS4*)buf1) + start, end - start, 9204 buf2, len2, PY_SSIZE_T_MAX 9205 ); 9206 break; 9207 default: 9208 assert(0); result = 0; 9209 } 9210 9211 Py_DECREF(sub_obj); 9212 Py_DECREF(str_obj); 9213 9214 if (kind1 != kind) 9215 PyMem_Free(buf1); 9216 if (kind2 != kind) 9217 PyMem_Free(buf2); 9218 9219 return result; 9220 onError: 9221 Py_DECREF(sub_obj); 9222 Py_DECREF(str_obj); 9223 if (kind1 != kind && buf1) 9224 PyMem_Free(buf1); 9225 if (kind2 != kind && buf2) 9226 PyMem_Free(buf2); 9227 return -1; 9228} 9229 9230Py_ssize_t 9231PyUnicode_Find(PyObject *str, 9232 PyObject *sub, 9233 Py_ssize_t start, 9234 Py_ssize_t end, 9235 int direction) 9236{ 9237 Py_ssize_t result; 9238 9239 str = PyUnicode_FromObject(str); 9240 if (!str || PyUnicode_READY(str) == -1) 9241 return -2; 9242 sub = PyUnicode_FromObject(sub); 9243 if (!sub || PyUnicode_READY(sub) == -1) { 9244 Py_DECREF(str); 9245 return -2; 9246 } 9247 9248 result = any_find_slice(direction, 9249 str, sub, start, end 9250 ); 9251 9252 Py_DECREF(str); 9253 Py_DECREF(sub); 9254 9255 return result; 9256} 9257 9258Py_ssize_t 9259PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9260 Py_ssize_t start, Py_ssize_t end, 9261 int direction) 9262{ 9263 int kind; 9264 Py_ssize_t result; 9265 if (PyUnicode_READY(str) == -1) 9266 return -2; 9267 if (start < 0 || end < 0) { 9268 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9269 return -2; 9270 } 9271 if (end > PyUnicode_GET_LENGTH(str)) 9272 end = PyUnicode_GET_LENGTH(str); 9273 kind = PyUnicode_KIND(str); 9274 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9275 kind, end-start, ch, direction); 9276 if (result == -1) 9277 return -1; 9278 else 9279 return start + result; 9280} 9281 9282static int 9283tailmatch(PyObject *self, 9284 PyObject *substring, 9285 Py_ssize_t start, 9286 Py_ssize_t end, 9287 int direction) 9288{ 9289 int kind_self; 9290 int kind_sub; 9291 void *data_self; 9292 void *data_sub; 9293 Py_ssize_t offset; 9294 Py_ssize_t i; 9295 Py_ssize_t end_sub; 9296 9297 if (PyUnicode_READY(self) == -1 || 9298 PyUnicode_READY(substring) == -1) 9299 return 0; 9300 9301 if (PyUnicode_GET_LENGTH(substring) == 0) 9302 return 1; 9303 9304 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9305 end -= PyUnicode_GET_LENGTH(substring); 9306 if (end < start) 9307 return 0; 9308 9309 kind_self = PyUnicode_KIND(self); 9310 data_self = PyUnicode_DATA(self); 9311 kind_sub = PyUnicode_KIND(substring); 9312 data_sub = PyUnicode_DATA(substring); 9313 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9314 9315 if (direction > 0) 9316 offset = end; 9317 else 9318 offset = start; 9319 9320 if (PyUnicode_READ(kind_self, data_self, offset) == 9321 PyUnicode_READ(kind_sub, data_sub, 0) && 9322 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9323 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9324 /* If both are of the same kind, memcmp is sufficient */ 9325 if (kind_self == kind_sub) { 9326 return ! memcmp((char *)data_self + 9327 (offset * PyUnicode_KIND(substring)), 9328 data_sub, 9329 PyUnicode_GET_LENGTH(substring) * 9330 PyUnicode_KIND(substring)); 9331 } 9332 /* otherwise we have to compare each character by first accesing it */ 9333 else { 9334 /* We do not need to compare 0 and len(substring)-1 because 9335 the if statement above ensured already that they are equal 9336 when we end up here. */ 9337 // TODO: honor direction and do a forward or backwards search 9338 for (i = 1; i < end_sub; ++i) { 9339 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9340 PyUnicode_READ(kind_sub, data_sub, i)) 9341 return 0; 9342 } 9343 return 1; 9344 } 9345 } 9346 9347 return 0; 9348} 9349 9350Py_ssize_t 9351PyUnicode_Tailmatch(PyObject *str, 9352 PyObject *substr, 9353 Py_ssize_t start, 9354 Py_ssize_t end, 9355 int direction) 9356{ 9357 Py_ssize_t result; 9358 9359 str = PyUnicode_FromObject(str); 9360 if (str == NULL) 9361 return -1; 9362 substr = PyUnicode_FromObject(substr); 9363 if (substr == NULL) { 9364 Py_DECREF(str); 9365 return -1; 9366 } 9367 9368 result = tailmatch(str, substr, 9369 start, end, direction); 9370 Py_DECREF(str); 9371 Py_DECREF(substr); 9372 return result; 9373} 9374 9375/* Apply fixfct filter to the Unicode object self and return a 9376 reference to the modified object */ 9377 9378static PyObject * 9379fixup(PyObject *self, 9380 Py_UCS4 (*fixfct)(PyObject *s)) 9381{ 9382 PyObject *u; 9383 Py_UCS4 maxchar_old, maxchar_new = 0; 9384 9385 if (PyUnicode_READY(self) == -1) 9386 return NULL; 9387 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self); 9388 u = PyUnicode_New(PyUnicode_GET_LENGTH(self), 9389 maxchar_old); 9390 if (u == NULL) 9391 return NULL; 9392 9393 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self), 9394 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u)); 9395 9396 /* fix functions return the new maximum character in a string, 9397 if the kind of the resulting unicode object does not change, 9398 everything is fine. Otherwise we need to change the string kind 9399 and re-run the fix function. */ 9400 maxchar_new = fixfct(u); 9401 if (maxchar_new == 0) 9402 /* do nothing, keep maxchar_new at 0 which means no changes. */; 9403 else if (maxchar_new <= 127) 9404 maxchar_new = 127; 9405 else if (maxchar_new <= 255) 9406 maxchar_new = 255; 9407 else if (maxchar_new <= 65535) 9408 maxchar_new = 65535; 9409 else 9410 maxchar_new = 1114111; /* 0x10ffff */ 9411 9412 if (!maxchar_new && PyUnicode_CheckExact(self)) { 9413 /* fixfct should return TRUE if it modified the buffer. If 9414 FALSE, return a reference to the original buffer instead 9415 (to save space, not time) */ 9416 Py_INCREF(self); 9417 Py_DECREF(u); 9418 return self; 9419 } 9420 else if (maxchar_new == maxchar_old) { 9421 return u; 9422 } 9423 else { 9424 /* In case the maximum character changed, we need to 9425 convert the string to the new category. */ 9426 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9427 if (v == NULL) { 9428 Py_DECREF(u); 9429 return NULL; 9430 } 9431 if (maxchar_new > maxchar_old) { 9432 /* If the maxchar increased so that the kind changed, not all 9433 characters are representable anymore and we need to fix the 9434 string again. This only happens in very few cases. */ 9435 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); 9436 maxchar_old = fixfct(v); 9437 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9438 } 9439 else { 9440 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); 9441 } 9442 9443 Py_DECREF(u); 9444 assert(_PyUnicode_CheckConsistency(v, 1)); 9445 return v; 9446 } 9447} 9448 9449static Py_UCS4 9450fixupper(PyObject *self) 9451{ 9452 /* No need to call PyUnicode_READY(self) because this function is only 9453 called as a callback from fixup() which does it already. */ 9454 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9455 const int kind = PyUnicode_KIND(self); 9456 void *data = PyUnicode_DATA(self); 9457 int touched = 0; 9458 Py_UCS4 maxchar = 0; 9459 Py_ssize_t i; 9460 9461 for (i = 0; i < len; ++i) { 9462 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9463 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 9464 if (up != ch) { 9465 if (up > maxchar) 9466 maxchar = up; 9467 PyUnicode_WRITE(kind, data, i, up); 9468 touched = 1; 9469 } 9470 else if (ch > maxchar) 9471 maxchar = ch; 9472 } 9473 9474 if (touched) 9475 return maxchar; 9476 else 9477 return 0; 9478} 9479 9480static Py_UCS4 9481fixlower(PyObject *self) 9482{ 9483 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9484 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9485 const int kind = PyUnicode_KIND(self); 9486 void *data = PyUnicode_DATA(self); 9487 int touched = 0; 9488 Py_UCS4 maxchar = 0; 9489 Py_ssize_t i; 9490 9491 for(i = 0; i < len; ++i) { 9492 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9493 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9494 if (lo != ch) { 9495 if (lo > maxchar) 9496 maxchar = lo; 9497 PyUnicode_WRITE(kind, data, i, lo); 9498 touched = 1; 9499 } 9500 else if (ch > maxchar) 9501 maxchar = ch; 9502 } 9503 9504 if (touched) 9505 return maxchar; 9506 else 9507 return 0; 9508} 9509 9510static Py_UCS4 9511fixswapcase(PyObject *self) 9512{ 9513 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9514 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9515 const int kind = PyUnicode_KIND(self); 9516 void *data = PyUnicode_DATA(self); 9517 int touched = 0; 9518 Py_UCS4 maxchar = 0; 9519 Py_ssize_t i; 9520 9521 for(i = 0; i < len; ++i) { 9522 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9523 Py_UCS4 nu = 0; 9524 9525 if (Py_UNICODE_ISUPPER(ch)) 9526 nu = Py_UNICODE_TOLOWER(ch); 9527 else if (Py_UNICODE_ISLOWER(ch)) 9528 nu = Py_UNICODE_TOUPPER(ch); 9529 9530 if (nu != 0) { 9531 if (nu > maxchar) 9532 maxchar = nu; 9533 PyUnicode_WRITE(kind, data, i, nu); 9534 touched = 1; 9535 } 9536 else if (ch > maxchar) 9537 maxchar = ch; 9538 } 9539 9540 if (touched) 9541 return maxchar; 9542 else 9543 return 0; 9544} 9545 9546static Py_UCS4 9547fixcapitalize(PyObject *self) 9548{ 9549 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9550 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9551 const int kind = PyUnicode_KIND(self); 9552 void *data = PyUnicode_DATA(self); 9553 int touched = 0; 9554 Py_UCS4 maxchar = 0; 9555 Py_ssize_t i = 0; 9556 Py_UCS4 ch; 9557 9558 if (len == 0) 9559 return 0; 9560 9561 ch = PyUnicode_READ(kind, data, i); 9562 if (!Py_UNICODE_ISUPPER(ch)) { 9563 maxchar = Py_UNICODE_TOUPPER(ch); 9564 PyUnicode_WRITE(kind, data, i, maxchar); 9565 touched = 1; 9566 } 9567 ++i; 9568 for(; i < len; ++i) { 9569 ch = PyUnicode_READ(kind, data, i); 9570 if (!Py_UNICODE_ISLOWER(ch)) { 9571 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9572 if (lo > maxchar) 9573 maxchar = lo; 9574 PyUnicode_WRITE(kind, data, i, lo); 9575 touched = 1; 9576 } 9577 else if (ch > maxchar) 9578 maxchar = ch; 9579 } 9580 9581 if (touched) 9582 return maxchar; 9583 else 9584 return 0; 9585} 9586 9587static Py_UCS4 9588fixtitle(PyObject *self) 9589{ 9590 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9591 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9592 const int kind = PyUnicode_KIND(self); 9593 void *data = PyUnicode_DATA(self); 9594 Py_UCS4 maxchar = 0; 9595 Py_ssize_t i = 0; 9596 int previous_is_cased; 9597 9598 /* Shortcut for single character strings */ 9599 if (len == 1) { 9600 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9601 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 9602 if (ti != ch) { 9603 PyUnicode_WRITE(kind, data, i, ti); 9604 return ti; 9605 } 9606 else 9607 return 0; 9608 } 9609 previous_is_cased = 0; 9610 for(; i < len; ++i) { 9611 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9612 Py_UCS4 nu; 9613 9614 if (previous_is_cased) 9615 nu = Py_UNICODE_TOLOWER(ch); 9616 else 9617 nu = Py_UNICODE_TOTITLE(ch); 9618 9619 if (nu > maxchar) 9620 maxchar = nu; 9621 PyUnicode_WRITE(kind, data, i, nu); 9622 9623 if (Py_UNICODE_ISLOWER(ch) || 9624 Py_UNICODE_ISUPPER(ch) || 9625 Py_UNICODE_ISTITLE(ch)) 9626 previous_is_cased = 1; 9627 else 9628 previous_is_cased = 0; 9629 } 9630 return maxchar; 9631} 9632 9633PyObject * 9634PyUnicode_Join(PyObject *separator, PyObject *seq) 9635{ 9636 PyObject *sep = NULL; 9637 Py_ssize_t seplen; 9638 PyObject *res = NULL; /* the result */ 9639 PyObject *fseq; /* PySequence_Fast(seq) */ 9640 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9641 PyObject **items; 9642 PyObject *item; 9643 Py_ssize_t sz, i, res_offset; 9644 Py_UCS4 maxchar; 9645 Py_UCS4 item_maxchar; 9646 int use_memcpy; 9647 unsigned char *res_data = NULL, *sep_data = NULL; 9648 PyObject *last_obj; 9649 unsigned int kind = 0; 9650 9651 fseq = PySequence_Fast(seq, ""); 9652 if (fseq == NULL) { 9653 return NULL; 9654 } 9655 9656 /* NOTE: the following code can't call back into Python code, 9657 * so we are sure that fseq won't be mutated. 9658 */ 9659 9660 seqlen = PySequence_Fast_GET_SIZE(fseq); 9661 /* If empty sequence, return u"". */ 9662 if (seqlen == 0) { 9663 Py_DECREF(fseq); 9664 Py_INCREF(unicode_empty); 9665 res = unicode_empty; 9666 return res; 9667 } 9668 9669 /* If singleton sequence with an exact Unicode, return that. */ 9670 last_obj = NULL; 9671 items = PySequence_Fast_ITEMS(fseq); 9672 if (seqlen == 1) { 9673 if (PyUnicode_CheckExact(items[0])) { 9674 res = items[0]; 9675 Py_INCREF(res); 9676 Py_DECREF(fseq); 9677 return res; 9678 } 9679 seplen = 0; 9680 maxchar = 0; 9681 } 9682 else { 9683 /* Set up sep and seplen */ 9684 if (separator == NULL) { 9685 /* fall back to a blank space separator */ 9686 sep = PyUnicode_FromOrdinal(' '); 9687 if (!sep) 9688 goto onError; 9689 seplen = 1; 9690 maxchar = 32; 9691 } 9692 else { 9693 if (!PyUnicode_Check(separator)) { 9694 PyErr_Format(PyExc_TypeError, 9695 "separator: expected str instance," 9696 " %.80s found", 9697 Py_TYPE(separator)->tp_name); 9698 goto onError; 9699 } 9700 if (PyUnicode_READY(separator)) 9701 goto onError; 9702 sep = separator; 9703 seplen = PyUnicode_GET_LENGTH(separator); 9704 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9705 /* inc refcount to keep this code path symmetric with the 9706 above case of a blank separator */ 9707 Py_INCREF(sep); 9708 } 9709 last_obj = sep; 9710 } 9711 9712 /* There are at least two things to join, or else we have a subclass 9713 * of str in the sequence. 9714 * Do a pre-pass to figure out the total amount of space we'll 9715 * need (sz), and see whether all argument are strings. 9716 */ 9717 sz = 0; 9718#ifdef Py_DEBUG 9719 use_memcpy = 0; 9720#else 9721 use_memcpy = 1; 9722#endif 9723 for (i = 0; i < seqlen; i++) { 9724 const Py_ssize_t old_sz = sz; 9725 item = items[i]; 9726 if (!PyUnicode_Check(item)) { 9727 PyErr_Format(PyExc_TypeError, 9728 "sequence item %zd: expected str instance," 9729 " %.80s found", 9730 i, Py_TYPE(item)->tp_name); 9731 goto onError; 9732 } 9733 if (PyUnicode_READY(item) == -1) 9734 goto onError; 9735 sz += PyUnicode_GET_LENGTH(item); 9736 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9737 maxchar = Py_MAX(maxchar, item_maxchar); 9738 if (i != 0) 9739 sz += seplen; 9740 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9741 PyErr_SetString(PyExc_OverflowError, 9742 "join() result is too long for a Python string"); 9743 goto onError; 9744 } 9745 if (use_memcpy && last_obj != NULL) { 9746 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9747 use_memcpy = 0; 9748 } 9749 last_obj = item; 9750 } 9751 9752 res = PyUnicode_New(sz, maxchar); 9753 if (res == NULL) 9754 goto onError; 9755 9756 /* Catenate everything. */ 9757#ifdef Py_DEBUG 9758 use_memcpy = 0; 9759#else 9760 if (use_memcpy) { 9761 res_data = PyUnicode_1BYTE_DATA(res); 9762 kind = PyUnicode_KIND(res); 9763 if (seplen != 0) 9764 sep_data = PyUnicode_1BYTE_DATA(sep); 9765 } 9766#endif 9767 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9768 Py_ssize_t itemlen; 9769 item = items[i]; 9770 /* Copy item, and maybe the separator. */ 9771 if (i && seplen != 0) { 9772 if (use_memcpy) { 9773 Py_MEMCPY(res_data, 9774 sep_data, 9775 kind * seplen); 9776 res_data += kind * seplen; 9777 } 9778 else { 9779 copy_characters(res, res_offset, sep, 0, seplen); 9780 res_offset += seplen; 9781 } 9782 } 9783 itemlen = PyUnicode_GET_LENGTH(item); 9784 if (itemlen != 0) { 9785 if (use_memcpy) { 9786 Py_MEMCPY(res_data, 9787 PyUnicode_DATA(item), 9788 kind * itemlen); 9789 res_data += kind * itemlen; 9790 } 9791 else { 9792 copy_characters(res, res_offset, item, 0, itemlen); 9793 res_offset += itemlen; 9794 } 9795 } 9796 } 9797 if (use_memcpy) 9798 assert(res_data == PyUnicode_1BYTE_DATA(res) 9799 + kind * PyUnicode_GET_LENGTH(res)); 9800 else 9801 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9802 9803 Py_DECREF(fseq); 9804 Py_XDECREF(sep); 9805 assert(_PyUnicode_CheckConsistency(res, 1)); 9806 return res; 9807 9808 onError: 9809 Py_DECREF(fseq); 9810 Py_XDECREF(sep); 9811 Py_XDECREF(res); 9812 return NULL; 9813} 9814 9815#define FILL(kind, data, value, start, length) \ 9816 do { \ 9817 Py_ssize_t i_ = 0; \ 9818 assert(kind != PyUnicode_WCHAR_KIND); \ 9819 switch ((kind)) { \ 9820 case PyUnicode_1BYTE_KIND: { \ 9821 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9822 memset(to_, (unsigned char)value, length); \ 9823 break; \ 9824 } \ 9825 case PyUnicode_2BYTE_KIND: { \ 9826 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9827 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9828 break; \ 9829 } \ 9830 default: { \ 9831 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9832 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9833 break; \ 9834 } \ 9835 } \ 9836 } while (0) 9837 9838static PyObject * 9839pad(PyObject *self, 9840 Py_ssize_t left, 9841 Py_ssize_t right, 9842 Py_UCS4 fill) 9843{ 9844 PyObject *u; 9845 Py_UCS4 maxchar; 9846 int kind; 9847 void *data; 9848 9849 if (left < 0) 9850 left = 0; 9851 if (right < 0) 9852 right = 0; 9853 9854 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9855 Py_INCREF(self); 9856 return self; 9857 } 9858 9859 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9860 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9861 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9862 return NULL; 9863 } 9864 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9865 if (fill > maxchar) 9866 maxchar = fill; 9867 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9868 if (!u) 9869 return NULL; 9870 9871 kind = PyUnicode_KIND(u); 9872 data = PyUnicode_DATA(u); 9873 if (left) 9874 FILL(kind, data, fill, 0, left); 9875 if (right) 9876 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9877 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9878 assert(_PyUnicode_CheckConsistency(u, 1)); 9879 return u; 9880} 9881#undef FILL 9882 9883PyObject * 9884PyUnicode_Splitlines(PyObject *string, int keepends) 9885{ 9886 PyObject *list; 9887 9888 string = PyUnicode_FromObject(string); 9889 if (string == NULL || PyUnicode_READY(string) == -1) 9890 return NULL; 9891 9892 switch(PyUnicode_KIND(string)) { 9893 case PyUnicode_1BYTE_KIND: 9894 if (PyUnicode_IS_ASCII(string)) 9895 list = asciilib_splitlines( 9896 string, PyUnicode_1BYTE_DATA(string), 9897 PyUnicode_GET_LENGTH(string), keepends); 9898 else 9899 list = ucs1lib_splitlines( 9900 string, PyUnicode_1BYTE_DATA(string), 9901 PyUnicode_GET_LENGTH(string), keepends); 9902 break; 9903 case PyUnicode_2BYTE_KIND: 9904 list = ucs2lib_splitlines( 9905 string, PyUnicode_2BYTE_DATA(string), 9906 PyUnicode_GET_LENGTH(string), keepends); 9907 break; 9908 case PyUnicode_4BYTE_KIND: 9909 list = ucs4lib_splitlines( 9910 string, PyUnicode_4BYTE_DATA(string), 9911 PyUnicode_GET_LENGTH(string), keepends); 9912 break; 9913 default: 9914 assert(0); 9915 list = 0; 9916 } 9917 Py_DECREF(string); 9918 return list; 9919} 9920 9921static PyObject * 9922split(PyObject *self, 9923 PyObject *substring, 9924 Py_ssize_t maxcount) 9925{ 9926 int kind1, kind2, kind; 9927 void *buf1, *buf2; 9928 Py_ssize_t len1, len2; 9929 PyObject* out; 9930 9931 if (maxcount < 0) 9932 maxcount = PY_SSIZE_T_MAX; 9933 9934 if (PyUnicode_READY(self) == -1) 9935 return NULL; 9936 9937 if (substring == NULL) 9938 switch(PyUnicode_KIND(self)) { 9939 case PyUnicode_1BYTE_KIND: 9940 if (PyUnicode_IS_ASCII(self)) 9941 return asciilib_split_whitespace( 9942 self, PyUnicode_1BYTE_DATA(self), 9943 PyUnicode_GET_LENGTH(self), maxcount 9944 ); 9945 else 9946 return ucs1lib_split_whitespace( 9947 self, PyUnicode_1BYTE_DATA(self), 9948 PyUnicode_GET_LENGTH(self), maxcount 9949 ); 9950 case PyUnicode_2BYTE_KIND: 9951 return ucs2lib_split_whitespace( 9952 self, PyUnicode_2BYTE_DATA(self), 9953 PyUnicode_GET_LENGTH(self), maxcount 9954 ); 9955 case PyUnicode_4BYTE_KIND: 9956 return ucs4lib_split_whitespace( 9957 self, PyUnicode_4BYTE_DATA(self), 9958 PyUnicode_GET_LENGTH(self), maxcount 9959 ); 9960 default: 9961 assert(0); 9962 return NULL; 9963 } 9964 9965 if (PyUnicode_READY(substring) == -1) 9966 return NULL; 9967 9968 kind1 = PyUnicode_KIND(self); 9969 kind2 = PyUnicode_KIND(substring); 9970 kind = kind1 > kind2 ? kind1 : kind2; 9971 buf1 = PyUnicode_DATA(self); 9972 buf2 = PyUnicode_DATA(substring); 9973 if (kind1 != kind) 9974 buf1 = _PyUnicode_AsKind(self, kind); 9975 if (!buf1) 9976 return NULL; 9977 if (kind2 != kind) 9978 buf2 = _PyUnicode_AsKind(substring, kind); 9979 if (!buf2) { 9980 if (kind1 != kind) PyMem_Free(buf1); 9981 return NULL; 9982 } 9983 len1 = PyUnicode_GET_LENGTH(self); 9984 len2 = PyUnicode_GET_LENGTH(substring); 9985 9986 switch(kind) { 9987 case PyUnicode_1BYTE_KIND: 9988 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9989 out = asciilib_split( 9990 self, buf1, len1, buf2, len2, maxcount); 9991 else 9992 out = ucs1lib_split( 9993 self, buf1, len1, buf2, len2, maxcount); 9994 break; 9995 case PyUnicode_2BYTE_KIND: 9996 out = ucs2lib_split( 9997 self, buf1, len1, buf2, len2, maxcount); 9998 break; 9999 case PyUnicode_4BYTE_KIND: 10000 out = ucs4lib_split( 10001 self, buf1, len1, buf2, len2, maxcount); 10002 break; 10003 default: 10004 out = NULL; 10005 } 10006 if (kind1 != kind) 10007 PyMem_Free(buf1); 10008 if (kind2 != kind) 10009 PyMem_Free(buf2); 10010 return out; 10011} 10012 10013static PyObject * 10014rsplit(PyObject *self, 10015 PyObject *substring, 10016 Py_ssize_t maxcount) 10017{ 10018 int kind1, kind2, kind; 10019 void *buf1, *buf2; 10020 Py_ssize_t len1, len2; 10021 PyObject* out; 10022 10023 if (maxcount < 0) 10024 maxcount = PY_SSIZE_T_MAX; 10025 10026 if (PyUnicode_READY(self) == -1) 10027 return NULL; 10028 10029 if (substring == NULL) 10030 switch(PyUnicode_KIND(self)) { 10031 case PyUnicode_1BYTE_KIND: 10032 if (PyUnicode_IS_ASCII(self)) 10033 return asciilib_rsplit_whitespace( 10034 self, PyUnicode_1BYTE_DATA(self), 10035 PyUnicode_GET_LENGTH(self), maxcount 10036 ); 10037 else 10038 return ucs1lib_rsplit_whitespace( 10039 self, PyUnicode_1BYTE_DATA(self), 10040 PyUnicode_GET_LENGTH(self), maxcount 10041 ); 10042 case PyUnicode_2BYTE_KIND: 10043 return ucs2lib_rsplit_whitespace( 10044 self, PyUnicode_2BYTE_DATA(self), 10045 PyUnicode_GET_LENGTH(self), maxcount 10046 ); 10047 case PyUnicode_4BYTE_KIND: 10048 return ucs4lib_rsplit_whitespace( 10049 self, PyUnicode_4BYTE_DATA(self), 10050 PyUnicode_GET_LENGTH(self), maxcount 10051 ); 10052 default: 10053 assert(0); 10054 return NULL; 10055 } 10056 10057 if (PyUnicode_READY(substring) == -1) 10058 return NULL; 10059 10060 kind1 = PyUnicode_KIND(self); 10061 kind2 = PyUnicode_KIND(substring); 10062 kind = kind1 > kind2 ? kind1 : kind2; 10063 buf1 = PyUnicode_DATA(self); 10064 buf2 = PyUnicode_DATA(substring); 10065 if (kind1 != kind) 10066 buf1 = _PyUnicode_AsKind(self, kind); 10067 if (!buf1) 10068 return NULL; 10069 if (kind2 != kind) 10070 buf2 = _PyUnicode_AsKind(substring, kind); 10071 if (!buf2) { 10072 if (kind1 != kind) PyMem_Free(buf1); 10073 return NULL; 10074 } 10075 len1 = PyUnicode_GET_LENGTH(self); 10076 len2 = PyUnicode_GET_LENGTH(substring); 10077 10078 switch(kind) { 10079 case PyUnicode_1BYTE_KIND: 10080 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10081 out = asciilib_rsplit( 10082 self, buf1, len1, buf2, len2, maxcount); 10083 else 10084 out = ucs1lib_rsplit( 10085 self, buf1, len1, buf2, len2, maxcount); 10086 break; 10087 case PyUnicode_2BYTE_KIND: 10088 out = ucs2lib_rsplit( 10089 self, buf1, len1, buf2, len2, maxcount); 10090 break; 10091 case PyUnicode_4BYTE_KIND: 10092 out = ucs4lib_rsplit( 10093 self, buf1, len1, buf2, len2, maxcount); 10094 break; 10095 default: 10096 out = NULL; 10097 } 10098 if (kind1 != kind) 10099 PyMem_Free(buf1); 10100 if (kind2 != kind) 10101 PyMem_Free(buf2); 10102 return out; 10103} 10104 10105static Py_ssize_t 10106anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10107 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10108{ 10109 switch(kind) { 10110 case PyUnicode_1BYTE_KIND: 10111 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10112 return asciilib_find(buf1, len1, buf2, len2, offset); 10113 else 10114 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10115 case PyUnicode_2BYTE_KIND: 10116 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10117 case PyUnicode_4BYTE_KIND: 10118 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10119 } 10120 assert(0); 10121 return -1; 10122} 10123 10124static Py_ssize_t 10125anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10126 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10127{ 10128 switch(kind) { 10129 case PyUnicode_1BYTE_KIND: 10130 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10131 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10132 else 10133 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10134 case PyUnicode_2BYTE_KIND: 10135 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10136 case PyUnicode_4BYTE_KIND: 10137 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10138 } 10139 assert(0); 10140 return 0; 10141} 10142 10143static PyObject * 10144replace(PyObject *self, PyObject *str1, 10145 PyObject *str2, Py_ssize_t maxcount) 10146{ 10147 PyObject *u; 10148 char *sbuf = PyUnicode_DATA(self); 10149 char *buf1 = PyUnicode_DATA(str1); 10150 char *buf2 = PyUnicode_DATA(str2); 10151 int srelease = 0, release1 = 0, release2 = 0; 10152 int skind = PyUnicode_KIND(self); 10153 int kind1 = PyUnicode_KIND(str1); 10154 int kind2 = PyUnicode_KIND(str2); 10155 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10156 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10157 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10158 int mayshrink; 10159 Py_UCS4 maxchar, maxchar_str2; 10160 10161 if (maxcount < 0) 10162 maxcount = PY_SSIZE_T_MAX; 10163 else if (maxcount == 0 || slen == 0) 10164 goto nothing; 10165 10166 if (str1 == str2) 10167 goto nothing; 10168 if (skind < kind1) 10169 /* substring too wide to be present */ 10170 goto nothing; 10171 10172 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10173 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10174 /* Replacing str1 with str2 may cause a maxchar reduction in the 10175 result string. */ 10176 mayshrink = (maxchar_str2 < maxchar); 10177 maxchar = Py_MAX(maxchar, maxchar_str2); 10178 10179 if (len1 == len2) { 10180 Py_ssize_t i; 10181 /* same length */ 10182 if (len1 == 0) 10183 goto nothing; 10184 if (len1 == 1) { 10185 /* replace characters */ 10186 Py_UCS4 u1, u2; 10187 int rkind; 10188 u1 = PyUnicode_READ_CHAR(str1, 0); 10189 if (findchar(sbuf, PyUnicode_KIND(self), 10190 slen, u1, 1) < 0) 10191 goto nothing; 10192 u2 = PyUnicode_READ_CHAR(str2, 0); 10193 u = PyUnicode_New(slen, maxchar); 10194 if (!u) 10195 goto error; 10196 copy_characters(u, 0, self, 0, slen); 10197 rkind = PyUnicode_KIND(u); 10198 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 10199 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 10200 if (--maxcount < 0) 10201 break; 10202 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 10203 } 10204 } 10205 else { 10206 int rkind = skind; 10207 char *res; 10208 10209 if (kind1 < rkind) { 10210 /* widen substring */ 10211 buf1 = _PyUnicode_AsKind(str1, rkind); 10212 if (!buf1) goto error; 10213 release1 = 1; 10214 } 10215 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10216 if (i < 0) 10217 goto nothing; 10218 if (rkind > kind2) { 10219 /* widen replacement */ 10220 buf2 = _PyUnicode_AsKind(str2, rkind); 10221 if (!buf2) goto error; 10222 release2 = 1; 10223 } 10224 else if (rkind < kind2) { 10225 /* widen self and buf1 */ 10226 rkind = kind2; 10227 if (release1) PyMem_Free(buf1); 10228 sbuf = _PyUnicode_AsKind(self, rkind); 10229 if (!sbuf) goto error; 10230 srelease = 1; 10231 buf1 = _PyUnicode_AsKind(str1, rkind); 10232 if (!buf1) goto error; 10233 release1 = 1; 10234 } 10235 u = PyUnicode_New(slen, maxchar); 10236 if (!u) 10237 goto error; 10238 assert(PyUnicode_KIND(u) == rkind); 10239 res = PyUnicode_DATA(u); 10240 10241 memcpy(res, sbuf, rkind * slen); 10242 /* change everything in-place, starting with this one */ 10243 memcpy(res + rkind * i, 10244 buf2, 10245 rkind * len2); 10246 i += len1; 10247 10248 while ( --maxcount > 0) { 10249 i = anylib_find(rkind, self, 10250 sbuf+rkind*i, slen-i, 10251 str1, buf1, len1, i); 10252 if (i == -1) 10253 break; 10254 memcpy(res + rkind * i, 10255 buf2, 10256 rkind * len2); 10257 i += len1; 10258 } 10259 } 10260 } 10261 else { 10262 Py_ssize_t n, i, j, ires; 10263 Py_ssize_t product, new_size; 10264 int rkind = skind; 10265 char *res; 10266 10267 if (kind1 < rkind) { 10268 /* widen substring */ 10269 buf1 = _PyUnicode_AsKind(str1, rkind); 10270 if (!buf1) goto error; 10271 release1 = 1; 10272 } 10273 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10274 if (n == 0) 10275 goto nothing; 10276 if (kind2 < rkind) { 10277 /* widen replacement */ 10278 buf2 = _PyUnicode_AsKind(str2, rkind); 10279 if (!buf2) goto error; 10280 release2 = 1; 10281 } 10282 else if (kind2 > rkind) { 10283 /* widen self and buf1 */ 10284 rkind = kind2; 10285 sbuf = _PyUnicode_AsKind(self, rkind); 10286 if (!sbuf) goto error; 10287 srelease = 1; 10288 if (release1) PyMem_Free(buf1); 10289 buf1 = _PyUnicode_AsKind(str1, rkind); 10290 if (!buf1) goto error; 10291 release1 = 1; 10292 } 10293 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10294 PyUnicode_GET_LENGTH(str1))); */ 10295 product = n * (len2-len1); 10296 if ((product / (len2-len1)) != n) { 10297 PyErr_SetString(PyExc_OverflowError, 10298 "replace string is too long"); 10299 goto error; 10300 } 10301 new_size = slen + product; 10302 if (new_size == 0) { 10303 Py_INCREF(unicode_empty); 10304 u = unicode_empty; 10305 goto done; 10306 } 10307 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10308 PyErr_SetString(PyExc_OverflowError, 10309 "replace string is too long"); 10310 goto error; 10311 } 10312 u = PyUnicode_New(new_size, maxchar); 10313 if (!u) 10314 goto error; 10315 assert(PyUnicode_KIND(u) == rkind); 10316 res = PyUnicode_DATA(u); 10317 ires = i = 0; 10318 if (len1 > 0) { 10319 while (n-- > 0) { 10320 /* look for next match */ 10321 j = anylib_find(rkind, self, 10322 sbuf + rkind * i, slen-i, 10323 str1, buf1, len1, i); 10324 if (j == -1) 10325 break; 10326 else if (j > i) { 10327 /* copy unchanged part [i:j] */ 10328 memcpy(res + rkind * ires, 10329 sbuf + rkind * i, 10330 rkind * (j-i)); 10331 ires += j - i; 10332 } 10333 /* copy substitution string */ 10334 if (len2 > 0) { 10335 memcpy(res + rkind * ires, 10336 buf2, 10337 rkind * len2); 10338 ires += len2; 10339 } 10340 i = j + len1; 10341 } 10342 if (i < slen) 10343 /* copy tail [i:] */ 10344 memcpy(res + rkind * ires, 10345 sbuf + rkind * i, 10346 rkind * (slen-i)); 10347 } 10348 else { 10349 /* interleave */ 10350 while (n > 0) { 10351 memcpy(res + rkind * ires, 10352 buf2, 10353 rkind * len2); 10354 ires += len2; 10355 if (--n <= 0) 10356 break; 10357 memcpy(res + rkind * ires, 10358 sbuf + rkind * i, 10359 rkind); 10360 ires++; 10361 i++; 10362 } 10363 memcpy(res + rkind * ires, 10364 sbuf + rkind * i, 10365 rkind * (slen-i)); 10366 } 10367 } 10368 10369 if (mayshrink) { 10370 unicode_adjust_maxchar(&u); 10371 if (u == NULL) 10372 goto error; 10373 } 10374 10375 done: 10376 if (srelease) 10377 PyMem_FREE(sbuf); 10378 if (release1) 10379 PyMem_FREE(buf1); 10380 if (release2) 10381 PyMem_FREE(buf2); 10382 assert(_PyUnicode_CheckConsistency(u, 1)); 10383 return u; 10384 10385 nothing: 10386 /* nothing to replace; return original string (when possible) */ 10387 if (srelease) 10388 PyMem_FREE(sbuf); 10389 if (release1) 10390 PyMem_FREE(buf1); 10391 if (release2) 10392 PyMem_FREE(buf2); 10393 if (PyUnicode_CheckExact(self)) { 10394 Py_INCREF(self); 10395 return self; 10396 } 10397 return PyUnicode_Copy(self); 10398 error: 10399 if (srelease && sbuf) 10400 PyMem_FREE(sbuf); 10401 if (release1 && buf1) 10402 PyMem_FREE(buf1); 10403 if (release2 && buf2) 10404 PyMem_FREE(buf2); 10405 return NULL; 10406} 10407 10408/* --- Unicode Object Methods --------------------------------------------- */ 10409 10410PyDoc_STRVAR(title__doc__, 10411 "S.title() -> str\n\ 10412\n\ 10413Return a titlecased version of S, i.e. words start with title case\n\ 10414characters, all remaining cased characters have lower case."); 10415 10416static PyObject* 10417unicode_title(PyObject *self) 10418{ 10419 return fixup(self, fixtitle); 10420} 10421 10422PyDoc_STRVAR(capitalize__doc__, 10423 "S.capitalize() -> str\n\ 10424\n\ 10425Return a capitalized version of S, i.e. make the first character\n\ 10426have upper case and the rest lower case."); 10427 10428static PyObject* 10429unicode_capitalize(PyObject *self) 10430{ 10431 return fixup(self, fixcapitalize); 10432} 10433 10434#if 0 10435PyDoc_STRVAR(capwords__doc__, 10436 "S.capwords() -> str\n\ 10437\n\ 10438Apply .capitalize() to all words in S and return the result with\n\ 10439normalized whitespace (all whitespace strings are replaced by ' ')."); 10440 10441static PyObject* 10442unicode_capwords(PyObject *self) 10443{ 10444 PyObject *list; 10445 PyObject *item; 10446 Py_ssize_t i; 10447 10448 /* Split into words */ 10449 list = split(self, NULL, -1); 10450 if (!list) 10451 return NULL; 10452 10453 /* Capitalize each word */ 10454 for (i = 0; i < PyList_GET_SIZE(list); i++) { 10455 item = fixup(PyList_GET_ITEM(list, i), 10456 fixcapitalize); 10457 if (item == NULL) 10458 goto onError; 10459 Py_DECREF(PyList_GET_ITEM(list, i)); 10460 PyList_SET_ITEM(list, i, item); 10461 } 10462 10463 /* Join the words to form a new string */ 10464 item = PyUnicode_Join(NULL, list); 10465 10466 onError: 10467 Py_DECREF(list); 10468 return item; 10469} 10470#endif 10471 10472/* Argument converter. Coerces to a single unicode character */ 10473 10474static int 10475convert_uc(PyObject *obj, void *addr) 10476{ 10477 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10478 PyObject *uniobj; 10479 10480 uniobj = PyUnicode_FromObject(obj); 10481 if (uniobj == NULL) { 10482 PyErr_SetString(PyExc_TypeError, 10483 "The fill character cannot be converted to Unicode"); 10484 return 0; 10485 } 10486 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10487 PyErr_SetString(PyExc_TypeError, 10488 "The fill character must be exactly one character long"); 10489 Py_DECREF(uniobj); 10490 return 0; 10491 } 10492 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10493 Py_DECREF(uniobj); 10494 return 1; 10495} 10496 10497PyDoc_STRVAR(center__doc__, 10498 "S.center(width[, fillchar]) -> str\n\ 10499\n\ 10500Return S centered in a string of length width. Padding is\n\ 10501done using the specified fill character (default is a space)"); 10502 10503static PyObject * 10504unicode_center(PyObject *self, PyObject *args) 10505{ 10506 Py_ssize_t marg, left; 10507 Py_ssize_t width; 10508 Py_UCS4 fillchar = ' '; 10509 10510 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10511 return NULL; 10512 10513 if (PyUnicode_READY(self) == -1) 10514 return NULL; 10515 10516 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10517 Py_INCREF(self); 10518 return self; 10519 } 10520 10521 marg = width - _PyUnicode_LENGTH(self); 10522 left = marg / 2 + (marg & width & 1); 10523 10524 return pad(self, left, marg - left, fillchar); 10525} 10526 10527/* This function assumes that str1 and str2 are readied by the caller. */ 10528 10529static int 10530unicode_compare(PyObject *str1, PyObject *str2) 10531{ 10532 int kind1, kind2; 10533 void *data1, *data2; 10534 Py_ssize_t len1, len2, i; 10535 10536 kind1 = PyUnicode_KIND(str1); 10537 kind2 = PyUnicode_KIND(str2); 10538 data1 = PyUnicode_DATA(str1); 10539 data2 = PyUnicode_DATA(str2); 10540 len1 = PyUnicode_GET_LENGTH(str1); 10541 len2 = PyUnicode_GET_LENGTH(str2); 10542 10543 for (i = 0; i < len1 && i < len2; ++i) { 10544 Py_UCS4 c1, c2; 10545 c1 = PyUnicode_READ(kind1, data1, i); 10546 c2 = PyUnicode_READ(kind2, data2, i); 10547 10548 if (c1 != c2) 10549 return (c1 < c2) ? -1 : 1; 10550 } 10551 10552 return (len1 < len2) ? -1 : (len1 != len2); 10553} 10554 10555int 10556PyUnicode_Compare(PyObject *left, PyObject *right) 10557{ 10558 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10559 if (PyUnicode_READY(left) == -1 || 10560 PyUnicode_READY(right) == -1) 10561 return -1; 10562 return unicode_compare(left, right); 10563 } 10564 PyErr_Format(PyExc_TypeError, 10565 "Can't compare %.100s and %.100s", 10566 left->ob_type->tp_name, 10567 right->ob_type->tp_name); 10568 return -1; 10569} 10570 10571int 10572PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10573{ 10574 Py_ssize_t i; 10575 int kind; 10576 void *data; 10577 Py_UCS4 chr; 10578 10579 assert(_PyUnicode_CHECK(uni)); 10580 if (PyUnicode_READY(uni) == -1) 10581 return -1; 10582 kind = PyUnicode_KIND(uni); 10583 data = PyUnicode_DATA(uni); 10584 /* Compare Unicode string and source character set string */ 10585 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10586 if (chr != str[i]) 10587 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10588 /* This check keeps Python strings that end in '\0' from comparing equal 10589 to C strings identical up to that point. */ 10590 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10591 return 1; /* uni is longer */ 10592 if (str[i]) 10593 return -1; /* str is longer */ 10594 return 0; 10595} 10596 10597 10598#define TEST_COND(cond) \ 10599 ((cond) ? Py_True : Py_False) 10600 10601PyObject * 10602PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10603{ 10604 int result; 10605 10606 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10607 PyObject *v; 10608 if (PyUnicode_READY(left) == -1 || 10609 PyUnicode_READY(right) == -1) 10610 return NULL; 10611 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10612 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10613 if (op == Py_EQ) { 10614 Py_INCREF(Py_False); 10615 return Py_False; 10616 } 10617 if (op == Py_NE) { 10618 Py_INCREF(Py_True); 10619 return Py_True; 10620 } 10621 } 10622 if (left == right) 10623 result = 0; 10624 else 10625 result = unicode_compare(left, right); 10626 10627 /* Convert the return value to a Boolean */ 10628 switch (op) { 10629 case Py_EQ: 10630 v = TEST_COND(result == 0); 10631 break; 10632 case Py_NE: 10633 v = TEST_COND(result != 0); 10634 break; 10635 case Py_LE: 10636 v = TEST_COND(result <= 0); 10637 break; 10638 case Py_GE: 10639 v = TEST_COND(result >= 0); 10640 break; 10641 case Py_LT: 10642 v = TEST_COND(result == -1); 10643 break; 10644 case Py_GT: 10645 v = TEST_COND(result == 1); 10646 break; 10647 default: 10648 PyErr_BadArgument(); 10649 return NULL; 10650 } 10651 Py_INCREF(v); 10652 return v; 10653 } 10654 10655 Py_RETURN_NOTIMPLEMENTED; 10656} 10657 10658int 10659PyUnicode_Contains(PyObject *container, PyObject *element) 10660{ 10661 PyObject *str, *sub; 10662 int kind1, kind2, kind; 10663 void *buf1, *buf2; 10664 Py_ssize_t len1, len2; 10665 int result; 10666 10667 /* Coerce the two arguments */ 10668 sub = PyUnicode_FromObject(element); 10669 if (!sub) { 10670 PyErr_Format(PyExc_TypeError, 10671 "'in <string>' requires string as left operand, not %s", 10672 element->ob_type->tp_name); 10673 return -1; 10674 } 10675 if (PyUnicode_READY(sub) == -1) 10676 return -1; 10677 10678 str = PyUnicode_FromObject(container); 10679 if (!str || PyUnicode_READY(str) == -1) { 10680 Py_DECREF(sub); 10681 return -1; 10682 } 10683 10684 kind1 = PyUnicode_KIND(str); 10685 kind2 = PyUnicode_KIND(sub); 10686 kind = kind1 > kind2 ? kind1 : kind2; 10687 buf1 = PyUnicode_DATA(str); 10688 buf2 = PyUnicode_DATA(sub); 10689 if (kind1 != kind) 10690 buf1 = _PyUnicode_AsKind(str, kind); 10691 if (!buf1) { 10692 Py_DECREF(sub); 10693 return -1; 10694 } 10695 if (kind2 != kind) 10696 buf2 = _PyUnicode_AsKind(sub, kind); 10697 if (!buf2) { 10698 Py_DECREF(sub); 10699 if (kind1 != kind) PyMem_Free(buf1); 10700 return -1; 10701 } 10702 len1 = PyUnicode_GET_LENGTH(str); 10703 len2 = PyUnicode_GET_LENGTH(sub); 10704 10705 switch(kind) { 10706 case PyUnicode_1BYTE_KIND: 10707 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10708 break; 10709 case PyUnicode_2BYTE_KIND: 10710 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10711 break; 10712 case PyUnicode_4BYTE_KIND: 10713 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10714 break; 10715 default: 10716 result = -1; 10717 assert(0); 10718 } 10719 10720 Py_DECREF(str); 10721 Py_DECREF(sub); 10722 10723 if (kind1 != kind) 10724 PyMem_Free(buf1); 10725 if (kind2 != kind) 10726 PyMem_Free(buf2); 10727 10728 return result; 10729} 10730 10731/* Concat to string or Unicode object giving a new Unicode object. */ 10732 10733PyObject * 10734PyUnicode_Concat(PyObject *left, PyObject *right) 10735{ 10736 PyObject *u = NULL, *v = NULL, *w; 10737 Py_UCS4 maxchar, maxchar2; 10738 10739 /* Coerce the two arguments */ 10740 u = PyUnicode_FromObject(left); 10741 if (u == NULL) 10742 goto onError; 10743 v = PyUnicode_FromObject(right); 10744 if (v == NULL) 10745 goto onError; 10746 10747 /* Shortcuts */ 10748 if (v == unicode_empty) { 10749 Py_DECREF(v); 10750 return u; 10751 } 10752 if (u == unicode_empty) { 10753 Py_DECREF(u); 10754 return v; 10755 } 10756 10757 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10758 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10759 maxchar = Py_MAX(maxchar, maxchar2); 10760 10761 /* Concat the two Unicode strings */ 10762 w = PyUnicode_New( 10763 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10764 maxchar); 10765 if (w == NULL) 10766 goto onError; 10767 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); 10768 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); 10769 Py_DECREF(u); 10770 Py_DECREF(v); 10771 assert(_PyUnicode_CheckConsistency(w, 1)); 10772 return w; 10773 10774 onError: 10775 Py_XDECREF(u); 10776 Py_XDECREF(v); 10777 return NULL; 10778} 10779 10780static void 10781unicode_append_inplace(PyObject **p_left, PyObject *right) 10782{ 10783 Py_ssize_t left_len, right_len, new_len; 10784 10785 assert(PyUnicode_IS_READY(*p_left)); 10786 assert(PyUnicode_IS_READY(right)); 10787 10788 left_len = PyUnicode_GET_LENGTH(*p_left); 10789 right_len = PyUnicode_GET_LENGTH(right); 10790 if (left_len > PY_SSIZE_T_MAX - right_len) { 10791 PyErr_SetString(PyExc_OverflowError, 10792 "strings are too large to concat"); 10793 goto error; 10794 } 10795 new_len = left_len + right_len; 10796 10797 /* Now we own the last reference to 'left', so we can resize it 10798 * in-place. 10799 */ 10800 if (unicode_resize(p_left, new_len) != 0) { 10801 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10802 * deallocated so it cannot be put back into 10803 * 'variable'. The MemoryError is raised when there 10804 * is no value in 'variable', which might (very 10805 * remotely) be a cause of incompatibilities. 10806 */ 10807 goto error; 10808 } 10809 /* copy 'right' into the newly allocated area of 'left' */ 10810 copy_characters(*p_left, left_len, right, 0, right_len); 10811 _PyUnicode_DIRTY(*p_left); 10812 return; 10813 10814error: 10815 Py_DECREF(*p_left); 10816 *p_left = NULL; 10817} 10818 10819void 10820PyUnicode_Append(PyObject **p_left, PyObject *right) 10821{ 10822 PyObject *left, *res; 10823 10824 if (p_left == NULL) { 10825 if (!PyErr_Occurred()) 10826 PyErr_BadInternalCall(); 10827 return; 10828 } 10829 left = *p_left; 10830 if (right == NULL || !PyUnicode_Check(left)) { 10831 if (!PyErr_Occurred()) 10832 PyErr_BadInternalCall(); 10833 goto error; 10834 } 10835 10836 if (PyUnicode_READY(left)) 10837 goto error; 10838 if (PyUnicode_READY(right)) 10839 goto error; 10840 10841 if (PyUnicode_CheckExact(left) && left != unicode_empty 10842 && PyUnicode_CheckExact(right) && right != unicode_empty 10843 && unicode_resizable(left) 10844 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10845 || _PyUnicode_WSTR(left) != NULL)) 10846 { 10847 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10848 to change the structure size, but characters are stored just after 10849 the structure, and so it requires to move all characters which is 10850 not so different than duplicating the string. */ 10851 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10852 { 10853 unicode_append_inplace(p_left, right); 10854 if (p_left != NULL) 10855 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10856 return; 10857 } 10858 } 10859 10860 res = PyUnicode_Concat(left, right); 10861 if (res == NULL) 10862 goto error; 10863 Py_DECREF(left); 10864 *p_left = res; 10865 return; 10866 10867error: 10868 Py_DECREF(*p_left); 10869 *p_left = NULL; 10870} 10871 10872void 10873PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10874{ 10875 PyUnicode_Append(pleft, right); 10876 Py_XDECREF(right); 10877} 10878 10879PyDoc_STRVAR(count__doc__, 10880 "S.count(sub[, start[, end]]) -> int\n\ 10881\n\ 10882Return the number of non-overlapping occurrences of substring sub in\n\ 10883string S[start:end]. Optional arguments start and end are\n\ 10884interpreted as in slice notation."); 10885 10886static PyObject * 10887unicode_count(PyObject *self, PyObject *args) 10888{ 10889 PyObject *substring; 10890 Py_ssize_t start = 0; 10891 Py_ssize_t end = PY_SSIZE_T_MAX; 10892 PyObject *result; 10893 int kind1, kind2, kind; 10894 void *buf1, *buf2; 10895 Py_ssize_t len1, len2, iresult; 10896 10897 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10898 &start, &end)) 10899 return NULL; 10900 10901 kind1 = PyUnicode_KIND(self); 10902 kind2 = PyUnicode_KIND(substring); 10903 kind = kind1 > kind2 ? kind1 : kind2; 10904 buf1 = PyUnicode_DATA(self); 10905 buf2 = PyUnicode_DATA(substring); 10906 if (kind1 != kind) 10907 buf1 = _PyUnicode_AsKind(self, kind); 10908 if (!buf1) { 10909 Py_DECREF(substring); 10910 return NULL; 10911 } 10912 if (kind2 != kind) 10913 buf2 = _PyUnicode_AsKind(substring, kind); 10914 if (!buf2) { 10915 Py_DECREF(substring); 10916 if (kind1 != kind) PyMem_Free(buf1); 10917 return NULL; 10918 } 10919 len1 = PyUnicode_GET_LENGTH(self); 10920 len2 = PyUnicode_GET_LENGTH(substring); 10921 10922 ADJUST_INDICES(start, end, len1); 10923 switch(kind) { 10924 case PyUnicode_1BYTE_KIND: 10925 iresult = ucs1lib_count( 10926 ((Py_UCS1*)buf1) + start, end - start, 10927 buf2, len2, PY_SSIZE_T_MAX 10928 ); 10929 break; 10930 case PyUnicode_2BYTE_KIND: 10931 iresult = ucs2lib_count( 10932 ((Py_UCS2*)buf1) + start, end - start, 10933 buf2, len2, PY_SSIZE_T_MAX 10934 ); 10935 break; 10936 case PyUnicode_4BYTE_KIND: 10937 iresult = ucs4lib_count( 10938 ((Py_UCS4*)buf1) + start, end - start, 10939 buf2, len2, PY_SSIZE_T_MAX 10940 ); 10941 break; 10942 default: 10943 assert(0); iresult = 0; 10944 } 10945 10946 result = PyLong_FromSsize_t(iresult); 10947 10948 if (kind1 != kind) 10949 PyMem_Free(buf1); 10950 if (kind2 != kind) 10951 PyMem_Free(buf2); 10952 10953 Py_DECREF(substring); 10954 10955 return result; 10956} 10957 10958PyDoc_STRVAR(encode__doc__, 10959 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10960\n\ 10961Encode S using the codec registered for encoding. Default encoding\n\ 10962is 'utf-8'. errors may be given to set a different error\n\ 10963handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10964a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10965'xmlcharrefreplace' as well as any other name registered with\n\ 10966codecs.register_error that can handle UnicodeEncodeErrors."); 10967 10968static PyObject * 10969unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10970{ 10971 static char *kwlist[] = {"encoding", "errors", 0}; 10972 char *encoding = NULL; 10973 char *errors = NULL; 10974 10975 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10976 kwlist, &encoding, &errors)) 10977 return NULL; 10978 return PyUnicode_AsEncodedString(self, encoding, errors); 10979} 10980 10981PyDoc_STRVAR(expandtabs__doc__, 10982 "S.expandtabs([tabsize]) -> str\n\ 10983\n\ 10984Return a copy of S where all tab characters are expanded using spaces.\n\ 10985If tabsize is not given, a tab size of 8 characters is assumed."); 10986 10987static PyObject* 10988unicode_expandtabs(PyObject *self, PyObject *args) 10989{ 10990 Py_ssize_t i, j, line_pos, src_len, incr; 10991 Py_UCS4 ch; 10992 PyObject *u; 10993 void *src_data, *dest_data; 10994 int tabsize = 8; 10995 int kind; 10996 int found; 10997 10998 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10999 return NULL; 11000 11001 if (PyUnicode_READY(self) == -1) 11002 return NULL; 11003 11004 /* First pass: determine size of output string */ 11005 src_len = PyUnicode_GET_LENGTH(self); 11006 i = j = line_pos = 0; 11007 kind = PyUnicode_KIND(self); 11008 src_data = PyUnicode_DATA(self); 11009 found = 0; 11010 for (; i < src_len; i++) { 11011 ch = PyUnicode_READ(kind, src_data, i); 11012 if (ch == '\t') { 11013 found = 1; 11014 if (tabsize > 0) { 11015 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11016 if (j > PY_SSIZE_T_MAX - incr) 11017 goto overflow; 11018 line_pos += incr; 11019 j += incr; 11020 } 11021 } 11022 else { 11023 if (j > PY_SSIZE_T_MAX - 1) 11024 goto overflow; 11025 line_pos++; 11026 j++; 11027 if (ch == '\n' || ch == '\r') 11028 line_pos = 0; 11029 } 11030 } 11031 if (!found && PyUnicode_CheckExact(self)) { 11032 Py_INCREF(self); 11033 return self; 11034 } 11035 11036 /* Second pass: create output string and fill it */ 11037 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11038 if (!u) 11039 return NULL; 11040 dest_data = PyUnicode_DATA(u); 11041 11042 i = j = line_pos = 0; 11043 11044 for (; i < src_len; i++) { 11045 ch = PyUnicode_READ(kind, src_data, i); 11046 if (ch == '\t') { 11047 if (tabsize > 0) { 11048 incr = tabsize - (line_pos % tabsize); 11049 line_pos += incr; 11050 while (incr--) { 11051 PyUnicode_WRITE(kind, dest_data, j, ' '); 11052 j++; 11053 } 11054 } 11055 } 11056 else { 11057 line_pos++; 11058 PyUnicode_WRITE(kind, dest_data, j, ch); 11059 j++; 11060 if (ch == '\n' || ch == '\r') 11061 line_pos = 0; 11062 } 11063 } 11064 assert (j == PyUnicode_GET_LENGTH(u)); 11065#ifndef DONT_MAKE_RESULT_READY 11066 if (_PyUnicode_READY_REPLACE(&u)) { 11067 Py_DECREF(u); 11068 return NULL; 11069 } 11070#endif 11071 assert(_PyUnicode_CheckConsistency(u, 1)); 11072 return u; 11073 11074 overflow: 11075 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11076 return NULL; 11077} 11078 11079PyDoc_STRVAR(find__doc__, 11080 "S.find(sub[, start[, end]]) -> int\n\ 11081\n\ 11082Return the lowest index in S where substring sub is found,\n\ 11083such that sub is contained within S[start:end]. Optional\n\ 11084arguments start and end are interpreted as in slice notation.\n\ 11085\n\ 11086Return -1 on failure."); 11087 11088static PyObject * 11089unicode_find(PyObject *self, PyObject *args) 11090{ 11091 PyObject *substring; 11092 Py_ssize_t start; 11093 Py_ssize_t end; 11094 Py_ssize_t result; 11095 11096 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11097 &start, &end)) 11098 return NULL; 11099 11100 if (PyUnicode_READY(self) == -1) 11101 return NULL; 11102 if (PyUnicode_READY(substring) == -1) 11103 return NULL; 11104 11105 result = any_find_slice(1, self, substring, start, end); 11106 11107 Py_DECREF(substring); 11108 11109 if (result == -2) 11110 return NULL; 11111 11112 return PyLong_FromSsize_t(result); 11113} 11114 11115static PyObject * 11116unicode_getitem(PyObject *self, Py_ssize_t index) 11117{ 11118 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 11119 if (ch == (Py_UCS4)-1) 11120 return NULL; 11121 return PyUnicode_FromOrdinal(ch); 11122} 11123 11124/* Believe it or not, this produces the same value for ASCII strings 11125 as bytes_hash(). */ 11126static Py_hash_t 11127unicode_hash(PyObject *self) 11128{ 11129 Py_ssize_t len; 11130 Py_uhash_t x; 11131 11132 if (_PyUnicode_HASH(self) != -1) 11133 return _PyUnicode_HASH(self); 11134 if (PyUnicode_READY(self) == -1) 11135 return -1; 11136 len = PyUnicode_GET_LENGTH(self); 11137 11138 /* The hash function as a macro, gets expanded three times below. */ 11139#define HASH(P) \ 11140 x = (Py_uhash_t)*P << 7; \ 11141 while (--len >= 0) \ 11142 x = (1000003*x) ^ (Py_uhash_t)*P++; 11143 11144 switch (PyUnicode_KIND(self)) { 11145 case PyUnicode_1BYTE_KIND: { 11146 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11147 HASH(c); 11148 break; 11149 } 11150 case PyUnicode_2BYTE_KIND: { 11151 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11152 HASH(s); 11153 break; 11154 } 11155 default: { 11156 Py_UCS4 *l; 11157 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11158 "Impossible switch case in unicode_hash"); 11159 l = PyUnicode_4BYTE_DATA(self); 11160 HASH(l); 11161 break; 11162 } 11163 } 11164 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 11165 11166 if (x == -1) 11167 x = -2; 11168 _PyUnicode_HASH(self) = x; 11169 return x; 11170} 11171#undef HASH 11172 11173PyDoc_STRVAR(index__doc__, 11174 "S.index(sub[, start[, end]]) -> int\n\ 11175\n\ 11176Like S.find() but raise ValueError when the substring is not found."); 11177 11178static PyObject * 11179unicode_index(PyObject *self, PyObject *args) 11180{ 11181 Py_ssize_t result; 11182 PyObject *substring; 11183 Py_ssize_t start; 11184 Py_ssize_t end; 11185 11186 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11187 &start, &end)) 11188 return NULL; 11189 11190 if (PyUnicode_READY(self) == -1) 11191 return NULL; 11192 if (PyUnicode_READY(substring) == -1) 11193 return NULL; 11194 11195 result = any_find_slice(1, self, substring, start, end); 11196 11197 Py_DECREF(substring); 11198 11199 if (result == -2) 11200 return NULL; 11201 11202 if (result < 0) { 11203 PyErr_SetString(PyExc_ValueError, "substring not found"); 11204 return NULL; 11205 } 11206 11207 return PyLong_FromSsize_t(result); 11208} 11209 11210PyDoc_STRVAR(islower__doc__, 11211 "S.islower() -> bool\n\ 11212\n\ 11213Return True if all cased characters in S are lowercase and there is\n\ 11214at least one cased character in S, False otherwise."); 11215 11216static PyObject* 11217unicode_islower(PyObject *self) 11218{ 11219 Py_ssize_t i, length; 11220 int kind; 11221 void *data; 11222 int cased; 11223 11224 if (PyUnicode_READY(self) == -1) 11225 return NULL; 11226 length = PyUnicode_GET_LENGTH(self); 11227 kind = PyUnicode_KIND(self); 11228 data = PyUnicode_DATA(self); 11229 11230 /* Shortcut for single character strings */ 11231 if (length == 1) 11232 return PyBool_FromLong( 11233 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11234 11235 /* Special case for empty strings */ 11236 if (length == 0) 11237 return PyBool_FromLong(0); 11238 11239 cased = 0; 11240 for (i = 0; i < length; i++) { 11241 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11242 11243 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11244 return PyBool_FromLong(0); 11245 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11246 cased = 1; 11247 } 11248 return PyBool_FromLong(cased); 11249} 11250 11251PyDoc_STRVAR(isupper__doc__, 11252 "S.isupper() -> bool\n\ 11253\n\ 11254Return True if all cased characters in S are uppercase and there is\n\ 11255at least one cased character in S, False otherwise."); 11256 11257static PyObject* 11258unicode_isupper(PyObject *self) 11259{ 11260 Py_ssize_t i, length; 11261 int kind; 11262 void *data; 11263 int cased; 11264 11265 if (PyUnicode_READY(self) == -1) 11266 return NULL; 11267 length = PyUnicode_GET_LENGTH(self); 11268 kind = PyUnicode_KIND(self); 11269 data = PyUnicode_DATA(self); 11270 11271 /* Shortcut for single character strings */ 11272 if (length == 1) 11273 return PyBool_FromLong( 11274 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11275 11276 /* Special case for empty strings */ 11277 if (length == 0) 11278 return PyBool_FromLong(0); 11279 11280 cased = 0; 11281 for (i = 0; i < length; i++) { 11282 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11283 11284 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11285 return PyBool_FromLong(0); 11286 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11287 cased = 1; 11288 } 11289 return PyBool_FromLong(cased); 11290} 11291 11292PyDoc_STRVAR(istitle__doc__, 11293 "S.istitle() -> bool\n\ 11294\n\ 11295Return True if S is a titlecased string and there is at least one\n\ 11296character in S, i.e. upper- and titlecase characters may only\n\ 11297follow uncased characters and lowercase characters only cased ones.\n\ 11298Return False otherwise."); 11299 11300static PyObject* 11301unicode_istitle(PyObject *self) 11302{ 11303 Py_ssize_t i, length; 11304 int kind; 11305 void *data; 11306 int cased, previous_is_cased; 11307 11308 if (PyUnicode_READY(self) == -1) 11309 return NULL; 11310 length = PyUnicode_GET_LENGTH(self); 11311 kind = PyUnicode_KIND(self); 11312 data = PyUnicode_DATA(self); 11313 11314 /* Shortcut for single character strings */ 11315 if (length == 1) { 11316 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11317 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11318 (Py_UNICODE_ISUPPER(ch) != 0)); 11319 } 11320 11321 /* Special case for empty strings */ 11322 if (length == 0) 11323 return PyBool_FromLong(0); 11324 11325 cased = 0; 11326 previous_is_cased = 0; 11327 for (i = 0; i < length; i++) { 11328 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11329 11330 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11331 if (previous_is_cased) 11332 return PyBool_FromLong(0); 11333 previous_is_cased = 1; 11334 cased = 1; 11335 } 11336 else if (Py_UNICODE_ISLOWER(ch)) { 11337 if (!previous_is_cased) 11338 return PyBool_FromLong(0); 11339 previous_is_cased = 1; 11340 cased = 1; 11341 } 11342 else 11343 previous_is_cased = 0; 11344 } 11345 return PyBool_FromLong(cased); 11346} 11347 11348PyDoc_STRVAR(isspace__doc__, 11349 "S.isspace() -> bool\n\ 11350\n\ 11351Return True if all characters in S are whitespace\n\ 11352and there is at least one character in S, False otherwise."); 11353 11354static PyObject* 11355unicode_isspace(PyObject *self) 11356{ 11357 Py_ssize_t i, length; 11358 int kind; 11359 void *data; 11360 11361 if (PyUnicode_READY(self) == -1) 11362 return NULL; 11363 length = PyUnicode_GET_LENGTH(self); 11364 kind = PyUnicode_KIND(self); 11365 data = PyUnicode_DATA(self); 11366 11367 /* Shortcut for single character strings */ 11368 if (length == 1) 11369 return PyBool_FromLong( 11370 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11371 11372 /* Special case for empty strings */ 11373 if (length == 0) 11374 return PyBool_FromLong(0); 11375 11376 for (i = 0; i < length; i++) { 11377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11378 if (!Py_UNICODE_ISSPACE(ch)) 11379 return PyBool_FromLong(0); 11380 } 11381 return PyBool_FromLong(1); 11382} 11383 11384PyDoc_STRVAR(isalpha__doc__, 11385 "S.isalpha() -> bool\n\ 11386\n\ 11387Return True if all characters in S are alphabetic\n\ 11388and there is at least one character in S, False otherwise."); 11389 11390static PyObject* 11391unicode_isalpha(PyObject *self) 11392{ 11393 Py_ssize_t i, length; 11394 int kind; 11395 void *data; 11396 11397 if (PyUnicode_READY(self) == -1) 11398 return NULL; 11399 length = PyUnicode_GET_LENGTH(self); 11400 kind = PyUnicode_KIND(self); 11401 data = PyUnicode_DATA(self); 11402 11403 /* Shortcut for single character strings */ 11404 if (length == 1) 11405 return PyBool_FromLong( 11406 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11407 11408 /* Special case for empty strings */ 11409 if (length == 0) 11410 return PyBool_FromLong(0); 11411 11412 for (i = 0; i < length; i++) { 11413 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11414 return PyBool_FromLong(0); 11415 } 11416 return PyBool_FromLong(1); 11417} 11418 11419PyDoc_STRVAR(isalnum__doc__, 11420 "S.isalnum() -> bool\n\ 11421\n\ 11422Return True if all characters in S are alphanumeric\n\ 11423and there is at least one character in S, False otherwise."); 11424 11425static PyObject* 11426unicode_isalnum(PyObject *self) 11427{ 11428 int kind; 11429 void *data; 11430 Py_ssize_t len, i; 11431 11432 if (PyUnicode_READY(self) == -1) 11433 return NULL; 11434 11435 kind = PyUnicode_KIND(self); 11436 data = PyUnicode_DATA(self); 11437 len = PyUnicode_GET_LENGTH(self); 11438 11439 /* Shortcut for single character strings */ 11440 if (len == 1) { 11441 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11442 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11443 } 11444 11445 /* Special case for empty strings */ 11446 if (len == 0) 11447 return PyBool_FromLong(0); 11448 11449 for (i = 0; i < len; i++) { 11450 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11451 if (!Py_UNICODE_ISALNUM(ch)) 11452 return PyBool_FromLong(0); 11453 } 11454 return PyBool_FromLong(1); 11455} 11456 11457PyDoc_STRVAR(isdecimal__doc__, 11458 "S.isdecimal() -> bool\n\ 11459\n\ 11460Return True if there are only decimal characters in S,\n\ 11461False otherwise."); 11462 11463static PyObject* 11464unicode_isdecimal(PyObject *self) 11465{ 11466 Py_ssize_t i, length; 11467 int kind; 11468 void *data; 11469 11470 if (PyUnicode_READY(self) == -1) 11471 return NULL; 11472 length = PyUnicode_GET_LENGTH(self); 11473 kind = PyUnicode_KIND(self); 11474 data = PyUnicode_DATA(self); 11475 11476 /* Shortcut for single character strings */ 11477 if (length == 1) 11478 return PyBool_FromLong( 11479 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11480 11481 /* Special case for empty strings */ 11482 if (length == 0) 11483 return PyBool_FromLong(0); 11484 11485 for (i = 0; i < length; i++) { 11486 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11487 return PyBool_FromLong(0); 11488 } 11489 return PyBool_FromLong(1); 11490} 11491 11492PyDoc_STRVAR(isdigit__doc__, 11493 "S.isdigit() -> bool\n\ 11494\n\ 11495Return True if all characters in S are digits\n\ 11496and there is at least one character in S, False otherwise."); 11497 11498static PyObject* 11499unicode_isdigit(PyObject *self) 11500{ 11501 Py_ssize_t i, length; 11502 int kind; 11503 void *data; 11504 11505 if (PyUnicode_READY(self) == -1) 11506 return NULL; 11507 length = PyUnicode_GET_LENGTH(self); 11508 kind = PyUnicode_KIND(self); 11509 data = PyUnicode_DATA(self); 11510 11511 /* Shortcut for single character strings */ 11512 if (length == 1) { 11513 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11514 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11515 } 11516 11517 /* Special case for empty strings */ 11518 if (length == 0) 11519 return PyBool_FromLong(0); 11520 11521 for (i = 0; i < length; i++) { 11522 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11523 return PyBool_FromLong(0); 11524 } 11525 return PyBool_FromLong(1); 11526} 11527 11528PyDoc_STRVAR(isnumeric__doc__, 11529 "S.isnumeric() -> bool\n\ 11530\n\ 11531Return True if there are only numeric characters in S,\n\ 11532False otherwise."); 11533 11534static PyObject* 11535unicode_isnumeric(PyObject *self) 11536{ 11537 Py_ssize_t i, length; 11538 int kind; 11539 void *data; 11540 11541 if (PyUnicode_READY(self) == -1) 11542 return NULL; 11543 length = PyUnicode_GET_LENGTH(self); 11544 kind = PyUnicode_KIND(self); 11545 data = PyUnicode_DATA(self); 11546 11547 /* Shortcut for single character strings */ 11548 if (length == 1) 11549 return PyBool_FromLong( 11550 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11551 11552 /* Special case for empty strings */ 11553 if (length == 0) 11554 return PyBool_FromLong(0); 11555 11556 for (i = 0; i < length; i++) { 11557 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11558 return PyBool_FromLong(0); 11559 } 11560 return PyBool_FromLong(1); 11561} 11562 11563int 11564PyUnicode_IsIdentifier(PyObject *self) 11565{ 11566 int kind; 11567 void *data; 11568 Py_ssize_t i; 11569 Py_UCS4 first; 11570 11571 if (PyUnicode_READY(self) == -1) { 11572 Py_FatalError("identifier not ready"); 11573 return 0; 11574 } 11575 11576 /* Special case for empty strings */ 11577 if (PyUnicode_GET_LENGTH(self) == 0) 11578 return 0; 11579 kind = PyUnicode_KIND(self); 11580 data = PyUnicode_DATA(self); 11581 11582 /* PEP 3131 says that the first character must be in 11583 XID_Start and subsequent characters in XID_Continue, 11584 and for the ASCII range, the 2.x rules apply (i.e 11585 start with letters and underscore, continue with 11586 letters, digits, underscore). However, given the current 11587 definition of XID_Start and XID_Continue, it is sufficient 11588 to check just for these, except that _ must be allowed 11589 as starting an identifier. */ 11590 first = PyUnicode_READ(kind, data, 0); 11591 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11592 return 0; 11593 11594 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11595 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11596 return 0; 11597 return 1; 11598} 11599 11600PyDoc_STRVAR(isidentifier__doc__, 11601 "S.isidentifier() -> bool\n\ 11602\n\ 11603Return True if S is a valid identifier according\n\ 11604to the language definition."); 11605 11606static PyObject* 11607unicode_isidentifier(PyObject *self) 11608{ 11609 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11610} 11611 11612PyDoc_STRVAR(isprintable__doc__, 11613 "S.isprintable() -> bool\n\ 11614\n\ 11615Return True if all characters in S are considered\n\ 11616printable in repr() or S is empty, False otherwise."); 11617 11618static PyObject* 11619unicode_isprintable(PyObject *self) 11620{ 11621 Py_ssize_t i, length; 11622 int kind; 11623 void *data; 11624 11625 if (PyUnicode_READY(self) == -1) 11626 return NULL; 11627 length = PyUnicode_GET_LENGTH(self); 11628 kind = PyUnicode_KIND(self); 11629 data = PyUnicode_DATA(self); 11630 11631 /* Shortcut for single character strings */ 11632 if (length == 1) 11633 return PyBool_FromLong( 11634 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11635 11636 for (i = 0; i < length; i++) { 11637 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11638 Py_RETURN_FALSE; 11639 } 11640 } 11641 Py_RETURN_TRUE; 11642} 11643 11644PyDoc_STRVAR(join__doc__, 11645 "S.join(iterable) -> str\n\ 11646\n\ 11647Return a string which is the concatenation of the strings in the\n\ 11648iterable. The separator between elements is S."); 11649 11650static PyObject* 11651unicode_join(PyObject *self, PyObject *data) 11652{ 11653 return PyUnicode_Join(self, data); 11654} 11655 11656static Py_ssize_t 11657unicode_length(PyObject *self) 11658{ 11659 if (PyUnicode_READY(self) == -1) 11660 return -1; 11661 return PyUnicode_GET_LENGTH(self); 11662} 11663 11664PyDoc_STRVAR(ljust__doc__, 11665 "S.ljust(width[, fillchar]) -> str\n\ 11666\n\ 11667Return S left-justified in a Unicode string of length width. Padding is\n\ 11668done using the specified fill character (default is a space)."); 11669 11670static PyObject * 11671unicode_ljust(PyObject *self, PyObject *args) 11672{ 11673 Py_ssize_t width; 11674 Py_UCS4 fillchar = ' '; 11675 11676 if (PyUnicode_READY(self) == -1) 11677 return NULL; 11678 11679 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11680 return NULL; 11681 11682 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11683 Py_INCREF(self); 11684 return self; 11685 } 11686 11687 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 11688} 11689 11690PyDoc_STRVAR(lower__doc__, 11691 "S.lower() -> str\n\ 11692\n\ 11693Return a copy of the string S converted to lowercase."); 11694 11695static PyObject* 11696unicode_lower(PyObject *self) 11697{ 11698 return fixup(self, fixlower); 11699} 11700 11701#define LEFTSTRIP 0 11702#define RIGHTSTRIP 1 11703#define BOTHSTRIP 2 11704 11705/* Arrays indexed by above */ 11706static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11707 11708#define STRIPNAME(i) (stripformat[i]+3) 11709 11710/* externally visible for str.strip(unicode) */ 11711PyObject * 11712_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11713{ 11714 void *data; 11715 int kind; 11716 Py_ssize_t i, j, len; 11717 BLOOM_MASK sepmask; 11718 11719 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11720 return NULL; 11721 11722 kind = PyUnicode_KIND(self); 11723 data = PyUnicode_DATA(self); 11724 len = PyUnicode_GET_LENGTH(self); 11725 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11726 PyUnicode_DATA(sepobj), 11727 PyUnicode_GET_LENGTH(sepobj)); 11728 11729 i = 0; 11730 if (striptype != RIGHTSTRIP) { 11731 while (i < len && 11732 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11733 i++; 11734 } 11735 } 11736 11737 j = len; 11738 if (striptype != LEFTSTRIP) { 11739 do { 11740 j--; 11741 } while (j >= i && 11742 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11743 j++; 11744 } 11745 11746 return PyUnicode_Substring(self, i, j); 11747} 11748 11749PyObject* 11750PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11751{ 11752 unsigned char *data; 11753 int kind; 11754 Py_ssize_t length; 11755 11756 if (PyUnicode_READY(self) == -1) 11757 return NULL; 11758 11759 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11760 11761 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11762 { 11763 if (PyUnicode_CheckExact(self)) { 11764 Py_INCREF(self); 11765 return self; 11766 } 11767 else 11768 return PyUnicode_Copy(self); 11769 } 11770 11771 length = end - start; 11772 if (length == 1) 11773 return unicode_getitem(self, start); 11774 11775 if (start < 0 || end < 0) { 11776 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11777 return NULL; 11778 } 11779 11780 if (PyUnicode_IS_ASCII(self)) { 11781 kind = PyUnicode_KIND(self); 11782 data = PyUnicode_1BYTE_DATA(self); 11783 return unicode_fromascii(data + start, length); 11784 } 11785 else { 11786 kind = PyUnicode_KIND(self); 11787 data = PyUnicode_1BYTE_DATA(self); 11788 return PyUnicode_FromKindAndData(kind, 11789 data + kind * start, 11790 length); 11791 } 11792} 11793 11794static PyObject * 11795do_strip(PyObject *self, int striptype) 11796{ 11797 int kind; 11798 void *data; 11799 Py_ssize_t len, i, j; 11800 11801 if (PyUnicode_READY(self) == -1) 11802 return NULL; 11803 11804 kind = PyUnicode_KIND(self); 11805 data = PyUnicode_DATA(self); 11806 len = PyUnicode_GET_LENGTH(self); 11807 11808 i = 0; 11809 if (striptype != RIGHTSTRIP) { 11810 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11811 i++; 11812 } 11813 } 11814 11815 j = len; 11816 if (striptype != LEFTSTRIP) { 11817 do { 11818 j--; 11819 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11820 j++; 11821 } 11822 11823 return PyUnicode_Substring(self, i, j); 11824} 11825 11826 11827static PyObject * 11828do_argstrip(PyObject *self, int striptype, PyObject *args) 11829{ 11830 PyObject *sep = NULL; 11831 11832 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11833 return NULL; 11834 11835 if (sep != NULL && sep != Py_None) { 11836 if (PyUnicode_Check(sep)) 11837 return _PyUnicode_XStrip(self, striptype, sep); 11838 else { 11839 PyErr_Format(PyExc_TypeError, 11840 "%s arg must be None or str", 11841 STRIPNAME(striptype)); 11842 return NULL; 11843 } 11844 } 11845 11846 return do_strip(self, striptype); 11847} 11848 11849 11850PyDoc_STRVAR(strip__doc__, 11851 "S.strip([chars]) -> str\n\ 11852\n\ 11853Return a copy of the string S with leading and trailing\n\ 11854whitespace removed.\n\ 11855If chars is given and not None, remove characters in chars instead."); 11856 11857static PyObject * 11858unicode_strip(PyObject *self, PyObject *args) 11859{ 11860 if (PyTuple_GET_SIZE(args) == 0) 11861 return do_strip(self, BOTHSTRIP); /* Common case */ 11862 else 11863 return do_argstrip(self, BOTHSTRIP, args); 11864} 11865 11866 11867PyDoc_STRVAR(lstrip__doc__, 11868 "S.lstrip([chars]) -> str\n\ 11869\n\ 11870Return a copy of the string S with leading whitespace removed.\n\ 11871If chars is given and not None, remove characters in chars instead."); 11872 11873static PyObject * 11874unicode_lstrip(PyObject *self, PyObject *args) 11875{ 11876 if (PyTuple_GET_SIZE(args) == 0) 11877 return do_strip(self, LEFTSTRIP); /* Common case */ 11878 else 11879 return do_argstrip(self, LEFTSTRIP, args); 11880} 11881 11882 11883PyDoc_STRVAR(rstrip__doc__, 11884 "S.rstrip([chars]) -> str\n\ 11885\n\ 11886Return a copy of the string S with trailing whitespace removed.\n\ 11887If chars is given and not None, remove characters in chars instead."); 11888 11889static PyObject * 11890unicode_rstrip(PyObject *self, PyObject *args) 11891{ 11892 if (PyTuple_GET_SIZE(args) == 0) 11893 return do_strip(self, RIGHTSTRIP); /* Common case */ 11894 else 11895 return do_argstrip(self, RIGHTSTRIP, args); 11896} 11897 11898 11899static PyObject* 11900unicode_repeat(PyObject *str, Py_ssize_t len) 11901{ 11902 PyObject *u; 11903 Py_ssize_t nchars, n; 11904 11905 if (len < 1) { 11906 Py_INCREF(unicode_empty); 11907 return unicode_empty; 11908 } 11909 11910 if (len == 1 && PyUnicode_CheckExact(str)) { 11911 /* no repeat, return original string */ 11912 Py_INCREF(str); 11913 return str; 11914 } 11915 11916 if (PyUnicode_READY(str) == -1) 11917 return NULL; 11918 11919 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11920 PyErr_SetString(PyExc_OverflowError, 11921 "repeated string is too long"); 11922 return NULL; 11923 } 11924 nchars = len * PyUnicode_GET_LENGTH(str); 11925 11926 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11927 if (!u) 11928 return NULL; 11929 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11930 11931 if (PyUnicode_GET_LENGTH(str) == 1) { 11932 const int kind = PyUnicode_KIND(str); 11933 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11934 void *to = PyUnicode_DATA(u); 11935 if (kind == PyUnicode_1BYTE_KIND) 11936 memset(to, (unsigned char)fill_char, len); 11937 else { 11938 for (n = 0; n < len; ++n) 11939 PyUnicode_WRITE(kind, to, n, fill_char); 11940 } 11941 } 11942 else { 11943 /* number of characters copied this far */ 11944 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11945 const Py_ssize_t char_size = PyUnicode_KIND(str); 11946 char *to = (char *) PyUnicode_DATA(u); 11947 Py_MEMCPY(to, PyUnicode_DATA(str), 11948 PyUnicode_GET_LENGTH(str) * char_size); 11949 while (done < nchars) { 11950 n = (done <= nchars-done) ? done : nchars-done; 11951 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11952 done += n; 11953 } 11954 } 11955 11956 assert(_PyUnicode_CheckConsistency(u, 1)); 11957 return u; 11958} 11959 11960PyObject * 11961PyUnicode_Replace(PyObject *obj, 11962 PyObject *subobj, 11963 PyObject *replobj, 11964 Py_ssize_t maxcount) 11965{ 11966 PyObject *self; 11967 PyObject *str1; 11968 PyObject *str2; 11969 PyObject *result; 11970 11971 self = PyUnicode_FromObject(obj); 11972 if (self == NULL || PyUnicode_READY(self) == -1) 11973 return NULL; 11974 str1 = PyUnicode_FromObject(subobj); 11975 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11976 Py_DECREF(self); 11977 return NULL; 11978 } 11979 str2 = PyUnicode_FromObject(replobj); 11980 if (str2 == NULL || PyUnicode_READY(str2)) { 11981 Py_DECREF(self); 11982 Py_DECREF(str1); 11983 return NULL; 11984 } 11985 result = replace(self, str1, str2, maxcount); 11986 Py_DECREF(self); 11987 Py_DECREF(str1); 11988 Py_DECREF(str2); 11989 return result; 11990} 11991 11992PyDoc_STRVAR(replace__doc__, 11993 "S.replace(old, new[, count]) -> str\n\ 11994\n\ 11995Return a copy of S with all occurrences of substring\n\ 11996old replaced by new. If the optional argument count is\n\ 11997given, only the first count occurrences are replaced."); 11998 11999static PyObject* 12000unicode_replace(PyObject *self, PyObject *args) 12001{ 12002 PyObject *str1; 12003 PyObject *str2; 12004 Py_ssize_t maxcount = -1; 12005 PyObject *result; 12006 12007 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12008 return NULL; 12009 if (!PyUnicode_READY(self) == -1) 12010 return NULL; 12011 str1 = PyUnicode_FromObject(str1); 12012 if (str1 == NULL || PyUnicode_READY(str1) == -1) 12013 return NULL; 12014 str2 = PyUnicode_FromObject(str2); 12015 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 12016 Py_DECREF(str1); 12017 return NULL; 12018 } 12019 12020 result = replace(self, str1, str2, maxcount); 12021 12022 Py_DECREF(str1); 12023 Py_DECREF(str2); 12024 return result; 12025} 12026 12027static PyObject * 12028unicode_repr(PyObject *unicode) 12029{ 12030 PyObject *repr; 12031 Py_ssize_t isize; 12032 Py_ssize_t osize, squote, dquote, i, o; 12033 Py_UCS4 max, quote; 12034 int ikind, okind; 12035 void *idata, *odata; 12036 12037 if (PyUnicode_READY(unicode) == -1) 12038 return NULL; 12039 12040 isize = PyUnicode_GET_LENGTH(unicode); 12041 idata = PyUnicode_DATA(unicode); 12042 12043 /* Compute length of output, quote characters, and 12044 maximum character */ 12045 osize = 2; /* quotes */ 12046 max = 127; 12047 squote = dquote = 0; 12048 ikind = PyUnicode_KIND(unicode); 12049 for (i = 0; i < isize; i++) { 12050 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12051 switch (ch) { 12052 case '\'': squote++; osize++; break; 12053 case '"': dquote++; osize++; break; 12054 case '\\': case '\t': case '\r': case '\n': 12055 osize += 2; break; 12056 default: 12057 /* Fast-path ASCII */ 12058 if (ch < ' ' || ch == 0x7f) 12059 osize += 4; /* \xHH */ 12060 else if (ch < 0x7f) 12061 osize++; 12062 else if (Py_UNICODE_ISPRINTABLE(ch)) { 12063 osize++; 12064 max = ch > max ? ch : max; 12065 } 12066 else if (ch < 0x100) 12067 osize += 4; /* \xHH */ 12068 else if (ch < 0x10000) 12069 osize += 6; /* \uHHHH */ 12070 else 12071 osize += 10; /* \uHHHHHHHH */ 12072 } 12073 } 12074 12075 quote = '\''; 12076 if (squote) { 12077 if (dquote) 12078 /* Both squote and dquote present. Use squote, 12079 and escape them */ 12080 osize += squote; 12081 else 12082 quote = '"'; 12083 } 12084 12085 repr = PyUnicode_New(osize, max); 12086 if (repr == NULL) 12087 return NULL; 12088 okind = PyUnicode_KIND(repr); 12089 odata = PyUnicode_DATA(repr); 12090 12091 PyUnicode_WRITE(okind, odata, 0, quote); 12092 PyUnicode_WRITE(okind, odata, osize-1, quote); 12093 12094 for (i = 0, o = 1; i < isize; i++) { 12095 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12096 12097 /* Escape quotes and backslashes */ 12098 if ((ch == quote) || (ch == '\\')) { 12099 PyUnicode_WRITE(okind, odata, o++, '\\'); 12100 PyUnicode_WRITE(okind, odata, o++, ch); 12101 continue; 12102 } 12103 12104 /* Map special whitespace to '\t', \n', '\r' */ 12105 if (ch == '\t') { 12106 PyUnicode_WRITE(okind, odata, o++, '\\'); 12107 PyUnicode_WRITE(okind, odata, o++, 't'); 12108 } 12109 else if (ch == '\n') { 12110 PyUnicode_WRITE(okind, odata, o++, '\\'); 12111 PyUnicode_WRITE(okind, odata, o++, 'n'); 12112 } 12113 else if (ch == '\r') { 12114 PyUnicode_WRITE(okind, odata, o++, '\\'); 12115 PyUnicode_WRITE(okind, odata, o++, 'r'); 12116 } 12117 12118 /* Map non-printable US ASCII to '\xhh' */ 12119 else if (ch < ' ' || ch == 0x7F) { 12120 PyUnicode_WRITE(okind, odata, o++, '\\'); 12121 PyUnicode_WRITE(okind, odata, o++, 'x'); 12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12124 } 12125 12126 /* Copy ASCII characters as-is */ 12127 else if (ch < 0x7F) { 12128 PyUnicode_WRITE(okind, odata, o++, ch); 12129 } 12130 12131 /* Non-ASCII characters */ 12132 else { 12133 /* Map Unicode whitespace and control characters 12134 (categories Z* and C* except ASCII space) 12135 */ 12136 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12137 /* Map 8-bit characters to '\xhh' */ 12138 if (ch <= 0xff) { 12139 PyUnicode_WRITE(okind, odata, o++, '\\'); 12140 PyUnicode_WRITE(okind, odata, o++, 'x'); 12141 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12142 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12143 } 12144 /* Map 21-bit characters to '\U00xxxxxx' */ 12145 else if (ch >= 0x10000) { 12146 PyUnicode_WRITE(okind, odata, o++, '\\'); 12147 PyUnicode_WRITE(okind, odata, o++, 'U'); 12148 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12149 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12150 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12151 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12152 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12153 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12154 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12156 } 12157 /* Map 16-bit characters to '\uxxxx' */ 12158 else { 12159 PyUnicode_WRITE(okind, odata, o++, '\\'); 12160 PyUnicode_WRITE(okind, odata, o++, 'u'); 12161 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12162 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12163 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12165 } 12166 } 12167 /* Copy characters as-is */ 12168 else { 12169 PyUnicode_WRITE(okind, odata, o++, ch); 12170 } 12171 } 12172 } 12173 /* Closing quote already added at the beginning */ 12174 assert(_PyUnicode_CheckConsistency(repr, 1)); 12175 return repr; 12176} 12177 12178PyDoc_STRVAR(rfind__doc__, 12179 "S.rfind(sub[, start[, end]]) -> int\n\ 12180\n\ 12181Return the highest index in S where substring sub is found,\n\ 12182such that sub is contained within S[start:end]. Optional\n\ 12183arguments start and end are interpreted as in slice notation.\n\ 12184\n\ 12185Return -1 on failure."); 12186 12187static PyObject * 12188unicode_rfind(PyObject *self, PyObject *args) 12189{ 12190 PyObject *substring; 12191 Py_ssize_t start; 12192 Py_ssize_t end; 12193 Py_ssize_t result; 12194 12195 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12196 &start, &end)) 12197 return NULL; 12198 12199 if (PyUnicode_READY(self) == -1) 12200 return NULL; 12201 if (PyUnicode_READY(substring) == -1) 12202 return NULL; 12203 12204 result = any_find_slice(-1, self, substring, start, end); 12205 12206 Py_DECREF(substring); 12207 12208 if (result == -2) 12209 return NULL; 12210 12211 return PyLong_FromSsize_t(result); 12212} 12213 12214PyDoc_STRVAR(rindex__doc__, 12215 "S.rindex(sub[, start[, end]]) -> int\n\ 12216\n\ 12217Like S.rfind() but raise ValueError when the substring is not found."); 12218 12219static PyObject * 12220unicode_rindex(PyObject *self, PyObject *args) 12221{ 12222 PyObject *substring; 12223 Py_ssize_t start; 12224 Py_ssize_t end; 12225 Py_ssize_t result; 12226 12227 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12228 &start, &end)) 12229 return NULL; 12230 12231 if (PyUnicode_READY(self) == -1) 12232 return NULL; 12233 if (PyUnicode_READY(substring) == -1) 12234 return NULL; 12235 12236 result = any_find_slice(-1, self, substring, start, end); 12237 12238 Py_DECREF(substring); 12239 12240 if (result == -2) 12241 return NULL; 12242 12243 if (result < 0) { 12244 PyErr_SetString(PyExc_ValueError, "substring not found"); 12245 return NULL; 12246 } 12247 12248 return PyLong_FromSsize_t(result); 12249} 12250 12251PyDoc_STRVAR(rjust__doc__, 12252 "S.rjust(width[, fillchar]) -> str\n\ 12253\n\ 12254Return S right-justified in a string of length width. Padding is\n\ 12255done using the specified fill character (default is a space)."); 12256 12257static PyObject * 12258unicode_rjust(PyObject *self, PyObject *args) 12259{ 12260 Py_ssize_t width; 12261 Py_UCS4 fillchar = ' '; 12262 12263 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12264 return NULL; 12265 12266 if (PyUnicode_READY(self) == -1) 12267 return NULL; 12268 12269 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 12270 Py_INCREF(self); 12271 return self; 12272 } 12273 12274 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 12275} 12276 12277PyObject * 12278PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12279{ 12280 PyObject *result; 12281 12282 s = PyUnicode_FromObject(s); 12283 if (s == NULL) 12284 return NULL; 12285 if (sep != NULL) { 12286 sep = PyUnicode_FromObject(sep); 12287 if (sep == NULL) { 12288 Py_DECREF(s); 12289 return NULL; 12290 } 12291 } 12292 12293 result = split(s, sep, maxsplit); 12294 12295 Py_DECREF(s); 12296 Py_XDECREF(sep); 12297 return result; 12298} 12299 12300PyDoc_STRVAR(split__doc__, 12301 "S.split([sep[, maxsplit]]) -> list of strings\n\ 12302\n\ 12303Return a list of the words in S, using sep as the\n\ 12304delimiter string. If maxsplit is given, at most maxsplit\n\ 12305splits are done. If sep is not specified or is None, any\n\ 12306whitespace string is a separator and empty strings are\n\ 12307removed from the result."); 12308 12309static PyObject* 12310unicode_split(PyObject *self, PyObject *args) 12311{ 12312 PyObject *substring = Py_None; 12313 Py_ssize_t maxcount = -1; 12314 12315 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 12316 return NULL; 12317 12318 if (substring == Py_None) 12319 return split(self, NULL, maxcount); 12320 else if (PyUnicode_Check(substring)) 12321 return split(self, substring, maxcount); 12322 else 12323 return PyUnicode_Split(self, substring, maxcount); 12324} 12325 12326PyObject * 12327PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12328{ 12329 PyObject* str_obj; 12330 PyObject* sep_obj; 12331 PyObject* out; 12332 int kind1, kind2, kind; 12333 void *buf1 = NULL, *buf2 = NULL; 12334 Py_ssize_t len1, len2; 12335 12336 str_obj = PyUnicode_FromObject(str_in); 12337 if (!str_obj || PyUnicode_READY(str_obj) == -1) 12338 return NULL; 12339 sep_obj = PyUnicode_FromObject(sep_in); 12340 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 12341 Py_DECREF(str_obj); 12342 return NULL; 12343 } 12344 12345 kind1 = PyUnicode_KIND(str_obj); 12346 kind2 = PyUnicode_KIND(sep_obj); 12347 kind = Py_MAX(kind1, kind2); 12348 buf1 = PyUnicode_DATA(str_obj); 12349 if (kind1 != kind) 12350 buf1 = _PyUnicode_AsKind(str_obj, kind); 12351 if (!buf1) 12352 goto onError; 12353 buf2 = PyUnicode_DATA(sep_obj); 12354 if (kind2 != kind) 12355 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12356 if (!buf2) 12357 goto onError; 12358 len1 = PyUnicode_GET_LENGTH(str_obj); 12359 len2 = PyUnicode_GET_LENGTH(sep_obj); 12360 12361 switch(PyUnicode_KIND(str_obj)) { 12362 case PyUnicode_1BYTE_KIND: 12363 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12364 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12365 else 12366 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12367 break; 12368 case PyUnicode_2BYTE_KIND: 12369 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12370 break; 12371 case PyUnicode_4BYTE_KIND: 12372 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12373 break; 12374 default: 12375 assert(0); 12376 out = 0; 12377 } 12378 12379 Py_DECREF(sep_obj); 12380 Py_DECREF(str_obj); 12381 if (kind1 != kind) 12382 PyMem_Free(buf1); 12383 if (kind2 != kind) 12384 PyMem_Free(buf2); 12385 12386 return out; 12387 onError: 12388 Py_DECREF(sep_obj); 12389 Py_DECREF(str_obj); 12390 if (kind1 != kind && buf1) 12391 PyMem_Free(buf1); 12392 if (kind2 != kind && buf2) 12393 PyMem_Free(buf2); 12394 return NULL; 12395} 12396 12397 12398PyObject * 12399PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12400{ 12401 PyObject* str_obj; 12402 PyObject* sep_obj; 12403 PyObject* out; 12404 int kind1, kind2, kind; 12405 void *buf1 = NULL, *buf2 = NULL; 12406 Py_ssize_t len1, len2; 12407 12408 str_obj = PyUnicode_FromObject(str_in); 12409 if (!str_obj) 12410 return NULL; 12411 sep_obj = PyUnicode_FromObject(sep_in); 12412 if (!sep_obj) { 12413 Py_DECREF(str_obj); 12414 return NULL; 12415 } 12416 12417 kind1 = PyUnicode_KIND(str_in); 12418 kind2 = PyUnicode_KIND(sep_obj); 12419 kind = Py_MAX(kind1, kind2); 12420 buf1 = PyUnicode_DATA(str_in); 12421 if (kind1 != kind) 12422 buf1 = _PyUnicode_AsKind(str_in, kind); 12423 if (!buf1) 12424 goto onError; 12425 buf2 = PyUnicode_DATA(sep_obj); 12426 if (kind2 != kind) 12427 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12428 if (!buf2) 12429 goto onError; 12430 len1 = PyUnicode_GET_LENGTH(str_obj); 12431 len2 = PyUnicode_GET_LENGTH(sep_obj); 12432 12433 switch(PyUnicode_KIND(str_in)) { 12434 case PyUnicode_1BYTE_KIND: 12435 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12436 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12437 else 12438 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12439 break; 12440 case PyUnicode_2BYTE_KIND: 12441 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12442 break; 12443 case PyUnicode_4BYTE_KIND: 12444 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12445 break; 12446 default: 12447 assert(0); 12448 out = 0; 12449 } 12450 12451 Py_DECREF(sep_obj); 12452 Py_DECREF(str_obj); 12453 if (kind1 != kind) 12454 PyMem_Free(buf1); 12455 if (kind2 != kind) 12456 PyMem_Free(buf2); 12457 12458 return out; 12459 onError: 12460 Py_DECREF(sep_obj); 12461 Py_DECREF(str_obj); 12462 if (kind1 != kind && buf1) 12463 PyMem_Free(buf1); 12464 if (kind2 != kind && buf2) 12465 PyMem_Free(buf2); 12466 return NULL; 12467} 12468 12469PyDoc_STRVAR(partition__doc__, 12470 "S.partition(sep) -> (head, sep, tail)\n\ 12471\n\ 12472Search for the separator sep in S, and return the part before it,\n\ 12473the separator itself, and the part after it. If the separator is not\n\ 12474found, return S and two empty strings."); 12475 12476static PyObject* 12477unicode_partition(PyObject *self, PyObject *separator) 12478{ 12479 return PyUnicode_Partition(self, separator); 12480} 12481 12482PyDoc_STRVAR(rpartition__doc__, 12483 "S.rpartition(sep) -> (head, sep, tail)\n\ 12484\n\ 12485Search for the separator sep in S, starting at the end of S, and return\n\ 12486the part before it, the separator itself, and the part after it. If the\n\ 12487separator is not found, return two empty strings and S."); 12488 12489static PyObject* 12490unicode_rpartition(PyObject *self, PyObject *separator) 12491{ 12492 return PyUnicode_RPartition(self, separator); 12493} 12494 12495PyObject * 12496PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12497{ 12498 PyObject *result; 12499 12500 s = PyUnicode_FromObject(s); 12501 if (s == NULL) 12502 return NULL; 12503 if (sep != NULL) { 12504 sep = PyUnicode_FromObject(sep); 12505 if (sep == NULL) { 12506 Py_DECREF(s); 12507 return NULL; 12508 } 12509 } 12510 12511 result = rsplit(s, sep, maxsplit); 12512 12513 Py_DECREF(s); 12514 Py_XDECREF(sep); 12515 return result; 12516} 12517 12518PyDoc_STRVAR(rsplit__doc__, 12519 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 12520\n\ 12521Return a list of the words in S, using sep as the\n\ 12522delimiter string, starting at the end of the string and\n\ 12523working to the front. If maxsplit is given, at most maxsplit\n\ 12524splits are done. If sep is not specified, any whitespace string\n\ 12525is a separator."); 12526 12527static PyObject* 12528unicode_rsplit(PyObject *self, PyObject *args) 12529{ 12530 PyObject *substring = Py_None; 12531 Py_ssize_t maxcount = -1; 12532 12533 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 12534 return NULL; 12535 12536 if (substring == Py_None) 12537 return rsplit(self, NULL, maxcount); 12538 else if (PyUnicode_Check(substring)) 12539 return rsplit(self, substring, maxcount); 12540 else 12541 return PyUnicode_RSplit(self, substring, maxcount); 12542} 12543 12544PyDoc_STRVAR(splitlines__doc__, 12545 "S.splitlines([keepends]) -> list of strings\n\ 12546\n\ 12547Return a list of the lines in S, breaking at line boundaries.\n\ 12548Line breaks are not included in the resulting list unless keepends\n\ 12549is given and true."); 12550 12551static PyObject* 12552unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12553{ 12554 static char *kwlist[] = {"keepends", 0}; 12555 int keepends = 0; 12556 12557 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12558 kwlist, &keepends)) 12559 return NULL; 12560 12561 return PyUnicode_Splitlines(self, keepends); 12562} 12563 12564static 12565PyObject *unicode_str(PyObject *self) 12566{ 12567 if (PyUnicode_CheckExact(self)) { 12568 Py_INCREF(self); 12569 return self; 12570 } else 12571 /* Subtype -- return genuine unicode string with the same value. */ 12572 return PyUnicode_Copy(self); 12573} 12574 12575PyDoc_STRVAR(swapcase__doc__, 12576 "S.swapcase() -> str\n\ 12577\n\ 12578Return a copy of S with uppercase characters converted to lowercase\n\ 12579and vice versa."); 12580 12581static PyObject* 12582unicode_swapcase(PyObject *self) 12583{ 12584 return fixup(self, fixswapcase); 12585} 12586 12587PyDoc_STRVAR(maketrans__doc__, 12588 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12589\n\ 12590Return a translation table usable for str.translate().\n\ 12591If there is only one argument, it must be a dictionary mapping Unicode\n\ 12592ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12593Character keys will be then converted to ordinals.\n\ 12594If there are two arguments, they must be strings of equal length, and\n\ 12595in the resulting dictionary, each character in x will be mapped to the\n\ 12596character at the same position in y. If there is a third argument, it\n\ 12597must be a string, whose characters will be mapped to None in the result."); 12598 12599static PyObject* 12600unicode_maketrans(PyObject *null, PyObject *args) 12601{ 12602 PyObject *x, *y = NULL, *z = NULL; 12603 PyObject *new = NULL, *key, *value; 12604 Py_ssize_t i = 0; 12605 int res; 12606 12607 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12608 return NULL; 12609 new = PyDict_New(); 12610 if (!new) 12611 return NULL; 12612 if (y != NULL) { 12613 int x_kind, y_kind, z_kind; 12614 void *x_data, *y_data, *z_data; 12615 12616 /* x must be a string too, of equal length */ 12617 if (!PyUnicode_Check(x)) { 12618 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12619 "be a string if there is a second argument"); 12620 goto err; 12621 } 12622 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12623 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12624 "arguments must have equal length"); 12625 goto err; 12626 } 12627 /* create entries for translating chars in x to those in y */ 12628 x_kind = PyUnicode_KIND(x); 12629 y_kind = PyUnicode_KIND(y); 12630 x_data = PyUnicode_DATA(x); 12631 y_data = PyUnicode_DATA(y); 12632 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12633 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12634 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12635 if (!key || !value) 12636 goto err; 12637 res = PyDict_SetItem(new, key, value); 12638 Py_DECREF(key); 12639 Py_DECREF(value); 12640 if (res < 0) 12641 goto err; 12642 } 12643 /* create entries for deleting chars in z */ 12644 if (z != NULL) { 12645 z_kind = PyUnicode_KIND(z); 12646 z_data = PyUnicode_DATA(z); 12647 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12648 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12649 if (!key) 12650 goto err; 12651 res = PyDict_SetItem(new, key, Py_None); 12652 Py_DECREF(key); 12653 if (res < 0) 12654 goto err; 12655 } 12656 } 12657 } else { 12658 int kind; 12659 void *data; 12660 12661 /* x must be a dict */ 12662 if (!PyDict_CheckExact(x)) { 12663 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12664 "to maketrans it must be a dict"); 12665 goto err; 12666 } 12667 /* copy entries into the new dict, converting string keys to int keys */ 12668 while (PyDict_Next(x, &i, &key, &value)) { 12669 if (PyUnicode_Check(key)) { 12670 /* convert string keys to integer keys */ 12671 PyObject *newkey; 12672 if (PyUnicode_GET_LENGTH(key) != 1) { 12673 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12674 "table must be of length 1"); 12675 goto err; 12676 } 12677 kind = PyUnicode_KIND(key); 12678 data = PyUnicode_DATA(key); 12679 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12680 if (!newkey) 12681 goto err; 12682 res = PyDict_SetItem(new, newkey, value); 12683 Py_DECREF(newkey); 12684 if (res < 0) 12685 goto err; 12686 } else if (PyLong_Check(key)) { 12687 /* just keep integer keys */ 12688 if (PyDict_SetItem(new, key, value) < 0) 12689 goto err; 12690 } else { 12691 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12692 "be strings or integers"); 12693 goto err; 12694 } 12695 } 12696 } 12697 return new; 12698 err: 12699 Py_DECREF(new); 12700 return NULL; 12701} 12702 12703PyDoc_STRVAR(translate__doc__, 12704 "S.translate(table) -> str\n\ 12705\n\ 12706Return a copy of the string S, where all characters have been mapped\n\ 12707through the given translation table, which must be a mapping of\n\ 12708Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12709Unmapped characters are left untouched. Characters mapped to None\n\ 12710are deleted."); 12711 12712static PyObject* 12713unicode_translate(PyObject *self, PyObject *table) 12714{ 12715 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12716} 12717 12718PyDoc_STRVAR(upper__doc__, 12719 "S.upper() -> str\n\ 12720\n\ 12721Return a copy of S converted to uppercase."); 12722 12723static PyObject* 12724unicode_upper(PyObject *self) 12725{ 12726 return fixup(self, fixupper); 12727} 12728 12729PyDoc_STRVAR(zfill__doc__, 12730 "S.zfill(width) -> str\n\ 12731\n\ 12732Pad a numeric string S with zeros on the left, to fill a field\n\ 12733of the specified width. The string S is never truncated."); 12734 12735static PyObject * 12736unicode_zfill(PyObject *self, PyObject *args) 12737{ 12738 Py_ssize_t fill; 12739 PyObject *u; 12740 Py_ssize_t width; 12741 int kind; 12742 void *data; 12743 Py_UCS4 chr; 12744 12745 if (PyUnicode_READY(self) == -1) 12746 return NULL; 12747 12748 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12749 return NULL; 12750 12751 if (PyUnicode_GET_LENGTH(self) >= width) { 12752 if (PyUnicode_CheckExact(self)) { 12753 Py_INCREF(self); 12754 return self; 12755 } 12756 else 12757 return PyUnicode_Copy(self); 12758 } 12759 12760 fill = width - _PyUnicode_LENGTH(self); 12761 12762 u = pad(self, fill, 0, '0'); 12763 12764 if (u == NULL) 12765 return NULL; 12766 12767 kind = PyUnicode_KIND(u); 12768 data = PyUnicode_DATA(u); 12769 chr = PyUnicode_READ(kind, data, fill); 12770 12771 if (chr == '+' || chr == '-') { 12772 /* move sign to beginning of string */ 12773 PyUnicode_WRITE(kind, data, 0, chr); 12774 PyUnicode_WRITE(kind, data, fill, '0'); 12775 } 12776 12777 assert(_PyUnicode_CheckConsistency(u, 1)); 12778 return u; 12779} 12780 12781#if 0 12782static PyObject * 12783unicode__decimal2ascii(PyObject *self) 12784{ 12785 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12786} 12787#endif 12788 12789PyDoc_STRVAR(startswith__doc__, 12790 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12791\n\ 12792Return True if S starts with the specified prefix, False otherwise.\n\ 12793With optional start, test S beginning at that position.\n\ 12794With optional end, stop comparing S at that position.\n\ 12795prefix can also be a tuple of strings to try."); 12796 12797static PyObject * 12798unicode_startswith(PyObject *self, 12799 PyObject *args) 12800{ 12801 PyObject *subobj; 12802 PyObject *substring; 12803 Py_ssize_t start = 0; 12804 Py_ssize_t end = PY_SSIZE_T_MAX; 12805 int result; 12806 12807 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12808 return NULL; 12809 if (PyTuple_Check(subobj)) { 12810 Py_ssize_t i; 12811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12812 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12813 if (substring == NULL) 12814 return NULL; 12815 result = tailmatch(self, substring, start, end, -1); 12816 Py_DECREF(substring); 12817 if (result) { 12818 Py_RETURN_TRUE; 12819 } 12820 } 12821 /* nothing matched */ 12822 Py_RETURN_FALSE; 12823 } 12824 substring = PyUnicode_FromObject(subobj); 12825 if (substring == NULL) { 12826 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12827 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12828 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12829 return NULL; 12830 } 12831 result = tailmatch(self, substring, start, end, -1); 12832 Py_DECREF(substring); 12833 return PyBool_FromLong(result); 12834} 12835 12836 12837PyDoc_STRVAR(endswith__doc__, 12838 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12839\n\ 12840Return True if S ends with the specified suffix, False otherwise.\n\ 12841With optional start, test S beginning at that position.\n\ 12842With optional end, stop comparing S at that position.\n\ 12843suffix can also be a tuple of strings to try."); 12844 12845static PyObject * 12846unicode_endswith(PyObject *self, 12847 PyObject *args) 12848{ 12849 PyObject *subobj; 12850 PyObject *substring; 12851 Py_ssize_t start = 0; 12852 Py_ssize_t end = PY_SSIZE_T_MAX; 12853 int result; 12854 12855 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12856 return NULL; 12857 if (PyTuple_Check(subobj)) { 12858 Py_ssize_t i; 12859 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12860 substring = PyUnicode_FromObject( 12861 PyTuple_GET_ITEM(subobj, i)); 12862 if (substring == NULL) 12863 return NULL; 12864 result = tailmatch(self, substring, start, end, +1); 12865 Py_DECREF(substring); 12866 if (result) { 12867 Py_RETURN_TRUE; 12868 } 12869 } 12870 Py_RETURN_FALSE; 12871 } 12872 substring = PyUnicode_FromObject(subobj); 12873 if (substring == NULL) { 12874 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12875 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12876 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12877 return NULL; 12878 } 12879 result = tailmatch(self, substring, start, end, +1); 12880 Py_DECREF(substring); 12881 return PyBool_FromLong(result); 12882} 12883 12884#include "stringlib/unicode_format.h" 12885 12886PyDoc_STRVAR(format__doc__, 12887 "S.format(*args, **kwargs) -> str\n\ 12888\n\ 12889Return a formatted version of S, using substitutions from args and kwargs.\n\ 12890The substitutions are identified by braces ('{' and '}')."); 12891 12892PyDoc_STRVAR(format_map__doc__, 12893 "S.format_map(mapping) -> str\n\ 12894\n\ 12895Return a formatted version of S, using substitutions from mapping.\n\ 12896The substitutions are identified by braces ('{' and '}')."); 12897 12898static PyObject * 12899unicode__format__(PyObject* self, PyObject* args) 12900{ 12901 PyObject *format_spec, *out; 12902 12903 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12904 return NULL; 12905 12906 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, 12907 PyUnicode_GET_LENGTH(format_spec)); 12908 return out; 12909} 12910 12911PyDoc_STRVAR(p_format__doc__, 12912 "S.__format__(format_spec) -> str\n\ 12913\n\ 12914Return a formatted version of S as described by format_spec."); 12915 12916static PyObject * 12917unicode__sizeof__(PyObject *v) 12918{ 12919 Py_ssize_t size; 12920 12921 /* If it's a compact object, account for base structure + 12922 character data. */ 12923 if (PyUnicode_IS_COMPACT_ASCII(v)) 12924 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12925 else if (PyUnicode_IS_COMPACT(v)) 12926 size = sizeof(PyCompactUnicodeObject) + 12927 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12928 else { 12929 /* If it is a two-block object, account for base object, and 12930 for character block if present. */ 12931 size = sizeof(PyUnicodeObject); 12932 if (_PyUnicode_DATA_ANY(v)) 12933 size += (PyUnicode_GET_LENGTH(v) + 1) * 12934 PyUnicode_KIND(v); 12935 } 12936 /* If the wstr pointer is present, account for it unless it is shared 12937 with the data pointer. Check if the data is not shared. */ 12938 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12939 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12940 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12941 size += PyUnicode_UTF8_LENGTH(v) + 1; 12942 12943 return PyLong_FromSsize_t(size); 12944} 12945 12946PyDoc_STRVAR(sizeof__doc__, 12947 "S.__sizeof__() -> size of S in memory, in bytes"); 12948 12949static PyObject * 12950unicode_getnewargs(PyObject *v) 12951{ 12952 PyObject *copy = PyUnicode_Copy(v); 12953 if (!copy) 12954 return NULL; 12955 return Py_BuildValue("(N)", copy); 12956} 12957 12958static PyMethodDef unicode_methods[] = { 12959 12960 /* Order is according to common usage: often used methods should 12961 appear first, since lookup is done sequentially. */ 12962 12963 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12964 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12965 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12966 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12967 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12968 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12969 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12970 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12971 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12972 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12973 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12974 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12975 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12976 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12977 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12978 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12979 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12980 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12981 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12982 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12983 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12984 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12985 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12986 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12987 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12988 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12989 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12990 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12991 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12992 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12993 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12994 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12995 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12996 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12997 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12998 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12999 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13000 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13001 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13002 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13003 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13004 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13005 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13006 {"maketrans", (PyCFunction) unicode_maketrans, 13007 METH_VARARGS | METH_STATIC, maketrans__doc__}, 13008 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13009#if 0 13010 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 13011#endif 13012 13013#if 0 13014 /* These methods are just used for debugging the implementation. */ 13015 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13016#endif 13017 13018 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13019 {NULL, NULL} 13020}; 13021 13022static PyObject * 13023unicode_mod(PyObject *v, PyObject *w) 13024{ 13025 if (!PyUnicode_Check(v)) 13026 Py_RETURN_NOTIMPLEMENTED; 13027 return PyUnicode_Format(v, w); 13028} 13029 13030static PyNumberMethods unicode_as_number = { 13031 0, /*nb_add*/ 13032 0, /*nb_subtract*/ 13033 0, /*nb_multiply*/ 13034 unicode_mod, /*nb_remainder*/ 13035}; 13036 13037static PySequenceMethods unicode_as_sequence = { 13038 (lenfunc) unicode_length, /* sq_length */ 13039 PyUnicode_Concat, /* sq_concat */ 13040 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13041 (ssizeargfunc) unicode_getitem, /* sq_item */ 13042 0, /* sq_slice */ 13043 0, /* sq_ass_item */ 13044 0, /* sq_ass_slice */ 13045 PyUnicode_Contains, /* sq_contains */ 13046}; 13047 13048static PyObject* 13049unicode_subscript(PyObject* self, PyObject* item) 13050{ 13051 if (PyUnicode_READY(self) == -1) 13052 return NULL; 13053 13054 if (PyIndex_Check(item)) { 13055 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13056 if (i == -1 && PyErr_Occurred()) 13057 return NULL; 13058 if (i < 0) 13059 i += PyUnicode_GET_LENGTH(self); 13060 return unicode_getitem(self, i); 13061 } else if (PySlice_Check(item)) { 13062 Py_ssize_t start, stop, step, slicelength, cur, i; 13063 PyObject *result; 13064 void *src_data, *dest_data; 13065 int src_kind, dest_kind; 13066 Py_UCS4 ch, max_char, kind_limit; 13067 13068 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13069 &start, &stop, &step, &slicelength) < 0) { 13070 return NULL; 13071 } 13072 13073 if (slicelength <= 0) { 13074 return PyUnicode_New(0, 0); 13075 } else if (start == 0 && step == 1 && 13076 slicelength == PyUnicode_GET_LENGTH(self) && 13077 PyUnicode_CheckExact(self)) { 13078 Py_INCREF(self); 13079 return self; 13080 } else if (step == 1) { 13081 return PyUnicode_Substring(self, 13082 start, start + slicelength); 13083 } 13084 /* General case */ 13085 src_kind = PyUnicode_KIND(self); 13086 src_data = PyUnicode_DATA(self); 13087 if (!PyUnicode_IS_ASCII(self)) { 13088 kind_limit = kind_maxchar_limit(src_kind); 13089 max_char = 0; 13090 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13091 ch = PyUnicode_READ(src_kind, src_data, cur); 13092 if (ch > max_char) { 13093 max_char = ch; 13094 if (max_char >= kind_limit) 13095 break; 13096 } 13097 } 13098 } 13099 else 13100 max_char = 127; 13101 result = PyUnicode_New(slicelength, max_char); 13102 if (result == NULL) 13103 return NULL; 13104 dest_kind = PyUnicode_KIND(result); 13105 dest_data = PyUnicode_DATA(result); 13106 13107 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13108 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13109 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13110 } 13111 assert(_PyUnicode_CheckConsistency(result, 1)); 13112 return result; 13113 } else { 13114 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13115 return NULL; 13116 } 13117} 13118 13119static PyMappingMethods unicode_as_mapping = { 13120 (lenfunc)unicode_length, /* mp_length */ 13121 (binaryfunc)unicode_subscript, /* mp_subscript */ 13122 (objobjargproc)0, /* mp_ass_subscript */ 13123}; 13124 13125 13126/* Helpers for PyUnicode_Format() */ 13127 13128static PyObject * 13129getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 13130{ 13131 Py_ssize_t argidx = *p_argidx; 13132 if (argidx < arglen) { 13133 (*p_argidx)++; 13134 if (arglen < 0) 13135 return args; 13136 else 13137 return PyTuple_GetItem(args, argidx); 13138 } 13139 PyErr_SetString(PyExc_TypeError, 13140 "not enough arguments for format string"); 13141 return NULL; 13142} 13143 13144/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13145 13146static PyObject * 13147formatfloat(PyObject *v, int flags, int prec, int type) 13148{ 13149 char *p; 13150 PyObject *result; 13151 double x; 13152 13153 x = PyFloat_AsDouble(v); 13154 if (x == -1.0 && PyErr_Occurred()) 13155 return NULL; 13156 13157 if (prec < 0) 13158 prec = 6; 13159 13160 p = PyOS_double_to_string(x, type, prec, 13161 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13162 if (p == NULL) 13163 return NULL; 13164 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 13165 PyMem_Free(p); 13166 return result; 13167} 13168 13169static PyObject* 13170formatlong(PyObject *val, int flags, int prec, int type) 13171{ 13172 char *buf; 13173 int len; 13174 PyObject *str; /* temporary string object. */ 13175 PyObject *result; 13176 13177 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 13178 if (!str) 13179 return NULL; 13180 result = PyUnicode_DecodeASCII(buf, len, NULL); 13181 Py_DECREF(str); 13182 return result; 13183} 13184 13185static Py_UCS4 13186formatchar(PyObject *v) 13187{ 13188 /* presume that the buffer is at least 3 characters long */ 13189 if (PyUnicode_Check(v)) { 13190 if (PyUnicode_GET_LENGTH(v) == 1) { 13191 return PyUnicode_READ_CHAR(v, 0); 13192 } 13193 goto onError; 13194 } 13195 else { 13196 /* Integer input truncated to a character */ 13197 long x; 13198 x = PyLong_AsLong(v); 13199 if (x == -1 && PyErr_Occurred()) 13200 goto onError; 13201 13202 if (x < 0 || x > 0x10ffff) { 13203 PyErr_SetString(PyExc_OverflowError, 13204 "%c arg not in range(0x110000)"); 13205 return (Py_UCS4) -1; 13206 } 13207 13208 return (Py_UCS4) x; 13209 } 13210 13211 onError: 13212 PyErr_SetString(PyExc_TypeError, 13213 "%c requires int or char"); 13214 return (Py_UCS4) -1; 13215} 13216 13217static int 13218repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) 13219{ 13220 int r; 13221 assert(count > 0); 13222 assert(PyUnicode_Check(obj)); 13223 if (count > 5) { 13224 PyObject *repeated = unicode_repeat(obj, count); 13225 if (repeated == NULL) 13226 return -1; 13227 r = _PyAccu_Accumulate(acc, repeated); 13228 Py_DECREF(repeated); 13229 return r; 13230 } 13231 else { 13232 do { 13233 if (_PyAccu_Accumulate(acc, obj)) 13234 return -1; 13235 } while (--count); 13236 return 0; 13237 } 13238} 13239 13240PyObject * 13241PyUnicode_Format(PyObject *format, PyObject *args) 13242{ 13243 void *fmt; 13244 int fmtkind; 13245 PyObject *result; 13246 int kind; 13247 int r; 13248 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13249 int args_owned = 0; 13250 PyObject *dict = NULL; 13251 PyObject *temp = NULL; 13252 PyObject *second = NULL; 13253 PyObject *uformat; 13254 _PyAccu acc; 13255 static PyObject *plus, *minus, *blank, *zero, *percent; 13256 13257 if (!plus && !(plus = get_latin1_char('+'))) 13258 return NULL; 13259 if (!minus && !(minus = get_latin1_char('-'))) 13260 return NULL; 13261 if (!blank && !(blank = get_latin1_char(' '))) 13262 return NULL; 13263 if (!zero && !(zero = get_latin1_char('0'))) 13264 return NULL; 13265 if (!percent && !(percent = get_latin1_char('%'))) 13266 return NULL; 13267 13268 if (format == NULL || args == NULL) { 13269 PyErr_BadInternalCall(); 13270 return NULL; 13271 } 13272 uformat = PyUnicode_FromObject(format); 13273 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 13274 return NULL; 13275 if (_PyAccu_Init(&acc)) 13276 goto onError; 13277 fmt = PyUnicode_DATA(uformat); 13278 fmtkind = PyUnicode_KIND(uformat); 13279 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13280 fmtpos = 0; 13281 13282 if (PyTuple_Check(args)) { 13283 arglen = PyTuple_Size(args); 13284 argidx = 0; 13285 } 13286 else { 13287 arglen = -1; 13288 argidx = -2; 13289 } 13290 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 13291 !PyUnicode_Check(args)) 13292 dict = args; 13293 13294 while (--fmtcnt >= 0) { 13295 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13296 PyObject *nonfmt; 13297 Py_ssize_t nonfmtpos; 13298 nonfmtpos = fmtpos++; 13299 while (fmtcnt >= 0 && 13300 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13301 fmtpos++; 13302 fmtcnt--; 13303 } 13304 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos); 13305 if (nonfmt == NULL) 13306 goto onError; 13307 r = _PyAccu_Accumulate(&acc, nonfmt); 13308 Py_DECREF(nonfmt); 13309 if (r) 13310 goto onError; 13311 } 13312 else { 13313 /* Got a format specifier */ 13314 int flags = 0; 13315 Py_ssize_t width = -1; 13316 int prec = -1; 13317 Py_UCS4 c = '\0'; 13318 Py_UCS4 fill, sign; 13319 int isnumok; 13320 PyObject *v = NULL; 13321 void *pbuf = NULL; 13322 Py_ssize_t pindex, len; 13323 PyObject *signobj = NULL, *fillobj = NULL; 13324 13325 fmtpos++; 13326 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 13327 Py_ssize_t keystart; 13328 Py_ssize_t keylen; 13329 PyObject *key; 13330 int pcount = 1; 13331 13332 if (dict == NULL) { 13333 PyErr_SetString(PyExc_TypeError, 13334 "format requires a mapping"); 13335 goto onError; 13336 } 13337 ++fmtpos; 13338 --fmtcnt; 13339 keystart = fmtpos; 13340 /* Skip over balanced parentheses */ 13341 while (pcount > 0 && --fmtcnt >= 0) { 13342 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 13343 --pcount; 13344 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 13345 ++pcount; 13346 fmtpos++; 13347 } 13348 keylen = fmtpos - keystart - 1; 13349 if (fmtcnt < 0 || pcount > 0) { 13350 PyErr_SetString(PyExc_ValueError, 13351 "incomplete format key"); 13352 goto onError; 13353 } 13354 key = PyUnicode_Substring(uformat, 13355 keystart, keystart + keylen); 13356 if (key == NULL) 13357 goto onError; 13358 if (args_owned) { 13359 Py_DECREF(args); 13360 args_owned = 0; 13361 } 13362 args = PyObject_GetItem(dict, key); 13363 Py_DECREF(key); 13364 if (args == NULL) { 13365 goto onError; 13366 } 13367 args_owned = 1; 13368 arglen = -1; 13369 argidx = -2; 13370 } 13371 while (--fmtcnt >= 0) { 13372 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 13373 case '-': flags |= F_LJUST; continue; 13374 case '+': flags |= F_SIGN; continue; 13375 case ' ': flags |= F_BLANK; continue; 13376 case '#': flags |= F_ALT; continue; 13377 case '0': flags |= F_ZERO; continue; 13378 } 13379 break; 13380 } 13381 if (c == '*') { 13382 v = getnextarg(args, arglen, &argidx); 13383 if (v == NULL) 13384 goto onError; 13385 if (!PyLong_Check(v)) { 13386 PyErr_SetString(PyExc_TypeError, 13387 "* wants int"); 13388 goto onError; 13389 } 13390 width = PyLong_AsLong(v); 13391 if (width == -1 && PyErr_Occurred()) 13392 goto onError; 13393 if (width < 0) { 13394 flags |= F_LJUST; 13395 width = -width; 13396 } 13397 if (--fmtcnt >= 0) 13398 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13399 } 13400 else if (c >= '0' && c <= '9') { 13401 width = c - '0'; 13402 while (--fmtcnt >= 0) { 13403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13404 if (c < '0' || c > '9') 13405 break; 13406 if ((width*10) / 10 != width) { 13407 PyErr_SetString(PyExc_ValueError, 13408 "width too big"); 13409 goto onError; 13410 } 13411 width = width*10 + (c - '0'); 13412 } 13413 } 13414 if (c == '.') { 13415 prec = 0; 13416 if (--fmtcnt >= 0) 13417 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13418 if (c == '*') { 13419 v = getnextarg(args, arglen, &argidx); 13420 if (v == NULL) 13421 goto onError; 13422 if (!PyLong_Check(v)) { 13423 PyErr_SetString(PyExc_TypeError, 13424 "* wants int"); 13425 goto onError; 13426 } 13427 prec = PyLong_AsLong(v); 13428 if (prec == -1 && PyErr_Occurred()) 13429 goto onError; 13430 if (prec < 0) 13431 prec = 0; 13432 if (--fmtcnt >= 0) 13433 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13434 } 13435 else if (c >= '0' && c <= '9') { 13436 prec = c - '0'; 13437 while (--fmtcnt >= 0) { 13438 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13439 if (c < '0' || c > '9') 13440 break; 13441 if ((prec*10) / 10 != prec) { 13442 PyErr_SetString(PyExc_ValueError, 13443 "prec too big"); 13444 goto onError; 13445 } 13446 prec = prec*10 + (c - '0'); 13447 } 13448 } 13449 } /* prec */ 13450 if (fmtcnt >= 0) { 13451 if (c == 'h' || c == 'l' || c == 'L') { 13452 if (--fmtcnt >= 0) 13453 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13454 } 13455 } 13456 if (fmtcnt < 0) { 13457 PyErr_SetString(PyExc_ValueError, 13458 "incomplete format"); 13459 goto onError; 13460 } 13461 if (c != '%') { 13462 v = getnextarg(args, arglen, &argidx); 13463 if (v == NULL) 13464 goto onError; 13465 } 13466 sign = 0; 13467 fill = ' '; 13468 fillobj = blank; 13469 switch (c) { 13470 13471 case '%': 13472 _PyAccu_Accumulate(&acc, percent); 13473 continue; 13474 13475 case 's': 13476 case 'r': 13477 case 'a': 13478 if (PyUnicode_CheckExact(v) && c == 's') { 13479 temp = v; 13480 Py_INCREF(temp); 13481 } 13482 else { 13483 if (c == 's') 13484 temp = PyObject_Str(v); 13485 else if (c == 'r') 13486 temp = PyObject_Repr(v); 13487 else 13488 temp = PyObject_ASCII(v); 13489 if (temp == NULL) 13490 goto onError; 13491 if (PyUnicode_Check(temp)) 13492 /* nothing to do */; 13493 else { 13494 Py_DECREF(temp); 13495 PyErr_SetString(PyExc_TypeError, 13496 "%s argument has non-string str()"); 13497 goto onError; 13498 } 13499 } 13500 if (PyUnicode_READY(temp) == -1) { 13501 Py_CLEAR(temp); 13502 goto onError; 13503 } 13504 pbuf = PyUnicode_DATA(temp); 13505 kind = PyUnicode_KIND(temp); 13506 len = PyUnicode_GET_LENGTH(temp); 13507 if (prec >= 0 && len > prec) 13508 len = prec; 13509 break; 13510 13511 case 'i': 13512 case 'd': 13513 case 'u': 13514 case 'o': 13515 case 'x': 13516 case 'X': 13517 isnumok = 0; 13518 if (PyNumber_Check(v)) { 13519 PyObject *iobj=NULL; 13520 13521 if (PyLong_Check(v)) { 13522 iobj = v; 13523 Py_INCREF(iobj); 13524 } 13525 else { 13526 iobj = PyNumber_Long(v); 13527 } 13528 if (iobj!=NULL) { 13529 if (PyLong_Check(iobj)) { 13530 isnumok = 1; 13531 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13532 Py_DECREF(iobj); 13533 if (!temp) 13534 goto onError; 13535 if (PyUnicode_READY(temp) == -1) { 13536 Py_CLEAR(temp); 13537 goto onError; 13538 } 13539 pbuf = PyUnicode_DATA(temp); 13540 kind = PyUnicode_KIND(temp); 13541 len = PyUnicode_GET_LENGTH(temp); 13542 sign = 1; 13543 } 13544 else { 13545 Py_DECREF(iobj); 13546 } 13547 } 13548 } 13549 if (!isnumok) { 13550 PyErr_Format(PyExc_TypeError, 13551 "%%%c format: a number is required, " 13552 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13553 goto onError; 13554 } 13555 if (flags & F_ZERO) { 13556 fill = '0'; 13557 fillobj = zero; 13558 } 13559 break; 13560 13561 case 'e': 13562 case 'E': 13563 case 'f': 13564 case 'F': 13565 case 'g': 13566 case 'G': 13567 temp = formatfloat(v, flags, prec, c); 13568 if (!temp) 13569 goto onError; 13570 if (PyUnicode_READY(temp) == -1) { 13571 Py_CLEAR(temp); 13572 goto onError; 13573 } 13574 pbuf = PyUnicode_DATA(temp); 13575 kind = PyUnicode_KIND(temp); 13576 len = PyUnicode_GET_LENGTH(temp); 13577 sign = 1; 13578 if (flags & F_ZERO) { 13579 fill = '0'; 13580 fillobj = zero; 13581 } 13582 break; 13583 13584 case 'c': 13585 { 13586 Py_UCS4 ch = formatchar(v); 13587 if (ch == (Py_UCS4) -1) 13588 goto onError; 13589 temp = _PyUnicode_FromUCS4(&ch, 1); 13590 if (temp == NULL) 13591 goto onError; 13592 pbuf = PyUnicode_DATA(temp); 13593 kind = PyUnicode_KIND(temp); 13594 len = PyUnicode_GET_LENGTH(temp); 13595 break; 13596 } 13597 13598 default: 13599 PyErr_Format(PyExc_ValueError, 13600 "unsupported format character '%c' (0x%x) " 13601 "at index %zd", 13602 (31<=c && c<=126) ? (char)c : '?', 13603 (int)c, 13604 fmtpos - 1); 13605 goto onError; 13606 } 13607 /* pbuf is initialized here. */ 13608 pindex = 0; 13609 if (sign) { 13610 if (PyUnicode_READ(kind, pbuf, pindex) == '-') { 13611 signobj = minus; 13612 len--; 13613 pindex++; 13614 } 13615 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') { 13616 signobj = plus; 13617 len--; 13618 pindex++; 13619 } 13620 else if (flags & F_SIGN) 13621 signobj = plus; 13622 else if (flags & F_BLANK) 13623 signobj = blank; 13624 else 13625 sign = 0; 13626 } 13627 if (width < len) 13628 width = len; 13629 if (sign) { 13630 if (fill != ' ') { 13631 assert(signobj != NULL); 13632 if (_PyAccu_Accumulate(&acc, signobj)) 13633 goto onError; 13634 } 13635 if (width > len) 13636 width--; 13637 } 13638 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13639 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13640 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13641 if (fill != ' ') { 13642 second = get_latin1_char( 13643 PyUnicode_READ(kind, pbuf, pindex + 1)); 13644 pindex += 2; 13645 if (second == NULL || 13646 _PyAccu_Accumulate(&acc, zero) || 13647 _PyAccu_Accumulate(&acc, second)) 13648 goto onError; 13649 Py_CLEAR(second); 13650 } 13651 width -= 2; 13652 if (width < 0) 13653 width = 0; 13654 len -= 2; 13655 } 13656 if (width > len && !(flags & F_LJUST)) { 13657 assert(fillobj != NULL); 13658 if (repeat_accumulate(&acc, fillobj, width - len)) 13659 goto onError; 13660 width = len; 13661 } 13662 if (fill == ' ') { 13663 if (sign) { 13664 assert(signobj != NULL); 13665 if (_PyAccu_Accumulate(&acc, signobj)) 13666 goto onError; 13667 } 13668 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13669 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13670 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13671 second = get_latin1_char( 13672 PyUnicode_READ(kind, pbuf, pindex + 1)); 13673 pindex += 2; 13674 if (second == NULL || 13675 _PyAccu_Accumulate(&acc, zero) || 13676 _PyAccu_Accumulate(&acc, second)) 13677 goto onError; 13678 Py_CLEAR(second); 13679 } 13680 } 13681 /* Copy all characters, preserving len */ 13682 if (temp != NULL) { 13683 assert(pbuf == PyUnicode_DATA(temp)); 13684 v = PyUnicode_Substring(temp, pindex, pindex + len); 13685 } 13686 else { 13687 const char *p = (const char *) pbuf; 13688 assert(pbuf != NULL); 13689 p += kind * pindex; 13690 v = PyUnicode_FromKindAndData(kind, p, len); 13691 } 13692 if (v == NULL) 13693 goto onError; 13694 r = _PyAccu_Accumulate(&acc, v); 13695 Py_DECREF(v); 13696 if (r) 13697 goto onError; 13698 if (width > len && repeat_accumulate(&acc, blank, width - len)) 13699 goto onError; 13700 if (dict && (argidx < arglen) && c != '%') { 13701 PyErr_SetString(PyExc_TypeError, 13702 "not all arguments converted during string formatting"); 13703 goto onError; 13704 } 13705 Py_CLEAR(temp); 13706 } /* '%' */ 13707 } /* until end */ 13708 if (argidx < arglen && !dict) { 13709 PyErr_SetString(PyExc_TypeError, 13710 "not all arguments converted during string formatting"); 13711 goto onError; 13712 } 13713 13714 result = _PyAccu_Finish(&acc); 13715 if (args_owned) { 13716 Py_DECREF(args); 13717 } 13718 Py_DECREF(uformat); 13719 Py_XDECREF(temp); 13720 Py_XDECREF(second); 13721 return result; 13722 13723 onError: 13724 Py_DECREF(uformat); 13725 Py_XDECREF(temp); 13726 Py_XDECREF(second); 13727 _PyAccu_Destroy(&acc); 13728 if (args_owned) { 13729 Py_DECREF(args); 13730 } 13731 return NULL; 13732} 13733 13734static PyObject * 13735unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13736 13737static PyObject * 13738unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13739{ 13740 PyObject *x = NULL; 13741 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13742 char *encoding = NULL; 13743 char *errors = NULL; 13744 13745 if (type != &PyUnicode_Type) 13746 return unicode_subtype_new(type, args, kwds); 13747 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13748 kwlist, &x, &encoding, &errors)) 13749 return NULL; 13750 if (x == NULL) 13751 return PyUnicode_New(0, 0); 13752 if (encoding == NULL && errors == NULL) 13753 return PyObject_Str(x); 13754 else 13755 return PyUnicode_FromEncodedObject(x, encoding, errors); 13756} 13757 13758static PyObject * 13759unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13760{ 13761 PyObject *unicode, *self; 13762 Py_ssize_t length, char_size; 13763 int share_wstr, share_utf8; 13764 unsigned int kind; 13765 void *data; 13766 13767 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13768 13769 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13770 if (unicode == NULL) 13771 return NULL; 13772 assert(_PyUnicode_CHECK(unicode)); 13773 if (PyUnicode_READY(unicode)) 13774 return NULL; 13775 13776 self = type->tp_alloc(type, 0); 13777 if (self == NULL) { 13778 Py_DECREF(unicode); 13779 return NULL; 13780 } 13781 kind = PyUnicode_KIND(unicode); 13782 length = PyUnicode_GET_LENGTH(unicode); 13783 13784 _PyUnicode_LENGTH(self) = length; 13785#ifdef Py_DEBUG 13786 _PyUnicode_HASH(self) = -1; 13787#else 13788 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13789#endif 13790 _PyUnicode_STATE(self).interned = 0; 13791 _PyUnicode_STATE(self).kind = kind; 13792 _PyUnicode_STATE(self).compact = 0; 13793 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13794 _PyUnicode_STATE(self).ready = 1; 13795 _PyUnicode_WSTR(self) = NULL; 13796 _PyUnicode_UTF8_LENGTH(self) = 0; 13797 _PyUnicode_UTF8(self) = NULL; 13798 _PyUnicode_WSTR_LENGTH(self) = 0; 13799 _PyUnicode_DATA_ANY(self) = NULL; 13800 13801 share_utf8 = 0; 13802 share_wstr = 0; 13803 if (kind == PyUnicode_1BYTE_KIND) { 13804 char_size = 1; 13805 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13806 share_utf8 = 1; 13807 } 13808 else if (kind == PyUnicode_2BYTE_KIND) { 13809 char_size = 2; 13810 if (sizeof(wchar_t) == 2) 13811 share_wstr = 1; 13812 } 13813 else { 13814 assert(kind == PyUnicode_4BYTE_KIND); 13815 char_size = 4; 13816 if (sizeof(wchar_t) == 4) 13817 share_wstr = 1; 13818 } 13819 13820 /* Ensure we won't overflow the length. */ 13821 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13822 PyErr_NoMemory(); 13823 goto onError; 13824 } 13825 data = PyObject_MALLOC((length + 1) * char_size); 13826 if (data == NULL) { 13827 PyErr_NoMemory(); 13828 goto onError; 13829 } 13830 13831 _PyUnicode_DATA_ANY(self) = data; 13832 if (share_utf8) { 13833 _PyUnicode_UTF8_LENGTH(self) = length; 13834 _PyUnicode_UTF8(self) = data; 13835 } 13836 if (share_wstr) { 13837 _PyUnicode_WSTR_LENGTH(self) = length; 13838 _PyUnicode_WSTR(self) = (wchar_t *)data; 13839 } 13840 13841 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13842 kind * (length + 1)); 13843 assert(_PyUnicode_CheckConsistency(self, 1)); 13844#ifdef Py_DEBUG 13845 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13846#endif 13847 Py_DECREF(unicode); 13848 return self; 13849 13850onError: 13851 Py_DECREF(unicode); 13852 Py_DECREF(self); 13853 return NULL; 13854} 13855 13856PyDoc_STRVAR(unicode_doc, 13857 "str(string[, encoding[, errors]]) -> str\n\ 13858\n\ 13859Create a new string object from the given encoded string.\n\ 13860encoding defaults to the current default string encoding.\n\ 13861errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13862 13863static PyObject *unicode_iter(PyObject *seq); 13864 13865PyTypeObject PyUnicode_Type = { 13866 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13867 "str", /* tp_name */ 13868 sizeof(PyUnicodeObject), /* tp_size */ 13869 0, /* tp_itemsize */ 13870 /* Slots */ 13871 (destructor)unicode_dealloc, /* tp_dealloc */ 13872 0, /* tp_print */ 13873 0, /* tp_getattr */ 13874 0, /* tp_setattr */ 13875 0, /* tp_reserved */ 13876 unicode_repr, /* tp_repr */ 13877 &unicode_as_number, /* tp_as_number */ 13878 &unicode_as_sequence, /* tp_as_sequence */ 13879 &unicode_as_mapping, /* tp_as_mapping */ 13880 (hashfunc) unicode_hash, /* tp_hash*/ 13881 0, /* tp_call*/ 13882 (reprfunc) unicode_str, /* tp_str */ 13883 PyObject_GenericGetAttr, /* tp_getattro */ 13884 0, /* tp_setattro */ 13885 0, /* tp_as_buffer */ 13886 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13887 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13888 unicode_doc, /* tp_doc */ 13889 0, /* tp_traverse */ 13890 0, /* tp_clear */ 13891 PyUnicode_RichCompare, /* tp_richcompare */ 13892 0, /* tp_weaklistoffset */ 13893 unicode_iter, /* tp_iter */ 13894 0, /* tp_iternext */ 13895 unicode_methods, /* tp_methods */ 13896 0, /* tp_members */ 13897 0, /* tp_getset */ 13898 &PyBaseObject_Type, /* tp_base */ 13899 0, /* tp_dict */ 13900 0, /* tp_descr_get */ 13901 0, /* tp_descr_set */ 13902 0, /* tp_dictoffset */ 13903 0, /* tp_init */ 13904 0, /* tp_alloc */ 13905 unicode_new, /* tp_new */ 13906 PyObject_Del, /* tp_free */ 13907}; 13908 13909/* Initialize the Unicode implementation */ 13910 13911int _PyUnicode_Init(void) 13912{ 13913 int i; 13914 13915 /* XXX - move this array to unicodectype.c ? */ 13916 Py_UCS2 linebreak[] = { 13917 0x000A, /* LINE FEED */ 13918 0x000D, /* CARRIAGE RETURN */ 13919 0x001C, /* FILE SEPARATOR */ 13920 0x001D, /* GROUP SEPARATOR */ 13921 0x001E, /* RECORD SEPARATOR */ 13922 0x0085, /* NEXT LINE */ 13923 0x2028, /* LINE SEPARATOR */ 13924 0x2029, /* PARAGRAPH SEPARATOR */ 13925 }; 13926 13927 /* Init the implementation */ 13928 unicode_empty = PyUnicode_New(0, 0); 13929 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 13930 if (!unicode_empty) 13931 Py_FatalError("Can't create empty string"); 13932 13933 for (i = 0; i < 256; i++) 13934 unicode_latin1[i] = NULL; 13935 if (PyType_Ready(&PyUnicode_Type) < 0) 13936 Py_FatalError("Can't initialize 'unicode'"); 13937 13938 /* initialize the linebreak bloom filter */ 13939 bloom_linebreak = make_bloom_mask( 13940 PyUnicode_2BYTE_KIND, linebreak, 13941 Py_ARRAY_LENGTH(linebreak)); 13942 13943 PyType_Ready(&EncodingMapType); 13944 13945#ifdef HAVE_MBCS 13946 winver.dwOSVersionInfoSize = sizeof(winver); 13947 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 13948 PyErr_SetFromWindowsErr(0); 13949 return -1; 13950 } 13951#endif 13952 return 0; 13953} 13954 13955/* Finalize the Unicode implementation */ 13956 13957int 13958PyUnicode_ClearFreeList(void) 13959{ 13960 return 0; 13961} 13962 13963void 13964_PyUnicode_Fini(void) 13965{ 13966 int i; 13967 13968 Py_XDECREF(unicode_empty); 13969 unicode_empty = NULL; 13970 13971 for (i = 0; i < 256; i++) { 13972 if (unicode_latin1[i]) { 13973 Py_DECREF(unicode_latin1[i]); 13974 unicode_latin1[i] = NULL; 13975 } 13976 } 13977 _PyUnicode_ClearStaticStrings(); 13978 (void)PyUnicode_ClearFreeList(); 13979} 13980 13981void 13982PyUnicode_InternInPlace(PyObject **p) 13983{ 13984 register PyObject *s = *p; 13985 PyObject *t; 13986#ifdef Py_DEBUG 13987 assert(s != NULL); 13988 assert(_PyUnicode_CHECK(s)); 13989#else 13990 if (s == NULL || !PyUnicode_Check(s)) 13991 return; 13992#endif 13993 /* If it's a subclass, we don't really know what putting 13994 it in the interned dict might do. */ 13995 if (!PyUnicode_CheckExact(s)) 13996 return; 13997 if (PyUnicode_CHECK_INTERNED(s)) 13998 return; 13999 if (_PyUnicode_READY_REPLACE(p)) { 14000 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); 14001 return; 14002 } 14003 s = *p; 14004 if (interned == NULL) { 14005 interned = PyDict_New(); 14006 if (interned == NULL) { 14007 PyErr_Clear(); /* Don't leave an exception */ 14008 return; 14009 } 14010 } 14011 /* It might be that the GetItem call fails even 14012 though the key is present in the dictionary, 14013 namely when this happens during a stack overflow. */ 14014 Py_ALLOW_RECURSION 14015 t = PyDict_GetItem(interned, s); 14016 Py_END_ALLOW_RECURSION 14017 14018 if (t) { 14019 Py_INCREF(t); 14020 Py_DECREF(*p); 14021 *p = t; 14022 return; 14023 } 14024 14025 PyThreadState_GET()->recursion_critical = 1; 14026 if (PyDict_SetItem(interned, s, s) < 0) { 14027 PyErr_Clear(); 14028 PyThreadState_GET()->recursion_critical = 0; 14029 return; 14030 } 14031 PyThreadState_GET()->recursion_critical = 0; 14032 /* The two references in interned are not counted by refcnt. 14033 The deallocator will take care of this */ 14034 Py_REFCNT(s) -= 2; 14035 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14036} 14037 14038void 14039PyUnicode_InternImmortal(PyObject **p) 14040{ 14041 PyUnicode_InternInPlace(p); 14042 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14043 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14044 Py_INCREF(*p); 14045 } 14046} 14047 14048PyObject * 14049PyUnicode_InternFromString(const char *cp) 14050{ 14051 PyObject *s = PyUnicode_FromString(cp); 14052 if (s == NULL) 14053 return NULL; 14054 PyUnicode_InternInPlace(&s); 14055 return s; 14056} 14057 14058void 14059_Py_ReleaseInternedUnicodeStrings(void) 14060{ 14061 PyObject *keys; 14062 PyObject *s; 14063 Py_ssize_t i, n; 14064 Py_ssize_t immortal_size = 0, mortal_size = 0; 14065 14066 if (interned == NULL || !PyDict_Check(interned)) 14067 return; 14068 keys = PyDict_Keys(interned); 14069 if (keys == NULL || !PyList_Check(keys)) { 14070 PyErr_Clear(); 14071 return; 14072 } 14073 14074 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14075 detector, interned unicode strings are not forcibly deallocated; 14076 rather, we give them their stolen references back, and then clear 14077 and DECREF the interned dict. */ 14078 14079 n = PyList_GET_SIZE(keys); 14080 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14081 n); 14082 for (i = 0; i < n; i++) { 14083 s = PyList_GET_ITEM(keys, i); 14084 if (PyUnicode_READY(s) == -1) { 14085 assert(0 && "could not ready string"); 14086 fprintf(stderr, "could not ready string\n"); 14087 } 14088 switch (PyUnicode_CHECK_INTERNED(s)) { 14089 case SSTATE_NOT_INTERNED: 14090 /* XXX Shouldn't happen */ 14091 break; 14092 case SSTATE_INTERNED_IMMORTAL: 14093 Py_REFCNT(s) += 1; 14094 immortal_size += PyUnicode_GET_LENGTH(s); 14095 break; 14096 case SSTATE_INTERNED_MORTAL: 14097 Py_REFCNT(s) += 2; 14098 mortal_size += PyUnicode_GET_LENGTH(s); 14099 break; 14100 default: 14101 Py_FatalError("Inconsistent interned string state."); 14102 } 14103 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14104 } 14105 fprintf(stderr, "total size of all interned strings: " 14106 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14107 "mortal/immortal\n", mortal_size, immortal_size); 14108 Py_DECREF(keys); 14109 PyDict_Clear(interned); 14110 Py_DECREF(interned); 14111 interned = NULL; 14112} 14113 14114 14115/********************* Unicode Iterator **************************/ 14116 14117typedef struct { 14118 PyObject_HEAD 14119 Py_ssize_t it_index; 14120 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14121} unicodeiterobject; 14122 14123static void 14124unicodeiter_dealloc(unicodeiterobject *it) 14125{ 14126 _PyObject_GC_UNTRACK(it); 14127 Py_XDECREF(it->it_seq); 14128 PyObject_GC_Del(it); 14129} 14130 14131static int 14132unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14133{ 14134 Py_VISIT(it->it_seq); 14135 return 0; 14136} 14137 14138static PyObject * 14139unicodeiter_next(unicodeiterobject *it) 14140{ 14141 PyObject *seq, *item; 14142 14143 assert(it != NULL); 14144 seq = it->it_seq; 14145 if (seq == NULL) 14146 return NULL; 14147 assert(_PyUnicode_CHECK(seq)); 14148 14149 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14150 int kind = PyUnicode_KIND(seq); 14151 void *data = PyUnicode_DATA(seq); 14152 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14153 item = PyUnicode_FromOrdinal(chr); 14154 if (item != NULL) 14155 ++it->it_index; 14156 return item; 14157 } 14158 14159 Py_DECREF(seq); 14160 it->it_seq = NULL; 14161 return NULL; 14162} 14163 14164static PyObject * 14165unicodeiter_len(unicodeiterobject *it) 14166{ 14167 Py_ssize_t len = 0; 14168 if (it->it_seq) 14169 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14170 return PyLong_FromSsize_t(len); 14171} 14172 14173PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14174 14175static PyMethodDef unicodeiter_methods[] = { 14176 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14177 length_hint_doc}, 14178 {NULL, NULL} /* sentinel */ 14179}; 14180 14181PyTypeObject PyUnicodeIter_Type = { 14182 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14183 "str_iterator", /* tp_name */ 14184 sizeof(unicodeiterobject), /* tp_basicsize */ 14185 0, /* tp_itemsize */ 14186 /* methods */ 14187 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14188 0, /* tp_print */ 14189 0, /* tp_getattr */ 14190 0, /* tp_setattr */ 14191 0, /* tp_reserved */ 14192 0, /* tp_repr */ 14193 0, /* tp_as_number */ 14194 0, /* tp_as_sequence */ 14195 0, /* tp_as_mapping */ 14196 0, /* tp_hash */ 14197 0, /* tp_call */ 14198 0, /* tp_str */ 14199 PyObject_GenericGetAttr, /* tp_getattro */ 14200 0, /* tp_setattro */ 14201 0, /* tp_as_buffer */ 14202 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14203 0, /* tp_doc */ 14204 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14205 0, /* tp_clear */ 14206 0, /* tp_richcompare */ 14207 0, /* tp_weaklistoffset */ 14208 PyObject_SelfIter, /* tp_iter */ 14209 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14210 unicodeiter_methods, /* tp_methods */ 14211 0, 14212}; 14213 14214static PyObject * 14215unicode_iter(PyObject *seq) 14216{ 14217 unicodeiterobject *it; 14218 14219 if (!PyUnicode_Check(seq)) { 14220 PyErr_BadInternalCall(); 14221 return NULL; 14222 } 14223 if (PyUnicode_READY(seq) == -1) 14224 return NULL; 14225 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14226 if (it == NULL) 14227 return NULL; 14228 it->it_index = 0; 14229 Py_INCREF(seq); 14230 it->it_seq = seq; 14231 _PyObject_GC_TRACK(it); 14232 return (PyObject *)it; 14233} 14234 14235 14236size_t 14237Py_UNICODE_strlen(const Py_UNICODE *u) 14238{ 14239 int res = 0; 14240 while(*u++) 14241 res++; 14242 return res; 14243} 14244 14245Py_UNICODE* 14246Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14247{ 14248 Py_UNICODE *u = s1; 14249 while ((*u++ = *s2++)); 14250 return s1; 14251} 14252 14253Py_UNICODE* 14254Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14255{ 14256 Py_UNICODE *u = s1; 14257 while ((*u++ = *s2++)) 14258 if (n-- == 0) 14259 break; 14260 return s1; 14261} 14262 14263Py_UNICODE* 14264Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14265{ 14266 Py_UNICODE *u1 = s1; 14267 u1 += Py_UNICODE_strlen(u1); 14268 Py_UNICODE_strcpy(u1, s2); 14269 return s1; 14270} 14271 14272int 14273Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14274{ 14275 while (*s1 && *s2 && *s1 == *s2) 14276 s1++, s2++; 14277 if (*s1 && *s2) 14278 return (*s1 < *s2) ? -1 : +1; 14279 if (*s1) 14280 return 1; 14281 if (*s2) 14282 return -1; 14283 return 0; 14284} 14285 14286int 14287Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14288{ 14289 register Py_UNICODE u1, u2; 14290 for (; n != 0; n--) { 14291 u1 = *s1; 14292 u2 = *s2; 14293 if (u1 != u2) 14294 return (u1 < u2) ? -1 : +1; 14295 if (u1 == '\0') 14296 return 0; 14297 s1++; 14298 s2++; 14299 } 14300 return 0; 14301} 14302 14303Py_UNICODE* 14304Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14305{ 14306 const Py_UNICODE *p; 14307 for (p = s; *p; p++) 14308 if (*p == c) 14309 return (Py_UNICODE*)p; 14310 return NULL; 14311} 14312 14313Py_UNICODE* 14314Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14315{ 14316 const Py_UNICODE *p; 14317 p = s + Py_UNICODE_strlen(s); 14318 while (p != s) { 14319 p--; 14320 if (*p == c) 14321 return (Py_UNICODE*)p; 14322 } 14323 return NULL; 14324} 14325 14326Py_UNICODE* 14327PyUnicode_AsUnicodeCopy(PyObject *unicode) 14328{ 14329 Py_UNICODE *u, *copy; 14330 Py_ssize_t len, size; 14331 14332 if (!PyUnicode_Check(unicode)) { 14333 PyErr_BadArgument(); 14334 return NULL; 14335 } 14336 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14337 if (u == NULL) 14338 return NULL; 14339 /* Ensure we won't overflow the size. */ 14340 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14341 PyErr_NoMemory(); 14342 return NULL; 14343 } 14344 size = len + 1; /* copy the null character */ 14345 size *= sizeof(Py_UNICODE); 14346 copy = PyMem_Malloc(size); 14347 if (copy == NULL) { 14348 PyErr_NoMemory(); 14349 return NULL; 14350 } 14351 memcpy(copy, u, size); 14352 return copy; 14353} 14354 14355/* A _string module, to export formatter_parser and formatter_field_name_split 14356 to the string.Formatter class implemented in Python. */ 14357 14358static PyMethodDef _string_methods[] = { 14359 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14360 METH_O, PyDoc_STR("split the argument as a field name")}, 14361 {"formatter_parser", (PyCFunction) formatter_parser, 14362 METH_O, PyDoc_STR("parse the argument as a format string")}, 14363 {NULL, NULL} 14364}; 14365 14366static struct PyModuleDef _string_module = { 14367 PyModuleDef_HEAD_INIT, 14368 "_string", 14369 PyDoc_STR("string helper module"), 14370 0, 14371 _string_methods, 14372 NULL, 14373 NULL, 14374 NULL, 14375 NULL 14376}; 14377 14378PyMODINIT_FUNC 14379PyInit__string(void) 14380{ 14381 return PyModule_Create(&_string_module); 14382} 14383 14384 14385#ifdef __cplusplus 14386} 14387#endif 14388