unicodeobject.c revision 84def3774d2079ea2a812e0220507ff0e27247e7
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44 45#ifdef MS_WINDOWS 46#include <windows.h> 47#endif 48 49/* Endianness switches; defaults to little endian */ 50 51#ifdef WORDS_BIGENDIAN 52# define BYTEORDER_IS_BIG_ENDIAN 53#else 54# define BYTEORDER_IS_LITTLE_ENDIAN 55#endif 56 57/* --- Globals ------------------------------------------------------------ 58 59 The globals are initialized by the _PyUnicode_Init() API and should 60 not be used before calling that API. 61 62*/ 63 64 65#ifdef __cplusplus 66extern "C" { 67#endif 68 69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 70#define MAX_UNICODE 0x10ffff 71 72#ifdef Py_DEBUG 73# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 74#else 75# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 76#endif 77 78#define _PyUnicode_UTF8(op) \ 79 (((PyCompactUnicodeObject*)(op))->utf8) 80#define PyUnicode_UTF8(op) \ 81 (assert(_PyUnicode_CHECK(op)), \ 82 assert(PyUnicode_IS_READY(op)), \ 83 PyUnicode_IS_COMPACT_ASCII(op) ? \ 84 ((char*)((PyASCIIObject*)(op) + 1)) : \ 85 _PyUnicode_UTF8(op)) 86#define _PyUnicode_UTF8_LENGTH(op) \ 87 (((PyCompactUnicodeObject*)(op))->utf8_length) 88#define PyUnicode_UTF8_LENGTH(op) \ 89 (assert(_PyUnicode_CHECK(op)), \ 90 assert(PyUnicode_IS_READY(op)), \ 91 PyUnicode_IS_COMPACT_ASCII(op) ? \ 92 ((PyASCIIObject*)(op))->length : \ 93 _PyUnicode_UTF8_LENGTH(op)) 94#define _PyUnicode_WSTR(op) \ 95 (((PyASCIIObject*)(op))->wstr) 96#define _PyUnicode_WSTR_LENGTH(op) \ 97 (((PyCompactUnicodeObject*)(op))->wstr_length) 98#define _PyUnicode_LENGTH(op) \ 99 (((PyASCIIObject *)(op))->length) 100#define _PyUnicode_STATE(op) \ 101 (((PyASCIIObject *)(op))->state) 102#define _PyUnicode_HASH(op) \ 103 (((PyASCIIObject *)(op))->hash) 104#define _PyUnicode_KIND(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 ((PyASCIIObject *)(op))->state.kind) 107#define _PyUnicode_GET_LENGTH(op) \ 108 (assert(_PyUnicode_CHECK(op)), \ 109 ((PyASCIIObject *)(op))->length) 110#define _PyUnicode_DATA_ANY(op) \ 111 (((PyUnicodeObject*)(op))->data.any) 112 113#undef PyUnicode_READY 114#define PyUnicode_READY(op) \ 115 (assert(_PyUnicode_CHECK(op)), \ 116 (PyUnicode_IS_READY(op) ? \ 117 0 : \ 118 _PyUnicode_Ready(op))) 119 120#define _PyUnicode_SHARE_UTF8(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 124#define _PyUnicode_SHARE_WSTR(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 127 128/* true if the Unicode object has an allocated UTF-8 memory block 129 (not shared with other data) */ 130#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (!PyUnicode_IS_COMPACT_ASCII(op) \ 133 && _PyUnicode_UTF8(op) \ 134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 135 136/* true if the Unicode object has an allocated wstr memory block 137 (not shared with other data) */ 138#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 139 (assert(_PyUnicode_CHECK(op)), \ 140 (_PyUnicode_WSTR(op) && \ 141 (!PyUnicode_IS_READY(op) || \ 142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 143 144/* Generic helper macro to convert characters of different types. 145 from_type and to_type have to be valid type names, begin and end 146 are pointers to the source characters which should be of type 147 "from_type *". to is a pointer of type "to_type *" and points to the 148 buffer where the result characters are written to. */ 149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 150 do { \ 151 to_type *_to = (to_type *) to; \ 152 const from_type *_iter = (begin); \ 153 const from_type *_end = (end); \ 154 Py_ssize_t n = (_end) - (_iter); \ 155 const from_type *_unrolled_end = \ 156 _iter + (n & ~ (Py_ssize_t) 3); \ 157 while (_iter < (_unrolled_end)) { \ 158 _to[0] = (to_type) _iter[0]; \ 159 _to[1] = (to_type) _iter[1]; \ 160 _to[2] = (to_type) _iter[2]; \ 161 _to[3] = (to_type) _iter[3]; \ 162 _iter += 4; _to += 4; \ 163 } \ 164 while (_iter < (_end)) \ 165 *_to++ = (to_type) *_iter++; \ 166 } while (0) 167 168/* The Unicode string has been modified: reset the hash */ 169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) 170 171/* This dictionary holds all interned unicode strings. Note that references 172 to strings in this dictionary are *not* counted in the string's ob_refcnt. 173 When the interned string reaches a refcnt of 0 the string deallocation 174 function will delete the reference from this dictionary. 175 176 Another way to look at this is that to say that the actual reference 177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 178*/ 179static PyObject *interned; 180 181/* The empty Unicode object is shared to improve performance. */ 182static PyObject *unicode_empty; 183 184/* List of static strings. */ 185static _Py_Identifier *static_strings; 186 187/* Single character Unicode strings in the Latin-1 range are being 188 shared as well. */ 189static PyObject *unicode_latin1[256]; 190 191/* Fast detection of the most frequent whitespace characters */ 192const unsigned char _Py_ascii_whitespace[] = { 193 0, 0, 0, 0, 0, 0, 0, 0, 194/* case 0x0009: * CHARACTER TABULATION */ 195/* case 0x000A: * LINE FEED */ 196/* case 0x000B: * LINE TABULATION */ 197/* case 0x000C: * FORM FEED */ 198/* case 0x000D: * CARRIAGE RETURN */ 199 0, 1, 1, 1, 1, 1, 0, 0, 200 0, 0, 0, 0, 0, 0, 0, 0, 201/* case 0x001C: * FILE SEPARATOR */ 202/* case 0x001D: * GROUP SEPARATOR */ 203/* case 0x001E: * RECORD SEPARATOR */ 204/* case 0x001F: * UNIT SEPARATOR */ 205 0, 0, 0, 0, 1, 1, 1, 1, 206/* case 0x0020: * SPACE */ 207 1, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0 220}; 221 222/* forward */ 223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 224static PyObject* get_latin1_char(unsigned char ch); 225static void copy_characters( 226 PyObject *to, Py_ssize_t to_start, 227 PyObject *from, Py_ssize_t from_start, 228 Py_ssize_t how_many); 229 230static PyObject * 231unicode_fromascii(const unsigned char *s, Py_ssize_t size); 232static PyObject * 233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 234static PyObject * 235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 236static PyObject * 237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 238 239static PyObject * 240unicode_encode_call_errorhandler(const char *errors, 241 PyObject **errorHandler,const char *encoding, const char *reason, 242 PyObject *unicode, PyObject **exceptionObject, 243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 244 245static void 246raise_encode_exception(PyObject **exceptionObject, 247 const char *encoding, 248 PyObject *unicode, 249 Py_ssize_t startpos, Py_ssize_t endpos, 250 const char *reason); 251 252/* Same for linebreaks */ 253static unsigned char ascii_linebreak[] = { 254 0, 0, 0, 0, 0, 0, 0, 0, 255/* 0x000A, * LINE FEED */ 256/* 0x000B, * LINE TABULATION */ 257/* 0x000C, * FORM FEED */ 258/* 0x000D, * CARRIAGE RETURN */ 259 0, 0, 1, 1, 1, 1, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 261/* 0x001C, * FILE SEPARATOR */ 262/* 0x001D, * GROUP SEPARATOR */ 263/* 0x001E, * RECORD SEPARATOR */ 264 0, 0, 0, 0, 1, 1, 1, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 0, 0, 0, 0, 0, 0, 0, 0, 269 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0, 277 0, 0, 0, 0, 0, 0, 0, 0 278}; 279 280/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 281 This function is kept for backward compatibility with the old API. */ 282Py_UNICODE 283PyUnicode_GetMax(void) 284{ 285#ifdef Py_UNICODE_WIDE 286 return 0x10FFFF; 287#else 288 /* This is actually an illegal character, so it should 289 not be passed to unichr. */ 290 return 0xFFFF; 291#endif 292} 293 294#ifdef Py_DEBUG 295int 296_PyUnicode_CheckConsistency(PyObject *op, int check_content) 297{ 298 PyASCIIObject *ascii; 299 unsigned int kind; 300 301 assert(PyUnicode_Check(op)); 302 303 ascii = (PyASCIIObject *)op; 304 kind = ascii->state.kind; 305 306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 307 assert(kind == PyUnicode_1BYTE_KIND); 308 assert(ascii->state.ready == 1); 309 } 310 else { 311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 312 void *data; 313 314 if (ascii->state.compact == 1) { 315 data = compact + 1; 316 assert(kind == PyUnicode_1BYTE_KIND 317 || kind == PyUnicode_2BYTE_KIND 318 || kind == PyUnicode_4BYTE_KIND); 319 assert(ascii->state.ascii == 0); 320 assert(ascii->state.ready == 1); 321 assert (compact->utf8 != data); 322 } 323 else { 324 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 325 326 data = unicode->data.any; 327 if (kind == PyUnicode_WCHAR_KIND) { 328 assert(ascii->length == 0); 329 assert(ascii->hash == -1); 330 assert(ascii->state.compact == 0); 331 assert(ascii->state.ascii == 0); 332 assert(ascii->state.ready == 0); 333 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 334 assert(ascii->wstr != NULL); 335 assert(data == NULL); 336 assert(compact->utf8 == NULL); 337 } 338 else { 339 assert(kind == PyUnicode_1BYTE_KIND 340 || kind == PyUnicode_2BYTE_KIND 341 || kind == PyUnicode_4BYTE_KIND); 342 assert(ascii->state.compact == 0); 343 assert(ascii->state.ready == 1); 344 assert(data != NULL); 345 if (ascii->state.ascii) { 346 assert (compact->utf8 == data); 347 assert (compact->utf8_length == ascii->length); 348 } 349 else 350 assert (compact->utf8 != data); 351 } 352 } 353 if (kind != PyUnicode_WCHAR_KIND) { 354 if ( 355#if SIZEOF_WCHAR_T == 2 356 kind == PyUnicode_2BYTE_KIND 357#else 358 kind == PyUnicode_4BYTE_KIND 359#endif 360 ) 361 { 362 assert(ascii->wstr == data); 363 assert(compact->wstr_length == ascii->length); 364 } else 365 assert(ascii->wstr != data); 366 } 367 368 if (compact->utf8 == NULL) 369 assert(compact->utf8_length == 0); 370 if (ascii->wstr == NULL) 371 assert(compact->wstr_length == 0); 372 } 373 /* check that the best kind is used */ 374 if (check_content && kind != PyUnicode_WCHAR_KIND) 375 { 376 Py_ssize_t i; 377 Py_UCS4 maxchar = 0; 378 void *data = PyUnicode_DATA(ascii); 379 for (i=0; i < ascii->length; i++) 380 { 381 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 382 if (ch > maxchar) 383 maxchar = ch; 384 } 385 if (kind == PyUnicode_1BYTE_KIND) { 386 if (ascii->state.ascii == 0) { 387 assert(maxchar >= 128); 388 assert(maxchar <= 255); 389 } 390 else 391 assert(maxchar < 128); 392 } 393 else if (kind == PyUnicode_2BYTE_KIND) { 394 assert(maxchar >= 0x100); 395 assert(maxchar <= 0xFFFF); 396 } 397 else { 398 assert(maxchar >= 0x10000); 399 assert(maxchar <= MAX_UNICODE); 400 } 401 } 402 return 1; 403} 404#endif 405 406static PyObject* 407unicode_result_wchar(PyObject *unicode) 408{ 409#ifndef Py_DEBUG 410 Py_ssize_t len; 411 412 assert(Py_REFCNT(unicode) == 1); 413 414 len = _PyUnicode_WSTR_LENGTH(unicode); 415 if (len == 0) { 416 Py_INCREF(unicode_empty); 417 Py_DECREF(unicode); 418 return unicode_empty; 419 } 420 421 if (len == 1) { 422 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 423 if (ch < 256) { 424 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 425 Py_DECREF(unicode); 426 return latin1_char; 427 } 428 } 429 430 if (_PyUnicode_Ready(unicode) < 0) { 431 Py_XDECREF(unicode); 432 return NULL; 433 } 434#else 435 /* don't make the result ready in debug mode to ensure that the caller 436 makes the string ready before using it */ 437 assert(_PyUnicode_CheckConsistency(unicode, 1)); 438#endif 439 return unicode; 440} 441 442static PyObject* 443unicode_result_ready(PyObject *unicode) 444{ 445 Py_ssize_t length; 446 447 length = PyUnicode_GET_LENGTH(unicode); 448 if (length == 0) { 449 if (unicode != unicode_empty) { 450 Py_INCREF(unicode_empty); 451 Py_DECREF(unicode); 452 } 453 return unicode_empty; 454 } 455 456 if (length == 1) { 457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 458 if (ch < 256) { 459 PyObject *latin1_char = unicode_latin1[ch]; 460 if (latin1_char != NULL) { 461 if (unicode != latin1_char) { 462 Py_INCREF(latin1_char); 463 Py_DECREF(unicode); 464 } 465 return latin1_char; 466 } 467 else { 468 assert(_PyUnicode_CheckConsistency(unicode, 1)); 469 Py_INCREF(unicode); 470 unicode_latin1[ch] = unicode; 471 return unicode; 472 } 473 } 474 } 475 476 assert(_PyUnicode_CheckConsistency(unicode, 1)); 477 return unicode; 478} 479 480static PyObject* 481unicode_result(PyObject *unicode) 482{ 483 assert(_PyUnicode_CHECK(unicode)); 484 if (PyUnicode_IS_READY(unicode)) 485 return unicode_result_ready(unicode); 486 else 487 return unicode_result_wchar(unicode); 488} 489 490#ifdef HAVE_MBCS 491static OSVERSIONINFOEX winver; 492#endif 493 494/* --- Bloom Filters ----------------------------------------------------- */ 495 496/* stuff to implement simple "bloom filters" for Unicode characters. 497 to keep things simple, we use a single bitmask, using the least 5 498 bits from each unicode characters as the bit index. */ 499 500/* the linebreak mask is set up by Unicode_Init below */ 501 502#if LONG_BIT >= 128 503#define BLOOM_WIDTH 128 504#elif LONG_BIT >= 64 505#define BLOOM_WIDTH 64 506#elif LONG_BIT >= 32 507#define BLOOM_WIDTH 32 508#else 509#error "LONG_BIT is smaller than 32" 510#endif 511 512#define BLOOM_MASK unsigned long 513 514static BLOOM_MASK bloom_linebreak; 515 516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 518 519#define BLOOM_LINEBREAK(ch) \ 520 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 522 523Py_LOCAL_INLINE(BLOOM_MASK) 524make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 525{ 526 /* calculate simple bloom-style bitmask for a given unicode string */ 527 528 BLOOM_MASK mask; 529 Py_ssize_t i; 530 531 mask = 0; 532 for (i = 0; i < len; i++) 533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 534 535 return mask; 536} 537 538#define BLOOM_MEMBER(mask, chr, str) \ 539 (BLOOM(mask, chr) \ 540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 541 542/* Compilation of templated routines */ 543 544#include "stringlib/asciilib.h" 545#include "stringlib/fastsearch.h" 546#include "stringlib/partition.h" 547#include "stringlib/split.h" 548#include "stringlib/count.h" 549#include "stringlib/find.h" 550#include "stringlib/find_max_char.h" 551#include "stringlib/localeutil.h" 552#include "stringlib/undef.h" 553 554#include "stringlib/ucs1lib.h" 555#include "stringlib/fastsearch.h" 556#include "stringlib/partition.h" 557#include "stringlib/split.h" 558#include "stringlib/count.h" 559#include "stringlib/find.h" 560#include "stringlib/find_max_char.h" 561#include "stringlib/localeutil.h" 562#include "stringlib/undef.h" 563 564#include "stringlib/ucs2lib.h" 565#include "stringlib/fastsearch.h" 566#include "stringlib/partition.h" 567#include "stringlib/split.h" 568#include "stringlib/count.h" 569#include "stringlib/find.h" 570#include "stringlib/find_max_char.h" 571#include "stringlib/localeutil.h" 572#include "stringlib/undef.h" 573 574#include "stringlib/ucs4lib.h" 575#include "stringlib/fastsearch.h" 576#include "stringlib/partition.h" 577#include "stringlib/split.h" 578#include "stringlib/count.h" 579#include "stringlib/find.h" 580#include "stringlib/find_max_char.h" 581#include "stringlib/localeutil.h" 582#include "stringlib/undef.h" 583 584#include "stringlib/unicodedefs.h" 585#include "stringlib/fastsearch.h" 586#include "stringlib/count.h" 587#include "stringlib/find.h" 588#include "stringlib/undef.h" 589 590/* --- Unicode Object ----------------------------------------------------- */ 591 592static PyObject * 593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 594 595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 596 Py_ssize_t size, Py_UCS4 ch, 597 int direction) 598{ 599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 600 601 switch (kind) { 602 case PyUnicode_1BYTE_KIND: 603 { 604 Py_UCS1 ch1 = (Py_UCS1) ch; 605 if (ch1 == ch) 606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 607 else 608 return -1; 609 } 610 case PyUnicode_2BYTE_KIND: 611 { 612 Py_UCS2 ch2 = (Py_UCS2) ch; 613 if (ch2 == ch) 614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 615 else 616 return -1; 617 } 618 case PyUnicode_4BYTE_KIND: 619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 620 default: 621 assert(0); 622 return -1; 623 } 624} 625 626static PyObject* 627resize_compact(PyObject *unicode, Py_ssize_t length) 628{ 629 Py_ssize_t char_size; 630 Py_ssize_t struct_size; 631 Py_ssize_t new_size; 632 int share_wstr; 633 PyObject *new_unicode; 634 635 assert(PyUnicode_IS_READY(unicode)); 636 char_size = PyUnicode_KIND(unicode); 637 if (PyUnicode_IS_COMPACT_ASCII(unicode)) 638 struct_size = sizeof(PyASCIIObject); 639 else 640 struct_size = sizeof(PyCompactUnicodeObject); 641 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 642 643 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 644 Py_DECREF(unicode); 645 PyErr_NoMemory(); 646 return NULL; 647 } 648 new_size = (struct_size + (length + 1) * char_size); 649 650 _Py_DEC_REFTOTAL; 651 _Py_ForgetReference(unicode); 652 653 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 654 if (new_unicode == NULL) { 655 PyObject_Del(unicode); 656 PyErr_NoMemory(); 657 return NULL; 658 } 659 unicode = new_unicode; 660 _Py_NewReference(unicode); 661 662 _PyUnicode_LENGTH(unicode) = length; 663 if (share_wstr) { 664 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 665 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 666 _PyUnicode_WSTR_LENGTH(unicode) = length; 667 } 668 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 669 length, 0); 670 return unicode; 671} 672 673static int 674resize_inplace(PyObject *unicode, Py_ssize_t length) 675{ 676 wchar_t *wstr; 677 assert(!PyUnicode_IS_COMPACT(unicode)); 678 assert(Py_REFCNT(unicode) == 1); 679 680 _PyUnicode_DIRTY(unicode); 681 682 if (PyUnicode_IS_READY(unicode)) { 683 Py_ssize_t char_size; 684 Py_ssize_t new_size; 685 int share_wstr, share_utf8; 686 void *data; 687 688 data = _PyUnicode_DATA_ANY(unicode); 689 assert(data != NULL); 690 char_size = PyUnicode_KIND(unicode); 691 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 692 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 693 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 694 { 695 PyObject_DEL(_PyUnicode_UTF8(unicode)); 696 _PyUnicode_UTF8(unicode) = NULL; 697 _PyUnicode_UTF8_LENGTH(unicode) = 0; 698 } 699 700 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 701 PyErr_NoMemory(); 702 return -1; 703 } 704 new_size = (length + 1) * char_size; 705 706 data = (PyObject *)PyObject_REALLOC(data, new_size); 707 if (data == NULL) { 708 PyErr_NoMemory(); 709 return -1; 710 } 711 _PyUnicode_DATA_ANY(unicode) = data; 712 if (share_wstr) { 713 _PyUnicode_WSTR(unicode) = data; 714 _PyUnicode_WSTR_LENGTH(unicode) = length; 715 } 716 if (share_utf8) { 717 _PyUnicode_UTF8(unicode) = data; 718 _PyUnicode_UTF8_LENGTH(unicode) = length; 719 } 720 _PyUnicode_LENGTH(unicode) = length; 721 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 722 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 723 assert(_PyUnicode_CheckConsistency(unicode, 0)); 724 return 0; 725 } 726 } 727 assert(_PyUnicode_WSTR(unicode) != NULL); 728 729 /* check for integer overflow */ 730 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 731 PyErr_NoMemory(); 732 return -1; 733 } 734 wstr = _PyUnicode_WSTR(unicode); 735 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1)); 736 if (!wstr) { 737 PyErr_NoMemory(); 738 return -1; 739 } 740 _PyUnicode_WSTR(unicode) = wstr; 741 _PyUnicode_WSTR(unicode)[length] = 0; 742 _PyUnicode_WSTR_LENGTH(unicode) = length; 743 assert(_PyUnicode_CheckConsistency(unicode, 0)); 744 return 0; 745} 746 747static PyObject* 748resize_copy(PyObject *unicode, Py_ssize_t length) 749{ 750 Py_ssize_t copy_length; 751 if (PyUnicode_IS_COMPACT(unicode)) { 752 PyObject *copy; 753 assert(PyUnicode_IS_READY(unicode)); 754 755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 756 if (copy == NULL) 757 return NULL; 758 759 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 760 copy_characters(copy, 0, unicode, 0, copy_length); 761 return copy; 762 } 763 else { 764 PyObject *w; 765 assert(_PyUnicode_WSTR(unicode) != NULL); 766 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 767 w = (PyObject*)_PyUnicode_New(length); 768 if (w == NULL) 769 return NULL; 770 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 771 copy_length = Py_MIN(copy_length, length); 772 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 773 copy_length); 774 return w; 775 } 776} 777 778/* We allocate one more byte to make sure the string is 779 Ux0000 terminated; some code (e.g. new_identifier) 780 relies on that. 781 782 XXX This allocator could further be enhanced by assuring that the 783 free list never reduces its size below 1. 784 785*/ 786 787#ifdef Py_DEBUG 788static int unicode_old_new_calls = 0; 789#endif 790 791static PyUnicodeObject * 792_PyUnicode_New(Py_ssize_t length) 793{ 794 register PyUnicodeObject *unicode; 795 size_t new_size; 796 797 /* Optimization for empty strings */ 798 if (length == 0 && unicode_empty != NULL) { 799 Py_INCREF(unicode_empty); 800 return (PyUnicodeObject*)unicode_empty; 801 } 802 803 /* Ensure we won't overflow the size. */ 804 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 805 return (PyUnicodeObject *)PyErr_NoMemory(); 806 } 807 if (length < 0) { 808 PyErr_SetString(PyExc_SystemError, 809 "Negative size passed to _PyUnicode_New"); 810 return NULL; 811 } 812 813#ifdef Py_DEBUG 814 ++unicode_old_new_calls; 815#endif 816 817 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 818 if (unicode == NULL) 819 return NULL; 820 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 821 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 822 if (!_PyUnicode_WSTR(unicode)) { 823 PyErr_NoMemory(); 824 goto onError; 825 } 826 827 /* Initialize the first element to guard against cases where 828 * the caller fails before initializing str -- unicode_resize() 829 * reads str[0], and the Keep-Alive optimization can keep memory 830 * allocated for str alive across a call to unicode_dealloc(unicode). 831 * We don't want unicode_resize to read uninitialized memory in 832 * that case. 833 */ 834 _PyUnicode_WSTR(unicode)[0] = 0; 835 _PyUnicode_WSTR(unicode)[length] = 0; 836 _PyUnicode_WSTR_LENGTH(unicode) = length; 837 _PyUnicode_HASH(unicode) = -1; 838 _PyUnicode_STATE(unicode).interned = 0; 839 _PyUnicode_STATE(unicode).kind = 0; 840 _PyUnicode_STATE(unicode).compact = 0; 841 _PyUnicode_STATE(unicode).ready = 0; 842 _PyUnicode_STATE(unicode).ascii = 0; 843 _PyUnicode_DATA_ANY(unicode) = NULL; 844 _PyUnicode_LENGTH(unicode) = 0; 845 _PyUnicode_UTF8(unicode) = NULL; 846 _PyUnicode_UTF8_LENGTH(unicode) = 0; 847 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 848 return unicode; 849 850 onError: 851 /* XXX UNREF/NEWREF interface should be more symmetrical */ 852 _Py_DEC_REFTOTAL; 853 _Py_ForgetReference((PyObject *)unicode); 854 PyObject_Del(unicode); 855 return NULL; 856} 857 858static const char* 859unicode_kind_name(PyObject *unicode) 860{ 861 /* don't check consistency: unicode_kind_name() is called from 862 _PyUnicode_Dump() */ 863 if (!PyUnicode_IS_COMPACT(unicode)) 864 { 865 if (!PyUnicode_IS_READY(unicode)) 866 return "wstr"; 867 switch(PyUnicode_KIND(unicode)) 868 { 869 case PyUnicode_1BYTE_KIND: 870 if (PyUnicode_IS_ASCII(unicode)) 871 return "legacy ascii"; 872 else 873 return "legacy latin1"; 874 case PyUnicode_2BYTE_KIND: 875 return "legacy UCS2"; 876 case PyUnicode_4BYTE_KIND: 877 return "legacy UCS4"; 878 default: 879 return "<legacy invalid kind>"; 880 } 881 } 882 assert(PyUnicode_IS_READY(unicode)); 883 switch(PyUnicode_KIND(unicode)) 884 { 885 case PyUnicode_1BYTE_KIND: 886 if (PyUnicode_IS_ASCII(unicode)) 887 return "ascii"; 888 else 889 return "latin1"; 890 case PyUnicode_2BYTE_KIND: 891 return "UCS2"; 892 case PyUnicode_4BYTE_KIND: 893 return "UCS4"; 894 default: 895 return "<invalid compact kind>"; 896 } 897} 898 899#ifdef Py_DEBUG 900static int unicode_new_new_calls = 0; 901 902/* Functions wrapping macros for use in debugger */ 903char *_PyUnicode_utf8(void *unicode){ 904 return PyUnicode_UTF8(unicode); 905} 906 907void *_PyUnicode_compact_data(void *unicode) { 908 return _PyUnicode_COMPACT_DATA(unicode); 909} 910void *_PyUnicode_data(void *unicode){ 911 printf("obj %p\n", unicode); 912 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 913 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 914 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 915 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 916 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 917 return PyUnicode_DATA(unicode); 918} 919 920void 921_PyUnicode_Dump(PyObject *op) 922{ 923 PyASCIIObject *ascii = (PyASCIIObject *)op; 924 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 925 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 926 void *data; 927 928 if (ascii->state.compact) 929 { 930 if (ascii->state.ascii) 931 data = (ascii + 1); 932 else 933 data = (compact + 1); 934 } 935 else 936 data = unicode->data.any; 937 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 938 939 if (ascii->wstr == data) 940 printf("shared "); 941 printf("wstr=%p", ascii->wstr); 942 943 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 944 printf(" (%zu), ", compact->wstr_length); 945 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 946 printf("shared "); 947 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 948 } 949 printf(", data=%p\n", data); 950} 951#endif 952 953PyObject * 954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 955{ 956 PyObject *obj; 957 PyCompactUnicodeObject *unicode; 958 void *data; 959 int kind_state; 960 int is_sharing, is_ascii; 961 Py_ssize_t char_size; 962 Py_ssize_t struct_size; 963 964 /* Optimization for empty strings */ 965 if (size == 0 && unicode_empty != NULL) { 966 Py_INCREF(unicode_empty); 967 return unicode_empty; 968 } 969 970#ifdef Py_DEBUG 971 ++unicode_new_new_calls; 972#endif 973 974 is_ascii = 0; 975 is_sharing = 0; 976 struct_size = sizeof(PyCompactUnicodeObject); 977 if (maxchar < 128) { 978 kind_state = PyUnicode_1BYTE_KIND; 979 char_size = 1; 980 is_ascii = 1; 981 struct_size = sizeof(PyASCIIObject); 982 } 983 else if (maxchar < 256) { 984 kind_state = PyUnicode_1BYTE_KIND; 985 char_size = 1; 986 } 987 else if (maxchar < 65536) { 988 kind_state = PyUnicode_2BYTE_KIND; 989 char_size = 2; 990 if (sizeof(wchar_t) == 2) 991 is_sharing = 1; 992 } 993 else { 994 kind_state = PyUnicode_4BYTE_KIND; 995 char_size = 4; 996 if (sizeof(wchar_t) == 4) 997 is_sharing = 1; 998 } 999 1000 /* Ensure we won't overflow the size. */ 1001 if (size < 0) { 1002 PyErr_SetString(PyExc_SystemError, 1003 "Negative size passed to PyUnicode_New"); 1004 return NULL; 1005 } 1006 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1007 return PyErr_NoMemory(); 1008 1009 /* Duplicated allocation code from _PyObject_New() instead of a call to 1010 * PyObject_New() so we are able to allocate space for the object and 1011 * it's data buffer. 1012 */ 1013 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1014 if (obj == NULL) 1015 return PyErr_NoMemory(); 1016 obj = PyObject_INIT(obj, &PyUnicode_Type); 1017 if (obj == NULL) 1018 return NULL; 1019 1020 unicode = (PyCompactUnicodeObject *)obj; 1021 if (is_ascii) 1022 data = ((PyASCIIObject*)obj) + 1; 1023 else 1024 data = unicode + 1; 1025 _PyUnicode_LENGTH(unicode) = size; 1026 _PyUnicode_HASH(unicode) = -1; 1027 _PyUnicode_STATE(unicode).interned = 0; 1028 _PyUnicode_STATE(unicode).kind = kind_state; 1029 _PyUnicode_STATE(unicode).compact = 1; 1030 _PyUnicode_STATE(unicode).ready = 1; 1031 _PyUnicode_STATE(unicode).ascii = is_ascii; 1032 if (is_ascii) { 1033 ((char*)data)[size] = 0; 1034 _PyUnicode_WSTR(unicode) = NULL; 1035 } 1036 else if (kind_state == PyUnicode_1BYTE_KIND) { 1037 ((char*)data)[size] = 0; 1038 _PyUnicode_WSTR(unicode) = NULL; 1039 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1040 unicode->utf8 = NULL; 1041 unicode->utf8_length = 0; 1042 } 1043 else { 1044 unicode->utf8 = NULL; 1045 unicode->utf8_length = 0; 1046 if (kind_state == PyUnicode_2BYTE_KIND) 1047 ((Py_UCS2*)data)[size] = 0; 1048 else /* kind_state == PyUnicode_4BYTE_KIND */ 1049 ((Py_UCS4*)data)[size] = 0; 1050 if (is_sharing) { 1051 _PyUnicode_WSTR_LENGTH(unicode) = size; 1052 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1053 } 1054 else { 1055 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1056 _PyUnicode_WSTR(unicode) = NULL; 1057 } 1058 } 1059 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1060 return obj; 1061} 1062 1063#if SIZEOF_WCHAR_T == 2 1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1065 will decode surrogate pairs, the other conversions are implemented as macros 1066 for efficiency. 1067 1068 This function assumes that unicode can hold one more code point than wstr 1069 characters for a terminating null character. */ 1070static void 1071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1072 PyObject *unicode) 1073{ 1074 const wchar_t *iter; 1075 Py_UCS4 *ucs4_out; 1076 1077 assert(unicode != NULL); 1078 assert(_PyUnicode_CHECK(unicode)); 1079 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1080 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1081 1082 for (iter = begin; iter < end; ) { 1083 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1084 _PyUnicode_GET_LENGTH(unicode))); 1085 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1086 && (iter+1) < end 1087 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1088 { 1089 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1090 iter += 2; 1091 } 1092 else { 1093 *ucs4_out++ = *iter; 1094 iter++; 1095 } 1096 } 1097 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1098 _PyUnicode_GET_LENGTH(unicode))); 1099 1100} 1101#endif 1102 1103static int 1104_PyUnicode_Dirty(PyObject *unicode) 1105{ 1106 assert(_PyUnicode_CHECK(unicode)); 1107 if (Py_REFCNT(unicode) != 1) { 1108 PyErr_SetString(PyExc_SystemError, 1109 "Cannot modify a string having more than 1 reference"); 1110 return -1; 1111 } 1112 _PyUnicode_DIRTY(unicode); 1113 return 0; 1114} 1115 1116static int 1117_copy_characters(PyObject *to, Py_ssize_t to_start, 1118 PyObject *from, Py_ssize_t from_start, 1119 Py_ssize_t how_many, int check_maxchar) 1120{ 1121 unsigned int from_kind, to_kind; 1122 void *from_data, *to_data; 1123 int fast; 1124 1125 assert(PyUnicode_Check(from)); 1126 assert(PyUnicode_Check(to)); 1127 assert(PyUnicode_IS_READY(from)); 1128 assert(PyUnicode_IS_READY(to)); 1129 1130 assert(PyUnicode_GET_LENGTH(from) >= how_many); 1131 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1132 assert(0 <= how_many); 1133 1134 if (how_many == 0) 1135 return 0; 1136 1137 from_kind = PyUnicode_KIND(from); 1138 from_data = PyUnicode_DATA(from); 1139 to_kind = PyUnicode_KIND(to); 1140 to_data = PyUnicode_DATA(to); 1141 1142#ifdef Py_DEBUG 1143 if (!check_maxchar 1144 && (from_kind > to_kind 1145 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) 1146 { 1147 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1148 Py_UCS4 ch; 1149 Py_ssize_t i; 1150 for (i=0; i < how_many; i++) { 1151 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1152 assert(ch <= to_maxchar); 1153 } 1154 } 1155#endif 1156 fast = (from_kind == to_kind); 1157 if (check_maxchar 1158 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1159 { 1160 /* deny latin1 => ascii */ 1161 fast = 0; 1162 } 1163 1164 if (fast) { 1165 Py_MEMCPY((char*)to_data + to_kind * to_start, 1166 (char*)from_data + from_kind * from_start, 1167 to_kind * how_many); 1168 } 1169 else if (from_kind == PyUnicode_1BYTE_KIND 1170 && to_kind == PyUnicode_2BYTE_KIND) 1171 { 1172 _PyUnicode_CONVERT_BYTES( 1173 Py_UCS1, Py_UCS2, 1174 PyUnicode_1BYTE_DATA(from) + from_start, 1175 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1176 PyUnicode_2BYTE_DATA(to) + to_start 1177 ); 1178 } 1179 else if (from_kind == PyUnicode_1BYTE_KIND 1180 && to_kind == PyUnicode_4BYTE_KIND) 1181 { 1182 _PyUnicode_CONVERT_BYTES( 1183 Py_UCS1, Py_UCS4, 1184 PyUnicode_1BYTE_DATA(from) + from_start, 1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1186 PyUnicode_4BYTE_DATA(to) + to_start 1187 ); 1188 } 1189 else if (from_kind == PyUnicode_2BYTE_KIND 1190 && to_kind == PyUnicode_4BYTE_KIND) 1191 { 1192 _PyUnicode_CONVERT_BYTES( 1193 Py_UCS2, Py_UCS4, 1194 PyUnicode_2BYTE_DATA(from) + from_start, 1195 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1196 PyUnicode_4BYTE_DATA(to) + to_start 1197 ); 1198 } 1199 else { 1200 /* check if max_char(from substring) <= max_char(to) */ 1201 if (from_kind > to_kind 1202 /* latin1 => ascii */ 1203 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) 1204 { 1205 /* slow path to check for character overflow */ 1206 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1207 Py_UCS4 ch; 1208 Py_ssize_t i; 1209 1210#ifdef Py_DEBUG 1211 for (i=0; i < how_many; i++) { 1212 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1213 assert(ch <= to_maxchar); 1214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1215 } 1216#else 1217 if (!check_maxchar) { 1218 for (i=0; i < how_many; i++) { 1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1221 } 1222 } 1223 else { 1224 for (i=0; i < how_many; i++) { 1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1226 if (ch > to_maxchar) 1227 return 1; 1228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1229 } 1230 } 1231#endif 1232 } 1233 else { 1234 assert(0 && "inconsistent state"); 1235 return 1; 1236 } 1237 } 1238 return 0; 1239} 1240 1241static void 1242copy_characters(PyObject *to, Py_ssize_t to_start, 1243 PyObject *from, Py_ssize_t from_start, 1244 Py_ssize_t how_many) 1245{ 1246 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1247} 1248 1249Py_ssize_t 1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1251 PyObject *from, Py_ssize_t from_start, 1252 Py_ssize_t how_many) 1253{ 1254 int err; 1255 1256 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1257 PyErr_BadInternalCall(); 1258 return -1; 1259 } 1260 1261 if (PyUnicode_READY(from)) 1262 return -1; 1263 if (PyUnicode_READY(to)) 1264 return -1; 1265 1266 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1267 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1268 PyErr_Format(PyExc_SystemError, 1269 "Cannot write %zi characters at %zi " 1270 "in a string of %zi characters", 1271 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1272 return -1; 1273 } 1274 1275 if (how_many == 0) 1276 return 0; 1277 1278 if (_PyUnicode_Dirty(to)) 1279 return -1; 1280 1281 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1282 if (err) { 1283 PyErr_Format(PyExc_SystemError, 1284 "Cannot copy %s characters " 1285 "into a string of %s characters", 1286 unicode_kind_name(from), 1287 unicode_kind_name(to)); 1288 return -1; 1289 } 1290 return how_many; 1291} 1292 1293/* Find the maximum code point and count the number of surrogate pairs so a 1294 correct string length can be computed before converting a string to UCS4. 1295 This function counts single surrogates as a character and not as a pair. 1296 1297 Return 0 on success, or -1 on error. */ 1298static int 1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1300 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1301{ 1302 const wchar_t *iter; 1303 Py_UCS4 ch; 1304 1305 assert(num_surrogates != NULL && maxchar != NULL); 1306 *num_surrogates = 0; 1307 *maxchar = 0; 1308 1309 for (iter = begin; iter < end; ) { 1310#if SIZEOF_WCHAR_T == 2 1311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1312 && (iter+1) < end 1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1314 { 1315 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1316 ++(*num_surrogates); 1317 iter += 2; 1318 } 1319 else 1320#endif 1321 { 1322 ch = *iter; 1323 iter++; 1324 } 1325 if (ch > *maxchar) { 1326 *maxchar = ch; 1327 if (*maxchar > MAX_UNICODE) { 1328 PyErr_Format(PyExc_ValueError, 1329 "character U+%x is not in range [U+0000; U+10ffff]", 1330 ch); 1331 return -1; 1332 } 1333 } 1334 } 1335 return 0; 1336} 1337 1338#ifdef Py_DEBUG 1339static int unicode_ready_calls = 0; 1340#endif 1341 1342int 1343_PyUnicode_Ready(PyObject *unicode) 1344{ 1345 wchar_t *end; 1346 Py_UCS4 maxchar = 0; 1347 Py_ssize_t num_surrogates; 1348#if SIZEOF_WCHAR_T == 2 1349 Py_ssize_t length_wo_surrogates; 1350#endif 1351 1352 /* _PyUnicode_Ready() is only intended for old-style API usage where 1353 strings were created using _PyObject_New() and where no canonical 1354 representation (the str field) has been set yet aka strings 1355 which are not yet ready. */ 1356 assert(_PyUnicode_CHECK(unicode)); 1357 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1358 assert(_PyUnicode_WSTR(unicode) != NULL); 1359 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1360 assert(_PyUnicode_UTF8(unicode) == NULL); 1361 /* Actually, it should neither be interned nor be anything else: */ 1362 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1363 1364#ifdef Py_DEBUG 1365 ++unicode_ready_calls; 1366#endif 1367 1368 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1369 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1370 &maxchar, &num_surrogates) == -1) 1371 return -1; 1372 1373 if (maxchar < 256) { 1374 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1375 if (!_PyUnicode_DATA_ANY(unicode)) { 1376 PyErr_NoMemory(); 1377 return -1; 1378 } 1379 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1380 _PyUnicode_WSTR(unicode), end, 1381 PyUnicode_1BYTE_DATA(unicode)); 1382 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1383 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1384 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1385 if (maxchar < 128) { 1386 _PyUnicode_STATE(unicode).ascii = 1; 1387 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1388 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1389 } 1390 else { 1391 _PyUnicode_STATE(unicode).ascii = 0; 1392 _PyUnicode_UTF8(unicode) = NULL; 1393 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1394 } 1395 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1396 _PyUnicode_WSTR(unicode) = NULL; 1397 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1398 } 1399 /* In this case we might have to convert down from 4-byte native 1400 wchar_t to 2-byte unicode. */ 1401 else if (maxchar < 65536) { 1402 assert(num_surrogates == 0 && 1403 "FindMaxCharAndNumSurrogatePairs() messed up"); 1404 1405#if SIZEOF_WCHAR_T == 2 1406 /* We can share representations and are done. */ 1407 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1408 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1410 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1411 _PyUnicode_UTF8(unicode) = NULL; 1412 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1413#else 1414 /* sizeof(wchar_t) == 4 */ 1415 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1416 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1417 if (!_PyUnicode_DATA_ANY(unicode)) { 1418 PyErr_NoMemory(); 1419 return -1; 1420 } 1421 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1422 _PyUnicode_WSTR(unicode), end, 1423 PyUnicode_2BYTE_DATA(unicode)); 1424 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1425 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1426 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1427 _PyUnicode_UTF8(unicode) = NULL; 1428 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1429 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1430 _PyUnicode_WSTR(unicode) = NULL; 1431 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1432#endif 1433 } 1434 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1435 else { 1436#if SIZEOF_WCHAR_T == 2 1437 /* in case the native representation is 2-bytes, we need to allocate a 1438 new normalized 4-byte version. */ 1439 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1441 if (!_PyUnicode_DATA_ANY(unicode)) { 1442 PyErr_NoMemory(); 1443 return -1; 1444 } 1445 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1446 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1447 _PyUnicode_UTF8(unicode) = NULL; 1448 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1449 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1450 _PyUnicode_STATE(unicode).ready = 1; 1451 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1452 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1453 _PyUnicode_WSTR(unicode) = NULL; 1454 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1455#else 1456 assert(num_surrogates == 0); 1457 1458 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1459 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1460 _PyUnicode_UTF8(unicode) = NULL; 1461 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1462 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1463#endif 1464 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1465 } 1466 _PyUnicode_STATE(unicode).ready = 1; 1467 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1468 return 0; 1469} 1470 1471static void 1472unicode_dealloc(register PyObject *unicode) 1473{ 1474 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1475 case SSTATE_NOT_INTERNED: 1476 break; 1477 1478 case SSTATE_INTERNED_MORTAL: 1479 /* revive dead object temporarily for DelItem */ 1480 Py_REFCNT(unicode) = 3; 1481 if (PyDict_DelItem(interned, unicode) != 0) 1482 Py_FatalError( 1483 "deletion of interned string failed"); 1484 break; 1485 1486 case SSTATE_INTERNED_IMMORTAL: 1487 Py_FatalError("Immortal interned string died."); 1488 1489 default: 1490 Py_FatalError("Inconsistent interned string state."); 1491 } 1492 1493 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1494 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1495 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1496 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1497 1498 if (PyUnicode_IS_COMPACT(unicode)) { 1499 Py_TYPE(unicode)->tp_free(unicode); 1500 } 1501 else { 1502 if (_PyUnicode_DATA_ANY(unicode)) 1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1504 Py_TYPE(unicode)->tp_free(unicode); 1505 } 1506} 1507 1508#ifdef Py_DEBUG 1509static int 1510unicode_is_singleton(PyObject *unicode) 1511{ 1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1513 if (unicode == unicode_empty) 1514 return 1; 1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1516 { 1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1518 if (ch < 256 && unicode_latin1[ch] == unicode) 1519 return 1; 1520 } 1521 return 0; 1522} 1523#endif 1524 1525static int 1526unicode_resizable(PyObject *unicode) 1527{ 1528 if (Py_REFCNT(unicode) != 1) 1529 return 0; 1530 if (PyUnicode_CHECK_INTERNED(unicode)) 1531 return 0; 1532#ifdef Py_DEBUG 1533 /* singleton refcount is greater than 1 */ 1534 assert(!unicode_is_singleton(unicode)); 1535#endif 1536 return 1; 1537} 1538 1539static int 1540unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1541{ 1542 PyObject *unicode; 1543 Py_ssize_t old_length; 1544 1545 assert(p_unicode != NULL); 1546 unicode = *p_unicode; 1547 1548 assert(unicode != NULL); 1549 assert(PyUnicode_Check(unicode)); 1550 assert(0 <= length); 1551 1552 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1553 old_length = PyUnicode_WSTR_LENGTH(unicode); 1554 else 1555 old_length = PyUnicode_GET_LENGTH(unicode); 1556 if (old_length == length) 1557 return 0; 1558 1559 if (length == 0) { 1560 Py_DECREF(*p_unicode); 1561 *p_unicode = unicode_empty; 1562 Py_INCREF(*p_unicode); 1563 return 0; 1564 } 1565 1566 if (!unicode_resizable(unicode)) { 1567 PyObject *copy = resize_copy(unicode, length); 1568 if (copy == NULL) 1569 return -1; 1570 Py_DECREF(*p_unicode); 1571 *p_unicode = copy; 1572 return 0; 1573 } 1574 1575 if (PyUnicode_IS_COMPACT(unicode)) { 1576 *p_unicode = resize_compact(unicode, length); 1577 if (*p_unicode == NULL) 1578 return -1; 1579 assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); 1580 return 0; 1581 } 1582 return resize_inplace(unicode, length); 1583} 1584 1585int 1586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1587{ 1588 PyObject *unicode; 1589 if (p_unicode == NULL) { 1590 PyErr_BadInternalCall(); 1591 return -1; 1592 } 1593 unicode = *p_unicode; 1594 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1595 { 1596 PyErr_BadInternalCall(); 1597 return -1; 1598 } 1599 return unicode_resize(p_unicode, length); 1600} 1601 1602static int 1603unicode_widen(PyObject **p_unicode, unsigned int maxchar) 1604{ 1605 PyObject *result; 1606 assert(PyUnicode_IS_READY(*p_unicode)); 1607 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1608 return 0; 1609 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1610 maxchar); 1611 if (result == NULL) 1612 return -1; 1613 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, 1614 PyUnicode_GET_LENGTH(*p_unicode)); 1615 Py_DECREF(*p_unicode); 1616 *p_unicode = result; 1617 return 0; 1618} 1619 1620static int 1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1622 Py_UCS4 ch) 1623{ 1624 if (unicode_widen(p_unicode, ch) < 0) 1625 return -1; 1626 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1627 PyUnicode_DATA(*p_unicode), 1628 (*pos)++, ch); 1629 return 0; 1630} 1631 1632static PyObject* 1633get_latin1_char(unsigned char ch) 1634{ 1635 PyObject *unicode = unicode_latin1[ch]; 1636 if (!unicode) { 1637 unicode = PyUnicode_New(1, ch); 1638 if (!unicode) 1639 return NULL; 1640 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1641 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1642 unicode_latin1[ch] = unicode; 1643 } 1644 Py_INCREF(unicode); 1645 return unicode; 1646} 1647 1648PyObject * 1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1650{ 1651 PyObject *unicode; 1652 Py_UCS4 maxchar = 0; 1653 Py_ssize_t num_surrogates; 1654 1655 if (u == NULL) 1656 return (PyObject*)_PyUnicode_New(size); 1657 1658 /* If the Unicode data is known at construction time, we can apply 1659 some optimizations which share commonly used objects. */ 1660 1661 /* Optimization for empty strings */ 1662 if (size == 0 && unicode_empty != NULL) { 1663 Py_INCREF(unicode_empty); 1664 return unicode_empty; 1665 } 1666 1667 /* Single character Unicode objects in the Latin-1 range are 1668 shared when using this constructor */ 1669 if (size == 1 && *u < 256) 1670 return get_latin1_char((unsigned char)*u); 1671 1672 /* If not empty and not single character, copy the Unicode data 1673 into the new object */ 1674 if (find_maxchar_surrogates(u, u + size, 1675 &maxchar, &num_surrogates) == -1) 1676 return NULL; 1677 1678 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1679 if (!unicode) 1680 return NULL; 1681 1682 switch (PyUnicode_KIND(unicode)) { 1683 case PyUnicode_1BYTE_KIND: 1684 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1685 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1686 break; 1687 case PyUnicode_2BYTE_KIND: 1688#if Py_UNICODE_SIZE == 2 1689 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1690#else 1691 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1692 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1693#endif 1694 break; 1695 case PyUnicode_4BYTE_KIND: 1696#if SIZEOF_WCHAR_T == 2 1697 /* This is the only case which has to process surrogates, thus 1698 a simple copy loop is not enough and we need a function. */ 1699 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1700#else 1701 assert(num_surrogates == 0); 1702 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1703#endif 1704 break; 1705 default: 1706 assert(0 && "Impossible state"); 1707 } 1708 1709 return unicode_result(unicode); 1710} 1711 1712PyObject * 1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1714{ 1715 if (size < 0) { 1716 PyErr_SetString(PyExc_SystemError, 1717 "Negative size passed to PyUnicode_FromStringAndSize"); 1718 return NULL; 1719 } 1720 1721 /* If the Unicode data is known at construction time, we can apply 1722 some optimizations which share commonly used objects. 1723 Also, this means the input must be UTF-8, so fall back to the 1724 UTF-8 decoder at the end. */ 1725 if (u != NULL) { 1726 1727 /* Optimization for empty strings */ 1728 if (size == 0 && unicode_empty != NULL) { 1729 Py_INCREF(unicode_empty); 1730 return unicode_empty; 1731 } 1732 1733 /* Single characters are shared when using this constructor. 1734 Restrict to ASCII, since the input must be UTF-8. */ 1735 if (size == 1 && (unsigned char)*u < 128) 1736 return get_latin1_char((unsigned char)*u); 1737 1738 return PyUnicode_DecodeUTF8(u, size, NULL); 1739 } 1740 1741 return (PyObject *)_PyUnicode_New(size); 1742} 1743 1744PyObject * 1745PyUnicode_FromString(const char *u) 1746{ 1747 size_t size = strlen(u); 1748 if (size > PY_SSIZE_T_MAX) { 1749 PyErr_SetString(PyExc_OverflowError, "input too long"); 1750 return NULL; 1751 } 1752 1753 return PyUnicode_FromStringAndSize(u, size); 1754} 1755 1756PyObject * 1757_PyUnicode_FromId(_Py_Identifier *id) 1758{ 1759 if (!id->object) { 1760 id->object = PyUnicode_FromString(id->string); 1761 if (!id->object) 1762 return NULL; 1763 PyUnicode_InternInPlace(&id->object); 1764 assert(!id->next); 1765 id->next = static_strings; 1766 static_strings = id; 1767 } 1768 return id->object; 1769} 1770 1771void 1772_PyUnicode_ClearStaticStrings() 1773{ 1774 _Py_Identifier *i; 1775 for (i = static_strings; i; i = i->next) { 1776 Py_DECREF(i->object); 1777 i->object = NULL; 1778 i->next = NULL; 1779 } 1780} 1781 1782/* Internal function, don't check maximum character */ 1783 1784static PyObject* 1785unicode_fromascii(const unsigned char* s, Py_ssize_t size) 1786{ 1787 PyObject *res; 1788#ifdef Py_DEBUG 1789 const unsigned char *p; 1790 const unsigned char *end = s + size; 1791 for (p=s; p < end; p++) { 1792 assert(*p < 128); 1793 } 1794#endif 1795 if (size == 1) 1796 return get_latin1_char(s[0]); 1797 res = PyUnicode_New(size, 127); 1798 if (!res) 1799 return NULL; 1800 memcpy(PyUnicode_1BYTE_DATA(res), s, size); 1801 return res; 1802} 1803 1804static Py_UCS4 1805kind_maxchar_limit(unsigned int kind) 1806{ 1807 switch(kind) { 1808 case PyUnicode_1BYTE_KIND: 1809 return 0x80; 1810 case PyUnicode_2BYTE_KIND: 1811 return 0x100; 1812 case PyUnicode_4BYTE_KIND: 1813 return 0x10000; 1814 default: 1815 assert(0 && "invalid kind"); 1816 return MAX_UNICODE; 1817 } 1818} 1819 1820static PyObject* 1821_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1822{ 1823 PyObject *res; 1824 unsigned char max_char; 1825 1826 if (size == 0) { 1827 Py_INCREF(unicode_empty); 1828 return unicode_empty; 1829 } 1830 assert(size > 0); 1831 if (size == 1) 1832 return get_latin1_char(u[0]); 1833 1834 max_char = ucs1lib_find_max_char(u, u + size); 1835 res = PyUnicode_New(size, max_char); 1836 if (!res) 1837 return NULL; 1838 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1839 assert(_PyUnicode_CheckConsistency(res, 1)); 1840 return res; 1841} 1842 1843static PyObject* 1844_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1845{ 1846 PyObject *res; 1847 Py_UCS2 max_char; 1848 1849 if (size == 0) { 1850 Py_INCREF(unicode_empty); 1851 return unicode_empty; 1852 } 1853 assert(size > 0); 1854 if (size == 1 && u[0] < 256) 1855 return get_latin1_char((unsigned char)u[0]); 1856 1857 max_char = ucs2lib_find_max_char(u, u + size); 1858 res = PyUnicode_New(size, max_char); 1859 if (!res) 1860 return NULL; 1861 if (max_char >= 256) 1862 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1863 else { 1864 _PyUnicode_CONVERT_BYTES( 1865 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1866 } 1867 assert(_PyUnicode_CheckConsistency(res, 1)); 1868 return res; 1869} 1870 1871static PyObject* 1872_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1873{ 1874 PyObject *res; 1875 Py_UCS4 max_char; 1876 1877 if (size == 0) { 1878 Py_INCREF(unicode_empty); 1879 return unicode_empty; 1880 } 1881 assert(size > 0); 1882 if (size == 1 && u[0] < 256) 1883 return get_latin1_char((unsigned char)u[0]); 1884 1885 max_char = ucs4lib_find_max_char(u, u + size); 1886 res = PyUnicode_New(size, max_char); 1887 if (!res) 1888 return NULL; 1889 if (max_char < 256) 1890 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1891 PyUnicode_1BYTE_DATA(res)); 1892 else if (max_char < 0x10000) 1893 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1894 PyUnicode_2BYTE_DATA(res)); 1895 else 1896 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1897 assert(_PyUnicode_CheckConsistency(res, 1)); 1898 return res; 1899} 1900 1901PyObject* 1902PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1903{ 1904 if (size < 0) { 1905 PyErr_SetString(PyExc_ValueError, "size must be positive"); 1906 return NULL; 1907 } 1908 switch(kind) { 1909 case PyUnicode_1BYTE_KIND: 1910 return _PyUnicode_FromUCS1(buffer, size); 1911 case PyUnicode_2BYTE_KIND: 1912 return _PyUnicode_FromUCS2(buffer, size); 1913 case PyUnicode_4BYTE_KIND: 1914 return _PyUnicode_FromUCS4(buffer, size); 1915 default: 1916 PyErr_SetString(PyExc_SystemError, "invalid kind"); 1917 return NULL; 1918 } 1919} 1920 1921/* Ensure that a string uses the most efficient storage, if it is not the 1922 case: create a new string with of the right kind. Write NULL into *p_unicode 1923 on error. */ 1924static void 1925unicode_adjust_maxchar(PyObject **p_unicode) 1926{ 1927 PyObject *unicode, *copy; 1928 Py_UCS4 max_char; 1929 Py_ssize_t len; 1930 unsigned int kind; 1931 1932 assert(p_unicode != NULL); 1933 unicode = *p_unicode; 1934 assert(PyUnicode_IS_READY(unicode)); 1935 if (PyUnicode_IS_ASCII(unicode)) 1936 return; 1937 1938 len = PyUnicode_GET_LENGTH(unicode); 1939 kind = PyUnicode_KIND(unicode); 1940 if (kind == PyUnicode_1BYTE_KIND) { 1941 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 1942 max_char = ucs1lib_find_max_char(u, u + len); 1943 if (max_char >= 128) 1944 return; 1945 } 1946 else if (kind == PyUnicode_2BYTE_KIND) { 1947 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 1948 max_char = ucs2lib_find_max_char(u, u + len); 1949 if (max_char >= 256) 1950 return; 1951 } 1952 else { 1953 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 1954 assert(kind == PyUnicode_4BYTE_KIND); 1955 max_char = ucs4lib_find_max_char(u, u + len); 1956 if (max_char >= 0x10000) 1957 return; 1958 } 1959 copy = PyUnicode_New(len, max_char); 1960 copy_characters(copy, 0, unicode, 0, len); 1961 Py_DECREF(unicode); 1962 *p_unicode = copy; 1963} 1964 1965PyObject* 1966PyUnicode_Copy(PyObject *unicode) 1967{ 1968 Py_ssize_t length; 1969 PyObject *copy; 1970 1971 if (!PyUnicode_Check(unicode)) { 1972 PyErr_BadInternalCall(); 1973 return NULL; 1974 } 1975 if (PyUnicode_READY(unicode)) 1976 return NULL; 1977 1978 length = PyUnicode_GET_LENGTH(unicode); 1979 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 1980 if (!copy) 1981 return NULL; 1982 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 1983 1984 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 1985 length * PyUnicode_KIND(unicode)); 1986 assert(_PyUnicode_CheckConsistency(copy, 1)); 1987 return copy; 1988} 1989 1990 1991/* Widen Unicode objects to larger buffers. Don't write terminating null 1992 character. Return NULL on error. */ 1993 1994void* 1995_PyUnicode_AsKind(PyObject *s, unsigned int kind) 1996{ 1997 Py_ssize_t len; 1998 void *result; 1999 unsigned int skind; 2000 2001 if (PyUnicode_READY(s)) 2002 return NULL; 2003 2004 len = PyUnicode_GET_LENGTH(s); 2005 skind = PyUnicode_KIND(s); 2006 if (skind >= kind) { 2007 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2008 return NULL; 2009 } 2010 switch(kind) { 2011 case PyUnicode_2BYTE_KIND: 2012 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2013 if (!result) 2014 return PyErr_NoMemory(); 2015 assert(skind == PyUnicode_1BYTE_KIND); 2016 _PyUnicode_CONVERT_BYTES( 2017 Py_UCS1, Py_UCS2, 2018 PyUnicode_1BYTE_DATA(s), 2019 PyUnicode_1BYTE_DATA(s) + len, 2020 result); 2021 return result; 2022 case PyUnicode_4BYTE_KIND: 2023 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2024 if (!result) 2025 return PyErr_NoMemory(); 2026 if (skind == PyUnicode_2BYTE_KIND) { 2027 _PyUnicode_CONVERT_BYTES( 2028 Py_UCS2, Py_UCS4, 2029 PyUnicode_2BYTE_DATA(s), 2030 PyUnicode_2BYTE_DATA(s) + len, 2031 result); 2032 } 2033 else { 2034 assert(skind == PyUnicode_1BYTE_KIND); 2035 _PyUnicode_CONVERT_BYTES( 2036 Py_UCS1, Py_UCS4, 2037 PyUnicode_1BYTE_DATA(s), 2038 PyUnicode_1BYTE_DATA(s) + len, 2039 result); 2040 } 2041 return result; 2042 default: 2043 break; 2044 } 2045 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2046 return NULL; 2047} 2048 2049static Py_UCS4* 2050as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2051 int copy_null) 2052{ 2053 int kind; 2054 void *data; 2055 Py_ssize_t len, targetlen; 2056 if (PyUnicode_READY(string) == -1) 2057 return NULL; 2058 kind = PyUnicode_KIND(string); 2059 data = PyUnicode_DATA(string); 2060 len = PyUnicode_GET_LENGTH(string); 2061 targetlen = len; 2062 if (copy_null) 2063 targetlen++; 2064 if (!target) { 2065 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2066 PyErr_NoMemory(); 2067 return NULL; 2068 } 2069 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2070 if (!target) { 2071 PyErr_NoMemory(); 2072 return NULL; 2073 } 2074 } 2075 else { 2076 if (targetsize < targetlen) { 2077 PyErr_Format(PyExc_SystemError, 2078 "string is longer than the buffer"); 2079 if (copy_null && 0 < targetsize) 2080 target[0] = 0; 2081 return NULL; 2082 } 2083 } 2084 if (kind == PyUnicode_1BYTE_KIND) { 2085 Py_UCS1 *start = (Py_UCS1 *) data; 2086 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2087 } 2088 else if (kind == PyUnicode_2BYTE_KIND) { 2089 Py_UCS2 *start = (Py_UCS2 *) data; 2090 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2091 } 2092 else { 2093 assert(kind == PyUnicode_4BYTE_KIND); 2094 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2095 } 2096 if (copy_null) 2097 target[len] = 0; 2098 return target; 2099} 2100 2101Py_UCS4* 2102PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2103 int copy_null) 2104{ 2105 if (target == NULL || targetsize < 0) { 2106 PyErr_BadInternalCall(); 2107 return NULL; 2108 } 2109 return as_ucs4(string, target, targetsize, copy_null); 2110} 2111 2112Py_UCS4* 2113PyUnicode_AsUCS4Copy(PyObject *string) 2114{ 2115 return as_ucs4(string, NULL, 0, 1); 2116} 2117 2118#ifdef HAVE_WCHAR_H 2119 2120PyObject * 2121PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2122{ 2123 if (w == NULL) { 2124 if (size == 0) 2125 return PyUnicode_New(0, 0); 2126 PyErr_BadInternalCall(); 2127 return NULL; 2128 } 2129 2130 if (size == -1) { 2131 size = wcslen(w); 2132 } 2133 2134 return PyUnicode_FromUnicode(w, size); 2135} 2136 2137#endif /* HAVE_WCHAR_H */ 2138 2139static void 2140makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2141 int zeropad, int width, int precision, char c) 2142{ 2143 *fmt++ = '%'; 2144 if (width) { 2145 if (zeropad) 2146 *fmt++ = '0'; 2147 fmt += sprintf(fmt, "%d", width); 2148 } 2149 if (precision) 2150 fmt += sprintf(fmt, ".%d", precision); 2151 if (longflag) 2152 *fmt++ = 'l'; 2153 else if (longlongflag) { 2154 /* longlongflag should only ever be nonzero on machines with 2155 HAVE_LONG_LONG defined */ 2156#ifdef HAVE_LONG_LONG 2157 char *f = PY_FORMAT_LONG_LONG; 2158 while (*f) 2159 *fmt++ = *f++; 2160#else 2161 /* we shouldn't ever get here */ 2162 assert(0); 2163 *fmt++ = 'l'; 2164#endif 2165 } 2166 else if (size_tflag) { 2167 char *f = PY_FORMAT_SIZE_T; 2168 while (*f) 2169 *fmt++ = *f++; 2170 } 2171 *fmt++ = c; 2172 *fmt = '\0'; 2173} 2174 2175/* helper for PyUnicode_FromFormatV() */ 2176 2177static const char* 2178parse_format_flags(const char *f, 2179 int *p_width, int *p_precision, 2180 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2181{ 2182 int width, precision, longflag, longlongflag, size_tflag; 2183 2184 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2185 f++; 2186 width = 0; 2187 while (Py_ISDIGIT((unsigned)*f)) 2188 width = (width*10) + *f++ - '0'; 2189 precision = 0; 2190 if (*f == '.') { 2191 f++; 2192 while (Py_ISDIGIT((unsigned)*f)) 2193 precision = (precision*10) + *f++ - '0'; 2194 if (*f == '%') { 2195 /* "%.3%s" => f points to "3" */ 2196 f--; 2197 } 2198 } 2199 if (*f == '\0') { 2200 /* bogus format "%.1" => go backward, f points to "1" */ 2201 f--; 2202 } 2203 if (p_width != NULL) 2204 *p_width = width; 2205 if (p_precision != NULL) 2206 *p_precision = precision; 2207 2208 /* Handle %ld, %lu, %lld and %llu. */ 2209 longflag = 0; 2210 longlongflag = 0; 2211 size_tflag = 0; 2212 2213 if (*f == 'l') { 2214 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2215 longflag = 1; 2216 ++f; 2217 } 2218#ifdef HAVE_LONG_LONG 2219 else if (f[1] == 'l' && 2220 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2221 longlongflag = 1; 2222 f += 2; 2223 } 2224#endif 2225 } 2226 /* handle the size_t flag. */ 2227 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2228 size_tflag = 1; 2229 ++f; 2230 } 2231 if (p_longflag != NULL) 2232 *p_longflag = longflag; 2233 if (p_longlongflag != NULL) 2234 *p_longlongflag = longlongflag; 2235 if (p_size_tflag != NULL) 2236 *p_size_tflag = size_tflag; 2237 return f; 2238} 2239 2240/* maximum number of characters required for output of %ld. 21 characters 2241 allows for 64-bit integers (in decimal) and an optional sign. */ 2242#define MAX_LONG_CHARS 21 2243/* maximum number of characters required for output of %lld. 2244 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2245 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2246#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2247 2248PyObject * 2249PyUnicode_FromFormatV(const char *format, va_list vargs) 2250{ 2251 va_list count; 2252 Py_ssize_t callcount = 0; 2253 PyObject **callresults = NULL; 2254 PyObject **callresult = NULL; 2255 Py_ssize_t n = 0; 2256 int width = 0; 2257 int precision = 0; 2258 int zeropad; 2259 const char* f; 2260 PyObject *string; 2261 /* used by sprintf */ 2262 char fmt[61]; /* should be enough for %0width.precisionlld */ 2263 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2264 Py_UCS4 argmaxchar; 2265 Py_ssize_t numbersize = 0; 2266 char *numberresults = NULL; 2267 char *numberresult = NULL; 2268 Py_ssize_t i; 2269 int kind; 2270 void *data; 2271 2272 Py_VA_COPY(count, vargs); 2273 /* step 1: count the number of %S/%R/%A/%s format specifications 2274 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2275 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2276 * result in an array) 2277 * also estimate a upper bound for all the number formats in the string, 2278 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2279 * buffer before putting everything together. */ 2280 for (f = format; *f; f++) { 2281 if (*f == '%') { 2282 int longlongflag; 2283 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2284 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2285 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2286 ++callcount; 2287 2288 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2289#ifdef HAVE_LONG_LONG 2290 if (longlongflag) { 2291 if (width < MAX_LONG_LONG_CHARS) 2292 width = MAX_LONG_LONG_CHARS; 2293 } 2294 else 2295#endif 2296 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2297 including sign. Decimal takes the most space. This 2298 isn't enough for octal. If a width is specified we 2299 need more (which we allocate later). */ 2300 if (width < MAX_LONG_CHARS) 2301 width = MAX_LONG_CHARS; 2302 2303 /* account for the size + '\0' to separate numbers 2304 inside of the numberresults buffer */ 2305 numbersize += (width + 1); 2306 } 2307 } 2308 else if ((unsigned char)*f > 127) { 2309 PyErr_Format(PyExc_ValueError, 2310 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2311 "string, got a non-ASCII byte: 0x%02x", 2312 (unsigned char)*f); 2313 return NULL; 2314 } 2315 } 2316 /* step 2: allocate memory for the results of 2317 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2318 if (callcount) { 2319 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2320 if (!callresults) { 2321 PyErr_NoMemory(); 2322 return NULL; 2323 } 2324 callresult = callresults; 2325 } 2326 /* step 2.5: allocate memory for the results of formating numbers */ 2327 if (numbersize) { 2328 numberresults = PyObject_Malloc(numbersize); 2329 if (!numberresults) { 2330 PyErr_NoMemory(); 2331 goto fail; 2332 } 2333 numberresult = numberresults; 2334 } 2335 2336 /* step 3: format numbers and figure out how large a buffer we need */ 2337 for (f = format; *f; f++) { 2338 if (*f == '%') { 2339 const char* p; 2340 int longflag; 2341 int longlongflag; 2342 int size_tflag; 2343 int numprinted; 2344 2345 p = f; 2346 zeropad = (f[1] == '0'); 2347 f = parse_format_flags(f, &width, &precision, 2348 &longflag, &longlongflag, &size_tflag); 2349 switch (*f) { 2350 case 'c': 2351 { 2352 Py_UCS4 ordinal = va_arg(count, int); 2353 maxchar = Py_MAX(maxchar, ordinal); 2354 n++; 2355 break; 2356 } 2357 case '%': 2358 n++; 2359 break; 2360 case 'i': 2361 case 'd': 2362 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2363 width, precision, *f); 2364 if (longflag) 2365 numprinted = sprintf(numberresult, fmt, 2366 va_arg(count, long)); 2367#ifdef HAVE_LONG_LONG 2368 else if (longlongflag) 2369 numprinted = sprintf(numberresult, fmt, 2370 va_arg(count, PY_LONG_LONG)); 2371#endif 2372 else if (size_tflag) 2373 numprinted = sprintf(numberresult, fmt, 2374 va_arg(count, Py_ssize_t)); 2375 else 2376 numprinted = sprintf(numberresult, fmt, 2377 va_arg(count, int)); 2378 n += numprinted; 2379 /* advance by +1 to skip over the '\0' */ 2380 numberresult += (numprinted + 1); 2381 assert(*(numberresult - 1) == '\0'); 2382 assert(*(numberresult - 2) != '\0'); 2383 assert(numprinted >= 0); 2384 assert(numberresult <= numberresults + numbersize); 2385 break; 2386 case 'u': 2387 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2388 width, precision, 'u'); 2389 if (longflag) 2390 numprinted = sprintf(numberresult, fmt, 2391 va_arg(count, unsigned long)); 2392#ifdef HAVE_LONG_LONG 2393 else if (longlongflag) 2394 numprinted = sprintf(numberresult, fmt, 2395 va_arg(count, unsigned PY_LONG_LONG)); 2396#endif 2397 else if (size_tflag) 2398 numprinted = sprintf(numberresult, fmt, 2399 va_arg(count, size_t)); 2400 else 2401 numprinted = sprintf(numberresult, fmt, 2402 va_arg(count, unsigned int)); 2403 n += numprinted; 2404 numberresult += (numprinted + 1); 2405 assert(*(numberresult - 1) == '\0'); 2406 assert(*(numberresult - 2) != '\0'); 2407 assert(numprinted >= 0); 2408 assert(numberresult <= numberresults + numbersize); 2409 break; 2410 case 'x': 2411 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2412 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2413 n += numprinted; 2414 numberresult += (numprinted + 1); 2415 assert(*(numberresult - 1) == '\0'); 2416 assert(*(numberresult - 2) != '\0'); 2417 assert(numprinted >= 0); 2418 assert(numberresult <= numberresults + numbersize); 2419 break; 2420 case 'p': 2421 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2422 /* %p is ill-defined: ensure leading 0x. */ 2423 if (numberresult[1] == 'X') 2424 numberresult[1] = 'x'; 2425 else if (numberresult[1] != 'x') { 2426 memmove(numberresult + 2, numberresult, 2427 strlen(numberresult) + 1); 2428 numberresult[0] = '0'; 2429 numberresult[1] = 'x'; 2430 numprinted += 2; 2431 } 2432 n += numprinted; 2433 numberresult += (numprinted + 1); 2434 assert(*(numberresult - 1) == '\0'); 2435 assert(*(numberresult - 2) != '\0'); 2436 assert(numprinted >= 0); 2437 assert(numberresult <= numberresults + numbersize); 2438 break; 2439 case 's': 2440 { 2441 /* UTF-8 */ 2442 const char *s = va_arg(count, const char*); 2443 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace"); 2444 if (!str) 2445 goto fail; 2446 /* since PyUnicode_DecodeUTF8 returns already flexible 2447 unicode objects, there is no need to call ready on them */ 2448 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2449 maxchar = Py_MAX(maxchar, argmaxchar); 2450 n += PyUnicode_GET_LENGTH(str); 2451 /* Remember the str and switch to the next slot */ 2452 *callresult++ = str; 2453 break; 2454 } 2455 case 'U': 2456 { 2457 PyObject *obj = va_arg(count, PyObject *); 2458 assert(obj && _PyUnicode_CHECK(obj)); 2459 if (PyUnicode_READY(obj) == -1) 2460 goto fail; 2461 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2462 maxchar = Py_MAX(maxchar, argmaxchar); 2463 n += PyUnicode_GET_LENGTH(obj); 2464 break; 2465 } 2466 case 'V': 2467 { 2468 PyObject *obj = va_arg(count, PyObject *); 2469 const char *str = va_arg(count, const char *); 2470 PyObject *str_obj; 2471 assert(obj || str); 2472 assert(!obj || _PyUnicode_CHECK(obj)); 2473 if (obj) { 2474 if (PyUnicode_READY(obj) == -1) 2475 goto fail; 2476 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2477 maxchar = Py_MAX(maxchar, argmaxchar); 2478 n += PyUnicode_GET_LENGTH(obj); 2479 *callresult++ = NULL; 2480 } 2481 else { 2482 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); 2483 if (!str_obj) 2484 goto fail; 2485 if (PyUnicode_READY(str_obj)) { 2486 Py_DECREF(str_obj); 2487 goto fail; 2488 } 2489 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2490 maxchar = Py_MAX(maxchar, argmaxchar); 2491 n += PyUnicode_GET_LENGTH(str_obj); 2492 *callresult++ = str_obj; 2493 } 2494 break; 2495 } 2496 case 'S': 2497 { 2498 PyObject *obj = va_arg(count, PyObject *); 2499 PyObject *str; 2500 assert(obj); 2501 str = PyObject_Str(obj); 2502 if (!str || PyUnicode_READY(str) == -1) 2503 goto fail; 2504 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2505 maxchar = Py_MAX(maxchar, argmaxchar); 2506 n += PyUnicode_GET_LENGTH(str); 2507 /* Remember the str and switch to the next slot */ 2508 *callresult++ = str; 2509 break; 2510 } 2511 case 'R': 2512 { 2513 PyObject *obj = va_arg(count, PyObject *); 2514 PyObject *repr; 2515 assert(obj); 2516 repr = PyObject_Repr(obj); 2517 if (!repr || PyUnicode_READY(repr) == -1) 2518 goto fail; 2519 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2520 maxchar = Py_MAX(maxchar, argmaxchar); 2521 n += PyUnicode_GET_LENGTH(repr); 2522 /* Remember the repr and switch to the next slot */ 2523 *callresult++ = repr; 2524 break; 2525 } 2526 case 'A': 2527 { 2528 PyObject *obj = va_arg(count, PyObject *); 2529 PyObject *ascii; 2530 assert(obj); 2531 ascii = PyObject_ASCII(obj); 2532 if (!ascii || PyUnicode_READY(ascii) == -1) 2533 goto fail; 2534 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2535 maxchar = Py_MAX(maxchar, argmaxchar); 2536 n += PyUnicode_GET_LENGTH(ascii); 2537 /* Remember the repr and switch to the next slot */ 2538 *callresult++ = ascii; 2539 break; 2540 } 2541 default: 2542 /* if we stumble upon an unknown 2543 formatting code, copy the rest of 2544 the format string to the output 2545 string. (we cannot just skip the 2546 code, since there's no way to know 2547 what's in the argument list) */ 2548 n += strlen(p); 2549 goto expand; 2550 } 2551 } else 2552 n++; 2553 } 2554 expand: 2555 /* step 4: fill the buffer */ 2556 /* Since we've analyzed how much space we need, 2557 we don't have to resize the string. 2558 There can be no errors beyond this point. */ 2559 string = PyUnicode_New(n, maxchar); 2560 if (!string) 2561 goto fail; 2562 kind = PyUnicode_KIND(string); 2563 data = PyUnicode_DATA(string); 2564 callresult = callresults; 2565 numberresult = numberresults; 2566 2567 for (i = 0, f = format; *f; f++) { 2568 if (*f == '%') { 2569 const char* p; 2570 2571 p = f; 2572 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2573 /* checking for == because the last argument could be a empty 2574 string, which causes i to point to end, the assert at the end of 2575 the loop */ 2576 assert(i <= PyUnicode_GET_LENGTH(string)); 2577 2578 switch (*f) { 2579 case 'c': 2580 { 2581 const int ordinal = va_arg(vargs, int); 2582 PyUnicode_WRITE(kind, data, i++, ordinal); 2583 break; 2584 } 2585 case 'i': 2586 case 'd': 2587 case 'u': 2588 case 'x': 2589 case 'p': 2590 /* unused, since we already have the result */ 2591 if (*f == 'p') 2592 (void) va_arg(vargs, void *); 2593 else 2594 (void) va_arg(vargs, int); 2595 /* extract the result from numberresults and append. */ 2596 for (; *numberresult; ++i, ++numberresult) 2597 PyUnicode_WRITE(kind, data, i, *numberresult); 2598 /* skip over the separating '\0' */ 2599 assert(*numberresult == '\0'); 2600 numberresult++; 2601 assert(numberresult <= numberresults + numbersize); 2602 break; 2603 case 's': 2604 { 2605 /* unused, since we already have the result */ 2606 Py_ssize_t size; 2607 (void) va_arg(vargs, char *); 2608 size = PyUnicode_GET_LENGTH(*callresult); 2609 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2610 copy_characters(string, i, *callresult, 0, size); 2611 i += size; 2612 /* We're done with the unicode()/repr() => forget it */ 2613 Py_DECREF(*callresult); 2614 /* switch to next unicode()/repr() result */ 2615 ++callresult; 2616 break; 2617 } 2618 case 'U': 2619 { 2620 PyObject *obj = va_arg(vargs, PyObject *); 2621 Py_ssize_t size; 2622 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2623 size = PyUnicode_GET_LENGTH(obj); 2624 copy_characters(string, i, obj, 0, size); 2625 i += size; 2626 break; 2627 } 2628 case 'V': 2629 { 2630 Py_ssize_t size; 2631 PyObject *obj = va_arg(vargs, PyObject *); 2632 va_arg(vargs, const char *); 2633 if (obj) { 2634 size = PyUnicode_GET_LENGTH(obj); 2635 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2636 copy_characters(string, i, obj, 0, size); 2637 i += size; 2638 } else { 2639 size = PyUnicode_GET_LENGTH(*callresult); 2640 assert(PyUnicode_KIND(*callresult) <= 2641 PyUnicode_KIND(string)); 2642 copy_characters(string, i, *callresult, 0, size); 2643 i += size; 2644 Py_DECREF(*callresult); 2645 } 2646 ++callresult; 2647 break; 2648 } 2649 case 'S': 2650 case 'R': 2651 case 'A': 2652 { 2653 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2654 /* unused, since we already have the result */ 2655 (void) va_arg(vargs, PyObject *); 2656 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2657 copy_characters(string, i, *callresult, 0, size); 2658 i += size; 2659 /* We're done with the unicode()/repr() => forget it */ 2660 Py_DECREF(*callresult); 2661 /* switch to next unicode()/repr() result */ 2662 ++callresult; 2663 break; 2664 } 2665 case '%': 2666 PyUnicode_WRITE(kind, data, i++, '%'); 2667 break; 2668 default: 2669 for (; *p; ++p, ++i) 2670 PyUnicode_WRITE(kind, data, i, *p); 2671 assert(i == PyUnicode_GET_LENGTH(string)); 2672 goto end; 2673 } 2674 } 2675 else { 2676 assert(i < PyUnicode_GET_LENGTH(string)); 2677 PyUnicode_WRITE(kind, data, i++, *f); 2678 } 2679 } 2680 assert(i == PyUnicode_GET_LENGTH(string)); 2681 2682 end: 2683 if (callresults) 2684 PyObject_Free(callresults); 2685 if (numberresults) 2686 PyObject_Free(numberresults); 2687 return unicode_result(string); 2688 fail: 2689 if (callresults) { 2690 PyObject **callresult2 = callresults; 2691 while (callresult2 < callresult) { 2692 Py_XDECREF(*callresult2); 2693 ++callresult2; 2694 } 2695 PyObject_Free(callresults); 2696 } 2697 if (numberresults) 2698 PyObject_Free(numberresults); 2699 return NULL; 2700} 2701 2702PyObject * 2703PyUnicode_FromFormat(const char *format, ...) 2704{ 2705 PyObject* ret; 2706 va_list vargs; 2707 2708#ifdef HAVE_STDARG_PROTOTYPES 2709 va_start(vargs, format); 2710#else 2711 va_start(vargs); 2712#endif 2713 ret = PyUnicode_FromFormatV(format, vargs); 2714 va_end(vargs); 2715 return ret; 2716} 2717 2718#ifdef HAVE_WCHAR_H 2719 2720/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2721 convert a Unicode object to a wide character string. 2722 2723 - If w is NULL: return the number of wide characters (including the null 2724 character) required to convert the unicode object. Ignore size argument. 2725 2726 - Otherwise: return the number of wide characters (excluding the null 2727 character) written into w. Write at most size wide characters (including 2728 the null character). */ 2729static Py_ssize_t 2730unicode_aswidechar(PyObject *unicode, 2731 wchar_t *w, 2732 Py_ssize_t size) 2733{ 2734 Py_ssize_t res; 2735 const wchar_t *wstr; 2736 2737 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2738 if (wstr == NULL) 2739 return -1; 2740 2741 if (w != NULL) { 2742 if (size > res) 2743 size = res + 1; 2744 else 2745 res = size; 2746 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2747 return res; 2748 } 2749 else 2750 return res + 1; 2751} 2752 2753Py_ssize_t 2754PyUnicode_AsWideChar(PyObject *unicode, 2755 wchar_t *w, 2756 Py_ssize_t size) 2757{ 2758 if (unicode == NULL) { 2759 PyErr_BadInternalCall(); 2760 return -1; 2761 } 2762 return unicode_aswidechar(unicode, w, size); 2763} 2764 2765wchar_t* 2766PyUnicode_AsWideCharString(PyObject *unicode, 2767 Py_ssize_t *size) 2768{ 2769 wchar_t* buffer; 2770 Py_ssize_t buflen; 2771 2772 if (unicode == NULL) { 2773 PyErr_BadInternalCall(); 2774 return NULL; 2775 } 2776 2777 buflen = unicode_aswidechar(unicode, NULL, 0); 2778 if (buflen == -1) 2779 return NULL; 2780 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2781 PyErr_NoMemory(); 2782 return NULL; 2783 } 2784 2785 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2786 if (buffer == NULL) { 2787 PyErr_NoMemory(); 2788 return NULL; 2789 } 2790 buflen = unicode_aswidechar(unicode, buffer, buflen); 2791 if (buflen == -1) 2792 return NULL; 2793 if (size != NULL) 2794 *size = buflen; 2795 return buffer; 2796} 2797 2798#endif /* HAVE_WCHAR_H */ 2799 2800PyObject * 2801PyUnicode_FromOrdinal(int ordinal) 2802{ 2803 PyObject *v; 2804 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2805 PyErr_SetString(PyExc_ValueError, 2806 "chr() arg not in range(0x110000)"); 2807 return NULL; 2808 } 2809 2810 if (ordinal < 256) 2811 return get_latin1_char(ordinal); 2812 2813 v = PyUnicode_New(1, ordinal); 2814 if (v == NULL) 2815 return NULL; 2816 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2817 assert(_PyUnicode_CheckConsistency(v, 1)); 2818 return v; 2819} 2820 2821PyObject * 2822PyUnicode_FromObject(register PyObject *obj) 2823{ 2824 /* XXX Perhaps we should make this API an alias of 2825 PyObject_Str() instead ?! */ 2826 if (PyUnicode_CheckExact(obj)) { 2827 if (PyUnicode_READY(obj)) 2828 return NULL; 2829 Py_INCREF(obj); 2830 return obj; 2831 } 2832 if (PyUnicode_Check(obj)) { 2833 /* For a Unicode subtype that's not a Unicode object, 2834 return a true Unicode object with the same data. */ 2835 return PyUnicode_Copy(obj); 2836 } 2837 PyErr_Format(PyExc_TypeError, 2838 "Can't convert '%.100s' object to str implicitly", 2839 Py_TYPE(obj)->tp_name); 2840 return NULL; 2841} 2842 2843PyObject * 2844PyUnicode_FromEncodedObject(register PyObject *obj, 2845 const char *encoding, 2846 const char *errors) 2847{ 2848 Py_buffer buffer; 2849 PyObject *v; 2850 2851 if (obj == NULL) { 2852 PyErr_BadInternalCall(); 2853 return NULL; 2854 } 2855 2856 /* Decoding bytes objects is the most common case and should be fast */ 2857 if (PyBytes_Check(obj)) { 2858 if (PyBytes_GET_SIZE(obj) == 0) { 2859 Py_INCREF(unicode_empty); 2860 v = unicode_empty; 2861 } 2862 else { 2863 v = PyUnicode_Decode( 2864 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2865 encoding, errors); 2866 } 2867 return v; 2868 } 2869 2870 if (PyUnicode_Check(obj)) { 2871 PyErr_SetString(PyExc_TypeError, 2872 "decoding str is not supported"); 2873 return NULL; 2874 } 2875 2876 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2877 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2878 PyErr_Format(PyExc_TypeError, 2879 "coercing to str: need bytes, bytearray " 2880 "or buffer-like object, %.80s found", 2881 Py_TYPE(obj)->tp_name); 2882 return NULL; 2883 } 2884 2885 if (buffer.len == 0) { 2886 Py_INCREF(unicode_empty); 2887 v = unicode_empty; 2888 } 2889 else 2890 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2891 2892 PyBuffer_Release(&buffer); 2893 return v; 2894} 2895 2896/* Convert encoding to lower case and replace '_' with '-' in order to 2897 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2898 1 on success. */ 2899static int 2900normalize_encoding(const char *encoding, 2901 char *lower, 2902 size_t lower_len) 2903{ 2904 const char *e; 2905 char *l; 2906 char *l_end; 2907 2908 if (encoding == NULL) { 2909 strcpy(lower, "utf-8"); 2910 return 1; 2911 } 2912 e = encoding; 2913 l = lower; 2914 l_end = &lower[lower_len - 1]; 2915 while (*e) { 2916 if (l == l_end) 2917 return 0; 2918 if (Py_ISUPPER(*e)) { 2919 *l++ = Py_TOLOWER(*e++); 2920 } 2921 else if (*e == '_') { 2922 *l++ = '-'; 2923 e++; 2924 } 2925 else { 2926 *l++ = *e++; 2927 } 2928 } 2929 *l = '\0'; 2930 return 1; 2931} 2932 2933PyObject * 2934PyUnicode_Decode(const char *s, 2935 Py_ssize_t size, 2936 const char *encoding, 2937 const char *errors) 2938{ 2939 PyObject *buffer = NULL, *unicode; 2940 Py_buffer info; 2941 char lower[11]; /* Enough for any encoding shortcut */ 2942 2943 /* Shortcuts for common default encodings */ 2944 if (normalize_encoding(encoding, lower, sizeof(lower))) { 2945 if ((strcmp(lower, "utf-8") == 0) || 2946 (strcmp(lower, "utf8") == 0)) 2947 return PyUnicode_DecodeUTF8(s, size, errors); 2948 else if ((strcmp(lower, "latin-1") == 0) || 2949 (strcmp(lower, "latin1") == 0) || 2950 (strcmp(lower, "iso-8859-1") == 0)) 2951 return PyUnicode_DecodeLatin1(s, size, errors); 2952#ifdef HAVE_MBCS 2953 else if (strcmp(lower, "mbcs") == 0) 2954 return PyUnicode_DecodeMBCS(s, size, errors); 2955#endif 2956 else if (strcmp(lower, "ascii") == 0) 2957 return PyUnicode_DecodeASCII(s, size, errors); 2958 else if (strcmp(lower, "utf-16") == 0) 2959 return PyUnicode_DecodeUTF16(s, size, errors, 0); 2960 else if (strcmp(lower, "utf-32") == 0) 2961 return PyUnicode_DecodeUTF32(s, size, errors, 0); 2962 } 2963 2964 /* Decode via the codec registry */ 2965 buffer = NULL; 2966 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 2967 goto onError; 2968 buffer = PyMemoryView_FromBuffer(&info); 2969 if (buffer == NULL) 2970 goto onError; 2971 unicode = PyCodec_Decode(buffer, encoding, errors); 2972 if (unicode == NULL) 2973 goto onError; 2974 if (!PyUnicode_Check(unicode)) { 2975 PyErr_Format(PyExc_TypeError, 2976 "decoder did not return a str object (type=%.400s)", 2977 Py_TYPE(unicode)->tp_name); 2978 Py_DECREF(unicode); 2979 goto onError; 2980 } 2981 Py_DECREF(buffer); 2982 return unicode_result(unicode); 2983 2984 onError: 2985 Py_XDECREF(buffer); 2986 return NULL; 2987} 2988 2989PyObject * 2990PyUnicode_AsDecodedObject(PyObject *unicode, 2991 const char *encoding, 2992 const char *errors) 2993{ 2994 PyObject *v; 2995 2996 if (!PyUnicode_Check(unicode)) { 2997 PyErr_BadArgument(); 2998 goto onError; 2999 } 3000 3001 if (encoding == NULL) 3002 encoding = PyUnicode_GetDefaultEncoding(); 3003 3004 /* Decode via the codec registry */ 3005 v = PyCodec_Decode(unicode, encoding, errors); 3006 if (v == NULL) 3007 goto onError; 3008 return unicode_result(v); 3009 3010 onError: 3011 return NULL; 3012} 3013 3014PyObject * 3015PyUnicode_AsDecodedUnicode(PyObject *unicode, 3016 const char *encoding, 3017 const char *errors) 3018{ 3019 PyObject *v; 3020 3021 if (!PyUnicode_Check(unicode)) { 3022 PyErr_BadArgument(); 3023 goto onError; 3024 } 3025 3026 if (encoding == NULL) 3027 encoding = PyUnicode_GetDefaultEncoding(); 3028 3029 /* Decode via the codec registry */ 3030 v = PyCodec_Decode(unicode, encoding, errors); 3031 if (v == NULL) 3032 goto onError; 3033 if (!PyUnicode_Check(v)) { 3034 PyErr_Format(PyExc_TypeError, 3035 "decoder did not return a str object (type=%.400s)", 3036 Py_TYPE(v)->tp_name); 3037 Py_DECREF(v); 3038 goto onError; 3039 } 3040 return unicode_result(v); 3041 3042 onError: 3043 return NULL; 3044} 3045 3046PyObject * 3047PyUnicode_Encode(const Py_UNICODE *s, 3048 Py_ssize_t size, 3049 const char *encoding, 3050 const char *errors) 3051{ 3052 PyObject *v, *unicode; 3053 3054 unicode = PyUnicode_FromUnicode(s, size); 3055 if (unicode == NULL) 3056 return NULL; 3057 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3058 Py_DECREF(unicode); 3059 return v; 3060} 3061 3062PyObject * 3063PyUnicode_AsEncodedObject(PyObject *unicode, 3064 const char *encoding, 3065 const char *errors) 3066{ 3067 PyObject *v; 3068 3069 if (!PyUnicode_Check(unicode)) { 3070 PyErr_BadArgument(); 3071 goto onError; 3072 } 3073 3074 if (encoding == NULL) 3075 encoding = PyUnicode_GetDefaultEncoding(); 3076 3077 /* Encode via the codec registry */ 3078 v = PyCodec_Encode(unicode, encoding, errors); 3079 if (v == NULL) 3080 goto onError; 3081 return v; 3082 3083 onError: 3084 return NULL; 3085} 3086 3087PyObject * 3088PyUnicode_EncodeFSDefault(PyObject *unicode) 3089{ 3090#ifdef HAVE_MBCS 3091 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3092#elif defined(__APPLE__) 3093 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3094#else 3095 PyInterpreterState *interp = PyThreadState_GET()->interp; 3096 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3097 cannot use it to encode and decode filenames before it is loaded. Load 3098 the Python codec requires to encode at least its own filename. Use the C 3099 version of the locale codec until the codec registry is initialized and 3100 the Python codec is loaded. 3101 3102 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3103 cannot only rely on it: check also interp->fscodec_initialized for 3104 subinterpreters. */ 3105 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3106 return PyUnicode_AsEncodedString(unicode, 3107 Py_FileSystemDefaultEncoding, 3108 "surrogateescape"); 3109 } 3110 else { 3111 /* locale encoding with surrogateescape */ 3112 wchar_t *wchar; 3113 char *bytes; 3114 PyObject *bytes_obj; 3115 size_t error_pos; 3116 3117 wchar = PyUnicode_AsWideCharString(unicode, NULL); 3118 if (wchar == NULL) 3119 return NULL; 3120 bytes = _Py_wchar2char(wchar, &error_pos); 3121 if (bytes == NULL) { 3122 if (error_pos != (size_t)-1) { 3123 char *errmsg = strerror(errno); 3124 PyObject *exc = NULL; 3125 if (errmsg == NULL) 3126 errmsg = "Py_wchar2char() failed"; 3127 raise_encode_exception(&exc, 3128 "filesystemencoding", unicode, 3129 error_pos, error_pos+1, 3130 errmsg); 3131 Py_XDECREF(exc); 3132 } 3133 else 3134 PyErr_NoMemory(); 3135 PyMem_Free(wchar); 3136 return NULL; 3137 } 3138 PyMem_Free(wchar); 3139 3140 bytes_obj = PyBytes_FromString(bytes); 3141 PyMem_Free(bytes); 3142 return bytes_obj; 3143 } 3144#endif 3145} 3146 3147PyObject * 3148PyUnicode_AsEncodedString(PyObject *unicode, 3149 const char *encoding, 3150 const char *errors) 3151{ 3152 PyObject *v; 3153 char lower[11]; /* Enough for any encoding shortcut */ 3154 3155 if (!PyUnicode_Check(unicode)) { 3156 PyErr_BadArgument(); 3157 return NULL; 3158 } 3159 3160 /* Shortcuts for common default encodings */ 3161 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3162 if ((strcmp(lower, "utf-8") == 0) || 3163 (strcmp(lower, "utf8") == 0)) 3164 { 3165 if (errors == NULL || strcmp(errors, "strict") == 0) 3166 return _PyUnicode_AsUTF8String(unicode, NULL); 3167 else 3168 return _PyUnicode_AsUTF8String(unicode, errors); 3169 } 3170 else if ((strcmp(lower, "latin-1") == 0) || 3171 (strcmp(lower, "latin1") == 0) || 3172 (strcmp(lower, "iso-8859-1") == 0)) 3173 return _PyUnicode_AsLatin1String(unicode, errors); 3174#ifdef HAVE_MBCS 3175 else if (strcmp(lower, "mbcs") == 0) 3176 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3177#endif 3178 else if (strcmp(lower, "ascii") == 0) 3179 return _PyUnicode_AsASCIIString(unicode, errors); 3180 } 3181 3182 /* Encode via the codec registry */ 3183 v = PyCodec_Encode(unicode, encoding, errors); 3184 if (v == NULL) 3185 return NULL; 3186 3187 /* The normal path */ 3188 if (PyBytes_Check(v)) 3189 return v; 3190 3191 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3192 if (PyByteArray_Check(v)) { 3193 int error; 3194 PyObject *b; 3195 3196 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3197 "encoder %s returned bytearray instead of bytes", 3198 encoding); 3199 if (error) { 3200 Py_DECREF(v); 3201 return NULL; 3202 } 3203 3204 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3205 Py_DECREF(v); 3206 return b; 3207 } 3208 3209 PyErr_Format(PyExc_TypeError, 3210 "encoder did not return a bytes object (type=%.400s)", 3211 Py_TYPE(v)->tp_name); 3212 Py_DECREF(v); 3213 return NULL; 3214} 3215 3216PyObject * 3217PyUnicode_AsEncodedUnicode(PyObject *unicode, 3218 const char *encoding, 3219 const char *errors) 3220{ 3221 PyObject *v; 3222 3223 if (!PyUnicode_Check(unicode)) { 3224 PyErr_BadArgument(); 3225 goto onError; 3226 } 3227 3228 if (encoding == NULL) 3229 encoding = PyUnicode_GetDefaultEncoding(); 3230 3231 /* Encode via the codec registry */ 3232 v = PyCodec_Encode(unicode, encoding, errors); 3233 if (v == NULL) 3234 goto onError; 3235 if (!PyUnicode_Check(v)) { 3236 PyErr_Format(PyExc_TypeError, 3237 "encoder did not return an str object (type=%.400s)", 3238 Py_TYPE(v)->tp_name); 3239 Py_DECREF(v); 3240 goto onError; 3241 } 3242 return v; 3243 3244 onError: 3245 return NULL; 3246} 3247 3248PyObject* 3249PyUnicode_DecodeFSDefault(const char *s) { 3250 Py_ssize_t size = (Py_ssize_t)strlen(s); 3251 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3252} 3253 3254PyObject* 3255PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3256{ 3257#ifdef HAVE_MBCS 3258 return PyUnicode_DecodeMBCS(s, size, NULL); 3259#elif defined(__APPLE__) 3260 return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); 3261#else 3262 PyInterpreterState *interp = PyThreadState_GET()->interp; 3263 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3264 cannot use it to encode and decode filenames before it is loaded. Load 3265 the Python codec requires to encode at least its own filename. Use the C 3266 version of the locale codec until the codec registry is initialized and 3267 the Python codec is loaded. 3268 3269 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3270 cannot only rely on it: check also interp->fscodec_initialized for 3271 subinterpreters. */ 3272 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3273 return PyUnicode_Decode(s, size, 3274 Py_FileSystemDefaultEncoding, 3275 "surrogateescape"); 3276 } 3277 else { 3278 /* locale encoding with surrogateescape */ 3279 wchar_t *wchar; 3280 PyObject *unicode; 3281 size_t len; 3282 3283 if (s[size] != '\0' || size != strlen(s)) { 3284 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3285 return NULL; 3286 } 3287 3288 wchar = _Py_char2wchar(s, &len); 3289 if (wchar == NULL) 3290 return PyErr_NoMemory(); 3291 3292 unicode = PyUnicode_FromWideChar(wchar, len); 3293 PyMem_Free(wchar); 3294 return unicode; 3295 } 3296#endif 3297} 3298 3299 3300int 3301PyUnicode_FSConverter(PyObject* arg, void* addr) 3302{ 3303 PyObject *output = NULL; 3304 Py_ssize_t size; 3305 void *data; 3306 if (arg == NULL) { 3307 Py_DECREF(*(PyObject**)addr); 3308 return 1; 3309 } 3310 if (PyBytes_Check(arg)) { 3311 output = arg; 3312 Py_INCREF(output); 3313 } 3314 else { 3315 arg = PyUnicode_FromObject(arg); 3316 if (!arg) 3317 return 0; 3318 output = PyUnicode_EncodeFSDefault(arg); 3319 Py_DECREF(arg); 3320 if (!output) 3321 return 0; 3322 if (!PyBytes_Check(output)) { 3323 Py_DECREF(output); 3324 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3325 return 0; 3326 } 3327 } 3328 size = PyBytes_GET_SIZE(output); 3329 data = PyBytes_AS_STRING(output); 3330 if (size != strlen(data)) { 3331 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3332 Py_DECREF(output); 3333 return 0; 3334 } 3335 *(PyObject**)addr = output; 3336 return Py_CLEANUP_SUPPORTED; 3337} 3338 3339 3340int 3341PyUnicode_FSDecoder(PyObject* arg, void* addr) 3342{ 3343 PyObject *output = NULL; 3344 if (arg == NULL) { 3345 Py_DECREF(*(PyObject**)addr); 3346 return 1; 3347 } 3348 if (PyUnicode_Check(arg)) { 3349 if (PyUnicode_READY(arg)) 3350 return 0; 3351 output = arg; 3352 Py_INCREF(output); 3353 } 3354 else { 3355 arg = PyBytes_FromObject(arg); 3356 if (!arg) 3357 return 0; 3358 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3359 PyBytes_GET_SIZE(arg)); 3360 Py_DECREF(arg); 3361 if (!output) 3362 return 0; 3363 if (!PyUnicode_Check(output)) { 3364 Py_DECREF(output); 3365 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3366 return 0; 3367 } 3368 } 3369 if (PyUnicode_READY(output) < 0) { 3370 Py_DECREF(output); 3371 return 0; 3372 } 3373 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3374 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3375 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3376 Py_DECREF(output); 3377 return 0; 3378 } 3379 *(PyObject**)addr = output; 3380 return Py_CLEANUP_SUPPORTED; 3381} 3382 3383 3384char* 3385PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3386{ 3387 PyObject *bytes; 3388 3389 if (!PyUnicode_Check(unicode)) { 3390 PyErr_BadArgument(); 3391 return NULL; 3392 } 3393 if (PyUnicode_READY(unicode) == -1) 3394 return NULL; 3395 3396 if (PyUnicode_UTF8(unicode) == NULL) { 3397 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3398 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3399 if (bytes == NULL) 3400 return NULL; 3401 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3402 if (_PyUnicode_UTF8(unicode) == NULL) { 3403 Py_DECREF(bytes); 3404 return NULL; 3405 } 3406 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3407 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3408 PyBytes_AS_STRING(bytes), 3409 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3410 Py_DECREF(bytes); 3411 } 3412 3413 if (psize) 3414 *psize = PyUnicode_UTF8_LENGTH(unicode); 3415 return PyUnicode_UTF8(unicode); 3416} 3417 3418char* 3419PyUnicode_AsUTF8(PyObject *unicode) 3420{ 3421 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3422} 3423 3424#ifdef Py_DEBUG 3425static int unicode_as_unicode_calls = 0; 3426#endif 3427 3428 3429Py_UNICODE * 3430PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3431{ 3432 const unsigned char *one_byte; 3433#if SIZEOF_WCHAR_T == 4 3434 const Py_UCS2 *two_bytes; 3435#else 3436 const Py_UCS4 *four_bytes; 3437 const Py_UCS4 *ucs4_end; 3438 Py_ssize_t num_surrogates; 3439#endif 3440 wchar_t *w; 3441 wchar_t *wchar_end; 3442 3443 if (!PyUnicode_Check(unicode)) { 3444 PyErr_BadArgument(); 3445 return NULL; 3446 } 3447 if (_PyUnicode_WSTR(unicode) == NULL) { 3448 /* Non-ASCII compact unicode object */ 3449 assert(_PyUnicode_KIND(unicode) != 0); 3450 assert(PyUnicode_IS_READY(unicode)); 3451 3452#ifdef Py_DEBUG 3453 ++unicode_as_unicode_calls; 3454#endif 3455 3456 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3457#if SIZEOF_WCHAR_T == 2 3458 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3459 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3460 num_surrogates = 0; 3461 3462 for (; four_bytes < ucs4_end; ++four_bytes) { 3463 if (*four_bytes > 0xFFFF) 3464 ++num_surrogates; 3465 } 3466 3467 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3468 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3469 if (!_PyUnicode_WSTR(unicode)) { 3470 PyErr_NoMemory(); 3471 return NULL; 3472 } 3473 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3474 3475 w = _PyUnicode_WSTR(unicode); 3476 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3477 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3478 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3479 if (*four_bytes > 0xFFFF) { 3480 assert(*four_bytes <= MAX_UNICODE); 3481 /* encode surrogate pair in this case */ 3482 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3483 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3484 } 3485 else 3486 *w = *four_bytes; 3487 3488 if (w > wchar_end) { 3489 assert(0 && "Miscalculated string end"); 3490 } 3491 } 3492 *w = 0; 3493#else 3494 /* sizeof(wchar_t) == 4 */ 3495 Py_FatalError("Impossible unicode object state, wstr and str " 3496 "should share memory already."); 3497 return NULL; 3498#endif 3499 } 3500 else { 3501 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3502 (_PyUnicode_LENGTH(unicode) + 1)); 3503 if (!_PyUnicode_WSTR(unicode)) { 3504 PyErr_NoMemory(); 3505 return NULL; 3506 } 3507 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3508 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3509 w = _PyUnicode_WSTR(unicode); 3510 wchar_end = w + _PyUnicode_LENGTH(unicode); 3511 3512 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3513 one_byte = PyUnicode_1BYTE_DATA(unicode); 3514 for (; w < wchar_end; ++one_byte, ++w) 3515 *w = *one_byte; 3516 /* null-terminate the wstr */ 3517 *w = 0; 3518 } 3519 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3520#if SIZEOF_WCHAR_T == 4 3521 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3522 for (; w < wchar_end; ++two_bytes, ++w) 3523 *w = *two_bytes; 3524 /* null-terminate the wstr */ 3525 *w = 0; 3526#else 3527 /* sizeof(wchar_t) == 2 */ 3528 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3529 _PyUnicode_WSTR(unicode) = NULL; 3530 Py_FatalError("Impossible unicode object state, wstr " 3531 "and str should share memory already."); 3532 return NULL; 3533#endif 3534 } 3535 else { 3536 assert(0 && "This should never happen."); 3537 } 3538 } 3539 } 3540 if (size != NULL) 3541 *size = PyUnicode_WSTR_LENGTH(unicode); 3542 return _PyUnicode_WSTR(unicode); 3543} 3544 3545Py_UNICODE * 3546PyUnicode_AsUnicode(PyObject *unicode) 3547{ 3548 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3549} 3550 3551 3552Py_ssize_t 3553PyUnicode_GetSize(PyObject *unicode) 3554{ 3555 if (!PyUnicode_Check(unicode)) { 3556 PyErr_BadArgument(); 3557 goto onError; 3558 } 3559 return PyUnicode_GET_SIZE(unicode); 3560 3561 onError: 3562 return -1; 3563} 3564 3565Py_ssize_t 3566PyUnicode_GetLength(PyObject *unicode) 3567{ 3568 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3569 PyErr_BadArgument(); 3570 return -1; 3571 } 3572 3573 return PyUnicode_GET_LENGTH(unicode); 3574} 3575 3576Py_UCS4 3577PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3578{ 3579 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3580 PyErr_BadArgument(); 3581 return (Py_UCS4)-1; 3582 } 3583 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3584 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3585 return (Py_UCS4)-1; 3586 } 3587 return PyUnicode_READ_CHAR(unicode, index); 3588} 3589 3590int 3591PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3592{ 3593 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3594 PyErr_BadArgument(); 3595 return -1; 3596 } 3597 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) { 3598 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3599 return -1; 3600 } 3601 if (_PyUnicode_Dirty(unicode)) 3602 return -1; 3603 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3604 index, ch); 3605 return 0; 3606} 3607 3608const char * 3609PyUnicode_GetDefaultEncoding(void) 3610{ 3611 return "utf-8"; 3612} 3613 3614/* create or adjust a UnicodeDecodeError */ 3615static void 3616make_decode_exception(PyObject **exceptionObject, 3617 const char *encoding, 3618 const char *input, Py_ssize_t length, 3619 Py_ssize_t startpos, Py_ssize_t endpos, 3620 const char *reason) 3621{ 3622 if (*exceptionObject == NULL) { 3623 *exceptionObject = PyUnicodeDecodeError_Create( 3624 encoding, input, length, startpos, endpos, reason); 3625 } 3626 else { 3627 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3628 goto onError; 3629 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3630 goto onError; 3631 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3632 goto onError; 3633 } 3634 return; 3635 3636onError: 3637 Py_DECREF(*exceptionObject); 3638 *exceptionObject = NULL; 3639} 3640 3641/* error handling callback helper: 3642 build arguments, call the callback and check the arguments, 3643 if no exception occurred, copy the replacement to the output 3644 and adjust various state variables. 3645 return 0 on success, -1 on error 3646*/ 3647 3648static int 3649unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 3650 const char *encoding, const char *reason, 3651 const char **input, const char **inend, Py_ssize_t *startinpos, 3652 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3653 PyObject **output, Py_ssize_t *outpos) 3654{ 3655 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 3656 3657 PyObject *restuple = NULL; 3658 PyObject *repunicode = NULL; 3659 Py_ssize_t outsize; 3660 Py_ssize_t insize; 3661 Py_ssize_t requiredsize; 3662 Py_ssize_t newpos; 3663 PyObject *inputobj = NULL; 3664 int res = -1; 3665 3666 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 3667 outsize = PyUnicode_GET_LENGTH(*output); 3668 else 3669 outsize = _PyUnicode_WSTR_LENGTH(*output); 3670 3671 if (*errorHandler == NULL) { 3672 *errorHandler = PyCodec_LookupError(errors); 3673 if (*errorHandler == NULL) 3674 goto onError; 3675 } 3676 3677 make_decode_exception(exceptionObject, 3678 encoding, 3679 *input, *inend - *input, 3680 *startinpos, *endinpos, 3681 reason); 3682 if (*exceptionObject == NULL) 3683 goto onError; 3684 3685 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 3686 if (restuple == NULL) 3687 goto onError; 3688 if (!PyTuple_Check(restuple)) { 3689 PyErr_SetString(PyExc_TypeError, &argparse[4]); 3690 goto onError; 3691 } 3692 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 3693 goto onError; 3694 if (PyUnicode_READY(repunicode) < 0) 3695 goto onError; 3696 3697 /* Copy back the bytes variables, which might have been modified by the 3698 callback */ 3699 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 3700 if (!inputobj) 3701 goto onError; 3702 if (!PyBytes_Check(inputobj)) { 3703 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 3704 } 3705 *input = PyBytes_AS_STRING(inputobj); 3706 insize = PyBytes_GET_SIZE(inputobj); 3707 *inend = *input + insize; 3708 /* we can DECREF safely, as the exception has another reference, 3709 so the object won't go away. */ 3710 Py_DECREF(inputobj); 3711 3712 if (newpos<0) 3713 newpos = insize+newpos; 3714 if (newpos<0 || newpos>insize) { 3715 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 3716 goto onError; 3717 } 3718 3719 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 3720 /* need more space? (at least enough for what we 3721 have+the replacement+the rest of the string (starting 3722 at the new input position), so we won't have to check space 3723 when there are no errors in the rest of the string) */ 3724 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 3725 requiredsize = *outpos + replen + insize-newpos; 3726 if (requiredsize > outsize) { 3727 if (requiredsize<2*outsize) 3728 requiredsize = 2*outsize; 3729 if (unicode_resize(output, requiredsize) < 0) 3730 goto onError; 3731 } 3732 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 3733 goto onError; 3734 copy_characters(*output, *outpos, repunicode, 0, replen); 3735 *outpos += replen; 3736 } 3737 else { 3738 wchar_t *repwstr; 3739 Py_ssize_t repwlen; 3740 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 3741 if (repwstr == NULL) 3742 goto onError; 3743 /* need more space? (at least enough for what we 3744 have+the replacement+the rest of the string (starting 3745 at the new input position), so we won't have to check space 3746 when there are no errors in the rest of the string) */ 3747 requiredsize = *outpos + repwlen + insize-newpos; 3748 if (requiredsize > outsize) { 3749 if (requiredsize < 2*outsize) 3750 requiredsize = 2*outsize; 3751 if (unicode_resize(output, requiredsize) < 0) 3752 goto onError; 3753 } 3754 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 3755 *outpos += repwlen; 3756 } 3757 *endinpos = newpos; 3758 *inptr = *input + newpos; 3759 3760 /* we made it! */ 3761 res = 0; 3762 3763 onError: 3764 Py_XDECREF(restuple); 3765 return res; 3766} 3767 3768/* --- UTF-7 Codec -------------------------------------------------------- */ 3769 3770/* See RFC2152 for details. We encode conservatively and decode liberally. */ 3771 3772/* Three simple macros defining base-64. */ 3773 3774/* Is c a base-64 character? */ 3775 3776#define IS_BASE64(c) \ 3777 (((c) >= 'A' && (c) <= 'Z') || \ 3778 ((c) >= 'a' && (c) <= 'z') || \ 3779 ((c) >= '0' && (c) <= '9') || \ 3780 (c) == '+' || (c) == '/') 3781 3782/* given that c is a base-64 character, what is its base-64 value? */ 3783 3784#define FROM_BASE64(c) \ 3785 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 3786 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 3787 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 3788 (c) == '+' ? 62 : 63) 3789 3790/* What is the base-64 character of the bottom 6 bits of n? */ 3791 3792#define TO_BASE64(n) \ 3793 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 3794 3795/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 3796 * decoded as itself. We are permissive on decoding; the only ASCII 3797 * byte not decoding to itself is the + which begins a base64 3798 * string. */ 3799 3800#define DECODE_DIRECT(c) \ 3801 ((c) <= 127 && (c) != '+') 3802 3803/* The UTF-7 encoder treats ASCII characters differently according to 3804 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 3805 * the above). See RFC2152. This array identifies these different 3806 * sets: 3807 * 0 : "Set D" 3808 * alphanumeric and '(),-./:? 3809 * 1 : "Set O" 3810 * !"#$%&*;<=>@[]^_`{|} 3811 * 2 : "whitespace" 3812 * ht nl cr sp 3813 * 3 : special (must be base64 encoded) 3814 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 3815 */ 3816 3817static 3818char utf7_category[128] = { 3819/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 3820 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 3821/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 3822 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3823/* sp ! " # $ % & ' ( ) * + , - . / */ 3824 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 3825/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 3826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 3827/* @ A B C D E F G H I J K L M N O */ 3828 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3829/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 3831/* ` a b c d e f g h i j k l m n o */ 3832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3833/* p q r s t u v w x y z { | } ~ del */ 3834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3835}; 3836 3837/* ENCODE_DIRECT: this character should be encoded as itself. The 3838 * answer depends on whether we are encoding set O as itself, and also 3839 * on whether we are encoding whitespace as itself. RFC2152 makes it 3840 * clear that the answers to these questions vary between 3841 * applications, so this code needs to be flexible. */ 3842 3843#define ENCODE_DIRECT(c, directO, directWS) \ 3844 ((c) < 128 && (c) > 0 && \ 3845 ((utf7_category[(c)] == 0) || \ 3846 (directWS && (utf7_category[(c)] == 2)) || \ 3847 (directO && (utf7_category[(c)] == 1)))) 3848 3849PyObject * 3850PyUnicode_DecodeUTF7(const char *s, 3851 Py_ssize_t size, 3852 const char *errors) 3853{ 3854 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 3855} 3856 3857/* The decoder. The only state we preserve is our read position, 3858 * i.e. how many characters we have consumed. So if we end in the 3859 * middle of a shift sequence we have to back off the read position 3860 * and the output to the beginning of the sequence, otherwise we lose 3861 * all the shift state (seen bits, number of bits seen, high 3862 * surrogate). */ 3863 3864PyObject * 3865PyUnicode_DecodeUTF7Stateful(const char *s, 3866 Py_ssize_t size, 3867 const char *errors, 3868 Py_ssize_t *consumed) 3869{ 3870 const char *starts = s; 3871 Py_ssize_t startinpos; 3872 Py_ssize_t endinpos; 3873 Py_ssize_t outpos; 3874 const char *e; 3875 PyObject *unicode; 3876 const char *errmsg = ""; 3877 int inShift = 0; 3878 Py_ssize_t shiftOutStart; 3879 unsigned int base64bits = 0; 3880 unsigned long base64buffer = 0; 3881 Py_UCS4 surrogate = 0; 3882 PyObject *errorHandler = NULL; 3883 PyObject *exc = NULL; 3884 3885 /* Start off assuming it's all ASCII. Widen later as necessary. */ 3886 unicode = PyUnicode_New(size, 127); 3887 if (!unicode) 3888 return NULL; 3889 if (size == 0) { 3890 if (consumed) 3891 *consumed = 0; 3892 return unicode; 3893 } 3894 3895 shiftOutStart = outpos = 0; 3896 e = s + size; 3897 3898 while (s < e) { 3899 Py_UCS4 ch; 3900 restart: 3901 ch = (unsigned char) *s; 3902 3903 if (inShift) { /* in a base-64 section */ 3904 if (IS_BASE64(ch)) { /* consume a base-64 character */ 3905 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 3906 base64bits += 6; 3907 s++; 3908 if (base64bits >= 16) { 3909 /* we have enough bits for a UTF-16 value */ 3910 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 3911 base64bits -= 16; 3912 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 3913 if (surrogate) { 3914 /* expecting a second surrogate */ 3915 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 3916 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 3917 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 3918 goto onError; 3919 surrogate = 0; 3920 continue; 3921 } 3922 else { 3923 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 3924 goto onError; 3925 surrogate = 0; 3926 } 3927 } 3928 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 3929 /* first surrogate */ 3930 surrogate = outCh; 3931 } 3932 else { 3933 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 3934 goto onError; 3935 } 3936 } 3937 } 3938 else { /* now leaving a base-64 section */ 3939 inShift = 0; 3940 s++; 3941 if (surrogate) { 3942 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 3943 goto onError; 3944 surrogate = 0; 3945 } 3946 if (base64bits > 0) { /* left-over bits */ 3947 if (base64bits >= 6) { 3948 /* We've seen at least one base-64 character */ 3949 errmsg = "partial character in shift sequence"; 3950 goto utf7Error; 3951 } 3952 else { 3953 /* Some bits remain; they should be zero */ 3954 if (base64buffer != 0) { 3955 errmsg = "non-zero padding bits in shift sequence"; 3956 goto utf7Error; 3957 } 3958 } 3959 } 3960 if (ch != '-') { 3961 /* '-' is absorbed; other terminating 3962 characters are preserved */ 3963 if (unicode_putchar(&unicode, &outpos, ch) < 0) 3964 goto onError; 3965 } 3966 } 3967 } 3968 else if ( ch == '+' ) { 3969 startinpos = s-starts; 3970 s++; /* consume '+' */ 3971 if (s < e && *s == '-') { /* '+-' encodes '+' */ 3972 s++; 3973 if (unicode_putchar(&unicode, &outpos, '+') < 0) 3974 goto onError; 3975 } 3976 else { /* begin base64-encoded section */ 3977 inShift = 1; 3978 shiftOutStart = outpos; 3979 base64bits = 0; 3980 } 3981 } 3982 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 3983 if (unicode_putchar(&unicode, &outpos, ch) < 0) 3984 goto onError; 3985 s++; 3986 } 3987 else { 3988 startinpos = s-starts; 3989 s++; 3990 errmsg = "unexpected special character"; 3991 goto utf7Error; 3992 } 3993 continue; 3994utf7Error: 3995 endinpos = s-starts; 3996 if (unicode_decode_call_errorhandler( 3997 errors, &errorHandler, 3998 "utf7", errmsg, 3999 &starts, &e, &startinpos, &endinpos, &exc, &s, 4000 &unicode, &outpos)) 4001 goto onError; 4002 } 4003 4004 /* end of string */ 4005 4006 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4007 /* if we're in an inconsistent state, that's an error */ 4008 if (surrogate || 4009 (base64bits >= 6) || 4010 (base64bits > 0 && base64buffer != 0)) { 4011 endinpos = size; 4012 if (unicode_decode_call_errorhandler( 4013 errors, &errorHandler, 4014 "utf7", "unterminated shift sequence", 4015 &starts, &e, &startinpos, &endinpos, &exc, &s, 4016 &unicode, &outpos)) 4017 goto onError; 4018 if (s < e) 4019 goto restart; 4020 } 4021 } 4022 4023 /* return state */ 4024 if (consumed) { 4025 if (inShift) { 4026 outpos = shiftOutStart; /* back off output */ 4027 *consumed = startinpos; 4028 } 4029 else { 4030 *consumed = s-starts; 4031 } 4032 } 4033 4034 if (unicode_resize(&unicode, outpos) < 0) 4035 goto onError; 4036 4037 Py_XDECREF(errorHandler); 4038 Py_XDECREF(exc); 4039 return unicode_result(unicode); 4040 4041 onError: 4042 Py_XDECREF(errorHandler); 4043 Py_XDECREF(exc); 4044 Py_DECREF(unicode); 4045 return NULL; 4046} 4047 4048 4049PyObject * 4050_PyUnicode_EncodeUTF7(PyObject *str, 4051 int base64SetO, 4052 int base64WhiteSpace, 4053 const char *errors) 4054{ 4055 int kind; 4056 void *data; 4057 Py_ssize_t len; 4058 PyObject *v; 4059 Py_ssize_t allocated; 4060 int inShift = 0; 4061 Py_ssize_t i; 4062 unsigned int base64bits = 0; 4063 unsigned long base64buffer = 0; 4064 char * out; 4065 char * start; 4066 4067 if (PyUnicode_READY(str) < 0) 4068 return NULL; 4069 kind = PyUnicode_KIND(str); 4070 data = PyUnicode_DATA(str); 4071 len = PyUnicode_GET_LENGTH(str); 4072 4073 if (len == 0) 4074 return PyBytes_FromStringAndSize(NULL, 0); 4075 4076 /* It might be possible to tighten this worst case */ 4077 allocated = 8 * len; 4078 if (allocated / 8 != len) 4079 return PyErr_NoMemory(); 4080 4081 v = PyBytes_FromStringAndSize(NULL, allocated); 4082 if (v == NULL) 4083 return NULL; 4084 4085 start = out = PyBytes_AS_STRING(v); 4086 for (i = 0; i < len; ++i) { 4087 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4088 4089 if (inShift) { 4090 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4091 /* shifting out */ 4092 if (base64bits) { /* output remaining bits */ 4093 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4094 base64buffer = 0; 4095 base64bits = 0; 4096 } 4097 inShift = 0; 4098 /* Characters not in the BASE64 set implicitly unshift the sequence 4099 so no '-' is required, except if the character is itself a '-' */ 4100 if (IS_BASE64(ch) || ch == '-') { 4101 *out++ = '-'; 4102 } 4103 *out++ = (char) ch; 4104 } 4105 else { 4106 goto encode_char; 4107 } 4108 } 4109 else { /* not in a shift sequence */ 4110 if (ch == '+') { 4111 *out++ = '+'; 4112 *out++ = '-'; 4113 } 4114 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4115 *out++ = (char) ch; 4116 } 4117 else { 4118 *out++ = '+'; 4119 inShift = 1; 4120 goto encode_char; 4121 } 4122 } 4123 continue; 4124encode_char: 4125 if (ch >= 0x10000) { 4126 assert(ch <= MAX_UNICODE); 4127 4128 /* code first surrogate */ 4129 base64bits += 16; 4130 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4131 while (base64bits >= 6) { 4132 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4133 base64bits -= 6; 4134 } 4135 /* prepare second surrogate */ 4136 ch = Py_UNICODE_LOW_SURROGATE(ch); 4137 } 4138 base64bits += 16; 4139 base64buffer = (base64buffer << 16) | ch; 4140 while (base64bits >= 6) { 4141 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4142 base64bits -= 6; 4143 } 4144 } 4145 if (base64bits) 4146 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4147 if (inShift) 4148 *out++ = '-'; 4149 if (_PyBytes_Resize(&v, out - start) < 0) 4150 return NULL; 4151 return v; 4152} 4153PyObject * 4154PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4155 Py_ssize_t size, 4156 int base64SetO, 4157 int base64WhiteSpace, 4158 const char *errors) 4159{ 4160 PyObject *result; 4161 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4162 if (tmp == NULL) 4163 return NULL; 4164 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4165 base64WhiteSpace, errors); 4166 Py_DECREF(tmp); 4167 return result; 4168} 4169 4170#undef IS_BASE64 4171#undef FROM_BASE64 4172#undef TO_BASE64 4173#undef DECODE_DIRECT 4174#undef ENCODE_DIRECT 4175 4176/* --- UTF-8 Codec -------------------------------------------------------- */ 4177 4178static 4179char utf8_code_length[256] = { 4180 /* Map UTF-8 encoded prefix byte to sequence length. Zero means 4181 illegal prefix. See RFC 3629 for details */ 4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */ 4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */ 4190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */ 4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */ 4194 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */ 4195 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */ 4196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */ 4197 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */ 4198}; 4199 4200PyObject * 4201PyUnicode_DecodeUTF8(const char *s, 4202 Py_ssize_t size, 4203 const char *errors) 4204{ 4205 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4206} 4207 4208#include "stringlib/ucs1lib.h" 4209#include "stringlib/codecs.h" 4210#include "stringlib/undef.h" 4211 4212#include "stringlib/ucs2lib.h" 4213#include "stringlib/codecs.h" 4214#include "stringlib/undef.h" 4215 4216#include "stringlib/ucs4lib.h" 4217#include "stringlib/codecs.h" 4218#include "stringlib/undef.h" 4219 4220/* Mask to check or force alignment of a pointer to C 'long' boundaries */ 4221#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1) 4222 4223/* Mask to quickly check whether a C 'long' contains a 4224 non-ASCII, UTF8-encoded char. */ 4225#if (SIZEOF_LONG == 8) 4226# define ASCII_CHAR_MASK 0x8080808080808080L 4227#elif (SIZEOF_LONG == 4) 4228# define ASCII_CHAR_MASK 0x80808080L 4229#else 4230# error C 'long' size should be either 4 or 8! 4231#endif 4232 4233/* Scans a UTF-8 string and returns the maximum character to be expected 4234 and the size of the decoded unicode string. 4235 4236 This function doesn't check for errors, these checks are performed in 4237 PyUnicode_DecodeUTF8Stateful. 4238 */ 4239static Py_UCS4 4240utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size, 4241 Py_ssize_t *unicode_size) 4242{ 4243 Py_ssize_t char_count = 0; 4244 const unsigned char *p = (const unsigned char *)s; 4245 const unsigned char *end = p + string_size; 4246 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 4247 4248 assert(unicode_size != NULL); 4249 4250 /* By having a cascade of independent loops which fallback onto each 4251 other, we minimize the amount of work done in the average loop 4252 iteration, and we also maximize the CPU's ability to predict 4253 branches correctly (because a given condition will have always the 4254 same boolean outcome except perhaps in the last iteration of the 4255 corresponding loop). 4256 In the general case this brings us rather close to decoding 4257 performance pre-PEP 393, despite the two-pass decoding. 4258 4259 Note that the pure ASCII loop is not duplicated once a non-ASCII 4260 character has been encountered. It is actually a pessimization (by 4261 a significant factor) to use this loop on text with many non-ASCII 4262 characters, and it is important to avoid bad performance on valid 4263 utf-8 data (invalid utf-8 being a different can of worms). 4264 */ 4265 4266 /* ASCII */ 4267 for (; p < end; ++p) { 4268 /* Only check value if it's not a ASCII char... */ 4269 if (*p < 0x80) { 4270 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 4271 an explanation. */ 4272 if (!((size_t) p & LONG_PTR_MASK)) { 4273 /* Help register allocation */ 4274 register const unsigned char *_p = p; 4275 while (_p < aligned_end) { 4276 unsigned long value = *(unsigned long *) _p; 4277 if (value & ASCII_CHAR_MASK) 4278 break; 4279 _p += SIZEOF_LONG; 4280 char_count += SIZEOF_LONG; 4281 } 4282 p = _p; 4283 if (p == end) 4284 break; 4285 } 4286 } 4287 if (*p < 0x80) 4288 ++char_count; 4289 else 4290 goto _ucs1loop; 4291 } 4292 *unicode_size = char_count; 4293 return 127; 4294 4295_ucs1loop: 4296 for (; p < end; ++p) { 4297 if (*p < 0xc4) 4298 char_count += ((*p & 0xc0) != 0x80); 4299 else 4300 goto _ucs2loop; 4301 } 4302 *unicode_size = char_count; 4303 return 255; 4304 4305_ucs2loop: 4306 for (; p < end; ++p) { 4307 if (*p < 0xf0) 4308 char_count += ((*p & 0xc0) != 0x80); 4309 else 4310 goto _ucs4loop; 4311 } 4312 *unicode_size = char_count; 4313 return 65535; 4314 4315_ucs4loop: 4316 for (; p < end; ++p) { 4317 char_count += ((*p & 0xc0) != 0x80); 4318 } 4319 *unicode_size = char_count; 4320 return 65537; 4321} 4322 4323/* Called when we encountered some error that wasn't detected in the original 4324 scan, e.g. an encoded surrogate character. The original maxchar computation 4325 may have been incorrect, so redo it. */ 4326static int 4327refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n) 4328{ 4329 PyObject *tmp; 4330 Py_ssize_t k; 4331 Py_UCS4 maxchar; 4332 for (k = 0, maxchar = 0; k < n; k++) 4333 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k)); 4334 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar); 4335 if (tmp == NULL) 4336 return -1; 4337 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n); 4338 Py_DECREF(*unicode); 4339 *unicode = tmp; 4340 return 0; 4341} 4342 4343/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string 4344 in case of errors. Implicit parameters: unicode, kind, data, has_errors, 4345 onError. Potential resizing overallocates, so the result needs to shrink 4346 at the end. 4347*/ 4348#define WRITE_MAYBE_FAIL(index, value) \ 4349 do { \ 4350 if (has_errors) { \ 4351 Py_ssize_t pos = index; \ 4352 if (pos > PyUnicode_GET_LENGTH(unicode) && \ 4353 unicode_resize(&unicode, pos + pos/8) < 0) \ 4354 goto onError; \ 4355 if (unicode_putchar(&unicode, &pos, value) < 0) \ 4356 goto onError; \ 4357 } \ 4358 else \ 4359 PyUnicode_WRITE(kind, data, index, value); \ 4360 } while (0) 4361 4362PyObject * 4363PyUnicode_DecodeUTF8Stateful(const char *s, 4364 Py_ssize_t size, 4365 const char *errors, 4366 Py_ssize_t *consumed) 4367{ 4368 const char *starts = s; 4369 int n; 4370 int k; 4371 Py_ssize_t startinpos; 4372 Py_ssize_t endinpos; 4373 const char *e, *aligned_end; 4374 PyObject *unicode; 4375 const char *errmsg = ""; 4376 PyObject *errorHandler = NULL; 4377 PyObject *exc = NULL; 4378 Py_UCS4 maxchar = 0; 4379 Py_ssize_t unicode_size; 4380 Py_ssize_t i; 4381 int kind; 4382 void *data; 4383 int has_errors = 0; 4384 4385 if (size == 0) { 4386 if (consumed) 4387 *consumed = 0; 4388 return (PyObject *)PyUnicode_New(0, 0); 4389 } 4390 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size); 4391 /* When the string is ASCII only, just use memcpy and return. 4392 unicode_size may be != size if there is an incomplete UTF-8 4393 sequence at the end of the ASCII block. */ 4394 if (maxchar < 128 && size == unicode_size) { 4395 if (consumed) 4396 *consumed = size; 4397 4398 if (size == 1) 4399 return get_latin1_char((unsigned char)s[0]); 4400 4401 unicode = PyUnicode_New(unicode_size, maxchar); 4402 if (!unicode) 4403 return NULL; 4404 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); 4405 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4406 return unicode; 4407 } 4408 4409 /* In case of errors, maxchar and size computation might be incorrect; 4410 code below refits and resizes as necessary. */ 4411 unicode = PyUnicode_New(unicode_size, maxchar); 4412 if (!unicode) 4413 return NULL; 4414 kind = PyUnicode_KIND(unicode); 4415 data = PyUnicode_DATA(unicode); 4416 4417 /* Unpack UTF-8 encoded data */ 4418 i = 0; 4419 e = s + size; 4420 switch (kind) { 4421 case PyUnicode_1BYTE_KIND: 4422 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i); 4423 break; 4424 case PyUnicode_2BYTE_KIND: 4425 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i); 4426 break; 4427 case PyUnicode_4BYTE_KIND: 4428 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i); 4429 break; 4430 } 4431 if (!has_errors) { 4432 /* Ensure the unicode size calculation was correct */ 4433 assert(i == unicode_size); 4434 assert(s == e); 4435 if (consumed) 4436 *consumed = s-starts; 4437 return unicode; 4438 } 4439 /* Fall through to the generic decoding loop for the rest of 4440 the string */ 4441 if (refit_partial_string(&unicode, kind, data, i) < 0) 4442 goto onError; 4443 4444 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4445 4446 while (s < e) { 4447 Py_UCS4 ch = (unsigned char)*s; 4448 4449 if (ch < 0x80) { 4450 /* Fast path for runs of ASCII characters. Given that common UTF-8 4451 input will consist of an overwhelming majority of ASCII 4452 characters, we try to optimize for this case by checking 4453 as many characters as a C 'long' can contain. 4454 First, check if we can do an aligned read, as most CPUs have 4455 a penalty for unaligned reads. 4456 */ 4457 if (!((size_t) s & LONG_PTR_MASK)) { 4458 /* Help register allocation */ 4459 register const char *_s = s; 4460 register Py_ssize_t _i = i; 4461 while (_s < aligned_end) { 4462 /* Read a whole long at a time (either 4 or 8 bytes), 4463 and do a fast unrolled copy if it only contains ASCII 4464 characters. */ 4465 unsigned long value = *(unsigned long *) _s; 4466 if (value & ASCII_CHAR_MASK) 4467 break; 4468 WRITE_MAYBE_FAIL(_i+0, _s[0]); 4469 WRITE_MAYBE_FAIL(_i+1, _s[1]); 4470 WRITE_MAYBE_FAIL(_i+2, _s[2]); 4471 WRITE_MAYBE_FAIL(_i+3, _s[3]); 4472#if (SIZEOF_LONG == 8) 4473 WRITE_MAYBE_FAIL(_i+4, _s[4]); 4474 WRITE_MAYBE_FAIL(_i+5, _s[5]); 4475 WRITE_MAYBE_FAIL(_i+6, _s[6]); 4476 WRITE_MAYBE_FAIL(_i+7, _s[7]); 4477#endif 4478 _s += SIZEOF_LONG; 4479 _i += SIZEOF_LONG; 4480 } 4481 s = _s; 4482 i = _i; 4483 if (s == e) 4484 break; 4485 ch = (unsigned char)*s; 4486 } 4487 } 4488 4489 if (ch < 0x80) { 4490 WRITE_MAYBE_FAIL(i++, ch); 4491 s++; 4492 continue; 4493 } 4494 4495 n = utf8_code_length[ch]; 4496 4497 if (s + n > e) { 4498 if (consumed) 4499 break; 4500 else { 4501 errmsg = "unexpected end of data"; 4502 startinpos = s-starts; 4503 endinpos = startinpos+1; 4504 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++) 4505 endinpos++; 4506 goto utf8Error; 4507 } 4508 } 4509 4510 switch (n) { 4511 4512 case 0: 4513 errmsg = "invalid start byte"; 4514 startinpos = s-starts; 4515 endinpos = startinpos+1; 4516 goto utf8Error; 4517 4518 case 1: 4519 errmsg = "internal error"; 4520 startinpos = s-starts; 4521 endinpos = startinpos+1; 4522 goto utf8Error; 4523 4524 case 2: 4525 if ((s[1] & 0xc0) != 0x80) { 4526 errmsg = "invalid continuation byte"; 4527 startinpos = s-starts; 4528 endinpos = startinpos + 1; 4529 goto utf8Error; 4530 } 4531 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4532 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4533 WRITE_MAYBE_FAIL(i++, ch); 4534 break; 4535 4536 case 3: 4537 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4538 will result in surrogates in range d800-dfff. Surrogates are 4539 not valid UTF-8 so they are rejected. 4540 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4541 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4542 if ((s[1] & 0xc0) != 0x80 || 4543 (s[2] & 0xc0) != 0x80 || 4544 ((unsigned char)s[0] == 0xE0 && 4545 (unsigned char)s[1] < 0xA0) || 4546 ((unsigned char)s[0] == 0xED && 4547 (unsigned char)s[1] > 0x9F)) { 4548 errmsg = "invalid continuation byte"; 4549 startinpos = s-starts; 4550 endinpos = startinpos + 1; 4551 4552 /* if s[1] first two bits are 1 and 0, then the invalid 4553 continuation byte is s[2], so increment endinpos by 1, 4554 if not, s[1] is invalid and endinpos doesn't need to 4555 be incremented. */ 4556 if ((s[1] & 0xC0) == 0x80) 4557 endinpos++; 4558 goto utf8Error; 4559 } 4560 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4561 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4562 WRITE_MAYBE_FAIL(i++, ch); 4563 break; 4564 4565 case 4: 4566 if ((s[1] & 0xc0) != 0x80 || 4567 (s[2] & 0xc0) != 0x80 || 4568 (s[3] & 0xc0) != 0x80 || 4569 ((unsigned char)s[0] == 0xF0 && 4570 (unsigned char)s[1] < 0x90) || 4571 ((unsigned char)s[0] == 0xF4 && 4572 (unsigned char)s[1] > 0x8F)) { 4573 errmsg = "invalid continuation byte"; 4574 startinpos = s-starts; 4575 endinpos = startinpos + 1; 4576 if ((s[1] & 0xC0) == 0x80) { 4577 endinpos++; 4578 if ((s[2] & 0xC0) == 0x80) 4579 endinpos++; 4580 } 4581 goto utf8Error; 4582 } 4583 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4584 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4585 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); 4586 4587 WRITE_MAYBE_FAIL(i++, ch); 4588 break; 4589 } 4590 s += n; 4591 continue; 4592 4593 utf8Error: 4594 if (!has_errors) { 4595 if (refit_partial_string(&unicode, kind, data, i) < 0) 4596 goto onError; 4597 has_errors = 1; 4598 } 4599 if (unicode_decode_call_errorhandler( 4600 errors, &errorHandler, 4601 "utf8", errmsg, 4602 &starts, &e, &startinpos, &endinpos, &exc, &s, 4603 &unicode, &i)) 4604 goto onError; 4605 /* Update data because unicode_decode_call_errorhandler might have 4606 re-created or resized the unicode object. */ 4607 data = PyUnicode_DATA(unicode); 4608 kind = PyUnicode_KIND(unicode); 4609 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); 4610 } 4611 /* Ensure the unicode_size calculation above was correct: */ 4612 assert(has_errors || i == unicode_size); 4613 4614 if (consumed) 4615 *consumed = s-starts; 4616 4617 /* Adjust length and ready string when it contained errors and 4618 is of the old resizable kind. */ 4619 if (has_errors) { 4620 if (PyUnicode_Resize(&unicode, i) < 0) 4621 goto onError; 4622 } 4623 4624 Py_XDECREF(errorHandler); 4625 Py_XDECREF(exc); 4626 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4627 return unicode; 4628 4629 onError: 4630 Py_XDECREF(errorHandler); 4631 Py_XDECREF(exc); 4632 Py_DECREF(unicode); 4633 return NULL; 4634} 4635 4636#undef WRITE_MAYBE_FAIL 4637 4638#ifdef __APPLE__ 4639 4640/* Simplified UTF-8 decoder using surrogateescape error handler, 4641 used to decode the command line arguments on Mac OS X. */ 4642 4643wchar_t* 4644_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4645{ 4646 int n; 4647 const char *e; 4648 wchar_t *unicode, *p; 4649 4650 /* Note: size will always be longer than the resulting Unicode 4651 character count */ 4652 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4653 PyErr_NoMemory(); 4654 return NULL; 4655 } 4656 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4657 if (!unicode) 4658 return NULL; 4659 4660 /* Unpack UTF-8 encoded data */ 4661 p = unicode; 4662 e = s + size; 4663 while (s < e) { 4664 Py_UCS4 ch = (unsigned char)*s; 4665 4666 if (ch < 0x80) { 4667 *p++ = (wchar_t)ch; 4668 s++; 4669 continue; 4670 } 4671 4672 n = utf8_code_length[ch]; 4673 if (s + n > e) { 4674 goto surrogateescape; 4675 } 4676 4677 switch (n) { 4678 case 0: 4679 case 1: 4680 goto surrogateescape; 4681 4682 case 2: 4683 if ((s[1] & 0xc0) != 0x80) 4684 goto surrogateescape; 4685 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); 4686 assert ((ch > 0x007F) && (ch <= 0x07FF)); 4687 *p++ = (wchar_t)ch; 4688 break; 4689 4690 case 3: 4691 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf 4692 will result in surrogates in range d800-dfff. Surrogates are 4693 not valid UTF-8 so they are rejected. 4694 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 4695 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ 4696 if ((s[1] & 0xc0) != 0x80 || 4697 (s[2] & 0xc0) != 0x80 || 4698 ((unsigned char)s[0] == 0xE0 && 4699 (unsigned char)s[1] < 0xA0) || 4700 ((unsigned char)s[0] == 0xED && 4701 (unsigned char)s[1] > 0x9F)) { 4702 4703 goto surrogateescape; 4704 } 4705 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); 4706 assert ((ch > 0x07FF) && (ch <= 0xFFFF)); 4707 *p++ = (wchar_t)ch; 4708 break; 4709 4710 case 4: 4711 if ((s[1] & 0xc0) != 0x80 || 4712 (s[2] & 0xc0) != 0x80 || 4713 (s[3] & 0xc0) != 0x80 || 4714 ((unsigned char)s[0] == 0xF0 && 4715 (unsigned char)s[1] < 0x90) || 4716 ((unsigned char)s[0] == 0xF4 && 4717 (unsigned char)s[1] > 0x8F)) { 4718 goto surrogateescape; 4719 } 4720 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + 4721 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); 4722 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); 4723 4724#if SIZEOF_WCHAR_T == 4 4725 *p++ = (wchar_t)ch; 4726#else 4727 /* compute and append the two surrogates: */ 4728 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4729 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4730#endif 4731 break; 4732 } 4733 s += n; 4734 continue; 4735 4736 surrogateescape: 4737 *p++ = 0xDC00 + ch; 4738 s++; 4739 } 4740 *p = L'\0'; 4741 return unicode; 4742} 4743 4744#endif /* __APPLE__ */ 4745 4746/* Primary internal function which creates utf8 encoded bytes objects. 4747 4748 Allocation strategy: if the string is short, convert into a stack buffer 4749 and allocate exactly as much space needed at the end. Else allocate the 4750 maximum possible needed (4 result bytes per Unicode character), and return 4751 the excess memory at the end. 4752*/ 4753PyObject * 4754_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4755{ 4756#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ 4757 4758 Py_ssize_t i; /* index into s of next input byte */ 4759 PyObject *result; /* result string object */ 4760 char *p; /* next free byte in output buffer */ 4761 Py_ssize_t nallocated; /* number of result bytes allocated */ 4762 Py_ssize_t nneeded; /* number of result bytes needed */ 4763 char stackbuf[MAX_SHORT_UNICHARS * 4]; 4764 PyObject *errorHandler = NULL; 4765 PyObject *exc = NULL; 4766 int kind; 4767 void *data; 4768 Py_ssize_t size; 4769 PyObject *rep = NULL; 4770 4771 if (!PyUnicode_Check(unicode)) { 4772 PyErr_BadArgument(); 4773 return NULL; 4774 } 4775 4776 if (PyUnicode_READY(unicode) == -1) 4777 return NULL; 4778 4779 if (PyUnicode_UTF8(unicode)) 4780 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4781 PyUnicode_UTF8_LENGTH(unicode)); 4782 4783 kind = PyUnicode_KIND(unicode); 4784 data = PyUnicode_DATA(unicode); 4785 size = PyUnicode_GET_LENGTH(unicode); 4786 4787 assert(size >= 0); 4788 4789 if (size <= MAX_SHORT_UNICHARS) { 4790 /* Write into the stack buffer; nallocated can't overflow. 4791 * At the end, we'll allocate exactly as much heap space as it 4792 * turns out we need. 4793 */ 4794 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); 4795 result = NULL; /* will allocate after we're done */ 4796 p = stackbuf; 4797 } 4798 else { 4799 /* Overallocate on the heap, and give the excess back at the end. */ 4800 nallocated = size * 4; 4801 if (nallocated / 4 != size) /* overflow! */ 4802 return PyErr_NoMemory(); 4803 result = PyBytes_FromStringAndSize(NULL, nallocated); 4804 if (result == NULL) 4805 return NULL; 4806 p = PyBytes_AS_STRING(result); 4807 } 4808 4809 for (i = 0; i < size;) { 4810 Py_UCS4 ch = PyUnicode_READ(kind, data, i++); 4811 4812 if (ch < 0x80) 4813 /* Encode ASCII */ 4814 *p++ = (char) ch; 4815 4816 else if (ch < 0x0800) { 4817 /* Encode Latin-1 */ 4818 *p++ = (char)(0xc0 | (ch >> 6)); 4819 *p++ = (char)(0x80 | (ch & 0x3f)); 4820 } else if (Py_UNICODE_IS_SURROGATE(ch)) { 4821 Py_ssize_t newpos; 4822 Py_ssize_t repsize, k, startpos; 4823 startpos = i-1; 4824 rep = unicode_encode_call_errorhandler( 4825 errors, &errorHandler, "utf-8", "surrogates not allowed", 4826 unicode, &exc, startpos, startpos+1, &newpos); 4827 if (!rep) 4828 goto error; 4829 4830 if (PyBytes_Check(rep)) 4831 repsize = PyBytes_GET_SIZE(rep); 4832 else 4833 repsize = PyUnicode_GET_LENGTH(rep); 4834 4835 if (repsize > 4) { 4836 Py_ssize_t offset; 4837 4838 if (result == NULL) 4839 offset = p - stackbuf; 4840 else 4841 offset = p - PyBytes_AS_STRING(result); 4842 4843 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) { 4844 /* integer overflow */ 4845 PyErr_NoMemory(); 4846 goto error; 4847 } 4848 nallocated += repsize - 4; 4849 if (result != NULL) { 4850 if (_PyBytes_Resize(&result, nallocated) < 0) 4851 goto error; 4852 } else { 4853 result = PyBytes_FromStringAndSize(NULL, nallocated); 4854 if (result == NULL) 4855 goto error; 4856 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); 4857 } 4858 p = PyBytes_AS_STRING(result) + offset; 4859 } 4860 4861 if (PyBytes_Check(rep)) { 4862 char *prep = PyBytes_AS_STRING(rep); 4863 for(k = repsize; k > 0; k--) 4864 *p++ = *prep++; 4865 } else /* rep is unicode */ { 4866 enum PyUnicode_Kind repkind; 4867 void *repdata; 4868 4869 if (PyUnicode_READY(rep) < 0) 4870 goto error; 4871 repkind = PyUnicode_KIND(rep); 4872 repdata = PyUnicode_DATA(rep); 4873 4874 for(k=0; k<repsize; k++) { 4875 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); 4876 if (0x80 <= c) { 4877 raise_encode_exception(&exc, "utf-8", 4878 unicode, 4879 i-1, i, 4880 "surrogates not allowed"); 4881 goto error; 4882 } 4883 *p++ = (char)c; 4884 } 4885 } 4886 Py_CLEAR(rep); 4887 } else if (ch < 0x10000) { 4888 *p++ = (char)(0xe0 | (ch >> 12)); 4889 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4890 *p++ = (char)(0x80 | (ch & 0x3f)); 4891 } else /* ch >= 0x10000 */ { 4892 assert(ch <= MAX_UNICODE); 4893 /* Encode UCS4 Unicode ordinals */ 4894 *p++ = (char)(0xf0 | (ch >> 18)); 4895 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); 4896 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); 4897 *p++ = (char)(0x80 | (ch & 0x3f)); 4898 } 4899 } 4900 4901 if (result == NULL) { 4902 /* This was stack allocated. */ 4903 nneeded = p - stackbuf; 4904 assert(nneeded <= nallocated); 4905 result = PyBytes_FromStringAndSize(stackbuf, nneeded); 4906 } 4907 else { 4908 /* Cut back to size actually needed. */ 4909 nneeded = p - PyBytes_AS_STRING(result); 4910 assert(nneeded <= nallocated); 4911 _PyBytes_Resize(&result, nneeded); 4912 } 4913 4914 Py_XDECREF(errorHandler); 4915 Py_XDECREF(exc); 4916 return result; 4917 error: 4918 Py_XDECREF(rep); 4919 Py_XDECREF(errorHandler); 4920 Py_XDECREF(exc); 4921 Py_XDECREF(result); 4922 return NULL; 4923 4924#undef MAX_SHORT_UNICHARS 4925} 4926 4927PyObject * 4928PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4929 Py_ssize_t size, 4930 const char *errors) 4931{ 4932 PyObject *v, *unicode; 4933 4934 unicode = PyUnicode_FromUnicode(s, size); 4935 if (unicode == NULL) 4936 return NULL; 4937 v = _PyUnicode_AsUTF8String(unicode, errors); 4938 Py_DECREF(unicode); 4939 return v; 4940} 4941 4942PyObject * 4943PyUnicode_AsUTF8String(PyObject *unicode) 4944{ 4945 return _PyUnicode_AsUTF8String(unicode, NULL); 4946} 4947 4948/* --- UTF-32 Codec ------------------------------------------------------- */ 4949 4950PyObject * 4951PyUnicode_DecodeUTF32(const char *s, 4952 Py_ssize_t size, 4953 const char *errors, 4954 int *byteorder) 4955{ 4956 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4957} 4958 4959PyObject * 4960PyUnicode_DecodeUTF32Stateful(const char *s, 4961 Py_ssize_t size, 4962 const char *errors, 4963 int *byteorder, 4964 Py_ssize_t *consumed) 4965{ 4966 const char *starts = s; 4967 Py_ssize_t startinpos; 4968 Py_ssize_t endinpos; 4969 Py_ssize_t outpos; 4970 PyObject *unicode; 4971 const unsigned char *q, *e; 4972 int bo = 0; /* assume native ordering by default */ 4973 const char *errmsg = ""; 4974 /* Offsets from q for retrieving bytes in the right order. */ 4975#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4976 int iorder[] = {0, 1, 2, 3}; 4977#else 4978 int iorder[] = {3, 2, 1, 0}; 4979#endif 4980 PyObject *errorHandler = NULL; 4981 PyObject *exc = NULL; 4982 4983 q = (unsigned char *)s; 4984 e = q + size; 4985 4986 if (byteorder) 4987 bo = *byteorder; 4988 4989 /* Check for BOM marks (U+FEFF) in the input and adjust current 4990 byte order setting accordingly. In native mode, the leading BOM 4991 mark is skipped, in all other modes, it is copied to the output 4992 stream as-is (giving a ZWNBSP character). */ 4993 if (bo == 0) { 4994 if (size >= 4) { 4995 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4996 (q[iorder[1]] << 8) | q[iorder[0]]; 4997#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4998 if (bom == 0x0000FEFF) { 4999 q += 4; 5000 bo = -1; 5001 } 5002 else if (bom == 0xFFFE0000) { 5003 q += 4; 5004 bo = 1; 5005 } 5006#else 5007 if (bom == 0x0000FEFF) { 5008 q += 4; 5009 bo = 1; 5010 } 5011 else if (bom == 0xFFFE0000) { 5012 q += 4; 5013 bo = -1; 5014 } 5015#endif 5016 } 5017 } 5018 5019 if (bo == -1) { 5020 /* force LE */ 5021 iorder[0] = 0; 5022 iorder[1] = 1; 5023 iorder[2] = 2; 5024 iorder[3] = 3; 5025 } 5026 else if (bo == 1) { 5027 /* force BE */ 5028 iorder[0] = 3; 5029 iorder[1] = 2; 5030 iorder[2] = 1; 5031 iorder[3] = 0; 5032 } 5033 5034 /* This might be one to much, because of a BOM */ 5035 unicode = PyUnicode_New((size+3)/4, 127); 5036 if (!unicode) 5037 return NULL; 5038 if (size == 0) 5039 return unicode; 5040 outpos = 0; 5041 5042 while (q < e) { 5043 Py_UCS4 ch; 5044 /* remaining bytes at the end? (size should be divisible by 4) */ 5045 if (e-q<4) { 5046 if (consumed) 5047 break; 5048 errmsg = "truncated data"; 5049 startinpos = ((const char *)q)-starts; 5050 endinpos = ((const char *)e)-starts; 5051 goto utf32Error; 5052 /* The remaining input chars are ignored if the callback 5053 chooses to skip the input */ 5054 } 5055 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 5056 (q[iorder[1]] << 8) | q[iorder[0]]; 5057 5058 if (ch >= 0x110000) 5059 { 5060 errmsg = "codepoint not in range(0x110000)"; 5061 startinpos = ((const char *)q)-starts; 5062 endinpos = startinpos+4; 5063 goto utf32Error; 5064 } 5065 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5066 goto onError; 5067 q += 4; 5068 continue; 5069 utf32Error: 5070 if (unicode_decode_call_errorhandler( 5071 errors, &errorHandler, 5072 "utf32", errmsg, 5073 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5074 &unicode, &outpos)) 5075 goto onError; 5076 } 5077 5078 if (byteorder) 5079 *byteorder = bo; 5080 5081 if (consumed) 5082 *consumed = (const char *)q-starts; 5083 5084 /* Adjust length */ 5085 if (PyUnicode_Resize(&unicode, outpos) < 0) 5086 goto onError; 5087 5088 Py_XDECREF(errorHandler); 5089 Py_XDECREF(exc); 5090 return unicode_result(unicode); 5091 5092 onError: 5093 Py_DECREF(unicode); 5094 Py_XDECREF(errorHandler); 5095 Py_XDECREF(exc); 5096 return NULL; 5097} 5098 5099PyObject * 5100_PyUnicode_EncodeUTF32(PyObject *str, 5101 const char *errors, 5102 int byteorder) 5103{ 5104 int kind; 5105 void *data; 5106 Py_ssize_t len; 5107 PyObject *v; 5108 unsigned char *p; 5109 Py_ssize_t nsize, bytesize, i; 5110 /* Offsets from p for storing byte pairs in the right order. */ 5111#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5112 int iorder[] = {0, 1, 2, 3}; 5113#else 5114 int iorder[] = {3, 2, 1, 0}; 5115#endif 5116 5117#define STORECHAR(CH) \ 5118 do { \ 5119 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5120 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5121 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5122 p[iorder[0]] = (CH) & 0xff; \ 5123 p += 4; \ 5124 } while(0) 5125 5126 if (!PyUnicode_Check(str)) { 5127 PyErr_BadArgument(); 5128 return NULL; 5129 } 5130 if (PyUnicode_READY(str) < 0) 5131 return NULL; 5132 kind = PyUnicode_KIND(str); 5133 data = PyUnicode_DATA(str); 5134 len = PyUnicode_GET_LENGTH(str); 5135 5136 nsize = len + (byteorder == 0); 5137 bytesize = nsize * 4; 5138 if (bytesize / 4 != nsize) 5139 return PyErr_NoMemory(); 5140 v = PyBytes_FromStringAndSize(NULL, bytesize); 5141 if (v == NULL) 5142 return NULL; 5143 5144 p = (unsigned char *)PyBytes_AS_STRING(v); 5145 if (byteorder == 0) 5146 STORECHAR(0xFEFF); 5147 if (len == 0) 5148 goto done; 5149 5150 if (byteorder == -1) { 5151 /* force LE */ 5152 iorder[0] = 0; 5153 iorder[1] = 1; 5154 iorder[2] = 2; 5155 iorder[3] = 3; 5156 } 5157 else if (byteorder == 1) { 5158 /* force BE */ 5159 iorder[0] = 3; 5160 iorder[1] = 2; 5161 iorder[2] = 1; 5162 iorder[3] = 0; 5163 } 5164 5165 for (i = 0; i < len; i++) 5166 STORECHAR(PyUnicode_READ(kind, data, i)); 5167 5168 done: 5169 return v; 5170#undef STORECHAR 5171} 5172 5173PyObject * 5174PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5175 Py_ssize_t size, 5176 const char *errors, 5177 int byteorder) 5178{ 5179 PyObject *result; 5180 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5181 if (tmp == NULL) 5182 return NULL; 5183 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5184 Py_DECREF(tmp); 5185 return result; 5186} 5187 5188PyObject * 5189PyUnicode_AsUTF32String(PyObject *unicode) 5190{ 5191 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5192} 5193 5194/* --- UTF-16 Codec ------------------------------------------------------- */ 5195 5196PyObject * 5197PyUnicode_DecodeUTF16(const char *s, 5198 Py_ssize_t size, 5199 const char *errors, 5200 int *byteorder) 5201{ 5202 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5203} 5204 5205/* Two masks for fast checking of whether a C 'long' may contain 5206 UTF16-encoded surrogate characters. This is an efficient heuristic, 5207 assuming that non-surrogate characters with a code point >= 0x8000 are 5208 rare in most input. 5209 FAST_CHAR_MASK is used when the input is in native byte ordering, 5210 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. 5211*/ 5212#if (SIZEOF_LONG == 8) 5213# define FAST_CHAR_MASK 0x8000800080008000L 5214# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L 5215#elif (SIZEOF_LONG == 4) 5216# define FAST_CHAR_MASK 0x80008000L 5217# define SWAPPED_FAST_CHAR_MASK 0x00800080L 5218#else 5219# error C 'long' size should be either 4 or 8! 5220#endif 5221 5222PyObject * 5223PyUnicode_DecodeUTF16Stateful(const char *s, 5224 Py_ssize_t size, 5225 const char *errors, 5226 int *byteorder, 5227 Py_ssize_t *consumed) 5228{ 5229 const char *starts = s; 5230 Py_ssize_t startinpos; 5231 Py_ssize_t endinpos; 5232 Py_ssize_t outpos; 5233 PyObject *unicode; 5234 const unsigned char *q, *e, *aligned_end; 5235 int bo = 0; /* assume native ordering by default */ 5236 int native_ordering = 0; 5237 const char *errmsg = ""; 5238 /* Offsets from q for retrieving byte pairs in the right order. */ 5239#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5240 int ihi = 1, ilo = 0; 5241#else 5242 int ihi = 0, ilo = 1; 5243#endif 5244 PyObject *errorHandler = NULL; 5245 PyObject *exc = NULL; 5246 5247 /* Note: size will always be longer than the resulting Unicode 5248 character count */ 5249 unicode = PyUnicode_New(size, 127); 5250 if (!unicode) 5251 return NULL; 5252 if (size == 0) 5253 return unicode; 5254 outpos = 0; 5255 5256 q = (unsigned char *)s; 5257 e = q + size - 1; 5258 5259 if (byteorder) 5260 bo = *byteorder; 5261 5262 /* Check for BOM marks (U+FEFF) in the input and adjust current 5263 byte order setting accordingly. In native mode, the leading BOM 5264 mark is skipped, in all other modes, it is copied to the output 5265 stream as-is (giving a ZWNBSP character). */ 5266 if (bo == 0) { 5267 if (size >= 2) { 5268 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; 5269#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5270 if (bom == 0xFEFF) { 5271 q += 2; 5272 bo = -1; 5273 } 5274 else if (bom == 0xFFFE) { 5275 q += 2; 5276 bo = 1; 5277 } 5278#else 5279 if (bom == 0xFEFF) { 5280 q += 2; 5281 bo = 1; 5282 } 5283 else if (bom == 0xFFFE) { 5284 q += 2; 5285 bo = -1; 5286 } 5287#endif 5288 } 5289 } 5290 5291 if (bo == -1) { 5292 /* force LE */ 5293 ihi = 1; 5294 ilo = 0; 5295 } 5296 else if (bo == 1) { 5297 /* force BE */ 5298 ihi = 0; 5299 ilo = 1; 5300 } 5301#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5302 native_ordering = ilo < ihi; 5303#else 5304 native_ordering = ilo > ihi; 5305#endif 5306 5307 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); 5308 while (q < e) { 5309 Py_UCS4 ch; 5310 /* First check for possible aligned read of a C 'long'. Unaligned 5311 reads are more expensive, better to defer to another iteration. */ 5312 if (!((size_t) q & LONG_PTR_MASK)) { 5313 /* Fast path for runs of non-surrogate chars. */ 5314 register const unsigned char *_q = q; 5315 int kind = PyUnicode_KIND(unicode); 5316 void *data = PyUnicode_DATA(unicode); 5317 while (_q < aligned_end) { 5318 unsigned long block = * (unsigned long *) _q; 5319 unsigned short *pblock = (unsigned short*)█ 5320 Py_UCS4 maxch; 5321 if (native_ordering) { 5322 /* Can use buffer directly */ 5323 if (block & FAST_CHAR_MASK) 5324 break; 5325 } 5326 else { 5327 /* Need to byte-swap */ 5328 unsigned char *_p = (unsigned char*)pblock; 5329 if (block & SWAPPED_FAST_CHAR_MASK) 5330 break; 5331 _p[0] = _q[1]; 5332 _p[1] = _q[0]; 5333 _p[2] = _q[3]; 5334 _p[3] = _q[2]; 5335#if (SIZEOF_LONG == 8) 5336 _p[4] = _q[5]; 5337 _p[5] = _q[4]; 5338 _p[6] = _q[7]; 5339 _p[7] = _q[6]; 5340#endif 5341 } 5342 maxch = Py_MAX(pblock[0], pblock[1]); 5343#if SIZEOF_LONG == 8 5344 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3])); 5345#endif 5346 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 5347 if (unicode_widen(&unicode, maxch) < 0) 5348 goto onError; 5349 kind = PyUnicode_KIND(unicode); 5350 data = PyUnicode_DATA(unicode); 5351 } 5352 PyUnicode_WRITE(kind, data, outpos++, pblock[0]); 5353 PyUnicode_WRITE(kind, data, outpos++, pblock[1]); 5354#if SIZEOF_LONG == 8 5355 PyUnicode_WRITE(kind, data, outpos++, pblock[2]); 5356 PyUnicode_WRITE(kind, data, outpos++, pblock[3]); 5357#endif 5358 _q += SIZEOF_LONG; 5359 } 5360 q = _q; 5361 if (q >= e) 5362 break; 5363 } 5364 ch = (q[ihi] << 8) | q[ilo]; 5365 5366 q += 2; 5367 5368 if (!Py_UNICODE_IS_SURROGATE(ch)) { 5369 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5370 goto onError; 5371 continue; 5372 } 5373 5374 /* UTF-16 code pair: */ 5375 if (q > e) { 5376 errmsg = "unexpected end of data"; 5377 startinpos = (((const char *)q) - 2) - starts; 5378 endinpos = ((const char *)e) + 1 - starts; 5379 goto utf16Error; 5380 } 5381 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { 5382 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo]; 5383 q += 2; 5384 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) { 5385 if (unicode_putchar(&unicode, &outpos, 5386 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0) 5387 goto onError; 5388 continue; 5389 } 5390 else { 5391 errmsg = "illegal UTF-16 surrogate"; 5392 startinpos = (((const char *)q)-4)-starts; 5393 endinpos = startinpos+2; 5394 goto utf16Error; 5395 } 5396 5397 } 5398 errmsg = "illegal encoding"; 5399 startinpos = (((const char *)q)-2)-starts; 5400 endinpos = startinpos+2; 5401 /* Fall through to report the error */ 5402 5403 utf16Error: 5404 if (unicode_decode_call_errorhandler( 5405 errors, 5406 &errorHandler, 5407 "utf16", errmsg, 5408 &starts, 5409 (const char **)&e, 5410 &startinpos, 5411 &endinpos, 5412 &exc, 5413 (const char **)&q, 5414 &unicode, 5415 &outpos)) 5416 goto onError; 5417 } 5418 /* remaining byte at the end? (size should be even) */ 5419 if (e == q) { 5420 if (!consumed) { 5421 errmsg = "truncated data"; 5422 startinpos = ((const char *)q) - starts; 5423 endinpos = ((const char *)e) + 1 - starts; 5424 if (unicode_decode_call_errorhandler( 5425 errors, 5426 &errorHandler, 5427 "utf16", errmsg, 5428 &starts, 5429 (const char **)&e, 5430 &startinpos, 5431 &endinpos, 5432 &exc, 5433 (const char **)&q, 5434 &unicode, 5435 &outpos)) 5436 goto onError; 5437 /* The remaining input chars are ignored if the callback 5438 chooses to skip the input */ 5439 } 5440 } 5441 5442 if (byteorder) 5443 *byteorder = bo; 5444 5445 if (consumed) 5446 *consumed = (const char *)q-starts; 5447 5448 /* Adjust length */ 5449 if (PyUnicode_Resize(&unicode, outpos) < 0) 5450 goto onError; 5451 5452 Py_XDECREF(errorHandler); 5453 Py_XDECREF(exc); 5454 return unicode_result(unicode); 5455 5456 onError: 5457 Py_DECREF(unicode); 5458 Py_XDECREF(errorHandler); 5459 Py_XDECREF(exc); 5460 return NULL; 5461} 5462 5463#undef FAST_CHAR_MASK 5464#undef SWAPPED_FAST_CHAR_MASK 5465 5466PyObject * 5467_PyUnicode_EncodeUTF16(PyObject *str, 5468 const char *errors, 5469 int byteorder) 5470{ 5471 int kind; 5472 void *data; 5473 Py_ssize_t len; 5474 PyObject *v; 5475 unsigned char *p; 5476 Py_ssize_t nsize, bytesize; 5477 Py_ssize_t i, pairs; 5478 /* Offsets from p for storing byte pairs in the right order. */ 5479#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5480 int ihi = 1, ilo = 0; 5481#else 5482 int ihi = 0, ilo = 1; 5483#endif 5484 5485#define STORECHAR(CH) \ 5486 do { \ 5487 p[ihi] = ((CH) >> 8) & 0xff; \ 5488 p[ilo] = (CH) & 0xff; \ 5489 p += 2; \ 5490 } while(0) 5491 5492 if (!PyUnicode_Check(str)) { 5493 PyErr_BadArgument(); 5494 return NULL; 5495 } 5496 if (PyUnicode_READY(str) < 0) 5497 return NULL; 5498 kind = PyUnicode_KIND(str); 5499 data = PyUnicode_DATA(str); 5500 len = PyUnicode_GET_LENGTH(str); 5501 5502 pairs = 0; 5503 if (kind == PyUnicode_4BYTE_KIND) 5504 for (i = 0; i < len; i++) 5505 if (PyUnicode_READ(kind, data, i) >= 0x10000) 5506 pairs++; 5507 /* 2 * (len + pairs + (byteorder == 0)) */ 5508 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) 5509 return PyErr_NoMemory(); 5510 nsize = len + pairs + (byteorder == 0); 5511 bytesize = nsize * 2; 5512 if (bytesize / 2 != nsize) 5513 return PyErr_NoMemory(); 5514 v = PyBytes_FromStringAndSize(NULL, bytesize); 5515 if (v == NULL) 5516 return NULL; 5517 5518 p = (unsigned char *)PyBytes_AS_STRING(v); 5519 if (byteorder == 0) 5520 STORECHAR(0xFEFF); 5521 if (len == 0) 5522 goto done; 5523 5524 if (byteorder == -1) { 5525 /* force LE */ 5526 ihi = 1; 5527 ilo = 0; 5528 } 5529 else if (byteorder == 1) { 5530 /* force BE */ 5531 ihi = 0; 5532 ilo = 1; 5533 } 5534 5535 for (i = 0; i < len; i++) { 5536 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5537 Py_UCS4 ch2 = 0; 5538 if (ch >= 0x10000) { 5539 ch2 = Py_UNICODE_LOW_SURROGATE(ch); 5540 ch = Py_UNICODE_HIGH_SURROGATE(ch); 5541 } 5542 STORECHAR(ch); 5543 if (ch2) 5544 STORECHAR(ch2); 5545 } 5546 5547 done: 5548 return v; 5549#undef STORECHAR 5550} 5551 5552PyObject * 5553PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5554 Py_ssize_t size, 5555 const char *errors, 5556 int byteorder) 5557{ 5558 PyObject *result; 5559 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5560 if (tmp == NULL) 5561 return NULL; 5562 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5563 Py_DECREF(tmp); 5564 return result; 5565} 5566 5567PyObject * 5568PyUnicode_AsUTF16String(PyObject *unicode) 5569{ 5570 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5571} 5572 5573/* --- Unicode Escape Codec ----------------------------------------------- */ 5574 5575/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5576 if all the escapes in the string make it still a valid ASCII string. 5577 Returns -1 if any escapes were found which cause the string to 5578 pop out of ASCII range. Otherwise returns the length of the 5579 required buffer to hold the string. 5580 */ 5581static Py_ssize_t 5582length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5583{ 5584 const unsigned char *p = (const unsigned char *)s; 5585 const unsigned char *end = p + size; 5586 Py_ssize_t length = 0; 5587 5588 if (size < 0) 5589 return -1; 5590 5591 for (; p < end; ++p) { 5592 if (*p > 127) { 5593 /* Non-ASCII */ 5594 return -1; 5595 } 5596 else if (*p != '\\') { 5597 /* Normal character */ 5598 ++length; 5599 } 5600 else { 5601 /* Backslash-escape, check next char */ 5602 ++p; 5603 /* Escape sequence reaches till end of string or 5604 non-ASCII follow-up. */ 5605 if (p >= end || *p > 127) 5606 return -1; 5607 switch (*p) { 5608 case '\n': 5609 /* backslash + \n result in zero characters */ 5610 break; 5611 case '\\': case '\'': case '\"': 5612 case 'b': case 'f': case 't': 5613 case 'n': case 'r': case 'v': case 'a': 5614 ++length; 5615 break; 5616 case '0': case '1': case '2': case '3': 5617 case '4': case '5': case '6': case '7': 5618 case 'x': case 'u': case 'U': case 'N': 5619 /* these do not guarantee ASCII characters */ 5620 return -1; 5621 default: 5622 /* count the backslash + the other character */ 5623 length += 2; 5624 } 5625 } 5626 } 5627 return length; 5628} 5629 5630static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5631 5632PyObject * 5633PyUnicode_DecodeUnicodeEscape(const char *s, 5634 Py_ssize_t size, 5635 const char *errors) 5636{ 5637 const char *starts = s; 5638 Py_ssize_t startinpos; 5639 Py_ssize_t endinpos; 5640 int j; 5641 PyObject *v; 5642 const char *end; 5643 char* message; 5644 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5645 PyObject *errorHandler = NULL; 5646 PyObject *exc = NULL; 5647 Py_ssize_t len; 5648 Py_ssize_t i; 5649 5650 len = length_of_escaped_ascii_string(s, size); 5651 5652 /* After length_of_escaped_ascii_string() there are two alternatives, 5653 either the string is pure ASCII with named escapes like \n, etc. 5654 and we determined it's exact size (common case) 5655 or it contains \x, \u, ... escape sequences. then we create a 5656 legacy wchar string and resize it at the end of this function. */ 5657 if (len >= 0) { 5658 v = PyUnicode_New(len, 127); 5659 if (!v) 5660 goto onError; 5661 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5662 } 5663 else { 5664 /* Escaped strings will always be longer than the resulting 5665 Unicode string, so we start with size here and then reduce the 5666 length after conversion to the true value. 5667 (but if the error callback returns a long replacement string 5668 we'll have to allocate more space) */ 5669 v = PyUnicode_New(size, 127); 5670 if (!v) 5671 goto onError; 5672 len = size; 5673 } 5674 5675 if (size == 0) 5676 return v; 5677 i = 0; 5678 end = s + size; 5679 5680 while (s < end) { 5681 unsigned char c; 5682 Py_UCS4 x; 5683 int digits; 5684 5685 /* The only case in which i == ascii_length is a backslash 5686 followed by a newline. */ 5687 assert(i <= len); 5688 5689 /* Non-escape characters are interpreted as Unicode ordinals */ 5690 if (*s != '\\') { 5691 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5692 goto onError; 5693 continue; 5694 } 5695 5696 startinpos = s-starts; 5697 /* \ - Escapes */ 5698 s++; 5699 c = *s++; 5700 if (s > end) 5701 c = '\0'; /* Invalid after \ */ 5702 5703 /* The only case in which i == ascii_length is a backslash 5704 followed by a newline. */ 5705 assert(i < len || (i == len && c == '\n')); 5706 5707 switch (c) { 5708 5709 /* \x escapes */ 5710#define WRITECHAR(ch) \ 5711 do { \ 5712 if (unicode_putchar(&v, &i, ch) < 0) \ 5713 goto onError; \ 5714 }while(0) 5715 5716 case '\n': break; 5717 case '\\': WRITECHAR('\\'); break; 5718 case '\'': WRITECHAR('\''); break; 5719 case '\"': WRITECHAR('\"'); break; 5720 case 'b': WRITECHAR('\b'); break; 5721 /* FF */ 5722 case 'f': WRITECHAR('\014'); break; 5723 case 't': WRITECHAR('\t'); break; 5724 case 'n': WRITECHAR('\n'); break; 5725 case 'r': WRITECHAR('\r'); break; 5726 /* VT */ 5727 case 'v': WRITECHAR('\013'); break; 5728 /* BEL, not classic C */ 5729 case 'a': WRITECHAR('\007'); break; 5730 5731 /* \OOO (octal) escapes */ 5732 case '0': case '1': case '2': case '3': 5733 case '4': case '5': case '6': case '7': 5734 x = s[-1] - '0'; 5735 if (s < end && '0' <= *s && *s <= '7') { 5736 x = (x<<3) + *s++ - '0'; 5737 if (s < end && '0' <= *s && *s <= '7') 5738 x = (x<<3) + *s++ - '0'; 5739 } 5740 WRITECHAR(x); 5741 break; 5742 5743 /* hex escapes */ 5744 /* \xXX */ 5745 case 'x': 5746 digits = 2; 5747 message = "truncated \\xXX escape"; 5748 goto hexescape; 5749 5750 /* \uXXXX */ 5751 case 'u': 5752 digits = 4; 5753 message = "truncated \\uXXXX escape"; 5754 goto hexescape; 5755 5756 /* \UXXXXXXXX */ 5757 case 'U': 5758 digits = 8; 5759 message = "truncated \\UXXXXXXXX escape"; 5760 hexescape: 5761 chr = 0; 5762 if (s+digits>end) { 5763 endinpos = size; 5764 if (unicode_decode_call_errorhandler( 5765 errors, &errorHandler, 5766 "unicodeescape", "end of string in escape sequence", 5767 &starts, &end, &startinpos, &endinpos, &exc, &s, 5768 &v, &i)) 5769 goto onError; 5770 goto nextByte; 5771 } 5772 for (j = 0; j < digits; ++j) { 5773 c = (unsigned char) s[j]; 5774 if (!Py_ISXDIGIT(c)) { 5775 endinpos = (s+j+1)-starts; 5776 if (unicode_decode_call_errorhandler( 5777 errors, &errorHandler, 5778 "unicodeescape", message, 5779 &starts, &end, &startinpos, &endinpos, &exc, &s, 5780 &v, &i)) 5781 goto onError; 5782 len = PyUnicode_GET_LENGTH(v); 5783 goto nextByte; 5784 } 5785 chr = (chr<<4) & ~0xF; 5786 if (c >= '0' && c <= '9') 5787 chr += c - '0'; 5788 else if (c >= 'a' && c <= 'f') 5789 chr += 10 + c - 'a'; 5790 else 5791 chr += 10 + c - 'A'; 5792 } 5793 s += j; 5794 if (chr == 0xffffffff && PyErr_Occurred()) 5795 /* _decoding_error will have already written into the 5796 target buffer. */ 5797 break; 5798 store: 5799 /* when we get here, chr is a 32-bit unicode character */ 5800 if (chr <= MAX_UNICODE) { 5801 WRITECHAR(chr); 5802 } else { 5803 endinpos = s-starts; 5804 if (unicode_decode_call_errorhandler( 5805 errors, &errorHandler, 5806 "unicodeescape", "illegal Unicode character", 5807 &starts, &end, &startinpos, &endinpos, &exc, &s, 5808 &v, &i)) 5809 goto onError; 5810 } 5811 break; 5812 5813 /* \N{name} */ 5814 case 'N': 5815 message = "malformed \\N character escape"; 5816 if (ucnhash_CAPI == NULL) { 5817 /* load the unicode data module */ 5818 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5819 PyUnicodeData_CAPSULE_NAME, 1); 5820 if (ucnhash_CAPI == NULL) 5821 goto ucnhashError; 5822 } 5823 if (*s == '{') { 5824 const char *start = s+1; 5825 /* look for the closing brace */ 5826 while (*s != '}' && s < end) 5827 s++; 5828 if (s > start && s < end && *s == '}') { 5829 /* found a name. look it up in the unicode database */ 5830 message = "unknown Unicode character name"; 5831 s++; 5832 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5833 &chr, 0)) 5834 goto store; 5835 } 5836 } 5837 endinpos = s-starts; 5838 if (unicode_decode_call_errorhandler( 5839 errors, &errorHandler, 5840 "unicodeescape", message, 5841 &starts, &end, &startinpos, &endinpos, &exc, &s, 5842 &v, &i)) 5843 goto onError; 5844 break; 5845 5846 default: 5847 if (s > end) { 5848 message = "\\ at end of string"; 5849 s--; 5850 endinpos = s-starts; 5851 if (unicode_decode_call_errorhandler( 5852 errors, &errorHandler, 5853 "unicodeescape", message, 5854 &starts, &end, &startinpos, &endinpos, &exc, &s, 5855 &v, &i)) 5856 goto onError; 5857 } 5858 else { 5859 WRITECHAR('\\'); 5860 WRITECHAR(s[-1]); 5861 } 5862 break; 5863 } 5864 nextByte: 5865 ; 5866 } 5867#undef WRITECHAR 5868 5869 if (PyUnicode_Resize(&v, i) < 0) 5870 goto onError; 5871 Py_XDECREF(errorHandler); 5872 Py_XDECREF(exc); 5873 return unicode_result(v); 5874 5875 ucnhashError: 5876 PyErr_SetString( 5877 PyExc_UnicodeError, 5878 "\\N escapes not supported (can't load unicodedata module)" 5879 ); 5880 Py_XDECREF(v); 5881 Py_XDECREF(errorHandler); 5882 Py_XDECREF(exc); 5883 return NULL; 5884 5885 onError: 5886 Py_XDECREF(v); 5887 Py_XDECREF(errorHandler); 5888 Py_XDECREF(exc); 5889 return NULL; 5890} 5891 5892/* Return a Unicode-Escape string version of the Unicode object. 5893 5894 If quotes is true, the string is enclosed in u"" or u'' quotes as 5895 appropriate. 5896 5897*/ 5898 5899PyObject * 5900PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5901{ 5902 Py_ssize_t i, len; 5903 PyObject *repr; 5904 char *p; 5905 int kind; 5906 void *data; 5907 Py_ssize_t expandsize = 0; 5908 5909 /* Initial allocation is based on the longest-possible unichr 5910 escape. 5911 5912 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5913 unichr, so in this case it's the longest unichr escape. In 5914 narrow (UTF-16) builds this is five chars per source unichr 5915 since there are two unichrs in the surrogate pair, so in narrow 5916 (UTF-16) builds it's not the longest unichr escape. 5917 5918 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5919 so in the narrow (UTF-16) build case it's the longest unichr 5920 escape. 5921 */ 5922 5923 if (!PyUnicode_Check(unicode)) { 5924 PyErr_BadArgument(); 5925 return NULL; 5926 } 5927 if (PyUnicode_READY(unicode) < 0) 5928 return NULL; 5929 len = PyUnicode_GET_LENGTH(unicode); 5930 kind = PyUnicode_KIND(unicode); 5931 data = PyUnicode_DATA(unicode); 5932 switch(kind) { 5933 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5934 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5935 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5936 } 5937 5938 if (len == 0) 5939 return PyBytes_FromStringAndSize(NULL, 0); 5940 5941 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5942 return PyErr_NoMemory(); 5943 5944 repr = PyBytes_FromStringAndSize(NULL, 5945 2 5946 + expandsize*len 5947 + 1); 5948 if (repr == NULL) 5949 return NULL; 5950 5951 p = PyBytes_AS_STRING(repr); 5952 5953 for (i = 0; i < len; i++) { 5954 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5955 5956 /* Escape backslashes */ 5957 if (ch == '\\') { 5958 *p++ = '\\'; 5959 *p++ = (char) ch; 5960 continue; 5961 } 5962 5963 /* Map 21-bit characters to '\U00xxxxxx' */ 5964 else if (ch >= 0x10000) { 5965 assert(ch <= MAX_UNICODE); 5966 *p++ = '\\'; 5967 *p++ = 'U'; 5968 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5969 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5970 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5971 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5972 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5973 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5974 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5975 *p++ = Py_hexdigits[ch & 0x0000000F]; 5976 continue; 5977 } 5978 5979 /* Map 16-bit characters to '\uxxxx' */ 5980 if (ch >= 256) { 5981 *p++ = '\\'; 5982 *p++ = 'u'; 5983 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5984 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5985 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5986 *p++ = Py_hexdigits[ch & 0x000F]; 5987 } 5988 5989 /* Map special whitespace to '\t', \n', '\r' */ 5990 else if (ch == '\t') { 5991 *p++ = '\\'; 5992 *p++ = 't'; 5993 } 5994 else if (ch == '\n') { 5995 *p++ = '\\'; 5996 *p++ = 'n'; 5997 } 5998 else if (ch == '\r') { 5999 *p++ = '\\'; 6000 *p++ = 'r'; 6001 } 6002 6003 /* Map non-printable US ASCII to '\xhh' */ 6004 else if (ch < ' ' || ch >= 0x7F) { 6005 *p++ = '\\'; 6006 *p++ = 'x'; 6007 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 6008 *p++ = Py_hexdigits[ch & 0x000F]; 6009 } 6010 6011 /* Copy everything else as-is */ 6012 else 6013 *p++ = (char) ch; 6014 } 6015 6016 assert(p - PyBytes_AS_STRING(repr) > 0); 6017 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 6018 return NULL; 6019 return repr; 6020} 6021 6022PyObject * 6023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 6024 Py_ssize_t size) 6025{ 6026 PyObject *result; 6027 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6028 if (tmp == NULL) 6029 return NULL; 6030 result = PyUnicode_AsUnicodeEscapeString(tmp); 6031 Py_DECREF(tmp); 6032 return result; 6033} 6034 6035/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6036 6037PyObject * 6038PyUnicode_DecodeRawUnicodeEscape(const char *s, 6039 Py_ssize_t size, 6040 const char *errors) 6041{ 6042 const char *starts = s; 6043 Py_ssize_t startinpos; 6044 Py_ssize_t endinpos; 6045 Py_ssize_t outpos; 6046 PyObject *v; 6047 const char *end; 6048 const char *bs; 6049 PyObject *errorHandler = NULL; 6050 PyObject *exc = NULL; 6051 6052 /* Escaped strings will always be longer than the resulting 6053 Unicode string, so we start with size here and then reduce the 6054 length after conversion to the true value. (But decoding error 6055 handler might have to resize the string) */ 6056 v = PyUnicode_New(size, 127); 6057 if (v == NULL) 6058 goto onError; 6059 if (size == 0) 6060 return v; 6061 outpos = 0; 6062 end = s + size; 6063 while (s < end) { 6064 unsigned char c; 6065 Py_UCS4 x; 6066 int i; 6067 int count; 6068 6069 /* Non-escape characters are interpreted as Unicode ordinals */ 6070 if (*s != '\\') { 6071 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6072 goto onError; 6073 continue; 6074 } 6075 startinpos = s-starts; 6076 6077 /* \u-escapes are only interpreted iff the number of leading 6078 backslashes if odd */ 6079 bs = s; 6080 for (;s < end;) { 6081 if (*s != '\\') 6082 break; 6083 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 6084 goto onError; 6085 } 6086 if (((s - bs) & 1) == 0 || 6087 s >= end || 6088 (*s != 'u' && *s != 'U')) { 6089 continue; 6090 } 6091 outpos--; 6092 count = *s=='u' ? 4 : 8; 6093 s++; 6094 6095 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6096 for (x = 0, i = 0; i < count; ++i, ++s) { 6097 c = (unsigned char)*s; 6098 if (!Py_ISXDIGIT(c)) { 6099 endinpos = s-starts; 6100 if (unicode_decode_call_errorhandler( 6101 errors, &errorHandler, 6102 "rawunicodeescape", "truncated \\uXXXX", 6103 &starts, &end, &startinpos, &endinpos, &exc, &s, 6104 &v, &outpos)) 6105 goto onError; 6106 goto nextByte; 6107 } 6108 x = (x<<4) & ~0xF; 6109 if (c >= '0' && c <= '9') 6110 x += c - '0'; 6111 else if (c >= 'a' && c <= 'f') 6112 x += 10 + c - 'a'; 6113 else 6114 x += 10 + c - 'A'; 6115 } 6116 if (x <= MAX_UNICODE) { 6117 if (unicode_putchar(&v, &outpos, x) < 0) 6118 goto onError; 6119 } else { 6120 endinpos = s-starts; 6121 if (unicode_decode_call_errorhandler( 6122 errors, &errorHandler, 6123 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6124 &starts, &end, &startinpos, &endinpos, &exc, &s, 6125 &v, &outpos)) 6126 goto onError; 6127 } 6128 nextByte: 6129 ; 6130 } 6131 if (PyUnicode_Resize(&v, outpos) < 0) 6132 goto onError; 6133 Py_XDECREF(errorHandler); 6134 Py_XDECREF(exc); 6135 return unicode_result(v); 6136 6137 onError: 6138 Py_XDECREF(v); 6139 Py_XDECREF(errorHandler); 6140 Py_XDECREF(exc); 6141 return NULL; 6142} 6143 6144 6145PyObject * 6146PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6147{ 6148 PyObject *repr; 6149 char *p; 6150 char *q; 6151 Py_ssize_t expandsize, pos; 6152 int kind; 6153 void *data; 6154 Py_ssize_t len; 6155 6156 if (!PyUnicode_Check(unicode)) { 6157 PyErr_BadArgument(); 6158 return NULL; 6159 } 6160 if (PyUnicode_READY(unicode) < 0) 6161 return NULL; 6162 kind = PyUnicode_KIND(unicode); 6163 data = PyUnicode_DATA(unicode); 6164 len = PyUnicode_GET_LENGTH(unicode); 6165 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6166 bytes, and 1 byte characters 4. */ 6167 expandsize = kind * 2 + 2; 6168 6169 if (len > PY_SSIZE_T_MAX / expandsize) 6170 return PyErr_NoMemory(); 6171 6172 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6173 if (repr == NULL) 6174 return NULL; 6175 if (len == 0) 6176 return repr; 6177 6178 p = q = PyBytes_AS_STRING(repr); 6179 for (pos = 0; pos < len; pos++) { 6180 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6181 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6182 if (ch >= 0x10000) { 6183 assert(ch <= MAX_UNICODE); 6184 *p++ = '\\'; 6185 *p++ = 'U'; 6186 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6187 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6188 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6189 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6190 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6191 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6192 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6193 *p++ = Py_hexdigits[ch & 15]; 6194 } 6195 /* Map 16-bit characters to '\uxxxx' */ 6196 else if (ch >= 256) { 6197 *p++ = '\\'; 6198 *p++ = 'u'; 6199 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6200 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6201 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6202 *p++ = Py_hexdigits[ch & 15]; 6203 } 6204 /* Copy everything else as-is */ 6205 else 6206 *p++ = (char) ch; 6207 } 6208 6209 assert(p > q); 6210 if (_PyBytes_Resize(&repr, p - q) < 0) 6211 return NULL; 6212 return repr; 6213} 6214 6215PyObject * 6216PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6217 Py_ssize_t size) 6218{ 6219 PyObject *result; 6220 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6221 if (tmp == NULL) 6222 return NULL; 6223 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6224 Py_DECREF(tmp); 6225 return result; 6226} 6227 6228/* --- Unicode Internal Codec ------------------------------------------- */ 6229 6230PyObject * 6231_PyUnicode_DecodeUnicodeInternal(const char *s, 6232 Py_ssize_t size, 6233 const char *errors) 6234{ 6235 const char *starts = s; 6236 Py_ssize_t startinpos; 6237 Py_ssize_t endinpos; 6238 Py_ssize_t outpos; 6239 PyObject *v; 6240 const char *end; 6241 const char *reason; 6242 PyObject *errorHandler = NULL; 6243 PyObject *exc = NULL; 6244 6245 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6246 "unicode_internal codec has been deprecated", 6247 1)) 6248 return NULL; 6249 6250 /* XXX overflow detection missing */ 6251 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6252 if (v == NULL) 6253 goto onError; 6254 if (PyUnicode_GET_LENGTH(v) == 0) 6255 return v; 6256 outpos = 0; 6257 end = s + size; 6258 6259 while (s < end) { 6260 Py_UNICODE uch; 6261 Py_UCS4 ch; 6262 /* We copy the raw representation one byte at a time because the 6263 pointer may be unaligned (see test_codeccallbacks). */ 6264 ((char *) &uch)[0] = s[0]; 6265 ((char *) &uch)[1] = s[1]; 6266#ifdef Py_UNICODE_WIDE 6267 ((char *) &uch)[2] = s[2]; 6268 ((char *) &uch)[3] = s[3]; 6269#endif 6270 ch = uch; 6271 6272 /* We have to sanity check the raw data, otherwise doom looms for 6273 some malformed UCS-4 data. */ 6274 if ( 6275#ifdef Py_UNICODE_WIDE 6276 ch > 0x10ffff || 6277#endif 6278 end-s < Py_UNICODE_SIZE 6279 ) 6280 { 6281 startinpos = s - starts; 6282 if (end-s < Py_UNICODE_SIZE) { 6283 endinpos = end-starts; 6284 reason = "truncated input"; 6285 } 6286 else { 6287 endinpos = s - starts + Py_UNICODE_SIZE; 6288 reason = "illegal code point (> 0x10FFFF)"; 6289 } 6290 if (unicode_decode_call_errorhandler( 6291 errors, &errorHandler, 6292 "unicode_internal", reason, 6293 &starts, &end, &startinpos, &endinpos, &exc, &s, 6294 &v, &outpos)) 6295 goto onError; 6296 continue; 6297 } 6298 6299 s += Py_UNICODE_SIZE; 6300#ifndef Py_UNICODE_WIDE 6301 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6302 { 6303 Py_UNICODE uch2; 6304 ((char *) &uch2)[0] = s[0]; 6305 ((char *) &uch2)[1] = s[1]; 6306 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6307 { 6308 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6309 s += Py_UNICODE_SIZE; 6310 } 6311 } 6312#endif 6313 6314 if (unicode_putchar(&v, &outpos, ch) < 0) 6315 goto onError; 6316 } 6317 6318 if (PyUnicode_Resize(&v, outpos) < 0) 6319 goto onError; 6320 Py_XDECREF(errorHandler); 6321 Py_XDECREF(exc); 6322 return unicode_result(v); 6323 6324 onError: 6325 Py_XDECREF(v); 6326 Py_XDECREF(errorHandler); 6327 Py_XDECREF(exc); 6328 return NULL; 6329} 6330 6331/* --- Latin-1 Codec ------------------------------------------------------ */ 6332 6333PyObject * 6334PyUnicode_DecodeLatin1(const char *s, 6335 Py_ssize_t size, 6336 const char *errors) 6337{ 6338 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6339 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6340} 6341 6342/* create or adjust a UnicodeEncodeError */ 6343static void 6344make_encode_exception(PyObject **exceptionObject, 6345 const char *encoding, 6346 PyObject *unicode, 6347 Py_ssize_t startpos, Py_ssize_t endpos, 6348 const char *reason) 6349{ 6350 if (*exceptionObject == NULL) { 6351 *exceptionObject = PyObject_CallFunction( 6352 PyExc_UnicodeEncodeError, "sOnns", 6353 encoding, unicode, startpos, endpos, reason); 6354 } 6355 else { 6356 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6357 goto onError; 6358 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6359 goto onError; 6360 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6361 goto onError; 6362 return; 6363 onError: 6364 Py_DECREF(*exceptionObject); 6365 *exceptionObject = NULL; 6366 } 6367} 6368 6369/* raises a UnicodeEncodeError */ 6370static void 6371raise_encode_exception(PyObject **exceptionObject, 6372 const char *encoding, 6373 PyObject *unicode, 6374 Py_ssize_t startpos, Py_ssize_t endpos, 6375 const char *reason) 6376{ 6377 make_encode_exception(exceptionObject, 6378 encoding, unicode, startpos, endpos, reason); 6379 if (*exceptionObject != NULL) 6380 PyCodec_StrictErrors(*exceptionObject); 6381} 6382 6383/* error handling callback helper: 6384 build arguments, call the callback and check the arguments, 6385 put the result into newpos and return the replacement string, which 6386 has to be freed by the caller */ 6387static PyObject * 6388unicode_encode_call_errorhandler(const char *errors, 6389 PyObject **errorHandler, 6390 const char *encoding, const char *reason, 6391 PyObject *unicode, PyObject **exceptionObject, 6392 Py_ssize_t startpos, Py_ssize_t endpos, 6393 Py_ssize_t *newpos) 6394{ 6395 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6396 Py_ssize_t len; 6397 PyObject *restuple; 6398 PyObject *resunicode; 6399 6400 if (*errorHandler == NULL) { 6401 *errorHandler = PyCodec_LookupError(errors); 6402 if (*errorHandler == NULL) 6403 return NULL; 6404 } 6405 6406 if (PyUnicode_READY(unicode) < 0) 6407 return NULL; 6408 len = PyUnicode_GET_LENGTH(unicode); 6409 6410 make_encode_exception(exceptionObject, 6411 encoding, unicode, startpos, endpos, reason); 6412 if (*exceptionObject == NULL) 6413 return NULL; 6414 6415 restuple = PyObject_CallFunctionObjArgs( 6416 *errorHandler, *exceptionObject, NULL); 6417 if (restuple == NULL) 6418 return NULL; 6419 if (!PyTuple_Check(restuple)) { 6420 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6421 Py_DECREF(restuple); 6422 return NULL; 6423 } 6424 if (!PyArg_ParseTuple(restuple, argparse, 6425 &resunicode, newpos)) { 6426 Py_DECREF(restuple); 6427 return NULL; 6428 } 6429 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6430 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6431 Py_DECREF(restuple); 6432 return NULL; 6433 } 6434 if (*newpos<0) 6435 *newpos = len + *newpos; 6436 if (*newpos<0 || *newpos>len) { 6437 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6438 Py_DECREF(restuple); 6439 return NULL; 6440 } 6441 Py_INCREF(resunicode); 6442 Py_DECREF(restuple); 6443 return resunicode; 6444} 6445 6446static PyObject * 6447unicode_encode_ucs1(PyObject *unicode, 6448 const char *errors, 6449 unsigned int limit) 6450{ 6451 /* input state */ 6452 Py_ssize_t pos=0, size; 6453 int kind; 6454 void *data; 6455 /* output object */ 6456 PyObject *res; 6457 /* pointer into the output */ 6458 char *str; 6459 /* current output position */ 6460 Py_ssize_t ressize; 6461 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6462 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6463 PyObject *errorHandler = NULL; 6464 PyObject *exc = NULL; 6465 /* the following variable is used for caching string comparisons 6466 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6467 int known_errorHandler = -1; 6468 6469 if (PyUnicode_READY(unicode) < 0) 6470 return NULL; 6471 size = PyUnicode_GET_LENGTH(unicode); 6472 kind = PyUnicode_KIND(unicode); 6473 data = PyUnicode_DATA(unicode); 6474 /* allocate enough for a simple encoding without 6475 replacements, if we need more, we'll resize */ 6476 if (size == 0) 6477 return PyBytes_FromStringAndSize(NULL, 0); 6478 res = PyBytes_FromStringAndSize(NULL, size); 6479 if (res == NULL) 6480 return NULL; 6481 str = PyBytes_AS_STRING(res); 6482 ressize = size; 6483 6484 while (pos < size) { 6485 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6486 6487 /* can we encode this? */ 6488 if (c<limit) { 6489 /* no overflow check, because we know that the space is enough */ 6490 *str++ = (char)c; 6491 ++pos; 6492 } 6493 else { 6494 Py_ssize_t requiredsize; 6495 PyObject *repunicode; 6496 Py_ssize_t repsize, newpos, respos, i; 6497 /* startpos for collecting unencodable chars */ 6498 Py_ssize_t collstart = pos; 6499 Py_ssize_t collend = pos; 6500 /* find all unecodable characters */ 6501 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6502 ++collend; 6503 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6504 if (known_errorHandler==-1) { 6505 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6506 known_errorHandler = 1; 6507 else if (!strcmp(errors, "replace")) 6508 known_errorHandler = 2; 6509 else if (!strcmp(errors, "ignore")) 6510 known_errorHandler = 3; 6511 else if (!strcmp(errors, "xmlcharrefreplace")) 6512 known_errorHandler = 4; 6513 else 6514 known_errorHandler = 0; 6515 } 6516 switch (known_errorHandler) { 6517 case 1: /* strict */ 6518 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6519 goto onError; 6520 case 2: /* replace */ 6521 while (collstart++<collend) 6522 *str++ = '?'; /* fall through */ 6523 case 3: /* ignore */ 6524 pos = collend; 6525 break; 6526 case 4: /* xmlcharrefreplace */ 6527 respos = str - PyBytes_AS_STRING(res); 6528 /* determine replacement size */ 6529 for (i = collstart, repsize = 0; i < collend; ++i) { 6530 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6531 if (ch < 10) 6532 repsize += 2+1+1; 6533 else if (ch < 100) 6534 repsize += 2+2+1; 6535 else if (ch < 1000) 6536 repsize += 2+3+1; 6537 else if (ch < 10000) 6538 repsize += 2+4+1; 6539 else if (ch < 100000) 6540 repsize += 2+5+1; 6541 else if (ch < 1000000) 6542 repsize += 2+6+1; 6543 else { 6544 assert(ch <= MAX_UNICODE); 6545 repsize += 2+7+1; 6546 } 6547 } 6548 requiredsize = respos+repsize+(size-collend); 6549 if (requiredsize > ressize) { 6550 if (requiredsize<2*ressize) 6551 requiredsize = 2*ressize; 6552 if (_PyBytes_Resize(&res, requiredsize)) 6553 goto onError; 6554 str = PyBytes_AS_STRING(res) + respos; 6555 ressize = requiredsize; 6556 } 6557 /* generate replacement */ 6558 for (i = collstart; i < collend; ++i) { 6559 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6560 } 6561 pos = collend; 6562 break; 6563 default: 6564 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6565 encoding, reason, unicode, &exc, 6566 collstart, collend, &newpos); 6567 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6568 PyUnicode_READY(repunicode) < 0)) 6569 goto onError; 6570 if (PyBytes_Check(repunicode)) { 6571 /* Directly copy bytes result to output. */ 6572 repsize = PyBytes_Size(repunicode); 6573 if (repsize > 1) { 6574 /* Make room for all additional bytes. */ 6575 respos = str - PyBytes_AS_STRING(res); 6576 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6577 Py_DECREF(repunicode); 6578 goto onError; 6579 } 6580 str = PyBytes_AS_STRING(res) + respos; 6581 ressize += repsize-1; 6582 } 6583 memcpy(str, PyBytes_AsString(repunicode), repsize); 6584 str += repsize; 6585 pos = newpos; 6586 Py_DECREF(repunicode); 6587 break; 6588 } 6589 /* need more space? (at least enough for what we 6590 have+the replacement+the rest of the string, so 6591 we won't have to check space for encodable characters) */ 6592 respos = str - PyBytes_AS_STRING(res); 6593 repsize = PyUnicode_GET_LENGTH(repunicode); 6594 requiredsize = respos+repsize+(size-collend); 6595 if (requiredsize > ressize) { 6596 if (requiredsize<2*ressize) 6597 requiredsize = 2*ressize; 6598 if (_PyBytes_Resize(&res, requiredsize)) { 6599 Py_DECREF(repunicode); 6600 goto onError; 6601 } 6602 str = PyBytes_AS_STRING(res) + respos; 6603 ressize = requiredsize; 6604 } 6605 /* check if there is anything unencodable in the replacement 6606 and copy it to the output */ 6607 for (i = 0; repsize-->0; ++i, ++str) { 6608 c = PyUnicode_READ_CHAR(repunicode, i); 6609 if (c >= limit) { 6610 raise_encode_exception(&exc, encoding, unicode, 6611 pos, pos+1, reason); 6612 Py_DECREF(repunicode); 6613 goto onError; 6614 } 6615 *str = (char)c; 6616 } 6617 pos = newpos; 6618 Py_DECREF(repunicode); 6619 } 6620 } 6621 } 6622 /* Resize if we allocated to much */ 6623 size = str - PyBytes_AS_STRING(res); 6624 if (size < ressize) { /* If this falls res will be NULL */ 6625 assert(size >= 0); 6626 if (_PyBytes_Resize(&res, size) < 0) 6627 goto onError; 6628 } 6629 6630 Py_XDECREF(errorHandler); 6631 Py_XDECREF(exc); 6632 return res; 6633 6634 onError: 6635 Py_XDECREF(res); 6636 Py_XDECREF(errorHandler); 6637 Py_XDECREF(exc); 6638 return NULL; 6639} 6640 6641/* Deprecated */ 6642PyObject * 6643PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6644 Py_ssize_t size, 6645 const char *errors) 6646{ 6647 PyObject *result; 6648 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6649 if (unicode == NULL) 6650 return NULL; 6651 result = unicode_encode_ucs1(unicode, errors, 256); 6652 Py_DECREF(unicode); 6653 return result; 6654} 6655 6656PyObject * 6657_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6658{ 6659 if (!PyUnicode_Check(unicode)) { 6660 PyErr_BadArgument(); 6661 return NULL; 6662 } 6663 if (PyUnicode_READY(unicode) == -1) 6664 return NULL; 6665 /* Fast path: if it is a one-byte string, construct 6666 bytes object directly. */ 6667 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6668 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6669 PyUnicode_GET_LENGTH(unicode)); 6670 /* Non-Latin-1 characters present. Defer to above function to 6671 raise the exception. */ 6672 return unicode_encode_ucs1(unicode, errors, 256); 6673} 6674 6675PyObject* 6676PyUnicode_AsLatin1String(PyObject *unicode) 6677{ 6678 return _PyUnicode_AsLatin1String(unicode, NULL); 6679} 6680 6681/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6682 6683PyObject * 6684PyUnicode_DecodeASCII(const char *s, 6685 Py_ssize_t size, 6686 const char *errors) 6687{ 6688 const char *starts = s; 6689 PyObject *v; 6690 int kind; 6691 void *data; 6692 Py_ssize_t startinpos; 6693 Py_ssize_t endinpos; 6694 Py_ssize_t outpos; 6695 const char *e; 6696 int has_error; 6697 const unsigned char *p = (const unsigned char *)s; 6698 const unsigned char *end = p + size; 6699 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); 6700 PyObject *errorHandler = NULL; 6701 PyObject *exc = NULL; 6702 6703 if (size == 0) { 6704 Py_INCREF(unicode_empty); 6705 return unicode_empty; 6706 } 6707 6708 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6709 if (size == 1 && (unsigned char)s[0] < 128) 6710 return get_latin1_char((unsigned char)s[0]); 6711 6712 has_error = 0; 6713 while (p < end && !has_error) { 6714 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for 6715 an explanation. */ 6716 if (!((size_t) p & LONG_PTR_MASK)) { 6717 /* Help register allocation */ 6718 register const unsigned char *_p = p; 6719 while (_p < aligned_end) { 6720 unsigned long value = *(unsigned long *) _p; 6721 if (value & ASCII_CHAR_MASK) { 6722 has_error = 1; 6723 break; 6724 } 6725 _p += SIZEOF_LONG; 6726 } 6727 if (_p == end) 6728 break; 6729 if (has_error) 6730 break; 6731 p = _p; 6732 } 6733 if (*p & 0x80) { 6734 has_error = 1; 6735 break; 6736 } 6737 else { 6738 ++p; 6739 } 6740 } 6741 if (!has_error) 6742 return unicode_fromascii((const unsigned char *)s, size); 6743 6744 v = PyUnicode_New(size, 127); 6745 if (v == NULL) 6746 goto onError; 6747 if (size == 0) 6748 return v; 6749 kind = PyUnicode_KIND(v); 6750 data = PyUnicode_DATA(v); 6751 outpos = 0; 6752 e = s + size; 6753 while (s < e) { 6754 register unsigned char c = (unsigned char)*s; 6755 if (c < 128) { 6756 PyUnicode_WRITE(kind, data, outpos++, c); 6757 ++s; 6758 } 6759 else { 6760 startinpos = s-starts; 6761 endinpos = startinpos + 1; 6762 if (unicode_decode_call_errorhandler( 6763 errors, &errorHandler, 6764 "ascii", "ordinal not in range(128)", 6765 &starts, &e, &startinpos, &endinpos, &exc, &s, 6766 &v, &outpos)) 6767 goto onError; 6768 kind = PyUnicode_KIND(v); 6769 data = PyUnicode_DATA(v); 6770 } 6771 } 6772 if (PyUnicode_Resize(&v, outpos) < 0) 6773 goto onError; 6774 Py_XDECREF(errorHandler); 6775 Py_XDECREF(exc); 6776 assert(_PyUnicode_CheckConsistency(v, 1)); 6777 return v; 6778 6779 onError: 6780 Py_XDECREF(v); 6781 Py_XDECREF(errorHandler); 6782 Py_XDECREF(exc); 6783 return NULL; 6784} 6785 6786/* Deprecated */ 6787PyObject * 6788PyUnicode_EncodeASCII(const Py_UNICODE *p, 6789 Py_ssize_t size, 6790 const char *errors) 6791{ 6792 PyObject *result; 6793 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6794 if (unicode == NULL) 6795 return NULL; 6796 result = unicode_encode_ucs1(unicode, errors, 128); 6797 Py_DECREF(unicode); 6798 return result; 6799} 6800 6801PyObject * 6802_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6803{ 6804 if (!PyUnicode_Check(unicode)) { 6805 PyErr_BadArgument(); 6806 return NULL; 6807 } 6808 if (PyUnicode_READY(unicode) == -1) 6809 return NULL; 6810 /* Fast path: if it is an ASCII-only string, construct bytes object 6811 directly. Else defer to above function to raise the exception. */ 6812 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6813 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6814 PyUnicode_GET_LENGTH(unicode)); 6815 return unicode_encode_ucs1(unicode, errors, 128); 6816} 6817 6818PyObject * 6819PyUnicode_AsASCIIString(PyObject *unicode) 6820{ 6821 return _PyUnicode_AsASCIIString(unicode, NULL); 6822} 6823 6824#ifdef HAVE_MBCS 6825 6826/* --- MBCS codecs for Windows -------------------------------------------- */ 6827 6828#if SIZEOF_INT < SIZEOF_SIZE_T 6829#define NEED_RETRY 6830#endif 6831 6832#ifndef WC_ERR_INVALID_CHARS 6833# define WC_ERR_INVALID_CHARS 0x0080 6834#endif 6835 6836static char* 6837code_page_name(UINT code_page, PyObject **obj) 6838{ 6839 *obj = NULL; 6840 if (code_page == CP_ACP) 6841 return "mbcs"; 6842 if (code_page == CP_UTF7) 6843 return "CP_UTF7"; 6844 if (code_page == CP_UTF8) 6845 return "CP_UTF8"; 6846 6847 *obj = PyBytes_FromFormat("cp%u", code_page); 6848 if (*obj == NULL) 6849 return NULL; 6850 return PyBytes_AS_STRING(*obj); 6851} 6852 6853static int 6854is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6855{ 6856 const char *curr = s + offset; 6857 const char *prev; 6858 6859 if (!IsDBCSLeadByteEx(code_page, *curr)) 6860 return 0; 6861 6862 prev = CharPrevExA(code_page, s, curr, 0); 6863 if (prev == curr) 6864 return 1; 6865 /* FIXME: This code is limited to "true" double-byte encodings, 6866 as it assumes an incomplete character consists of a single 6867 byte. */ 6868 if (curr - prev == 2) 6869 return 1; 6870 if (!IsDBCSLeadByteEx(code_page, *prev)) 6871 return 1; 6872 return 0; 6873} 6874 6875static DWORD 6876decode_code_page_flags(UINT code_page) 6877{ 6878 if (code_page == CP_UTF7) { 6879 /* The CP_UTF7 decoder only supports flags=0 */ 6880 return 0; 6881 } 6882 else 6883 return MB_ERR_INVALID_CHARS; 6884} 6885 6886/* 6887 * Decode a byte string from a Windows code page into unicode object in strict 6888 * mode. 6889 * 6890 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6891 * WindowsError and returns -1 on other error. 6892 */ 6893static int 6894decode_code_page_strict(UINT code_page, 6895 PyObject **v, 6896 const char *in, 6897 int insize) 6898{ 6899 const DWORD flags = decode_code_page_flags(code_page); 6900 wchar_t *out; 6901 DWORD outsize; 6902 6903 /* First get the size of the result */ 6904 assert(insize > 0); 6905 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6906 if (outsize <= 0) 6907 goto error; 6908 6909 if (*v == NULL) { 6910 /* Create unicode object */ 6911 *v = (PyObject*)_PyUnicode_New(outsize); 6912 if (*v == NULL) 6913 return -1; 6914 out = PyUnicode_AS_UNICODE(*v); 6915 } 6916 else { 6917 /* Extend unicode object */ 6918 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6919 if (PyUnicode_Resize(v, n + outsize) < 0) 6920 return -1; 6921 out = PyUnicode_AS_UNICODE(*v) + n; 6922 } 6923 6924 /* Do the conversion */ 6925 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6926 if (outsize <= 0) 6927 goto error; 6928 return insize; 6929 6930error: 6931 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6932 return -2; 6933 PyErr_SetFromWindowsErr(0); 6934 return -1; 6935} 6936 6937/* 6938 * Decode a byte string from a code page into unicode object with an error 6939 * handler. 6940 * 6941 * Returns consumed size if succeed, or raise a WindowsError or 6942 * UnicodeDecodeError exception and returns -1 on error. 6943 */ 6944static int 6945decode_code_page_errors(UINT code_page, 6946 PyObject **v, 6947 const char *in, const int size, 6948 const char *errors) 6949{ 6950 const char *startin = in; 6951 const char *endin = in + size; 6952 const DWORD flags = decode_code_page_flags(code_page); 6953 /* Ideally, we should get reason from FormatMessage. This is the Windows 6954 2000 English version of the message. */ 6955 const char *reason = "No mapping for the Unicode character exists " 6956 "in the target code page."; 6957 /* each step cannot decode more than 1 character, but a character can be 6958 represented as a surrogate pair */ 6959 wchar_t buffer[2], *startout, *out; 6960 int insize, outsize; 6961 PyObject *errorHandler = NULL; 6962 PyObject *exc = NULL; 6963 PyObject *encoding_obj = NULL; 6964 char *encoding; 6965 DWORD err; 6966 int ret = -1; 6967 6968 assert(size > 0); 6969 6970 encoding = code_page_name(code_page, &encoding_obj); 6971 if (encoding == NULL) 6972 return -1; 6973 6974 if (errors == NULL || strcmp(errors, "strict") == 0) { 6975 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6976 UnicodeDecodeError. */ 6977 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6978 if (exc != NULL) { 6979 PyCodec_StrictErrors(exc); 6980 Py_CLEAR(exc); 6981 } 6982 goto error; 6983 } 6984 6985 if (*v == NULL) { 6986 /* Create unicode object */ 6987 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6988 PyErr_NoMemory(); 6989 goto error; 6990 } 6991 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6992 if (*v == NULL) 6993 goto error; 6994 startout = PyUnicode_AS_UNICODE(*v); 6995 } 6996 else { 6997 /* Extend unicode object */ 6998 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6999 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 7000 PyErr_NoMemory(); 7001 goto error; 7002 } 7003 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 7004 goto error; 7005 startout = PyUnicode_AS_UNICODE(*v) + n; 7006 } 7007 7008 /* Decode the byte string character per character */ 7009 out = startout; 7010 while (in < endin) 7011 { 7012 /* Decode a character */ 7013 insize = 1; 7014 do 7015 { 7016 outsize = MultiByteToWideChar(code_page, flags, 7017 in, insize, 7018 buffer, Py_ARRAY_LENGTH(buffer)); 7019 if (outsize > 0) 7020 break; 7021 err = GetLastError(); 7022 if (err != ERROR_NO_UNICODE_TRANSLATION 7023 && err != ERROR_INSUFFICIENT_BUFFER) 7024 { 7025 PyErr_SetFromWindowsErr(0); 7026 goto error; 7027 } 7028 insize++; 7029 } 7030 /* 4=maximum length of a UTF-8 sequence */ 7031 while (insize <= 4 && (in + insize) <= endin); 7032 7033 if (outsize <= 0) { 7034 Py_ssize_t startinpos, endinpos, outpos; 7035 7036 startinpos = in - startin; 7037 endinpos = startinpos + 1; 7038 outpos = out - PyUnicode_AS_UNICODE(*v); 7039 if (unicode_decode_call_errorhandler( 7040 errors, &errorHandler, 7041 encoding, reason, 7042 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7043 v, &outpos)) 7044 { 7045 goto error; 7046 } 7047 out = PyUnicode_AS_UNICODE(*v) + outpos; 7048 } 7049 else { 7050 in += insize; 7051 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7052 out += outsize; 7053 } 7054 } 7055 7056 /* write a NUL character at the end */ 7057 *out = 0; 7058 7059 /* Extend unicode object */ 7060 outsize = out - startout; 7061 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7062 if (PyUnicode_Resize(v, outsize) < 0) 7063 goto error; 7064 ret = size; 7065 7066error: 7067 Py_XDECREF(encoding_obj); 7068 Py_XDECREF(errorHandler); 7069 Py_XDECREF(exc); 7070 return ret; 7071} 7072 7073static PyObject * 7074decode_code_page_stateful(int code_page, 7075 const char *s, Py_ssize_t size, 7076 const char *errors, Py_ssize_t *consumed) 7077{ 7078 PyObject *v = NULL; 7079 int chunk_size, final, converted, done; 7080 7081 if (code_page < 0) { 7082 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7083 return NULL; 7084 } 7085 7086 if (consumed) 7087 *consumed = 0; 7088 7089 do 7090 { 7091#ifdef NEED_RETRY 7092 if (size > INT_MAX) { 7093 chunk_size = INT_MAX; 7094 final = 0; 7095 done = 0; 7096 } 7097 else 7098#endif 7099 { 7100 chunk_size = (int)size; 7101 final = (consumed == NULL); 7102 done = 1; 7103 } 7104 7105 /* Skip trailing lead-byte unless 'final' is set */ 7106 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 7107 --chunk_size; 7108 7109 if (chunk_size == 0 && done) { 7110 if (v != NULL) 7111 break; 7112 Py_INCREF(unicode_empty); 7113 return unicode_empty; 7114 } 7115 7116 7117 converted = decode_code_page_strict(code_page, &v, 7118 s, chunk_size); 7119 if (converted == -2) 7120 converted = decode_code_page_errors(code_page, &v, 7121 s, chunk_size, 7122 errors); 7123 assert(converted != 0); 7124 7125 if (converted < 0) { 7126 Py_XDECREF(v); 7127 return NULL; 7128 } 7129 7130 if (consumed) 7131 *consumed += converted; 7132 7133 s += converted; 7134 size -= converted; 7135 } while (!done); 7136 7137 return unicode_result(v); 7138} 7139 7140PyObject * 7141PyUnicode_DecodeCodePageStateful(int code_page, 7142 const char *s, 7143 Py_ssize_t size, 7144 const char *errors, 7145 Py_ssize_t *consumed) 7146{ 7147 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7148} 7149 7150PyObject * 7151PyUnicode_DecodeMBCSStateful(const char *s, 7152 Py_ssize_t size, 7153 const char *errors, 7154 Py_ssize_t *consumed) 7155{ 7156 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7157} 7158 7159PyObject * 7160PyUnicode_DecodeMBCS(const char *s, 7161 Py_ssize_t size, 7162 const char *errors) 7163{ 7164 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7165} 7166 7167static DWORD 7168encode_code_page_flags(UINT code_page, const char *errors) 7169{ 7170 if (code_page == CP_UTF8) { 7171 if (winver.dwMajorVersion >= 6) 7172 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7173 and later */ 7174 return WC_ERR_INVALID_CHARS; 7175 else 7176 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7177 return 0; 7178 } 7179 else if (code_page == CP_UTF7) { 7180 /* CP_UTF7 only supports flags=0 */ 7181 return 0; 7182 } 7183 else { 7184 if (errors != NULL && strcmp(errors, "replace") == 0) 7185 return 0; 7186 else 7187 return WC_NO_BEST_FIT_CHARS; 7188 } 7189} 7190 7191/* 7192 * Encode a Unicode string to a Windows code page into a byte string in strict 7193 * mode. 7194 * 7195 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7196 * a WindowsError and returns -1 on other error. 7197 */ 7198static int 7199encode_code_page_strict(UINT code_page, PyObject **outbytes, 7200 PyObject *unicode, Py_ssize_t offset, int len, 7201 const char* errors) 7202{ 7203 BOOL usedDefaultChar = FALSE; 7204 BOOL *pusedDefaultChar = &usedDefaultChar; 7205 int outsize; 7206 PyObject *exc = NULL; 7207 wchar_t *p; 7208 Py_ssize_t size; 7209 const DWORD flags = encode_code_page_flags(code_page, NULL); 7210 char *out; 7211 /* Create a substring so that we can get the UTF-16 representation 7212 of just the slice under consideration. */ 7213 PyObject *substring; 7214 7215 assert(len > 0); 7216 7217 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7218 pusedDefaultChar = &usedDefaultChar; 7219 else 7220 pusedDefaultChar = NULL; 7221 7222 substring = PyUnicode_Substring(unicode, offset, offset+len); 7223 if (substring == NULL) 7224 return -1; 7225 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7226 if (p == NULL) { 7227 Py_DECREF(substring); 7228 return -1; 7229 } 7230 7231 /* First get the size of the result */ 7232 outsize = WideCharToMultiByte(code_page, flags, 7233 p, size, 7234 NULL, 0, 7235 NULL, pusedDefaultChar); 7236 if (outsize <= 0) 7237 goto error; 7238 /* If we used a default char, then we failed! */ 7239 if (pusedDefaultChar && *pusedDefaultChar) { 7240 Py_DECREF(substring); 7241 return -2; 7242 } 7243 7244 if (*outbytes == NULL) { 7245 /* Create string object */ 7246 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7247 if (*outbytes == NULL) { 7248 Py_DECREF(substring); 7249 return -1; 7250 } 7251 out = PyBytes_AS_STRING(*outbytes); 7252 } 7253 else { 7254 /* Extend string object */ 7255 const Py_ssize_t n = PyBytes_Size(*outbytes); 7256 if (outsize > PY_SSIZE_T_MAX - n) { 7257 PyErr_NoMemory(); 7258 Py_DECREF(substring); 7259 return -1; 7260 } 7261 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7262 Py_DECREF(substring); 7263 return -1; 7264 } 7265 out = PyBytes_AS_STRING(*outbytes) + n; 7266 } 7267 7268 /* Do the conversion */ 7269 outsize = WideCharToMultiByte(code_page, flags, 7270 p, size, 7271 out, outsize, 7272 NULL, pusedDefaultChar); 7273 Py_CLEAR(substring); 7274 if (outsize <= 0) 7275 goto error; 7276 if (pusedDefaultChar && *pusedDefaultChar) 7277 return -2; 7278 return 0; 7279 7280error: 7281 Py_XDECREF(substring); 7282 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7283 return -2; 7284 PyErr_SetFromWindowsErr(0); 7285 return -1; 7286} 7287 7288/* 7289 * Encode a Unicode string to a Windows code page into a byte string using a 7290 * error handler. 7291 * 7292 * Returns consumed characters if succeed, or raise a WindowsError and returns 7293 * -1 on other error. 7294 */ 7295static int 7296encode_code_page_errors(UINT code_page, PyObject **outbytes, 7297 PyObject *unicode, Py_ssize_t unicode_offset, 7298 Py_ssize_t insize, const char* errors) 7299{ 7300 const DWORD flags = encode_code_page_flags(code_page, errors); 7301 Py_ssize_t pos = unicode_offset; 7302 Py_ssize_t endin = unicode_offset + insize; 7303 /* Ideally, we should get reason from FormatMessage. This is the Windows 7304 2000 English version of the message. */ 7305 const char *reason = "invalid character"; 7306 /* 4=maximum length of a UTF-8 sequence */ 7307 char buffer[4]; 7308 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7309 Py_ssize_t outsize; 7310 char *out; 7311 PyObject *errorHandler = NULL; 7312 PyObject *exc = NULL; 7313 PyObject *encoding_obj = NULL; 7314 char *encoding; 7315 Py_ssize_t newpos, newoutsize; 7316 PyObject *rep; 7317 int ret = -1; 7318 7319 assert(insize > 0); 7320 7321 encoding = code_page_name(code_page, &encoding_obj); 7322 if (encoding == NULL) 7323 return -1; 7324 7325 if (errors == NULL || strcmp(errors, "strict") == 0) { 7326 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7327 then we raise a UnicodeEncodeError. */ 7328 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7329 if (exc != NULL) { 7330 PyCodec_StrictErrors(exc); 7331 Py_DECREF(exc); 7332 } 7333 Py_XDECREF(encoding_obj); 7334 return -1; 7335 } 7336 7337 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7338 pusedDefaultChar = &usedDefaultChar; 7339 else 7340 pusedDefaultChar = NULL; 7341 7342 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7343 PyErr_NoMemory(); 7344 goto error; 7345 } 7346 outsize = insize * Py_ARRAY_LENGTH(buffer); 7347 7348 if (*outbytes == NULL) { 7349 /* Create string object */ 7350 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7351 if (*outbytes == NULL) 7352 goto error; 7353 out = PyBytes_AS_STRING(*outbytes); 7354 } 7355 else { 7356 /* Extend string object */ 7357 Py_ssize_t n = PyBytes_Size(*outbytes); 7358 if (n > PY_SSIZE_T_MAX - outsize) { 7359 PyErr_NoMemory(); 7360 goto error; 7361 } 7362 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7363 goto error; 7364 out = PyBytes_AS_STRING(*outbytes) + n; 7365 } 7366 7367 /* Encode the string character per character */ 7368 while (pos < endin) 7369 { 7370 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7371 wchar_t chars[2]; 7372 int charsize; 7373 if (ch < 0x10000) { 7374 chars[0] = (wchar_t)ch; 7375 charsize = 1; 7376 } 7377 else { 7378 ch -= 0x10000; 7379 chars[0] = 0xd800 + (ch >> 10); 7380 chars[1] = 0xdc00 + (ch & 0x3ff); 7381 charsize = 2; 7382 } 7383 7384 outsize = WideCharToMultiByte(code_page, flags, 7385 chars, charsize, 7386 buffer, Py_ARRAY_LENGTH(buffer), 7387 NULL, pusedDefaultChar); 7388 if (outsize > 0) { 7389 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7390 { 7391 pos++; 7392 memcpy(out, buffer, outsize); 7393 out += outsize; 7394 continue; 7395 } 7396 } 7397 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7398 PyErr_SetFromWindowsErr(0); 7399 goto error; 7400 } 7401 7402 rep = unicode_encode_call_errorhandler( 7403 errors, &errorHandler, encoding, reason, 7404 unicode, &exc, 7405 pos, pos + 1, &newpos); 7406 if (rep == NULL) 7407 goto error; 7408 pos = newpos; 7409 7410 if (PyBytes_Check(rep)) { 7411 outsize = PyBytes_GET_SIZE(rep); 7412 if (outsize != 1) { 7413 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7414 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7415 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7416 Py_DECREF(rep); 7417 goto error; 7418 } 7419 out = PyBytes_AS_STRING(*outbytes) + offset; 7420 } 7421 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7422 out += outsize; 7423 } 7424 else { 7425 Py_ssize_t i; 7426 enum PyUnicode_Kind kind; 7427 void *data; 7428 7429 if (PyUnicode_READY(rep) < 0) { 7430 Py_DECREF(rep); 7431 goto error; 7432 } 7433 7434 outsize = PyUnicode_GET_LENGTH(rep); 7435 if (outsize != 1) { 7436 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7437 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7438 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7439 Py_DECREF(rep); 7440 goto error; 7441 } 7442 out = PyBytes_AS_STRING(*outbytes) + offset; 7443 } 7444 kind = PyUnicode_KIND(rep); 7445 data = PyUnicode_DATA(rep); 7446 for (i=0; i < outsize; i++) { 7447 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7448 if (ch > 127) { 7449 raise_encode_exception(&exc, 7450 encoding, unicode, 7451 pos, pos + 1, 7452 "unable to encode error handler result to ASCII"); 7453 Py_DECREF(rep); 7454 goto error; 7455 } 7456 *out = (unsigned char)ch; 7457 out++; 7458 } 7459 } 7460 Py_DECREF(rep); 7461 } 7462 /* write a NUL byte */ 7463 *out = 0; 7464 outsize = out - PyBytes_AS_STRING(*outbytes); 7465 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7466 if (_PyBytes_Resize(outbytes, outsize) < 0) 7467 goto error; 7468 ret = 0; 7469 7470error: 7471 Py_XDECREF(encoding_obj); 7472 Py_XDECREF(errorHandler); 7473 Py_XDECREF(exc); 7474 return ret; 7475} 7476 7477static PyObject * 7478encode_code_page(int code_page, 7479 PyObject *unicode, 7480 const char *errors) 7481{ 7482 Py_ssize_t len; 7483 PyObject *outbytes = NULL; 7484 Py_ssize_t offset; 7485 int chunk_len, ret, done; 7486 7487 if (PyUnicode_READY(unicode) < 0) 7488 return NULL; 7489 len = PyUnicode_GET_LENGTH(unicode); 7490 7491 if (code_page < 0) { 7492 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7493 return NULL; 7494 } 7495 7496 if (len == 0) 7497 return PyBytes_FromStringAndSize(NULL, 0); 7498 7499 offset = 0; 7500 do 7501 { 7502#ifdef NEED_RETRY 7503 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7504 chunks. */ 7505 if (len > INT_MAX/2) { 7506 chunk_len = INT_MAX/2; 7507 done = 0; 7508 } 7509 else 7510#endif 7511 { 7512 chunk_len = (int)len; 7513 done = 1; 7514 } 7515 7516 ret = encode_code_page_strict(code_page, &outbytes, 7517 unicode, offset, chunk_len, 7518 errors); 7519 if (ret == -2) 7520 ret = encode_code_page_errors(code_page, &outbytes, 7521 unicode, offset, 7522 chunk_len, errors); 7523 if (ret < 0) { 7524 Py_XDECREF(outbytes); 7525 return NULL; 7526 } 7527 7528 offset += chunk_len; 7529 len -= chunk_len; 7530 } while (!done); 7531 7532 return outbytes; 7533} 7534 7535PyObject * 7536PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7537 Py_ssize_t size, 7538 const char *errors) 7539{ 7540 PyObject *unicode, *res; 7541 unicode = PyUnicode_FromUnicode(p, size); 7542 if (unicode == NULL) 7543 return NULL; 7544 res = encode_code_page(CP_ACP, unicode, errors); 7545 Py_DECREF(unicode); 7546 return res; 7547} 7548 7549PyObject * 7550PyUnicode_EncodeCodePage(int code_page, 7551 PyObject *unicode, 7552 const char *errors) 7553{ 7554 return encode_code_page(code_page, unicode, errors); 7555} 7556 7557PyObject * 7558PyUnicode_AsMBCSString(PyObject *unicode) 7559{ 7560 if (!PyUnicode_Check(unicode)) { 7561 PyErr_BadArgument(); 7562 return NULL; 7563 } 7564 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7565} 7566 7567#undef NEED_RETRY 7568 7569#endif /* HAVE_MBCS */ 7570 7571/* --- Character Mapping Codec -------------------------------------------- */ 7572 7573PyObject * 7574PyUnicode_DecodeCharmap(const char *s, 7575 Py_ssize_t size, 7576 PyObject *mapping, 7577 const char *errors) 7578{ 7579 const char *starts = s; 7580 Py_ssize_t startinpos; 7581 Py_ssize_t endinpos; 7582 Py_ssize_t outpos; 7583 const char *e; 7584 PyObject *v; 7585 Py_ssize_t extrachars = 0; 7586 PyObject *errorHandler = NULL; 7587 PyObject *exc = NULL; 7588 7589 /* Default to Latin-1 */ 7590 if (mapping == NULL) 7591 return PyUnicode_DecodeLatin1(s, size, errors); 7592 7593 v = PyUnicode_New(size, 127); 7594 if (v == NULL) 7595 goto onError; 7596 if (size == 0) 7597 return v; 7598 outpos = 0; 7599 e = s + size; 7600 if (PyUnicode_CheckExact(mapping)) { 7601 Py_ssize_t maplen; 7602 enum PyUnicode_Kind kind; 7603 void *data; 7604 Py_UCS4 x; 7605 7606 if (PyUnicode_READY(mapping) < 0) 7607 return NULL; 7608 7609 maplen = PyUnicode_GET_LENGTH(mapping); 7610 data = PyUnicode_DATA(mapping); 7611 kind = PyUnicode_KIND(mapping); 7612 while (s < e) { 7613 unsigned char ch = *s; 7614 7615 if (ch < maplen) 7616 x = PyUnicode_READ(kind, data, ch); 7617 else 7618 x = 0xfffe; /* invalid value */ 7619 7620 if (x == 0xfffe) 7621 { 7622 /* undefined mapping */ 7623 startinpos = s-starts; 7624 endinpos = startinpos+1; 7625 if (unicode_decode_call_errorhandler( 7626 errors, &errorHandler, 7627 "charmap", "character maps to <undefined>", 7628 &starts, &e, &startinpos, &endinpos, &exc, &s, 7629 &v, &outpos)) { 7630 goto onError; 7631 } 7632 continue; 7633 } 7634 7635 if (unicode_putchar(&v, &outpos, x) < 0) 7636 goto onError; 7637 ++s; 7638 } 7639 } 7640 else { 7641 while (s < e) { 7642 unsigned char ch = *s; 7643 PyObject *w, *x; 7644 7645 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7646 w = PyLong_FromLong((long)ch); 7647 if (w == NULL) 7648 goto onError; 7649 x = PyObject_GetItem(mapping, w); 7650 Py_DECREF(w); 7651 if (x == NULL) { 7652 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7653 /* No mapping found means: mapping is undefined. */ 7654 PyErr_Clear(); 7655 x = Py_None; 7656 Py_INCREF(x); 7657 } else 7658 goto onError; 7659 } 7660 7661 /* Apply mapping */ 7662 if (PyLong_Check(x)) { 7663 long value = PyLong_AS_LONG(x); 7664 if (value < 0 || value > 65535) { 7665 PyErr_SetString(PyExc_TypeError, 7666 "character mapping must be in range(65536)"); 7667 Py_DECREF(x); 7668 goto onError; 7669 } 7670 if (unicode_putchar(&v, &outpos, value) < 0) 7671 goto onError; 7672 } 7673 else if (x == Py_None) { 7674 /* undefined mapping */ 7675 startinpos = s-starts; 7676 endinpos = startinpos+1; 7677 if (unicode_decode_call_errorhandler( 7678 errors, &errorHandler, 7679 "charmap", "character maps to <undefined>", 7680 &starts, &e, &startinpos, &endinpos, &exc, &s, 7681 &v, &outpos)) { 7682 Py_DECREF(x); 7683 goto onError; 7684 } 7685 Py_DECREF(x); 7686 continue; 7687 } 7688 else if (PyUnicode_Check(x)) { 7689 Py_ssize_t targetsize; 7690 7691 if (PyUnicode_READY(x) < 0) 7692 goto onError; 7693 targetsize = PyUnicode_GET_LENGTH(x); 7694 7695 if (targetsize == 1) { 7696 /* 1-1 mapping */ 7697 if (unicode_putchar(&v, &outpos, 7698 PyUnicode_READ_CHAR(x, 0)) < 0) 7699 goto onError; 7700 } 7701 else if (targetsize > 1) { 7702 /* 1-n mapping */ 7703 if (targetsize > extrachars) { 7704 /* resize first */ 7705 Py_ssize_t needed = (targetsize - extrachars) + \ 7706 (targetsize << 2); 7707 extrachars += needed; 7708 /* XXX overflow detection missing */ 7709 if (PyUnicode_Resize(&v, 7710 PyUnicode_GET_LENGTH(v) + needed) < 0) { 7711 Py_DECREF(x); 7712 goto onError; 7713 } 7714 } 7715 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7716 goto onError; 7717 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7718 outpos += targetsize; 7719 extrachars -= targetsize; 7720 } 7721 /* 1-0 mapping: skip the character */ 7722 } 7723 else { 7724 /* wrong return value */ 7725 PyErr_SetString(PyExc_TypeError, 7726 "character mapping must return integer, None or str"); 7727 Py_DECREF(x); 7728 goto onError; 7729 } 7730 Py_DECREF(x); 7731 ++s; 7732 } 7733 } 7734 if (PyUnicode_Resize(&v, outpos) < 0) 7735 goto onError; 7736 Py_XDECREF(errorHandler); 7737 Py_XDECREF(exc); 7738 return unicode_result(v); 7739 7740 onError: 7741 Py_XDECREF(errorHandler); 7742 Py_XDECREF(exc); 7743 Py_XDECREF(v); 7744 return NULL; 7745} 7746 7747/* Charmap encoding: the lookup table */ 7748 7749struct encoding_map { 7750 PyObject_HEAD 7751 unsigned char level1[32]; 7752 int count2, count3; 7753 unsigned char level23[1]; 7754}; 7755 7756static PyObject* 7757encoding_map_size(PyObject *obj, PyObject* args) 7758{ 7759 struct encoding_map *map = (struct encoding_map*)obj; 7760 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7761 128*map->count3); 7762} 7763 7764static PyMethodDef encoding_map_methods[] = { 7765 {"size", encoding_map_size, METH_NOARGS, 7766 PyDoc_STR("Return the size (in bytes) of this object") }, 7767 { 0 } 7768}; 7769 7770static void 7771encoding_map_dealloc(PyObject* o) 7772{ 7773 PyObject_FREE(o); 7774} 7775 7776static PyTypeObject EncodingMapType = { 7777 PyVarObject_HEAD_INIT(NULL, 0) 7778 "EncodingMap", /*tp_name*/ 7779 sizeof(struct encoding_map), /*tp_basicsize*/ 7780 0, /*tp_itemsize*/ 7781 /* methods */ 7782 encoding_map_dealloc, /*tp_dealloc*/ 7783 0, /*tp_print*/ 7784 0, /*tp_getattr*/ 7785 0, /*tp_setattr*/ 7786 0, /*tp_reserved*/ 7787 0, /*tp_repr*/ 7788 0, /*tp_as_number*/ 7789 0, /*tp_as_sequence*/ 7790 0, /*tp_as_mapping*/ 7791 0, /*tp_hash*/ 7792 0, /*tp_call*/ 7793 0, /*tp_str*/ 7794 0, /*tp_getattro*/ 7795 0, /*tp_setattro*/ 7796 0, /*tp_as_buffer*/ 7797 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7798 0, /*tp_doc*/ 7799 0, /*tp_traverse*/ 7800 0, /*tp_clear*/ 7801 0, /*tp_richcompare*/ 7802 0, /*tp_weaklistoffset*/ 7803 0, /*tp_iter*/ 7804 0, /*tp_iternext*/ 7805 encoding_map_methods, /*tp_methods*/ 7806 0, /*tp_members*/ 7807 0, /*tp_getset*/ 7808 0, /*tp_base*/ 7809 0, /*tp_dict*/ 7810 0, /*tp_descr_get*/ 7811 0, /*tp_descr_set*/ 7812 0, /*tp_dictoffset*/ 7813 0, /*tp_init*/ 7814 0, /*tp_alloc*/ 7815 0, /*tp_new*/ 7816 0, /*tp_free*/ 7817 0, /*tp_is_gc*/ 7818}; 7819 7820PyObject* 7821PyUnicode_BuildEncodingMap(PyObject* string) 7822{ 7823 PyObject *result; 7824 struct encoding_map *mresult; 7825 int i; 7826 int need_dict = 0; 7827 unsigned char level1[32]; 7828 unsigned char level2[512]; 7829 unsigned char *mlevel1, *mlevel2, *mlevel3; 7830 int count2 = 0, count3 = 0; 7831 int kind; 7832 void *data; 7833 Py_UCS4 ch; 7834 7835 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) { 7836 PyErr_BadArgument(); 7837 return NULL; 7838 } 7839 kind = PyUnicode_KIND(string); 7840 data = PyUnicode_DATA(string); 7841 memset(level1, 0xFF, sizeof level1); 7842 memset(level2, 0xFF, sizeof level2); 7843 7844 /* If there isn't a one-to-one mapping of NULL to \0, 7845 or if there are non-BMP characters, we need to use 7846 a mapping dictionary. */ 7847 if (PyUnicode_READ(kind, data, 0) != 0) 7848 need_dict = 1; 7849 for (i = 1; i < 256; i++) { 7850 int l1, l2; 7851 ch = PyUnicode_READ(kind, data, i); 7852 if (ch == 0 || ch > 0xFFFF) { 7853 need_dict = 1; 7854 break; 7855 } 7856 if (ch == 0xFFFE) 7857 /* unmapped character */ 7858 continue; 7859 l1 = ch >> 11; 7860 l2 = ch >> 7; 7861 if (level1[l1] == 0xFF) 7862 level1[l1] = count2++; 7863 if (level2[l2] == 0xFF) 7864 level2[l2] = count3++; 7865 } 7866 7867 if (count2 >= 0xFF || count3 >= 0xFF) 7868 need_dict = 1; 7869 7870 if (need_dict) { 7871 PyObject *result = PyDict_New(); 7872 PyObject *key, *value; 7873 if (!result) 7874 return NULL; 7875 for (i = 0; i < 256; i++) { 7876 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7877 value = PyLong_FromLong(i); 7878 if (!key || !value) 7879 goto failed1; 7880 if (PyDict_SetItem(result, key, value) == -1) 7881 goto failed1; 7882 Py_DECREF(key); 7883 Py_DECREF(value); 7884 } 7885 return result; 7886 failed1: 7887 Py_XDECREF(key); 7888 Py_XDECREF(value); 7889 Py_DECREF(result); 7890 return NULL; 7891 } 7892 7893 /* Create a three-level trie */ 7894 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7895 16*count2 + 128*count3 - 1); 7896 if (!result) 7897 return PyErr_NoMemory(); 7898 PyObject_Init(result, &EncodingMapType); 7899 mresult = (struct encoding_map*)result; 7900 mresult->count2 = count2; 7901 mresult->count3 = count3; 7902 mlevel1 = mresult->level1; 7903 mlevel2 = mresult->level23; 7904 mlevel3 = mresult->level23 + 16*count2; 7905 memcpy(mlevel1, level1, 32); 7906 memset(mlevel2, 0xFF, 16*count2); 7907 memset(mlevel3, 0, 128*count3); 7908 count3 = 0; 7909 for (i = 1; i < 256; i++) { 7910 int o1, o2, o3, i2, i3; 7911 if (PyUnicode_READ(kind, data, i) == 0xFFFE) 7912 /* unmapped character */ 7913 continue; 7914 o1 = PyUnicode_READ(kind, data, i)>>11; 7915 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF; 7916 i2 = 16*mlevel1[o1] + o2; 7917 if (mlevel2[i2] == 0xFF) 7918 mlevel2[i2] = count3++; 7919 o3 = PyUnicode_READ(kind, data, i) & 0x7F; 7920 i3 = 128*mlevel2[i2] + o3; 7921 mlevel3[i3] = i; 7922 } 7923 return result; 7924} 7925 7926static int 7927encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7928{ 7929 struct encoding_map *map = (struct encoding_map*)mapping; 7930 int l1 = c>>11; 7931 int l2 = (c>>7) & 0xF; 7932 int l3 = c & 0x7F; 7933 int i; 7934 7935 if (c > 0xFFFF) 7936 return -1; 7937 if (c == 0) 7938 return 0; 7939 /* level 1*/ 7940 i = map->level1[l1]; 7941 if (i == 0xFF) { 7942 return -1; 7943 } 7944 /* level 2*/ 7945 i = map->level23[16*i+l2]; 7946 if (i == 0xFF) { 7947 return -1; 7948 } 7949 /* level 3 */ 7950 i = map->level23[16*map->count2 + 128*i + l3]; 7951 if (i == 0) { 7952 return -1; 7953 } 7954 return i; 7955} 7956 7957/* Lookup the character ch in the mapping. If the character 7958 can't be found, Py_None is returned (or NULL, if another 7959 error occurred). */ 7960static PyObject * 7961charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7962{ 7963 PyObject *w = PyLong_FromLong((long)c); 7964 PyObject *x; 7965 7966 if (w == NULL) 7967 return NULL; 7968 x = PyObject_GetItem(mapping, w); 7969 Py_DECREF(w); 7970 if (x == NULL) { 7971 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7972 /* No mapping found means: mapping is undefined. */ 7973 PyErr_Clear(); 7974 x = Py_None; 7975 Py_INCREF(x); 7976 return x; 7977 } else 7978 return NULL; 7979 } 7980 else if (x == Py_None) 7981 return x; 7982 else if (PyLong_Check(x)) { 7983 long value = PyLong_AS_LONG(x); 7984 if (value < 0 || value > 255) { 7985 PyErr_SetString(PyExc_TypeError, 7986 "character mapping must be in range(256)"); 7987 Py_DECREF(x); 7988 return NULL; 7989 } 7990 return x; 7991 } 7992 else if (PyBytes_Check(x)) 7993 return x; 7994 else { 7995 /* wrong return value */ 7996 PyErr_Format(PyExc_TypeError, 7997 "character mapping must return integer, bytes or None, not %.400s", 7998 x->ob_type->tp_name); 7999 Py_DECREF(x); 8000 return NULL; 8001 } 8002} 8003 8004static int 8005charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8006{ 8007 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8008 /* exponentially overallocate to minimize reallocations */ 8009 if (requiredsize < 2*outsize) 8010 requiredsize = 2*outsize; 8011 if (_PyBytes_Resize(outobj, requiredsize)) 8012 return -1; 8013 return 0; 8014} 8015 8016typedef enum charmapencode_result { 8017 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8018} charmapencode_result; 8019/* lookup the character, put the result in the output string and adjust 8020 various state variables. Resize the output bytes object if not enough 8021 space is available. Return a new reference to the object that 8022 was put in the output buffer, or Py_None, if the mapping was undefined 8023 (in which case no character was written) or NULL, if a 8024 reallocation error occurred. The caller must decref the result */ 8025static charmapencode_result 8026charmapencode_output(Py_UCS4 c, PyObject *mapping, 8027 PyObject **outobj, Py_ssize_t *outpos) 8028{ 8029 PyObject *rep; 8030 char *outstart; 8031 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8032 8033 if (Py_TYPE(mapping) == &EncodingMapType) { 8034 int res = encoding_map_lookup(c, mapping); 8035 Py_ssize_t requiredsize = *outpos+1; 8036 if (res == -1) 8037 return enc_FAILED; 8038 if (outsize<requiredsize) 8039 if (charmapencode_resize(outobj, outpos, requiredsize)) 8040 return enc_EXCEPTION; 8041 outstart = PyBytes_AS_STRING(*outobj); 8042 outstart[(*outpos)++] = (char)res; 8043 return enc_SUCCESS; 8044 } 8045 8046 rep = charmapencode_lookup(c, mapping); 8047 if (rep==NULL) 8048 return enc_EXCEPTION; 8049 else if (rep==Py_None) { 8050 Py_DECREF(rep); 8051 return enc_FAILED; 8052 } else { 8053 if (PyLong_Check(rep)) { 8054 Py_ssize_t requiredsize = *outpos+1; 8055 if (outsize<requiredsize) 8056 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8057 Py_DECREF(rep); 8058 return enc_EXCEPTION; 8059 } 8060 outstart = PyBytes_AS_STRING(*outobj); 8061 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8062 } 8063 else { 8064 const char *repchars = PyBytes_AS_STRING(rep); 8065 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8066 Py_ssize_t requiredsize = *outpos+repsize; 8067 if (outsize<requiredsize) 8068 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8069 Py_DECREF(rep); 8070 return enc_EXCEPTION; 8071 } 8072 outstart = PyBytes_AS_STRING(*outobj); 8073 memcpy(outstart + *outpos, repchars, repsize); 8074 *outpos += repsize; 8075 } 8076 } 8077 Py_DECREF(rep); 8078 return enc_SUCCESS; 8079} 8080 8081/* handle an error in PyUnicode_EncodeCharmap 8082 Return 0 on success, -1 on error */ 8083static int 8084charmap_encoding_error( 8085 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8086 PyObject **exceptionObject, 8087 int *known_errorHandler, PyObject **errorHandler, const char *errors, 8088 PyObject **res, Py_ssize_t *respos) 8089{ 8090 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8091 Py_ssize_t size, repsize; 8092 Py_ssize_t newpos; 8093 enum PyUnicode_Kind kind; 8094 void *data; 8095 Py_ssize_t index; 8096 /* startpos for collecting unencodable chars */ 8097 Py_ssize_t collstartpos = *inpos; 8098 Py_ssize_t collendpos = *inpos+1; 8099 Py_ssize_t collpos; 8100 char *encoding = "charmap"; 8101 char *reason = "character maps to <undefined>"; 8102 charmapencode_result x; 8103 Py_UCS4 ch; 8104 int val; 8105 8106 if (PyUnicode_READY(unicode) < 0) 8107 return -1; 8108 size = PyUnicode_GET_LENGTH(unicode); 8109 /* find all unencodable characters */ 8110 while (collendpos < size) { 8111 PyObject *rep; 8112 if (Py_TYPE(mapping) == &EncodingMapType) { 8113 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8114 val = encoding_map_lookup(ch, mapping); 8115 if (val != -1) 8116 break; 8117 ++collendpos; 8118 continue; 8119 } 8120 8121 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8122 rep = charmapencode_lookup(ch, mapping); 8123 if (rep==NULL) 8124 return -1; 8125 else if (rep!=Py_None) { 8126 Py_DECREF(rep); 8127 break; 8128 } 8129 Py_DECREF(rep); 8130 ++collendpos; 8131 } 8132 /* cache callback name lookup 8133 * (if not done yet, i.e. it's the first error) */ 8134 if (*known_errorHandler==-1) { 8135 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8136 *known_errorHandler = 1; 8137 else if (!strcmp(errors, "replace")) 8138 *known_errorHandler = 2; 8139 else if (!strcmp(errors, "ignore")) 8140 *known_errorHandler = 3; 8141 else if (!strcmp(errors, "xmlcharrefreplace")) 8142 *known_errorHandler = 4; 8143 else 8144 *known_errorHandler = 0; 8145 } 8146 switch (*known_errorHandler) { 8147 case 1: /* strict */ 8148 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8149 return -1; 8150 case 2: /* replace */ 8151 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8152 x = charmapencode_output('?', mapping, res, respos); 8153 if (x==enc_EXCEPTION) { 8154 return -1; 8155 } 8156 else if (x==enc_FAILED) { 8157 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8158 return -1; 8159 } 8160 } 8161 /* fall through */ 8162 case 3: /* ignore */ 8163 *inpos = collendpos; 8164 break; 8165 case 4: /* xmlcharrefreplace */ 8166 /* generate replacement (temporarily (mis)uses p) */ 8167 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8168 char buffer[2+29+1+1]; 8169 char *cp; 8170 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8171 for (cp = buffer; *cp; ++cp) { 8172 x = charmapencode_output(*cp, mapping, res, respos); 8173 if (x==enc_EXCEPTION) 8174 return -1; 8175 else if (x==enc_FAILED) { 8176 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8177 return -1; 8178 } 8179 } 8180 } 8181 *inpos = collendpos; 8182 break; 8183 default: 8184 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8185 encoding, reason, unicode, exceptionObject, 8186 collstartpos, collendpos, &newpos); 8187 if (repunicode == NULL) 8188 return -1; 8189 if (PyBytes_Check(repunicode)) { 8190 /* Directly copy bytes result to output. */ 8191 Py_ssize_t outsize = PyBytes_Size(*res); 8192 Py_ssize_t requiredsize; 8193 repsize = PyBytes_Size(repunicode); 8194 requiredsize = *respos + repsize; 8195 if (requiredsize > outsize) 8196 /* Make room for all additional bytes. */ 8197 if (charmapencode_resize(res, respos, requiredsize)) { 8198 Py_DECREF(repunicode); 8199 return -1; 8200 } 8201 memcpy(PyBytes_AsString(*res) + *respos, 8202 PyBytes_AsString(repunicode), repsize); 8203 *respos += repsize; 8204 *inpos = newpos; 8205 Py_DECREF(repunicode); 8206 break; 8207 } 8208 /* generate replacement */ 8209 if (PyUnicode_READY(repunicode) < 0) { 8210 Py_DECREF(repunicode); 8211 return -1; 8212 } 8213 repsize = PyUnicode_GET_LENGTH(repunicode); 8214 data = PyUnicode_DATA(repunicode); 8215 kind = PyUnicode_KIND(repunicode); 8216 for (index = 0; index < repsize; index++) { 8217 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8218 x = charmapencode_output(repch, mapping, res, respos); 8219 if (x==enc_EXCEPTION) { 8220 Py_DECREF(repunicode); 8221 return -1; 8222 } 8223 else if (x==enc_FAILED) { 8224 Py_DECREF(repunicode); 8225 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8226 return -1; 8227 } 8228 } 8229 *inpos = newpos; 8230 Py_DECREF(repunicode); 8231 } 8232 return 0; 8233} 8234 8235PyObject * 8236_PyUnicode_EncodeCharmap(PyObject *unicode, 8237 PyObject *mapping, 8238 const char *errors) 8239{ 8240 /* output object */ 8241 PyObject *res = NULL; 8242 /* current input position */ 8243 Py_ssize_t inpos = 0; 8244 Py_ssize_t size; 8245 /* current output position */ 8246 Py_ssize_t respos = 0; 8247 PyObject *errorHandler = NULL; 8248 PyObject *exc = NULL; 8249 /* the following variable is used for caching string comparisons 8250 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8251 * 3=ignore, 4=xmlcharrefreplace */ 8252 int known_errorHandler = -1; 8253 8254 if (PyUnicode_READY(unicode) < 0) 8255 return NULL; 8256 size = PyUnicode_GET_LENGTH(unicode); 8257 8258 /* Default to Latin-1 */ 8259 if (mapping == NULL) 8260 return unicode_encode_ucs1(unicode, errors, 256); 8261 8262 /* allocate enough for a simple encoding without 8263 replacements, if we need more, we'll resize */ 8264 res = PyBytes_FromStringAndSize(NULL, size); 8265 if (res == NULL) 8266 goto onError; 8267 if (size == 0) 8268 return res; 8269 8270 while (inpos<size) { 8271 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8272 /* try to encode it */ 8273 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8274 if (x==enc_EXCEPTION) /* error */ 8275 goto onError; 8276 if (x==enc_FAILED) { /* unencodable character */ 8277 if (charmap_encoding_error(unicode, &inpos, mapping, 8278 &exc, 8279 &known_errorHandler, &errorHandler, errors, 8280 &res, &respos)) { 8281 goto onError; 8282 } 8283 } 8284 else 8285 /* done with this character => adjust input position */ 8286 ++inpos; 8287 } 8288 8289 /* Resize if we allocated to much */ 8290 if (respos<PyBytes_GET_SIZE(res)) 8291 if (_PyBytes_Resize(&res, respos) < 0) 8292 goto onError; 8293 8294 Py_XDECREF(exc); 8295 Py_XDECREF(errorHandler); 8296 return res; 8297 8298 onError: 8299 Py_XDECREF(res); 8300 Py_XDECREF(exc); 8301 Py_XDECREF(errorHandler); 8302 return NULL; 8303} 8304 8305/* Deprecated */ 8306PyObject * 8307PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8308 Py_ssize_t size, 8309 PyObject *mapping, 8310 const char *errors) 8311{ 8312 PyObject *result; 8313 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8314 if (unicode == NULL) 8315 return NULL; 8316 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8317 Py_DECREF(unicode); 8318 return result; 8319} 8320 8321PyObject * 8322PyUnicode_AsCharmapString(PyObject *unicode, 8323 PyObject *mapping) 8324{ 8325 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8326 PyErr_BadArgument(); 8327 return NULL; 8328 } 8329 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8330} 8331 8332/* create or adjust a UnicodeTranslateError */ 8333static void 8334make_translate_exception(PyObject **exceptionObject, 8335 PyObject *unicode, 8336 Py_ssize_t startpos, Py_ssize_t endpos, 8337 const char *reason) 8338{ 8339 if (*exceptionObject == NULL) { 8340 *exceptionObject = _PyUnicodeTranslateError_Create( 8341 unicode, startpos, endpos, reason); 8342 } 8343 else { 8344 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8345 goto onError; 8346 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8347 goto onError; 8348 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8349 goto onError; 8350 return; 8351 onError: 8352 Py_DECREF(*exceptionObject); 8353 *exceptionObject = NULL; 8354 } 8355} 8356 8357/* raises a UnicodeTranslateError */ 8358static void 8359raise_translate_exception(PyObject **exceptionObject, 8360 PyObject *unicode, 8361 Py_ssize_t startpos, Py_ssize_t endpos, 8362 const char *reason) 8363{ 8364 make_translate_exception(exceptionObject, 8365 unicode, startpos, endpos, reason); 8366 if (*exceptionObject != NULL) 8367 PyCodec_StrictErrors(*exceptionObject); 8368} 8369 8370/* error handling callback helper: 8371 build arguments, call the callback and check the arguments, 8372 put the result into newpos and return the replacement string, which 8373 has to be freed by the caller */ 8374static PyObject * 8375unicode_translate_call_errorhandler(const char *errors, 8376 PyObject **errorHandler, 8377 const char *reason, 8378 PyObject *unicode, PyObject **exceptionObject, 8379 Py_ssize_t startpos, Py_ssize_t endpos, 8380 Py_ssize_t *newpos) 8381{ 8382 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8383 8384 Py_ssize_t i_newpos; 8385 PyObject *restuple; 8386 PyObject *resunicode; 8387 8388 if (*errorHandler == NULL) { 8389 *errorHandler = PyCodec_LookupError(errors); 8390 if (*errorHandler == NULL) 8391 return NULL; 8392 } 8393 8394 make_translate_exception(exceptionObject, 8395 unicode, startpos, endpos, reason); 8396 if (*exceptionObject == NULL) 8397 return NULL; 8398 8399 restuple = PyObject_CallFunctionObjArgs( 8400 *errorHandler, *exceptionObject, NULL); 8401 if (restuple == NULL) 8402 return NULL; 8403 if (!PyTuple_Check(restuple)) { 8404 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8405 Py_DECREF(restuple); 8406 return NULL; 8407 } 8408 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8409 &resunicode, &i_newpos)) { 8410 Py_DECREF(restuple); 8411 return NULL; 8412 } 8413 if (i_newpos<0) 8414 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8415 else 8416 *newpos = i_newpos; 8417 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8419 Py_DECREF(restuple); 8420 return NULL; 8421 } 8422 Py_INCREF(resunicode); 8423 Py_DECREF(restuple); 8424 return resunicode; 8425} 8426 8427/* Lookup the character ch in the mapping and put the result in result, 8428 which must be decrefed by the caller. 8429 Return 0 on success, -1 on error */ 8430static int 8431charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8432{ 8433 PyObject *w = PyLong_FromLong((long)c); 8434 PyObject *x; 8435 8436 if (w == NULL) 8437 return -1; 8438 x = PyObject_GetItem(mapping, w); 8439 Py_DECREF(w); 8440 if (x == NULL) { 8441 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8442 /* No mapping found means: use 1:1 mapping. */ 8443 PyErr_Clear(); 8444 *result = NULL; 8445 return 0; 8446 } else 8447 return -1; 8448 } 8449 else if (x == Py_None) { 8450 *result = x; 8451 return 0; 8452 } 8453 else if (PyLong_Check(x)) { 8454 long value = PyLong_AS_LONG(x); 8455 long max = PyUnicode_GetMax(); 8456 if (value < 0 || value > max) { 8457 PyErr_Format(PyExc_TypeError, 8458 "character mapping must be in range(0x%x)", max+1); 8459 Py_DECREF(x); 8460 return -1; 8461 } 8462 *result = x; 8463 return 0; 8464 } 8465 else if (PyUnicode_Check(x)) { 8466 *result = x; 8467 return 0; 8468 } 8469 else { 8470 /* wrong return value */ 8471 PyErr_SetString(PyExc_TypeError, 8472 "character mapping must return integer, None or str"); 8473 Py_DECREF(x); 8474 return -1; 8475 } 8476} 8477/* ensure that *outobj is at least requiredsize characters long, 8478 if not reallocate and adjust various state variables. 8479 Return 0 on success, -1 on error */ 8480static int 8481charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8482 Py_ssize_t requiredsize) 8483{ 8484 Py_ssize_t oldsize = *psize; 8485 if (requiredsize > oldsize) { 8486 /* exponentially overallocate to minimize reallocations */ 8487 if (requiredsize < 2 * oldsize) 8488 requiredsize = 2 * oldsize; 8489 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8490 if (*outobj == 0) 8491 return -1; 8492 *psize = requiredsize; 8493 } 8494 return 0; 8495} 8496/* lookup the character, put the result in the output string and adjust 8497 various state variables. Return a new reference to the object that 8498 was put in the output buffer in *result, or Py_None, if the mapping was 8499 undefined (in which case no character was written). 8500 The called must decref result. 8501 Return 0 on success, -1 on error. */ 8502static int 8503charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8504 PyObject *mapping, Py_UCS4 **output, 8505 Py_ssize_t *osize, Py_ssize_t *opos, 8506 PyObject **res) 8507{ 8508 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8509 if (charmaptranslate_lookup(curinp, mapping, res)) 8510 return -1; 8511 if (*res==NULL) { 8512 /* not found => default to 1:1 mapping */ 8513 (*output)[(*opos)++] = curinp; 8514 } 8515 else if (*res==Py_None) 8516 ; 8517 else if (PyLong_Check(*res)) { 8518 /* no overflow check, because we know that the space is enough */ 8519 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8520 } 8521 else if (PyUnicode_Check(*res)) { 8522 Py_ssize_t repsize; 8523 if (PyUnicode_READY(*res) == -1) 8524 return -1; 8525 repsize = PyUnicode_GET_LENGTH(*res); 8526 if (repsize==1) { 8527 /* no overflow check, because we know that the space is enough */ 8528 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8529 } 8530 else if (repsize!=0) { 8531 /* more than one character */ 8532 Py_ssize_t requiredsize = *opos + 8533 (PyUnicode_GET_LENGTH(input) - ipos) + 8534 repsize - 1; 8535 Py_ssize_t i; 8536 if (charmaptranslate_makespace(output, osize, requiredsize)) 8537 return -1; 8538 for(i = 0; i < repsize; i++) 8539 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8540 } 8541 } 8542 else 8543 return -1; 8544 return 0; 8545} 8546 8547PyObject * 8548_PyUnicode_TranslateCharmap(PyObject *input, 8549 PyObject *mapping, 8550 const char *errors) 8551{ 8552 /* input object */ 8553 char *idata; 8554 Py_ssize_t size, i; 8555 int kind; 8556 /* output buffer */ 8557 Py_UCS4 *output = NULL; 8558 Py_ssize_t osize; 8559 PyObject *res; 8560 /* current output position */ 8561 Py_ssize_t opos; 8562 char *reason = "character maps to <undefined>"; 8563 PyObject *errorHandler = NULL; 8564 PyObject *exc = NULL; 8565 /* the following variable is used for caching string comparisons 8566 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8567 * 3=ignore, 4=xmlcharrefreplace */ 8568 int known_errorHandler = -1; 8569 8570 if (mapping == NULL) { 8571 PyErr_BadArgument(); 8572 return NULL; 8573 } 8574 8575 if (PyUnicode_READY(input) == -1) 8576 return NULL; 8577 idata = (char*)PyUnicode_DATA(input); 8578 kind = PyUnicode_KIND(input); 8579 size = PyUnicode_GET_LENGTH(input); 8580 i = 0; 8581 8582 if (size == 0) { 8583 Py_INCREF(input); 8584 return input; 8585 } 8586 8587 /* allocate enough for a simple 1:1 translation without 8588 replacements, if we need more, we'll resize */ 8589 osize = size; 8590 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8591 opos = 0; 8592 if (output == NULL) { 8593 PyErr_NoMemory(); 8594 goto onError; 8595 } 8596 8597 while (i<size) { 8598 /* try to encode it */ 8599 PyObject *x = NULL; 8600 if (charmaptranslate_output(input, i, mapping, 8601 &output, &osize, &opos, &x)) { 8602 Py_XDECREF(x); 8603 goto onError; 8604 } 8605 Py_XDECREF(x); 8606 if (x!=Py_None) /* it worked => adjust input pointer */ 8607 ++i; 8608 else { /* untranslatable character */ 8609 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8610 Py_ssize_t repsize; 8611 Py_ssize_t newpos; 8612 Py_ssize_t uni2; 8613 /* startpos for collecting untranslatable chars */ 8614 Py_ssize_t collstart = i; 8615 Py_ssize_t collend = i+1; 8616 Py_ssize_t coll; 8617 8618 /* find all untranslatable characters */ 8619 while (collend < size) { 8620 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8621 goto onError; 8622 Py_XDECREF(x); 8623 if (x!=Py_None) 8624 break; 8625 ++collend; 8626 } 8627 /* cache callback name lookup 8628 * (if not done yet, i.e. it's the first error) */ 8629 if (known_errorHandler==-1) { 8630 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8631 known_errorHandler = 1; 8632 else if (!strcmp(errors, "replace")) 8633 known_errorHandler = 2; 8634 else if (!strcmp(errors, "ignore")) 8635 known_errorHandler = 3; 8636 else if (!strcmp(errors, "xmlcharrefreplace")) 8637 known_errorHandler = 4; 8638 else 8639 known_errorHandler = 0; 8640 } 8641 switch (known_errorHandler) { 8642 case 1: /* strict */ 8643 raise_translate_exception(&exc, input, collstart, 8644 collend, reason); 8645 goto onError; 8646 case 2: /* replace */ 8647 /* No need to check for space, this is a 1:1 replacement */ 8648 for (coll = collstart; coll<collend; coll++) 8649 output[opos++] = '?'; 8650 /* fall through */ 8651 case 3: /* ignore */ 8652 i = collend; 8653 break; 8654 case 4: /* xmlcharrefreplace */ 8655 /* generate replacement (temporarily (mis)uses i) */ 8656 for (i = collstart; i < collend; ++i) { 8657 char buffer[2+29+1+1]; 8658 char *cp; 8659 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8660 if (charmaptranslate_makespace(&output, &osize, 8661 opos+strlen(buffer)+(size-collend))) 8662 goto onError; 8663 for (cp = buffer; *cp; ++cp) 8664 output[opos++] = *cp; 8665 } 8666 i = collend; 8667 break; 8668 default: 8669 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8670 reason, input, &exc, 8671 collstart, collend, &newpos); 8672 if (repunicode == NULL) 8673 goto onError; 8674 if (PyUnicode_READY(repunicode) < 0) { 8675 Py_DECREF(repunicode); 8676 goto onError; 8677 } 8678 /* generate replacement */ 8679 repsize = PyUnicode_GET_LENGTH(repunicode); 8680 if (charmaptranslate_makespace(&output, &osize, 8681 opos+repsize+(size-collend))) { 8682 Py_DECREF(repunicode); 8683 goto onError; 8684 } 8685 for (uni2 = 0; repsize-->0; ++uni2) 8686 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8687 i = newpos; 8688 Py_DECREF(repunicode); 8689 } 8690 } 8691 } 8692 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8693 if (!res) 8694 goto onError; 8695 PyMem_Free(output); 8696 Py_XDECREF(exc); 8697 Py_XDECREF(errorHandler); 8698 return res; 8699 8700 onError: 8701 PyMem_Free(output); 8702 Py_XDECREF(exc); 8703 Py_XDECREF(errorHandler); 8704 return NULL; 8705} 8706 8707/* Deprecated. Use PyUnicode_Translate instead. */ 8708PyObject * 8709PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8710 Py_ssize_t size, 8711 PyObject *mapping, 8712 const char *errors) 8713{ 8714 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8715 if (!unicode) 8716 return NULL; 8717 return _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8718} 8719 8720PyObject * 8721PyUnicode_Translate(PyObject *str, 8722 PyObject *mapping, 8723 const char *errors) 8724{ 8725 PyObject *result; 8726 8727 str = PyUnicode_FromObject(str); 8728 if (str == NULL) 8729 goto onError; 8730 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8731 Py_DECREF(str); 8732 return result; 8733 8734 onError: 8735 Py_XDECREF(str); 8736 return NULL; 8737} 8738 8739static Py_UCS4 8740fix_decimal_and_space_to_ascii(PyObject *self) 8741{ 8742 /* No need to call PyUnicode_READY(self) because this function is only 8743 called as a callback from fixup() which does it already. */ 8744 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8745 const int kind = PyUnicode_KIND(self); 8746 void *data = PyUnicode_DATA(self); 8747 Py_UCS4 maxchar = 0, ch, fixed; 8748 Py_ssize_t i; 8749 8750 for (i = 0; i < len; ++i) { 8751 ch = PyUnicode_READ(kind, data, i); 8752 fixed = 0; 8753 if (ch > 127) { 8754 if (Py_UNICODE_ISSPACE(ch)) 8755 fixed = ' '; 8756 else { 8757 const int decimal = Py_UNICODE_TODECIMAL(ch); 8758 if (decimal >= 0) 8759 fixed = '0' + decimal; 8760 } 8761 if (fixed != 0) { 8762 if (fixed > maxchar) 8763 maxchar = fixed; 8764 PyUnicode_WRITE(kind, data, i, fixed); 8765 } 8766 else if (ch > maxchar) 8767 maxchar = ch; 8768 } 8769 else if (ch > maxchar) 8770 maxchar = ch; 8771 } 8772 8773 return maxchar; 8774} 8775 8776PyObject * 8777_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8778{ 8779 if (!PyUnicode_Check(unicode)) { 8780 PyErr_BadInternalCall(); 8781 return NULL; 8782 } 8783 if (PyUnicode_READY(unicode) == -1) 8784 return NULL; 8785 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8786 /* If the string is already ASCII, just return the same string */ 8787 Py_INCREF(unicode); 8788 return unicode; 8789 } 8790 return fixup(unicode, fix_decimal_and_space_to_ascii); 8791} 8792 8793PyObject * 8794PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8795 Py_ssize_t length) 8796{ 8797 PyObject *decimal; 8798 Py_ssize_t i; 8799 Py_UCS4 maxchar; 8800 enum PyUnicode_Kind kind; 8801 void *data; 8802 8803 maxchar = 0; 8804 for (i = 0; i < length; i++) { 8805 Py_UNICODE ch = s[i]; 8806 if (ch > 127) { 8807 int decimal = Py_UNICODE_TODECIMAL(ch); 8808 if (decimal >= 0) 8809 ch = '0' + decimal; 8810 } 8811 maxchar = Py_MAX(maxchar, ch); 8812 } 8813 8814 /* Copy to a new string */ 8815 decimal = PyUnicode_New(length, maxchar); 8816 if (decimal == NULL) 8817 return decimal; 8818 kind = PyUnicode_KIND(decimal); 8819 data = PyUnicode_DATA(decimal); 8820 /* Iterate over code points */ 8821 for (i = 0; i < length; i++) { 8822 Py_UNICODE ch = s[i]; 8823 if (ch > 127) { 8824 int decimal = Py_UNICODE_TODECIMAL(ch); 8825 if (decimal >= 0) 8826 ch = '0' + decimal; 8827 } 8828 PyUnicode_WRITE(kind, data, i, ch); 8829 } 8830 return unicode_result(decimal); 8831} 8832/* --- Decimal Encoder ---------------------------------------------------- */ 8833 8834int 8835PyUnicode_EncodeDecimal(Py_UNICODE *s, 8836 Py_ssize_t length, 8837 char *output, 8838 const char *errors) 8839{ 8840 PyObject *unicode; 8841 Py_ssize_t i; 8842 enum PyUnicode_Kind kind; 8843 void *data; 8844 8845 if (output == NULL) { 8846 PyErr_BadArgument(); 8847 return -1; 8848 } 8849 8850 unicode = PyUnicode_FromUnicode(s, length); 8851 if (unicode == NULL) 8852 return -1; 8853 8854 if (PyUnicode_READY(unicode) < 0) { 8855 Py_DECREF(unicode); 8856 return -1; 8857 } 8858 kind = PyUnicode_KIND(unicode); 8859 data = PyUnicode_DATA(unicode); 8860 8861 for (i=0; i < length; ) { 8862 PyObject *exc; 8863 Py_UCS4 ch; 8864 int decimal; 8865 Py_ssize_t startpos; 8866 8867 ch = PyUnicode_READ(kind, data, i); 8868 8869 if (Py_UNICODE_ISSPACE(ch)) { 8870 *output++ = ' '; 8871 i++; 8872 continue; 8873 } 8874 decimal = Py_UNICODE_TODECIMAL(ch); 8875 if (decimal >= 0) { 8876 *output++ = '0' + decimal; 8877 i++; 8878 continue; 8879 } 8880 if (0 < ch && ch < 256) { 8881 *output++ = (char)ch; 8882 i++; 8883 continue; 8884 } 8885 8886 startpos = i; 8887 exc = NULL; 8888 raise_encode_exception(&exc, "decimal", unicode, 8889 startpos, startpos+1, 8890 "invalid decimal Unicode string"); 8891 Py_XDECREF(exc); 8892 Py_DECREF(unicode); 8893 return -1; 8894 } 8895 /* 0-terminate the output string */ 8896 *output++ = '\0'; 8897 Py_DECREF(unicode); 8898 return 0; 8899} 8900 8901/* --- Helpers ------------------------------------------------------------ */ 8902 8903static Py_ssize_t 8904any_find_slice(int direction, PyObject* s1, PyObject* s2, 8905 Py_ssize_t start, 8906 Py_ssize_t end) 8907{ 8908 int kind1, kind2, kind; 8909 void *buf1, *buf2; 8910 Py_ssize_t len1, len2, result; 8911 8912 kind1 = PyUnicode_KIND(s1); 8913 kind2 = PyUnicode_KIND(s2); 8914 kind = kind1 > kind2 ? kind1 : kind2; 8915 buf1 = PyUnicode_DATA(s1); 8916 buf2 = PyUnicode_DATA(s2); 8917 if (kind1 != kind) 8918 buf1 = _PyUnicode_AsKind(s1, kind); 8919 if (!buf1) 8920 return -2; 8921 if (kind2 != kind) 8922 buf2 = _PyUnicode_AsKind(s2, kind); 8923 if (!buf2) { 8924 if (kind1 != kind) PyMem_Free(buf1); 8925 return -2; 8926 } 8927 len1 = PyUnicode_GET_LENGTH(s1); 8928 len2 = PyUnicode_GET_LENGTH(s2); 8929 8930 if (direction > 0) { 8931 switch(kind) { 8932 case PyUnicode_1BYTE_KIND: 8933 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8934 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8935 else 8936 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8937 break; 8938 case PyUnicode_2BYTE_KIND: 8939 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8940 break; 8941 case PyUnicode_4BYTE_KIND: 8942 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8943 break; 8944 default: 8945 assert(0); result = -2; 8946 } 8947 } 8948 else { 8949 switch(kind) { 8950 case PyUnicode_1BYTE_KIND: 8951 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8952 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8953 else 8954 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8955 break; 8956 case PyUnicode_2BYTE_KIND: 8957 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8958 break; 8959 case PyUnicode_4BYTE_KIND: 8960 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8961 break; 8962 default: 8963 assert(0); result = -2; 8964 } 8965 } 8966 8967 if (kind1 != kind) 8968 PyMem_Free(buf1); 8969 if (kind2 != kind) 8970 PyMem_Free(buf2); 8971 8972 return result; 8973} 8974 8975Py_ssize_t 8976_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, 8977 Py_ssize_t n_buffer, 8978 void *digits, Py_ssize_t n_digits, 8979 Py_ssize_t min_width, 8980 const char *grouping, 8981 const char *thousands_sep) 8982{ 8983 switch(kind) { 8984 case PyUnicode_1BYTE_KIND: 8985 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8986 return _PyUnicode_ascii_InsertThousandsGrouping( 8987 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8988 min_width, grouping, thousands_sep); 8989 else 8990 return _PyUnicode_ucs1_InsertThousandsGrouping( 8991 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8992 min_width, grouping, thousands_sep); 8993 case PyUnicode_2BYTE_KIND: 8994 return _PyUnicode_ucs2_InsertThousandsGrouping( 8995 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, 8996 min_width, grouping, thousands_sep); 8997 case PyUnicode_4BYTE_KIND: 8998 return _PyUnicode_ucs4_InsertThousandsGrouping( 8999 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits, 9000 min_width, grouping, thousands_sep); 9001 } 9002 assert(0); 9003 return -1; 9004} 9005 9006 9007/* helper macro to fixup start/end slice values */ 9008#define ADJUST_INDICES(start, end, len) \ 9009 if (end > len) \ 9010 end = len; \ 9011 else if (end < 0) { \ 9012 end += len; \ 9013 if (end < 0) \ 9014 end = 0; \ 9015 } \ 9016 if (start < 0) { \ 9017 start += len; \ 9018 if (start < 0) \ 9019 start = 0; \ 9020 } 9021 9022Py_ssize_t 9023PyUnicode_Count(PyObject *str, 9024 PyObject *substr, 9025 Py_ssize_t start, 9026 Py_ssize_t end) 9027{ 9028 Py_ssize_t result; 9029 PyObject* str_obj; 9030 PyObject* sub_obj; 9031 int kind1, kind2, kind; 9032 void *buf1 = NULL, *buf2 = NULL; 9033 Py_ssize_t len1, len2; 9034 9035 str_obj = PyUnicode_FromObject(str); 9036 if (!str_obj || PyUnicode_READY(str_obj) == -1) 9037 return -1; 9038 sub_obj = PyUnicode_FromObject(substr); 9039 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) { 9040 Py_DECREF(str_obj); 9041 return -1; 9042 } 9043 9044 kind1 = PyUnicode_KIND(str_obj); 9045 kind2 = PyUnicode_KIND(sub_obj); 9046 kind = kind1 > kind2 ? kind1 : kind2; 9047 buf1 = PyUnicode_DATA(str_obj); 9048 if (kind1 != kind) 9049 buf1 = _PyUnicode_AsKind(str_obj, kind); 9050 if (!buf1) 9051 goto onError; 9052 buf2 = PyUnicode_DATA(sub_obj); 9053 if (kind2 != kind) 9054 buf2 = _PyUnicode_AsKind(sub_obj, kind); 9055 if (!buf2) 9056 goto onError; 9057 len1 = PyUnicode_GET_LENGTH(str_obj); 9058 len2 = PyUnicode_GET_LENGTH(sub_obj); 9059 9060 ADJUST_INDICES(start, end, len1); 9061 switch(kind) { 9062 case PyUnicode_1BYTE_KIND: 9063 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9064 result = asciilib_count( 9065 ((Py_UCS1*)buf1) + start, end - start, 9066 buf2, len2, PY_SSIZE_T_MAX 9067 ); 9068 else 9069 result = ucs1lib_count( 9070 ((Py_UCS1*)buf1) + start, end - start, 9071 buf2, len2, PY_SSIZE_T_MAX 9072 ); 9073 break; 9074 case PyUnicode_2BYTE_KIND: 9075 result = ucs2lib_count( 9076 ((Py_UCS2*)buf1) + start, end - start, 9077 buf2, len2, PY_SSIZE_T_MAX 9078 ); 9079 break; 9080 case PyUnicode_4BYTE_KIND: 9081 result = ucs4lib_count( 9082 ((Py_UCS4*)buf1) + start, end - start, 9083 buf2, len2, PY_SSIZE_T_MAX 9084 ); 9085 break; 9086 default: 9087 assert(0); result = 0; 9088 } 9089 9090 Py_DECREF(sub_obj); 9091 Py_DECREF(str_obj); 9092 9093 if (kind1 != kind) 9094 PyMem_Free(buf1); 9095 if (kind2 != kind) 9096 PyMem_Free(buf2); 9097 9098 return result; 9099 onError: 9100 Py_DECREF(sub_obj); 9101 Py_DECREF(str_obj); 9102 if (kind1 != kind && buf1) 9103 PyMem_Free(buf1); 9104 if (kind2 != kind && buf2) 9105 PyMem_Free(buf2); 9106 return -1; 9107} 9108 9109Py_ssize_t 9110PyUnicode_Find(PyObject *str, 9111 PyObject *sub, 9112 Py_ssize_t start, 9113 Py_ssize_t end, 9114 int direction) 9115{ 9116 Py_ssize_t result; 9117 9118 str = PyUnicode_FromObject(str); 9119 if (!str || PyUnicode_READY(str) == -1) 9120 return -2; 9121 sub = PyUnicode_FromObject(sub); 9122 if (!sub || PyUnicode_READY(sub) == -1) { 9123 Py_DECREF(str); 9124 return -2; 9125 } 9126 9127 result = any_find_slice(direction, 9128 str, sub, start, end 9129 ); 9130 9131 Py_DECREF(str); 9132 Py_DECREF(sub); 9133 9134 return result; 9135} 9136 9137Py_ssize_t 9138PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9139 Py_ssize_t start, Py_ssize_t end, 9140 int direction) 9141{ 9142 int kind; 9143 Py_ssize_t result; 9144 if (PyUnicode_READY(str) == -1) 9145 return -2; 9146 if (start < 0 || end < 0) { 9147 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9148 return -2; 9149 } 9150 if (end > PyUnicode_GET_LENGTH(str)) 9151 end = PyUnicode_GET_LENGTH(str); 9152 kind = PyUnicode_KIND(str); 9153 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9154 kind, end-start, ch, direction); 9155 if (result == -1) 9156 return -1; 9157 else 9158 return start + result; 9159} 9160 9161static int 9162tailmatch(PyObject *self, 9163 PyObject *substring, 9164 Py_ssize_t start, 9165 Py_ssize_t end, 9166 int direction) 9167{ 9168 int kind_self; 9169 int kind_sub; 9170 void *data_self; 9171 void *data_sub; 9172 Py_ssize_t offset; 9173 Py_ssize_t i; 9174 Py_ssize_t end_sub; 9175 9176 if (PyUnicode_READY(self) == -1 || 9177 PyUnicode_READY(substring) == -1) 9178 return 0; 9179 9180 if (PyUnicode_GET_LENGTH(substring) == 0) 9181 return 1; 9182 9183 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9184 end -= PyUnicode_GET_LENGTH(substring); 9185 if (end < start) 9186 return 0; 9187 9188 kind_self = PyUnicode_KIND(self); 9189 data_self = PyUnicode_DATA(self); 9190 kind_sub = PyUnicode_KIND(substring); 9191 data_sub = PyUnicode_DATA(substring); 9192 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9193 9194 if (direction > 0) 9195 offset = end; 9196 else 9197 offset = start; 9198 9199 if (PyUnicode_READ(kind_self, data_self, offset) == 9200 PyUnicode_READ(kind_sub, data_sub, 0) && 9201 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9202 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9203 /* If both are of the same kind, memcmp is sufficient */ 9204 if (kind_self == kind_sub) { 9205 return ! memcmp((char *)data_self + 9206 (offset * PyUnicode_KIND(substring)), 9207 data_sub, 9208 PyUnicode_GET_LENGTH(substring) * 9209 PyUnicode_KIND(substring)); 9210 } 9211 /* otherwise we have to compare each character by first accesing it */ 9212 else { 9213 /* We do not need to compare 0 and len(substring)-1 because 9214 the if statement above ensured already that they are equal 9215 when we end up here. */ 9216 // TODO: honor direction and do a forward or backwards search 9217 for (i = 1; i < end_sub; ++i) { 9218 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9219 PyUnicode_READ(kind_sub, data_sub, i)) 9220 return 0; 9221 } 9222 return 1; 9223 } 9224 } 9225 9226 return 0; 9227} 9228 9229Py_ssize_t 9230PyUnicode_Tailmatch(PyObject *str, 9231 PyObject *substr, 9232 Py_ssize_t start, 9233 Py_ssize_t end, 9234 int direction) 9235{ 9236 Py_ssize_t result; 9237 9238 str = PyUnicode_FromObject(str); 9239 if (str == NULL) 9240 return -1; 9241 substr = PyUnicode_FromObject(substr); 9242 if (substr == NULL) { 9243 Py_DECREF(str); 9244 return -1; 9245 } 9246 9247 result = tailmatch(str, substr, 9248 start, end, direction); 9249 Py_DECREF(str); 9250 Py_DECREF(substr); 9251 return result; 9252} 9253 9254/* Apply fixfct filter to the Unicode object self and return a 9255 reference to the modified object */ 9256 9257static PyObject * 9258fixup(PyObject *self, 9259 Py_UCS4 (*fixfct)(PyObject *s)) 9260{ 9261 PyObject *u; 9262 Py_UCS4 maxchar_old, maxchar_new = 0; 9263 9264 u = PyUnicode_Copy(self); 9265 if (u == NULL) 9266 return NULL; 9267 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9268 9269 /* fix functions return the new maximum character in a string, 9270 if the kind of the resulting unicode object does not change, 9271 everything is fine. Otherwise we need to change the string kind 9272 and re-run the fix function. */ 9273 maxchar_new = fixfct(u); 9274 if (maxchar_new == 0) 9275 /* do nothing, keep maxchar_new at 0 which means no changes. */; 9276 else if (maxchar_new <= 127) 9277 maxchar_new = 127; 9278 else if (maxchar_new <= 255) 9279 maxchar_new = 255; 9280 else if (maxchar_new <= 65535) 9281 maxchar_new = 65535; 9282 else 9283 maxchar_new = MAX_UNICODE; 9284 9285 if (!maxchar_new && PyUnicode_CheckExact(self)) { 9286 /* fixfct should return TRUE if it modified the buffer. If 9287 FALSE, return a reference to the original buffer instead 9288 (to save space, not time) */ 9289 Py_INCREF(self); 9290 Py_DECREF(u); 9291 return self; 9292 } 9293 else if (maxchar_new == maxchar_old) { 9294 return u; 9295 } 9296 else { 9297 /* In case the maximum character changed, we need to 9298 convert the string to the new category. */ 9299 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9300 if (v == NULL) { 9301 Py_DECREF(u); 9302 return NULL; 9303 } 9304 if (maxchar_new > maxchar_old) { 9305 /* If the maxchar increased so that the kind changed, not all 9306 characters are representable anymore and we need to fix the 9307 string again. This only happens in very few cases. */ 9308 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); 9309 maxchar_old = fixfct(v); 9310 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9311 } 9312 else { 9313 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); 9314 } 9315 9316 Py_DECREF(u); 9317 assert(_PyUnicode_CheckConsistency(v, 1)); 9318 return v; 9319 } 9320} 9321 9322static Py_UCS4 9323fixupper(PyObject *self) 9324{ 9325 /* No need to call PyUnicode_READY(self) because this function is only 9326 called as a callback from fixup() which does it already. */ 9327 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9328 const int kind = PyUnicode_KIND(self); 9329 void *data = PyUnicode_DATA(self); 9330 int touched = 0; 9331 Py_UCS4 maxchar = 0; 9332 Py_ssize_t i; 9333 9334 for (i = 0; i < len; ++i) { 9335 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9336 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); 9337 if (up != ch) { 9338 if (up > maxchar) 9339 maxchar = up; 9340 PyUnicode_WRITE(kind, data, i, up); 9341 touched = 1; 9342 } 9343 else if (ch > maxchar) 9344 maxchar = ch; 9345 } 9346 9347 if (touched) 9348 return maxchar; 9349 else 9350 return 0; 9351} 9352 9353static Py_UCS4 9354fixlower(PyObject *self) 9355{ 9356 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9357 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9358 const int kind = PyUnicode_KIND(self); 9359 void *data = PyUnicode_DATA(self); 9360 int touched = 0; 9361 Py_UCS4 maxchar = 0; 9362 Py_ssize_t i; 9363 9364 for(i = 0; i < len; ++i) { 9365 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9366 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9367 if (lo != ch) { 9368 if (lo > maxchar) 9369 maxchar = lo; 9370 PyUnicode_WRITE(kind, data, i, lo); 9371 touched = 1; 9372 } 9373 else if (ch > maxchar) 9374 maxchar = ch; 9375 } 9376 9377 if (touched) 9378 return maxchar; 9379 else 9380 return 0; 9381} 9382 9383static Py_UCS4 9384fixswapcase(PyObject *self) 9385{ 9386 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9387 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9388 const int kind = PyUnicode_KIND(self); 9389 void *data = PyUnicode_DATA(self); 9390 int touched = 0; 9391 Py_UCS4 maxchar = 0; 9392 Py_ssize_t i; 9393 9394 for(i = 0; i < len; ++i) { 9395 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9396 Py_UCS4 nu = 0; 9397 9398 if (Py_UNICODE_ISUPPER(ch)) 9399 nu = Py_UNICODE_TOLOWER(ch); 9400 else if (Py_UNICODE_ISLOWER(ch)) 9401 nu = Py_UNICODE_TOUPPER(ch); 9402 9403 if (nu != 0) { 9404 if (nu > maxchar) 9405 maxchar = nu; 9406 PyUnicode_WRITE(kind, data, i, nu); 9407 touched = 1; 9408 } 9409 else if (ch > maxchar) 9410 maxchar = ch; 9411 } 9412 9413 if (touched) 9414 return maxchar; 9415 else 9416 return 0; 9417} 9418 9419static Py_UCS4 9420fixcapitalize(PyObject *self) 9421{ 9422 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9423 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9424 const int kind = PyUnicode_KIND(self); 9425 void *data = PyUnicode_DATA(self); 9426 int touched = 0; 9427 Py_UCS4 maxchar = 0; 9428 Py_ssize_t i = 0; 9429 Py_UCS4 ch; 9430 9431 if (len == 0) 9432 return 0; 9433 9434 ch = PyUnicode_READ(kind, data, i); 9435 if (!Py_UNICODE_ISUPPER(ch)) { 9436 maxchar = Py_UNICODE_TOUPPER(ch); 9437 PyUnicode_WRITE(kind, data, i, maxchar); 9438 touched = 1; 9439 } 9440 ++i; 9441 for(; i < len; ++i) { 9442 ch = PyUnicode_READ(kind, data, i); 9443 if (!Py_UNICODE_ISLOWER(ch)) { 9444 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); 9445 if (lo > maxchar) 9446 maxchar = lo; 9447 PyUnicode_WRITE(kind, data, i, lo); 9448 touched = 1; 9449 } 9450 else if (ch > maxchar) 9451 maxchar = ch; 9452 } 9453 9454 if (touched) 9455 return maxchar; 9456 else 9457 return 0; 9458} 9459 9460static Py_UCS4 9461fixtitle(PyObject *self) 9462{ 9463 /* No need to call PyUnicode_READY(self) because fixup() which does it. */ 9464 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9465 const int kind = PyUnicode_KIND(self); 9466 void *data = PyUnicode_DATA(self); 9467 Py_UCS4 maxchar = 0; 9468 Py_ssize_t i = 0; 9469 int previous_is_cased; 9470 9471 /* Shortcut for single character strings */ 9472 if (len == 1) { 9473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9474 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); 9475 if (ti != ch) { 9476 PyUnicode_WRITE(kind, data, i, ti); 9477 return ti; 9478 } 9479 else 9480 return 0; 9481 } 9482 previous_is_cased = 0; 9483 for(; i < len; ++i) { 9484 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 9485 Py_UCS4 nu; 9486 9487 if (previous_is_cased) 9488 nu = Py_UNICODE_TOLOWER(ch); 9489 else 9490 nu = Py_UNICODE_TOTITLE(ch); 9491 9492 if (nu > maxchar) 9493 maxchar = nu; 9494 PyUnicode_WRITE(kind, data, i, nu); 9495 9496 if (Py_UNICODE_ISLOWER(ch) || 9497 Py_UNICODE_ISUPPER(ch) || 9498 Py_UNICODE_ISTITLE(ch)) 9499 previous_is_cased = 1; 9500 else 9501 previous_is_cased = 0; 9502 } 9503 return maxchar; 9504} 9505 9506PyObject * 9507PyUnicode_Join(PyObject *separator, PyObject *seq) 9508{ 9509 PyObject *sep = NULL; 9510 Py_ssize_t seplen; 9511 PyObject *res = NULL; /* the result */ 9512 PyObject *fseq; /* PySequence_Fast(seq) */ 9513 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9514 PyObject **items; 9515 PyObject *item; 9516 Py_ssize_t sz, i, res_offset; 9517 Py_UCS4 maxchar; 9518 Py_UCS4 item_maxchar; 9519 int use_memcpy; 9520 unsigned char *res_data = NULL, *sep_data = NULL; 9521 PyObject *last_obj; 9522 unsigned int kind = 0; 9523 9524 fseq = PySequence_Fast(seq, ""); 9525 if (fseq == NULL) { 9526 return NULL; 9527 } 9528 9529 /* NOTE: the following code can't call back into Python code, 9530 * so we are sure that fseq won't be mutated. 9531 */ 9532 9533 seqlen = PySequence_Fast_GET_SIZE(fseq); 9534 /* If empty sequence, return u"". */ 9535 if (seqlen == 0) { 9536 Py_DECREF(fseq); 9537 Py_INCREF(unicode_empty); 9538 res = unicode_empty; 9539 return res; 9540 } 9541 9542 /* If singleton sequence with an exact Unicode, return that. */ 9543 last_obj = NULL; 9544 items = PySequence_Fast_ITEMS(fseq); 9545 if (seqlen == 1) { 9546 if (PyUnicode_CheckExact(items[0])) { 9547 res = items[0]; 9548 Py_INCREF(res); 9549 Py_DECREF(fseq); 9550 return res; 9551 } 9552 seplen = 0; 9553 maxchar = 0; 9554 } 9555 else { 9556 /* Set up sep and seplen */ 9557 if (separator == NULL) { 9558 /* fall back to a blank space separator */ 9559 sep = PyUnicode_FromOrdinal(' '); 9560 if (!sep) 9561 goto onError; 9562 seplen = 1; 9563 maxchar = 32; 9564 } 9565 else { 9566 if (!PyUnicode_Check(separator)) { 9567 PyErr_Format(PyExc_TypeError, 9568 "separator: expected str instance," 9569 " %.80s found", 9570 Py_TYPE(separator)->tp_name); 9571 goto onError; 9572 } 9573 if (PyUnicode_READY(separator)) 9574 goto onError; 9575 sep = separator; 9576 seplen = PyUnicode_GET_LENGTH(separator); 9577 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9578 /* inc refcount to keep this code path symmetric with the 9579 above case of a blank separator */ 9580 Py_INCREF(sep); 9581 } 9582 last_obj = sep; 9583 } 9584 9585 /* There are at least two things to join, or else we have a subclass 9586 * of str in the sequence. 9587 * Do a pre-pass to figure out the total amount of space we'll 9588 * need (sz), and see whether all argument are strings. 9589 */ 9590 sz = 0; 9591#ifdef Py_DEBUG 9592 use_memcpy = 0; 9593#else 9594 use_memcpy = 1; 9595#endif 9596 for (i = 0; i < seqlen; i++) { 9597 const Py_ssize_t old_sz = sz; 9598 item = items[i]; 9599 if (!PyUnicode_Check(item)) { 9600 PyErr_Format(PyExc_TypeError, 9601 "sequence item %zd: expected str instance," 9602 " %.80s found", 9603 i, Py_TYPE(item)->tp_name); 9604 goto onError; 9605 } 9606 if (PyUnicode_READY(item) == -1) 9607 goto onError; 9608 sz += PyUnicode_GET_LENGTH(item); 9609 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9610 maxchar = Py_MAX(maxchar, item_maxchar); 9611 if (i != 0) 9612 sz += seplen; 9613 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9614 PyErr_SetString(PyExc_OverflowError, 9615 "join() result is too long for a Python string"); 9616 goto onError; 9617 } 9618 if (use_memcpy && last_obj != NULL) { 9619 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9620 use_memcpy = 0; 9621 } 9622 last_obj = item; 9623 } 9624 9625 res = PyUnicode_New(sz, maxchar); 9626 if (res == NULL) 9627 goto onError; 9628 9629 /* Catenate everything. */ 9630#ifdef Py_DEBUG 9631 use_memcpy = 0; 9632#else 9633 if (use_memcpy) { 9634 res_data = PyUnicode_1BYTE_DATA(res); 9635 kind = PyUnicode_KIND(res); 9636 if (seplen != 0) 9637 sep_data = PyUnicode_1BYTE_DATA(sep); 9638 } 9639#endif 9640 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9641 Py_ssize_t itemlen; 9642 item = items[i]; 9643 /* Copy item, and maybe the separator. */ 9644 if (i && seplen != 0) { 9645 if (use_memcpy) { 9646 Py_MEMCPY(res_data, 9647 sep_data, 9648 kind * seplen); 9649 res_data += kind * seplen; 9650 } 9651 else { 9652 copy_characters(res, res_offset, sep, 0, seplen); 9653 res_offset += seplen; 9654 } 9655 } 9656 itemlen = PyUnicode_GET_LENGTH(item); 9657 if (itemlen != 0) { 9658 if (use_memcpy) { 9659 Py_MEMCPY(res_data, 9660 PyUnicode_DATA(item), 9661 kind * itemlen); 9662 res_data += kind * itemlen; 9663 } 9664 else { 9665 copy_characters(res, res_offset, item, 0, itemlen); 9666 res_offset += itemlen; 9667 } 9668 } 9669 } 9670 if (use_memcpy) 9671 assert(res_data == PyUnicode_1BYTE_DATA(res) 9672 + kind * PyUnicode_GET_LENGTH(res)); 9673 else 9674 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9675 9676 Py_DECREF(fseq); 9677 Py_XDECREF(sep); 9678 assert(_PyUnicode_CheckConsistency(res, 1)); 9679 return res; 9680 9681 onError: 9682 Py_DECREF(fseq); 9683 Py_XDECREF(sep); 9684 Py_XDECREF(res); 9685 return NULL; 9686} 9687 9688#define FILL(kind, data, value, start, length) \ 9689 do { \ 9690 Py_ssize_t i_ = 0; \ 9691 assert(kind != PyUnicode_WCHAR_KIND); \ 9692 switch ((kind)) { \ 9693 case PyUnicode_1BYTE_KIND: { \ 9694 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9695 memset(to_, (unsigned char)value, length); \ 9696 break; \ 9697 } \ 9698 case PyUnicode_2BYTE_KIND: { \ 9699 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9700 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9701 break; \ 9702 } \ 9703 default: { \ 9704 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9705 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9706 break; \ 9707 } \ 9708 } \ 9709 } while (0) 9710 9711static PyObject * 9712pad(PyObject *self, 9713 Py_ssize_t left, 9714 Py_ssize_t right, 9715 Py_UCS4 fill) 9716{ 9717 PyObject *u; 9718 Py_UCS4 maxchar; 9719 int kind; 9720 void *data; 9721 9722 if (left < 0) 9723 left = 0; 9724 if (right < 0) 9725 right = 0; 9726 9727 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { 9728 Py_INCREF(self); 9729 return self; 9730 } 9731 9732 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9733 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9734 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9735 return NULL; 9736 } 9737 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9738 if (fill > maxchar) 9739 maxchar = fill; 9740 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9741 if (!u) 9742 return NULL; 9743 9744 kind = PyUnicode_KIND(u); 9745 data = PyUnicode_DATA(u); 9746 if (left) 9747 FILL(kind, data, fill, 0, left); 9748 if (right) 9749 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9750 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9751 assert(_PyUnicode_CheckConsistency(u, 1)); 9752 return u; 9753} 9754#undef FILL 9755 9756PyObject * 9757PyUnicode_Splitlines(PyObject *string, int keepends) 9758{ 9759 PyObject *list; 9760 9761 string = PyUnicode_FromObject(string); 9762 if (string == NULL || PyUnicode_READY(string) == -1) 9763 return NULL; 9764 9765 switch(PyUnicode_KIND(string)) { 9766 case PyUnicode_1BYTE_KIND: 9767 if (PyUnicode_IS_ASCII(string)) 9768 list = asciilib_splitlines( 9769 string, PyUnicode_1BYTE_DATA(string), 9770 PyUnicode_GET_LENGTH(string), keepends); 9771 else 9772 list = ucs1lib_splitlines( 9773 string, PyUnicode_1BYTE_DATA(string), 9774 PyUnicode_GET_LENGTH(string), keepends); 9775 break; 9776 case PyUnicode_2BYTE_KIND: 9777 list = ucs2lib_splitlines( 9778 string, PyUnicode_2BYTE_DATA(string), 9779 PyUnicode_GET_LENGTH(string), keepends); 9780 break; 9781 case PyUnicode_4BYTE_KIND: 9782 list = ucs4lib_splitlines( 9783 string, PyUnicode_4BYTE_DATA(string), 9784 PyUnicode_GET_LENGTH(string), keepends); 9785 break; 9786 default: 9787 assert(0); 9788 list = 0; 9789 } 9790 Py_DECREF(string); 9791 return list; 9792} 9793 9794static PyObject * 9795split(PyObject *self, 9796 PyObject *substring, 9797 Py_ssize_t maxcount) 9798{ 9799 int kind1, kind2, kind; 9800 void *buf1, *buf2; 9801 Py_ssize_t len1, len2; 9802 PyObject* out; 9803 9804 if (maxcount < 0) 9805 maxcount = PY_SSIZE_T_MAX; 9806 9807 if (PyUnicode_READY(self) == -1) 9808 return NULL; 9809 9810 if (substring == NULL) 9811 switch(PyUnicode_KIND(self)) { 9812 case PyUnicode_1BYTE_KIND: 9813 if (PyUnicode_IS_ASCII(self)) 9814 return asciilib_split_whitespace( 9815 self, PyUnicode_1BYTE_DATA(self), 9816 PyUnicode_GET_LENGTH(self), maxcount 9817 ); 9818 else 9819 return ucs1lib_split_whitespace( 9820 self, PyUnicode_1BYTE_DATA(self), 9821 PyUnicode_GET_LENGTH(self), maxcount 9822 ); 9823 case PyUnicode_2BYTE_KIND: 9824 return ucs2lib_split_whitespace( 9825 self, PyUnicode_2BYTE_DATA(self), 9826 PyUnicode_GET_LENGTH(self), maxcount 9827 ); 9828 case PyUnicode_4BYTE_KIND: 9829 return ucs4lib_split_whitespace( 9830 self, PyUnicode_4BYTE_DATA(self), 9831 PyUnicode_GET_LENGTH(self), maxcount 9832 ); 9833 default: 9834 assert(0); 9835 return NULL; 9836 } 9837 9838 if (PyUnicode_READY(substring) == -1) 9839 return NULL; 9840 9841 kind1 = PyUnicode_KIND(self); 9842 kind2 = PyUnicode_KIND(substring); 9843 kind = kind1 > kind2 ? kind1 : kind2; 9844 buf1 = PyUnicode_DATA(self); 9845 buf2 = PyUnicode_DATA(substring); 9846 if (kind1 != kind) 9847 buf1 = _PyUnicode_AsKind(self, kind); 9848 if (!buf1) 9849 return NULL; 9850 if (kind2 != kind) 9851 buf2 = _PyUnicode_AsKind(substring, kind); 9852 if (!buf2) { 9853 if (kind1 != kind) PyMem_Free(buf1); 9854 return NULL; 9855 } 9856 len1 = PyUnicode_GET_LENGTH(self); 9857 len2 = PyUnicode_GET_LENGTH(substring); 9858 9859 switch(kind) { 9860 case PyUnicode_1BYTE_KIND: 9861 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9862 out = asciilib_split( 9863 self, buf1, len1, buf2, len2, maxcount); 9864 else 9865 out = ucs1lib_split( 9866 self, buf1, len1, buf2, len2, maxcount); 9867 break; 9868 case PyUnicode_2BYTE_KIND: 9869 out = ucs2lib_split( 9870 self, buf1, len1, buf2, len2, maxcount); 9871 break; 9872 case PyUnicode_4BYTE_KIND: 9873 out = ucs4lib_split( 9874 self, buf1, len1, buf2, len2, maxcount); 9875 break; 9876 default: 9877 out = NULL; 9878 } 9879 if (kind1 != kind) 9880 PyMem_Free(buf1); 9881 if (kind2 != kind) 9882 PyMem_Free(buf2); 9883 return out; 9884} 9885 9886static PyObject * 9887rsplit(PyObject *self, 9888 PyObject *substring, 9889 Py_ssize_t maxcount) 9890{ 9891 int kind1, kind2, kind; 9892 void *buf1, *buf2; 9893 Py_ssize_t len1, len2; 9894 PyObject* out; 9895 9896 if (maxcount < 0) 9897 maxcount = PY_SSIZE_T_MAX; 9898 9899 if (PyUnicode_READY(self) == -1) 9900 return NULL; 9901 9902 if (substring == NULL) 9903 switch(PyUnicode_KIND(self)) { 9904 case PyUnicode_1BYTE_KIND: 9905 if (PyUnicode_IS_ASCII(self)) 9906 return asciilib_rsplit_whitespace( 9907 self, PyUnicode_1BYTE_DATA(self), 9908 PyUnicode_GET_LENGTH(self), maxcount 9909 ); 9910 else 9911 return ucs1lib_rsplit_whitespace( 9912 self, PyUnicode_1BYTE_DATA(self), 9913 PyUnicode_GET_LENGTH(self), maxcount 9914 ); 9915 case PyUnicode_2BYTE_KIND: 9916 return ucs2lib_rsplit_whitespace( 9917 self, PyUnicode_2BYTE_DATA(self), 9918 PyUnicode_GET_LENGTH(self), maxcount 9919 ); 9920 case PyUnicode_4BYTE_KIND: 9921 return ucs4lib_rsplit_whitespace( 9922 self, PyUnicode_4BYTE_DATA(self), 9923 PyUnicode_GET_LENGTH(self), maxcount 9924 ); 9925 default: 9926 assert(0); 9927 return NULL; 9928 } 9929 9930 if (PyUnicode_READY(substring) == -1) 9931 return NULL; 9932 9933 kind1 = PyUnicode_KIND(self); 9934 kind2 = PyUnicode_KIND(substring); 9935 kind = kind1 > kind2 ? kind1 : kind2; 9936 buf1 = PyUnicode_DATA(self); 9937 buf2 = PyUnicode_DATA(substring); 9938 if (kind1 != kind) 9939 buf1 = _PyUnicode_AsKind(self, kind); 9940 if (!buf1) 9941 return NULL; 9942 if (kind2 != kind) 9943 buf2 = _PyUnicode_AsKind(substring, kind); 9944 if (!buf2) { 9945 if (kind1 != kind) PyMem_Free(buf1); 9946 return NULL; 9947 } 9948 len1 = PyUnicode_GET_LENGTH(self); 9949 len2 = PyUnicode_GET_LENGTH(substring); 9950 9951 switch(kind) { 9952 case PyUnicode_1BYTE_KIND: 9953 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9954 out = asciilib_rsplit( 9955 self, buf1, len1, buf2, len2, maxcount); 9956 else 9957 out = ucs1lib_rsplit( 9958 self, buf1, len1, buf2, len2, maxcount); 9959 break; 9960 case PyUnicode_2BYTE_KIND: 9961 out = ucs2lib_rsplit( 9962 self, buf1, len1, buf2, len2, maxcount); 9963 break; 9964 case PyUnicode_4BYTE_KIND: 9965 out = ucs4lib_rsplit( 9966 self, buf1, len1, buf2, len2, maxcount); 9967 break; 9968 default: 9969 out = NULL; 9970 } 9971 if (kind1 != kind) 9972 PyMem_Free(buf1); 9973 if (kind2 != kind) 9974 PyMem_Free(buf2); 9975 return out; 9976} 9977 9978static Py_ssize_t 9979anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 9980 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 9981{ 9982 switch(kind) { 9983 case PyUnicode_1BYTE_KIND: 9984 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 9985 return asciilib_find(buf1, len1, buf2, len2, offset); 9986 else 9987 return ucs1lib_find(buf1, len1, buf2, len2, offset); 9988 case PyUnicode_2BYTE_KIND: 9989 return ucs2lib_find(buf1, len1, buf2, len2, offset); 9990 case PyUnicode_4BYTE_KIND: 9991 return ucs4lib_find(buf1, len1, buf2, len2, offset); 9992 } 9993 assert(0); 9994 return -1; 9995} 9996 9997static Py_ssize_t 9998anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 9999 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10000{ 10001 switch(kind) { 10002 case PyUnicode_1BYTE_KIND: 10003 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10004 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10005 else 10006 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10007 case PyUnicode_2BYTE_KIND: 10008 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10009 case PyUnicode_4BYTE_KIND: 10010 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10011 } 10012 assert(0); 10013 return 0; 10014} 10015 10016static PyObject * 10017replace(PyObject *self, PyObject *str1, 10018 PyObject *str2, Py_ssize_t maxcount) 10019{ 10020 PyObject *u; 10021 char *sbuf = PyUnicode_DATA(self); 10022 char *buf1 = PyUnicode_DATA(str1); 10023 char *buf2 = PyUnicode_DATA(str2); 10024 int srelease = 0, release1 = 0, release2 = 0; 10025 int skind = PyUnicode_KIND(self); 10026 int kind1 = PyUnicode_KIND(str1); 10027 int kind2 = PyUnicode_KIND(str2); 10028 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10029 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10030 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10031 int mayshrink; 10032 Py_UCS4 maxchar, maxchar_str2; 10033 10034 if (maxcount < 0) 10035 maxcount = PY_SSIZE_T_MAX; 10036 else if (maxcount == 0 || slen == 0) 10037 goto nothing; 10038 10039 if (str1 == str2) 10040 goto nothing; 10041 if (skind < kind1) 10042 /* substring too wide to be present */ 10043 goto nothing; 10044 10045 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10046 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10047 /* Replacing str1 with str2 may cause a maxchar reduction in the 10048 result string. */ 10049 mayshrink = (maxchar_str2 < maxchar); 10050 maxchar = Py_MAX(maxchar, maxchar_str2); 10051 10052 if (len1 == len2) { 10053 Py_ssize_t i; 10054 /* same length */ 10055 if (len1 == 0) 10056 goto nothing; 10057 if (len1 == 1) { 10058 /* replace characters */ 10059 Py_UCS4 u1, u2; 10060 int rkind; 10061 u1 = PyUnicode_READ_CHAR(str1, 0); 10062 if (findchar(sbuf, PyUnicode_KIND(self), 10063 slen, u1, 1) < 0) 10064 goto nothing; 10065 u2 = PyUnicode_READ_CHAR(str2, 0); 10066 u = PyUnicode_New(slen, maxchar); 10067 if (!u) 10068 goto error; 10069 copy_characters(u, 0, self, 0, slen); 10070 rkind = PyUnicode_KIND(u); 10071 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) 10072 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { 10073 if (--maxcount < 0) 10074 break; 10075 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); 10076 } 10077 } 10078 else { 10079 int rkind = skind; 10080 char *res; 10081 10082 if (kind1 < rkind) { 10083 /* widen substring */ 10084 buf1 = _PyUnicode_AsKind(str1, rkind); 10085 if (!buf1) goto error; 10086 release1 = 1; 10087 } 10088 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10089 if (i < 0) 10090 goto nothing; 10091 if (rkind > kind2) { 10092 /* widen replacement */ 10093 buf2 = _PyUnicode_AsKind(str2, rkind); 10094 if (!buf2) goto error; 10095 release2 = 1; 10096 } 10097 else if (rkind < kind2) { 10098 /* widen self and buf1 */ 10099 rkind = kind2; 10100 if (release1) PyMem_Free(buf1); 10101 sbuf = _PyUnicode_AsKind(self, rkind); 10102 if (!sbuf) goto error; 10103 srelease = 1; 10104 buf1 = _PyUnicode_AsKind(str1, rkind); 10105 if (!buf1) goto error; 10106 release1 = 1; 10107 } 10108 u = PyUnicode_New(slen, maxchar); 10109 if (!u) 10110 goto error; 10111 assert(PyUnicode_KIND(u) == rkind); 10112 res = PyUnicode_DATA(u); 10113 10114 memcpy(res, sbuf, rkind * slen); 10115 /* change everything in-place, starting with this one */ 10116 memcpy(res + rkind * i, 10117 buf2, 10118 rkind * len2); 10119 i += len1; 10120 10121 while ( --maxcount > 0) { 10122 i = anylib_find(rkind, self, 10123 sbuf+rkind*i, slen-i, 10124 str1, buf1, len1, i); 10125 if (i == -1) 10126 break; 10127 memcpy(res + rkind * i, 10128 buf2, 10129 rkind * len2); 10130 i += len1; 10131 } 10132 } 10133 } 10134 else { 10135 Py_ssize_t n, i, j, ires; 10136 Py_ssize_t product, new_size; 10137 int rkind = skind; 10138 char *res; 10139 10140 if (kind1 < rkind) { 10141 /* widen substring */ 10142 buf1 = _PyUnicode_AsKind(str1, rkind); 10143 if (!buf1) goto error; 10144 release1 = 1; 10145 } 10146 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10147 if (n == 0) 10148 goto nothing; 10149 if (kind2 < rkind) { 10150 /* widen replacement */ 10151 buf2 = _PyUnicode_AsKind(str2, rkind); 10152 if (!buf2) goto error; 10153 release2 = 1; 10154 } 10155 else if (kind2 > rkind) { 10156 /* widen self and buf1 */ 10157 rkind = kind2; 10158 sbuf = _PyUnicode_AsKind(self, rkind); 10159 if (!sbuf) goto error; 10160 srelease = 1; 10161 if (release1) PyMem_Free(buf1); 10162 buf1 = _PyUnicode_AsKind(str1, rkind); 10163 if (!buf1) goto error; 10164 release1 = 1; 10165 } 10166 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10167 PyUnicode_GET_LENGTH(str1))); */ 10168 product = n * (len2-len1); 10169 if ((product / (len2-len1)) != n) { 10170 PyErr_SetString(PyExc_OverflowError, 10171 "replace string is too long"); 10172 goto error; 10173 } 10174 new_size = slen + product; 10175 if (new_size == 0) { 10176 Py_INCREF(unicode_empty); 10177 u = unicode_empty; 10178 goto done; 10179 } 10180 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10181 PyErr_SetString(PyExc_OverflowError, 10182 "replace string is too long"); 10183 goto error; 10184 } 10185 u = PyUnicode_New(new_size, maxchar); 10186 if (!u) 10187 goto error; 10188 assert(PyUnicode_KIND(u) == rkind); 10189 res = PyUnicode_DATA(u); 10190 ires = i = 0; 10191 if (len1 > 0) { 10192 while (n-- > 0) { 10193 /* look for next match */ 10194 j = anylib_find(rkind, self, 10195 sbuf + rkind * i, slen-i, 10196 str1, buf1, len1, i); 10197 if (j == -1) 10198 break; 10199 else if (j > i) { 10200 /* copy unchanged part [i:j] */ 10201 memcpy(res + rkind * ires, 10202 sbuf + rkind * i, 10203 rkind * (j-i)); 10204 ires += j - i; 10205 } 10206 /* copy substitution string */ 10207 if (len2 > 0) { 10208 memcpy(res + rkind * ires, 10209 buf2, 10210 rkind * len2); 10211 ires += len2; 10212 } 10213 i = j + len1; 10214 } 10215 if (i < slen) 10216 /* copy tail [i:] */ 10217 memcpy(res + rkind * ires, 10218 sbuf + rkind * i, 10219 rkind * (slen-i)); 10220 } 10221 else { 10222 /* interleave */ 10223 while (n > 0) { 10224 memcpy(res + rkind * ires, 10225 buf2, 10226 rkind * len2); 10227 ires += len2; 10228 if (--n <= 0) 10229 break; 10230 memcpy(res + rkind * ires, 10231 sbuf + rkind * i, 10232 rkind); 10233 ires++; 10234 i++; 10235 } 10236 memcpy(res + rkind * ires, 10237 sbuf + rkind * i, 10238 rkind * (slen-i)); 10239 } 10240 } 10241 10242 if (mayshrink) { 10243 unicode_adjust_maxchar(&u); 10244 if (u == NULL) 10245 goto error; 10246 } 10247 10248 done: 10249 if (srelease) 10250 PyMem_FREE(sbuf); 10251 if (release1) 10252 PyMem_FREE(buf1); 10253 if (release2) 10254 PyMem_FREE(buf2); 10255 assert(_PyUnicode_CheckConsistency(u, 1)); 10256 return u; 10257 10258 nothing: 10259 /* nothing to replace; return original string (when possible) */ 10260 if (srelease) 10261 PyMem_FREE(sbuf); 10262 if (release1) 10263 PyMem_FREE(buf1); 10264 if (release2) 10265 PyMem_FREE(buf2); 10266 if (PyUnicode_CheckExact(self)) { 10267 Py_INCREF(self); 10268 return self; 10269 } 10270 return PyUnicode_Copy(self); 10271 error: 10272 if (srelease && sbuf) 10273 PyMem_FREE(sbuf); 10274 if (release1 && buf1) 10275 PyMem_FREE(buf1); 10276 if (release2 && buf2) 10277 PyMem_FREE(buf2); 10278 return NULL; 10279} 10280 10281/* --- Unicode Object Methods --------------------------------------------- */ 10282 10283PyDoc_STRVAR(title__doc__, 10284 "S.title() -> str\n\ 10285\n\ 10286Return a titlecased version of S, i.e. words start with title case\n\ 10287characters, all remaining cased characters have lower case."); 10288 10289static PyObject* 10290unicode_title(PyObject *self) 10291{ 10292 return fixup(self, fixtitle); 10293} 10294 10295PyDoc_STRVAR(capitalize__doc__, 10296 "S.capitalize() -> str\n\ 10297\n\ 10298Return a capitalized version of S, i.e. make the first character\n\ 10299have upper case and the rest lower case."); 10300 10301static PyObject* 10302unicode_capitalize(PyObject *self) 10303{ 10304 return fixup(self, fixcapitalize); 10305} 10306 10307#if 0 10308PyDoc_STRVAR(capwords__doc__, 10309 "S.capwords() -> str\n\ 10310\n\ 10311Apply .capitalize() to all words in S and return the result with\n\ 10312normalized whitespace (all whitespace strings are replaced by ' ')."); 10313 10314static PyObject* 10315unicode_capwords(PyObject *self) 10316{ 10317 PyObject *list; 10318 PyObject *item; 10319 Py_ssize_t i; 10320 10321 /* Split into words */ 10322 list = split(self, NULL, -1); 10323 if (!list) 10324 return NULL; 10325 10326 /* Capitalize each word */ 10327 for (i = 0; i < PyList_GET_SIZE(list); i++) { 10328 item = fixup(PyList_GET_ITEM(list, i), 10329 fixcapitalize); 10330 if (item == NULL) 10331 goto onError; 10332 Py_DECREF(PyList_GET_ITEM(list, i)); 10333 PyList_SET_ITEM(list, i, item); 10334 } 10335 10336 /* Join the words to form a new string */ 10337 item = PyUnicode_Join(NULL, list); 10338 10339 onError: 10340 Py_DECREF(list); 10341 return item; 10342} 10343#endif 10344 10345/* Argument converter. Coerces to a single unicode character */ 10346 10347static int 10348convert_uc(PyObject *obj, void *addr) 10349{ 10350 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10351 PyObject *uniobj; 10352 10353 uniobj = PyUnicode_FromObject(obj); 10354 if (uniobj == NULL) { 10355 PyErr_SetString(PyExc_TypeError, 10356 "The fill character cannot be converted to Unicode"); 10357 return 0; 10358 } 10359 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10360 PyErr_SetString(PyExc_TypeError, 10361 "The fill character must be exactly one character long"); 10362 Py_DECREF(uniobj); 10363 return 0; 10364 } 10365 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10366 Py_DECREF(uniobj); 10367 return 1; 10368} 10369 10370PyDoc_STRVAR(center__doc__, 10371 "S.center(width[, fillchar]) -> str\n\ 10372\n\ 10373Return S centered in a string of length width. Padding is\n\ 10374done using the specified fill character (default is a space)"); 10375 10376static PyObject * 10377unicode_center(PyObject *self, PyObject *args) 10378{ 10379 Py_ssize_t marg, left; 10380 Py_ssize_t width; 10381 Py_UCS4 fillchar = ' '; 10382 10383 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10384 return NULL; 10385 10386 if (PyUnicode_READY(self) == -1) 10387 return NULL; 10388 10389 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 10390 Py_INCREF(self); 10391 return self; 10392 } 10393 10394 marg = width - _PyUnicode_LENGTH(self); 10395 left = marg / 2 + (marg & width & 1); 10396 10397 return pad(self, left, marg - left, fillchar); 10398} 10399 10400/* This function assumes that str1 and str2 are readied by the caller. */ 10401 10402static int 10403unicode_compare(PyObject *str1, PyObject *str2) 10404{ 10405 int kind1, kind2; 10406 void *data1, *data2; 10407 Py_ssize_t len1, len2, i; 10408 10409 kind1 = PyUnicode_KIND(str1); 10410 kind2 = PyUnicode_KIND(str2); 10411 data1 = PyUnicode_DATA(str1); 10412 data2 = PyUnicode_DATA(str2); 10413 len1 = PyUnicode_GET_LENGTH(str1); 10414 len2 = PyUnicode_GET_LENGTH(str2); 10415 10416 for (i = 0; i < len1 && i < len2; ++i) { 10417 Py_UCS4 c1, c2; 10418 c1 = PyUnicode_READ(kind1, data1, i); 10419 c2 = PyUnicode_READ(kind2, data2, i); 10420 10421 if (c1 != c2) 10422 return (c1 < c2) ? -1 : 1; 10423 } 10424 10425 return (len1 < len2) ? -1 : (len1 != len2); 10426} 10427 10428int 10429PyUnicode_Compare(PyObject *left, PyObject *right) 10430{ 10431 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10432 if (PyUnicode_READY(left) == -1 || 10433 PyUnicode_READY(right) == -1) 10434 return -1; 10435 return unicode_compare(left, right); 10436 } 10437 PyErr_Format(PyExc_TypeError, 10438 "Can't compare %.100s and %.100s", 10439 left->ob_type->tp_name, 10440 right->ob_type->tp_name); 10441 return -1; 10442} 10443 10444int 10445PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10446{ 10447 Py_ssize_t i; 10448 int kind; 10449 void *data; 10450 Py_UCS4 chr; 10451 10452 assert(_PyUnicode_CHECK(uni)); 10453 if (PyUnicode_READY(uni) == -1) 10454 return -1; 10455 kind = PyUnicode_KIND(uni); 10456 data = PyUnicode_DATA(uni); 10457 /* Compare Unicode string and source character set string */ 10458 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10459 if (chr != str[i]) 10460 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10461 /* This check keeps Python strings that end in '\0' from comparing equal 10462 to C strings identical up to that point. */ 10463 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10464 return 1; /* uni is longer */ 10465 if (str[i]) 10466 return -1; /* str is longer */ 10467 return 0; 10468} 10469 10470 10471#define TEST_COND(cond) \ 10472 ((cond) ? Py_True : Py_False) 10473 10474PyObject * 10475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10476{ 10477 int result; 10478 10479 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10480 PyObject *v; 10481 if (PyUnicode_READY(left) == -1 || 10482 PyUnicode_READY(right) == -1) 10483 return NULL; 10484 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10485 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10486 if (op == Py_EQ) { 10487 Py_INCREF(Py_False); 10488 return Py_False; 10489 } 10490 if (op == Py_NE) { 10491 Py_INCREF(Py_True); 10492 return Py_True; 10493 } 10494 } 10495 if (left == right) 10496 result = 0; 10497 else 10498 result = unicode_compare(left, right); 10499 10500 /* Convert the return value to a Boolean */ 10501 switch (op) { 10502 case Py_EQ: 10503 v = TEST_COND(result == 0); 10504 break; 10505 case Py_NE: 10506 v = TEST_COND(result != 0); 10507 break; 10508 case Py_LE: 10509 v = TEST_COND(result <= 0); 10510 break; 10511 case Py_GE: 10512 v = TEST_COND(result >= 0); 10513 break; 10514 case Py_LT: 10515 v = TEST_COND(result == -1); 10516 break; 10517 case Py_GT: 10518 v = TEST_COND(result == 1); 10519 break; 10520 default: 10521 PyErr_BadArgument(); 10522 return NULL; 10523 } 10524 Py_INCREF(v); 10525 return v; 10526 } 10527 10528 Py_RETURN_NOTIMPLEMENTED; 10529} 10530 10531int 10532PyUnicode_Contains(PyObject *container, PyObject *element) 10533{ 10534 PyObject *str, *sub; 10535 int kind1, kind2, kind; 10536 void *buf1, *buf2; 10537 Py_ssize_t len1, len2; 10538 int result; 10539 10540 /* Coerce the two arguments */ 10541 sub = PyUnicode_FromObject(element); 10542 if (!sub) { 10543 PyErr_Format(PyExc_TypeError, 10544 "'in <string>' requires string as left operand, not %s", 10545 element->ob_type->tp_name); 10546 return -1; 10547 } 10548 if (PyUnicode_READY(sub) == -1) 10549 return -1; 10550 10551 str = PyUnicode_FromObject(container); 10552 if (!str || PyUnicode_READY(str) == -1) { 10553 Py_DECREF(sub); 10554 return -1; 10555 } 10556 10557 kind1 = PyUnicode_KIND(str); 10558 kind2 = PyUnicode_KIND(sub); 10559 kind = kind1 > kind2 ? kind1 : kind2; 10560 buf1 = PyUnicode_DATA(str); 10561 buf2 = PyUnicode_DATA(sub); 10562 if (kind1 != kind) 10563 buf1 = _PyUnicode_AsKind(str, kind); 10564 if (!buf1) { 10565 Py_DECREF(sub); 10566 return -1; 10567 } 10568 if (kind2 != kind) 10569 buf2 = _PyUnicode_AsKind(sub, kind); 10570 if (!buf2) { 10571 Py_DECREF(sub); 10572 if (kind1 != kind) PyMem_Free(buf1); 10573 return -1; 10574 } 10575 len1 = PyUnicode_GET_LENGTH(str); 10576 len2 = PyUnicode_GET_LENGTH(sub); 10577 10578 switch(kind) { 10579 case PyUnicode_1BYTE_KIND: 10580 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10581 break; 10582 case PyUnicode_2BYTE_KIND: 10583 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10584 break; 10585 case PyUnicode_4BYTE_KIND: 10586 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10587 break; 10588 default: 10589 result = -1; 10590 assert(0); 10591 } 10592 10593 Py_DECREF(str); 10594 Py_DECREF(sub); 10595 10596 if (kind1 != kind) 10597 PyMem_Free(buf1); 10598 if (kind2 != kind) 10599 PyMem_Free(buf2); 10600 10601 return result; 10602} 10603 10604/* Concat to string or Unicode object giving a new Unicode object. */ 10605 10606PyObject * 10607PyUnicode_Concat(PyObject *left, PyObject *right) 10608{ 10609 PyObject *u = NULL, *v = NULL, *w; 10610 Py_UCS4 maxchar, maxchar2; 10611 10612 /* Coerce the two arguments */ 10613 u = PyUnicode_FromObject(left); 10614 if (u == NULL) 10615 goto onError; 10616 v = PyUnicode_FromObject(right); 10617 if (v == NULL) 10618 goto onError; 10619 10620 /* Shortcuts */ 10621 if (v == unicode_empty) { 10622 Py_DECREF(v); 10623 return u; 10624 } 10625 if (u == unicode_empty) { 10626 Py_DECREF(u); 10627 return v; 10628 } 10629 10630 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10631 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10632 maxchar = Py_MAX(maxchar, maxchar2); 10633 10634 /* Concat the two Unicode strings */ 10635 w = PyUnicode_New( 10636 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v), 10637 maxchar); 10638 if (w == NULL) 10639 goto onError; 10640 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); 10641 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); 10642 Py_DECREF(u); 10643 Py_DECREF(v); 10644 assert(_PyUnicode_CheckConsistency(w, 1)); 10645 return w; 10646 10647 onError: 10648 Py_XDECREF(u); 10649 Py_XDECREF(v); 10650 return NULL; 10651} 10652 10653static void 10654unicode_append_inplace(PyObject **p_left, PyObject *right) 10655{ 10656 Py_ssize_t left_len, right_len, new_len; 10657 10658 assert(PyUnicode_IS_READY(*p_left)); 10659 assert(PyUnicode_IS_READY(right)); 10660 10661 left_len = PyUnicode_GET_LENGTH(*p_left); 10662 right_len = PyUnicode_GET_LENGTH(right); 10663 if (left_len > PY_SSIZE_T_MAX - right_len) { 10664 PyErr_SetString(PyExc_OverflowError, 10665 "strings are too large to concat"); 10666 goto error; 10667 } 10668 new_len = left_len + right_len; 10669 10670 /* Now we own the last reference to 'left', so we can resize it 10671 * in-place. 10672 */ 10673 if (unicode_resize(p_left, new_len) != 0) { 10674 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10675 * deallocated so it cannot be put back into 10676 * 'variable'. The MemoryError is raised when there 10677 * is no value in 'variable', which might (very 10678 * remotely) be a cause of incompatibilities. 10679 */ 10680 goto error; 10681 } 10682 /* copy 'right' into the newly allocated area of 'left' */ 10683 copy_characters(*p_left, left_len, right, 0, right_len); 10684 _PyUnicode_DIRTY(*p_left); 10685 return; 10686 10687error: 10688 Py_DECREF(*p_left); 10689 *p_left = NULL; 10690} 10691 10692void 10693PyUnicode_Append(PyObject **p_left, PyObject *right) 10694{ 10695 PyObject *left, *res; 10696 10697 if (p_left == NULL) { 10698 if (!PyErr_Occurred()) 10699 PyErr_BadInternalCall(); 10700 return; 10701 } 10702 left = *p_left; 10703 if (right == NULL || !PyUnicode_Check(left)) { 10704 if (!PyErr_Occurred()) 10705 PyErr_BadInternalCall(); 10706 goto error; 10707 } 10708 10709 if (PyUnicode_READY(left)) 10710 goto error; 10711 if (PyUnicode_READY(right)) 10712 goto error; 10713 10714 if (PyUnicode_CheckExact(left) && left != unicode_empty 10715 && PyUnicode_CheckExact(right) && right != unicode_empty 10716 && unicode_resizable(left) 10717 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) 10718 || _PyUnicode_WSTR(left) != NULL)) 10719 { 10720 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10721 to change the structure size, but characters are stored just after 10722 the structure, and so it requires to move all characters which is 10723 not so different than duplicating the string. */ 10724 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10725 { 10726 unicode_append_inplace(p_left, right); 10727 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1)); 10728 return; 10729 } 10730 } 10731 10732 res = PyUnicode_Concat(left, right); 10733 if (res == NULL) 10734 goto error; 10735 Py_DECREF(left); 10736 *p_left = res; 10737 return; 10738 10739error: 10740 Py_DECREF(*p_left); 10741 *p_left = NULL; 10742} 10743 10744void 10745PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10746{ 10747 PyUnicode_Append(pleft, right); 10748 Py_XDECREF(right); 10749} 10750 10751PyDoc_STRVAR(count__doc__, 10752 "S.count(sub[, start[, end]]) -> int\n\ 10753\n\ 10754Return the number of non-overlapping occurrences of substring sub in\n\ 10755string S[start:end]. Optional arguments start and end are\n\ 10756interpreted as in slice notation."); 10757 10758static PyObject * 10759unicode_count(PyObject *self, PyObject *args) 10760{ 10761 PyObject *substring; 10762 Py_ssize_t start = 0; 10763 Py_ssize_t end = PY_SSIZE_T_MAX; 10764 PyObject *result; 10765 int kind1, kind2, kind; 10766 void *buf1, *buf2; 10767 Py_ssize_t len1, len2, iresult; 10768 10769 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10770 &start, &end)) 10771 return NULL; 10772 10773 kind1 = PyUnicode_KIND(self); 10774 kind2 = PyUnicode_KIND(substring); 10775 kind = kind1 > kind2 ? kind1 : kind2; 10776 buf1 = PyUnicode_DATA(self); 10777 buf2 = PyUnicode_DATA(substring); 10778 if (kind1 != kind) 10779 buf1 = _PyUnicode_AsKind(self, kind); 10780 if (!buf1) { 10781 Py_DECREF(substring); 10782 return NULL; 10783 } 10784 if (kind2 != kind) 10785 buf2 = _PyUnicode_AsKind(substring, kind); 10786 if (!buf2) { 10787 Py_DECREF(substring); 10788 if (kind1 != kind) PyMem_Free(buf1); 10789 return NULL; 10790 } 10791 len1 = PyUnicode_GET_LENGTH(self); 10792 len2 = PyUnicode_GET_LENGTH(substring); 10793 10794 ADJUST_INDICES(start, end, len1); 10795 switch(kind) { 10796 case PyUnicode_1BYTE_KIND: 10797 iresult = ucs1lib_count( 10798 ((Py_UCS1*)buf1) + start, end - start, 10799 buf2, len2, PY_SSIZE_T_MAX 10800 ); 10801 break; 10802 case PyUnicode_2BYTE_KIND: 10803 iresult = ucs2lib_count( 10804 ((Py_UCS2*)buf1) + start, end - start, 10805 buf2, len2, PY_SSIZE_T_MAX 10806 ); 10807 break; 10808 case PyUnicode_4BYTE_KIND: 10809 iresult = ucs4lib_count( 10810 ((Py_UCS4*)buf1) + start, end - start, 10811 buf2, len2, PY_SSIZE_T_MAX 10812 ); 10813 break; 10814 default: 10815 assert(0); iresult = 0; 10816 } 10817 10818 result = PyLong_FromSsize_t(iresult); 10819 10820 if (kind1 != kind) 10821 PyMem_Free(buf1); 10822 if (kind2 != kind) 10823 PyMem_Free(buf2); 10824 10825 Py_DECREF(substring); 10826 10827 return result; 10828} 10829 10830PyDoc_STRVAR(encode__doc__, 10831 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10832\n\ 10833Encode S using the codec registered for encoding. Default encoding\n\ 10834is 'utf-8'. errors may be given to set a different error\n\ 10835handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10836a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10837'xmlcharrefreplace' as well as any other name registered with\n\ 10838codecs.register_error that can handle UnicodeEncodeErrors."); 10839 10840static PyObject * 10841unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10842{ 10843 static char *kwlist[] = {"encoding", "errors", 0}; 10844 char *encoding = NULL; 10845 char *errors = NULL; 10846 10847 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10848 kwlist, &encoding, &errors)) 10849 return NULL; 10850 return PyUnicode_AsEncodedString(self, encoding, errors); 10851} 10852 10853PyDoc_STRVAR(expandtabs__doc__, 10854 "S.expandtabs([tabsize]) -> str\n\ 10855\n\ 10856Return a copy of S where all tab characters are expanded using spaces.\n\ 10857If tabsize is not given, a tab size of 8 characters is assumed."); 10858 10859static PyObject* 10860unicode_expandtabs(PyObject *self, PyObject *args) 10861{ 10862 Py_ssize_t i, j, line_pos, src_len, incr; 10863 Py_UCS4 ch; 10864 PyObject *u; 10865 void *src_data, *dest_data; 10866 int tabsize = 8; 10867 int kind; 10868 int found; 10869 10870 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10871 return NULL; 10872 10873 if (PyUnicode_READY(self) == -1) 10874 return NULL; 10875 10876 /* First pass: determine size of output string */ 10877 src_len = PyUnicode_GET_LENGTH(self); 10878 i = j = line_pos = 0; 10879 kind = PyUnicode_KIND(self); 10880 src_data = PyUnicode_DATA(self); 10881 found = 0; 10882 for (; i < src_len; i++) { 10883 ch = PyUnicode_READ(kind, src_data, i); 10884 if (ch == '\t') { 10885 found = 1; 10886 if (tabsize > 0) { 10887 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10888 if (j > PY_SSIZE_T_MAX - incr) 10889 goto overflow; 10890 line_pos += incr; 10891 j += incr; 10892 } 10893 } 10894 else { 10895 if (j > PY_SSIZE_T_MAX - 1) 10896 goto overflow; 10897 line_pos++; 10898 j++; 10899 if (ch == '\n' || ch == '\r') 10900 line_pos = 0; 10901 } 10902 } 10903 if (!found && PyUnicode_CheckExact(self)) { 10904 Py_INCREF(self); 10905 return self; 10906 } 10907 10908 /* Second pass: create output string and fill it */ 10909 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10910 if (!u) 10911 return NULL; 10912 dest_data = PyUnicode_DATA(u); 10913 10914 i = j = line_pos = 0; 10915 10916 for (; i < src_len; i++) { 10917 ch = PyUnicode_READ(kind, src_data, i); 10918 if (ch == '\t') { 10919 if (tabsize > 0) { 10920 incr = tabsize - (line_pos % tabsize); 10921 line_pos += incr; 10922 while (incr--) { 10923 PyUnicode_WRITE(kind, dest_data, j, ' '); 10924 j++; 10925 } 10926 } 10927 } 10928 else { 10929 line_pos++; 10930 PyUnicode_WRITE(kind, dest_data, j, ch); 10931 j++; 10932 if (ch == '\n' || ch == '\r') 10933 line_pos = 0; 10934 } 10935 } 10936 assert (j == PyUnicode_GET_LENGTH(u)); 10937 return unicode_result(u); 10938 10939 overflow: 10940 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10941 return NULL; 10942} 10943 10944PyDoc_STRVAR(find__doc__, 10945 "S.find(sub[, start[, end]]) -> int\n\ 10946\n\ 10947Return the lowest index in S where substring sub is found,\n\ 10948such that sub is contained within S[start:end]. Optional\n\ 10949arguments start and end are interpreted as in slice notation.\n\ 10950\n\ 10951Return -1 on failure."); 10952 10953static PyObject * 10954unicode_find(PyObject *self, PyObject *args) 10955{ 10956 PyObject *substring; 10957 Py_ssize_t start; 10958 Py_ssize_t end; 10959 Py_ssize_t result; 10960 10961 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10962 &start, &end)) 10963 return NULL; 10964 10965 if (PyUnicode_READY(self) == -1) 10966 return NULL; 10967 if (PyUnicode_READY(substring) == -1) 10968 return NULL; 10969 10970 result = any_find_slice(1, self, substring, start, end); 10971 10972 Py_DECREF(substring); 10973 10974 if (result == -2) 10975 return NULL; 10976 10977 return PyLong_FromSsize_t(result); 10978} 10979 10980static PyObject * 10981unicode_getitem(PyObject *self, Py_ssize_t index) 10982{ 10983 Py_UCS4 ch = PyUnicode_ReadChar(self, index); 10984 if (ch == (Py_UCS4)-1) 10985 return NULL; 10986 return PyUnicode_FromOrdinal(ch); 10987} 10988 10989/* Believe it or not, this produces the same value for ASCII strings 10990 as bytes_hash(). */ 10991static Py_hash_t 10992unicode_hash(PyObject *self) 10993{ 10994 Py_ssize_t len; 10995 Py_uhash_t x; 10996 10997 if (_PyUnicode_HASH(self) != -1) 10998 return _PyUnicode_HASH(self); 10999 if (PyUnicode_READY(self) == -1) 11000 return -1; 11001 len = PyUnicode_GET_LENGTH(self); 11002 11003 /* The hash function as a macro, gets expanded three times below. */ 11004#define HASH(P) \ 11005 x = (Py_uhash_t)*P << 7; \ 11006 while (--len >= 0) \ 11007 x = (1000003*x) ^ (Py_uhash_t)*P++; 11008 11009 switch (PyUnicode_KIND(self)) { 11010 case PyUnicode_1BYTE_KIND: { 11011 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11012 HASH(c); 11013 break; 11014 } 11015 case PyUnicode_2BYTE_KIND: { 11016 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11017 HASH(s); 11018 break; 11019 } 11020 default: { 11021 Py_UCS4 *l; 11022 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11023 "Impossible switch case in unicode_hash"); 11024 l = PyUnicode_4BYTE_DATA(self); 11025 HASH(l); 11026 break; 11027 } 11028 } 11029 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self); 11030 11031 if (x == -1) 11032 x = -2; 11033 _PyUnicode_HASH(self) = x; 11034 return x; 11035} 11036#undef HASH 11037 11038PyDoc_STRVAR(index__doc__, 11039 "S.index(sub[, start[, end]]) -> int\n\ 11040\n\ 11041Like S.find() but raise ValueError when the substring is not found."); 11042 11043static PyObject * 11044unicode_index(PyObject *self, PyObject *args) 11045{ 11046 Py_ssize_t result; 11047 PyObject *substring; 11048 Py_ssize_t start; 11049 Py_ssize_t end; 11050 11051 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11052 &start, &end)) 11053 return NULL; 11054 11055 if (PyUnicode_READY(self) == -1) 11056 return NULL; 11057 if (PyUnicode_READY(substring) == -1) 11058 return NULL; 11059 11060 result = any_find_slice(1, self, substring, start, end); 11061 11062 Py_DECREF(substring); 11063 11064 if (result == -2) 11065 return NULL; 11066 11067 if (result < 0) { 11068 PyErr_SetString(PyExc_ValueError, "substring not found"); 11069 return NULL; 11070 } 11071 11072 return PyLong_FromSsize_t(result); 11073} 11074 11075PyDoc_STRVAR(islower__doc__, 11076 "S.islower() -> bool\n\ 11077\n\ 11078Return True if all cased characters in S are lowercase and there is\n\ 11079at least one cased character in S, False otherwise."); 11080 11081static PyObject* 11082unicode_islower(PyObject *self) 11083{ 11084 Py_ssize_t i, length; 11085 int kind; 11086 void *data; 11087 int cased; 11088 11089 if (PyUnicode_READY(self) == -1) 11090 return NULL; 11091 length = PyUnicode_GET_LENGTH(self); 11092 kind = PyUnicode_KIND(self); 11093 data = PyUnicode_DATA(self); 11094 11095 /* Shortcut for single character strings */ 11096 if (length == 1) 11097 return PyBool_FromLong( 11098 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11099 11100 /* Special case for empty strings */ 11101 if (length == 0) 11102 return PyBool_FromLong(0); 11103 11104 cased = 0; 11105 for (i = 0; i < length; i++) { 11106 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11107 11108 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11109 return PyBool_FromLong(0); 11110 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11111 cased = 1; 11112 } 11113 return PyBool_FromLong(cased); 11114} 11115 11116PyDoc_STRVAR(isupper__doc__, 11117 "S.isupper() -> bool\n\ 11118\n\ 11119Return True if all cased characters in S are uppercase and there is\n\ 11120at least one cased character in S, False otherwise."); 11121 11122static PyObject* 11123unicode_isupper(PyObject *self) 11124{ 11125 Py_ssize_t i, length; 11126 int kind; 11127 void *data; 11128 int cased; 11129 11130 if (PyUnicode_READY(self) == -1) 11131 return NULL; 11132 length = PyUnicode_GET_LENGTH(self); 11133 kind = PyUnicode_KIND(self); 11134 data = PyUnicode_DATA(self); 11135 11136 /* Shortcut for single character strings */ 11137 if (length == 1) 11138 return PyBool_FromLong( 11139 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11140 11141 /* Special case for empty strings */ 11142 if (length == 0) 11143 return PyBool_FromLong(0); 11144 11145 cased = 0; 11146 for (i = 0; i < length; i++) { 11147 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11148 11149 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11150 return PyBool_FromLong(0); 11151 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11152 cased = 1; 11153 } 11154 return PyBool_FromLong(cased); 11155} 11156 11157PyDoc_STRVAR(istitle__doc__, 11158 "S.istitle() -> bool\n\ 11159\n\ 11160Return True if S is a titlecased string and there is at least one\n\ 11161character in S, i.e. upper- and titlecase characters may only\n\ 11162follow uncased characters and lowercase characters only cased ones.\n\ 11163Return False otherwise."); 11164 11165static PyObject* 11166unicode_istitle(PyObject *self) 11167{ 11168 Py_ssize_t i, length; 11169 int kind; 11170 void *data; 11171 int cased, previous_is_cased; 11172 11173 if (PyUnicode_READY(self) == -1) 11174 return NULL; 11175 length = PyUnicode_GET_LENGTH(self); 11176 kind = PyUnicode_KIND(self); 11177 data = PyUnicode_DATA(self); 11178 11179 /* Shortcut for single character strings */ 11180 if (length == 1) { 11181 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11182 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11183 (Py_UNICODE_ISUPPER(ch) != 0)); 11184 } 11185 11186 /* Special case for empty strings */ 11187 if (length == 0) 11188 return PyBool_FromLong(0); 11189 11190 cased = 0; 11191 previous_is_cased = 0; 11192 for (i = 0; i < length; i++) { 11193 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11194 11195 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11196 if (previous_is_cased) 11197 return PyBool_FromLong(0); 11198 previous_is_cased = 1; 11199 cased = 1; 11200 } 11201 else if (Py_UNICODE_ISLOWER(ch)) { 11202 if (!previous_is_cased) 11203 return PyBool_FromLong(0); 11204 previous_is_cased = 1; 11205 cased = 1; 11206 } 11207 else 11208 previous_is_cased = 0; 11209 } 11210 return PyBool_FromLong(cased); 11211} 11212 11213PyDoc_STRVAR(isspace__doc__, 11214 "S.isspace() -> bool\n\ 11215\n\ 11216Return True if all characters in S are whitespace\n\ 11217and there is at least one character in S, False otherwise."); 11218 11219static PyObject* 11220unicode_isspace(PyObject *self) 11221{ 11222 Py_ssize_t i, length; 11223 int kind; 11224 void *data; 11225 11226 if (PyUnicode_READY(self) == -1) 11227 return NULL; 11228 length = PyUnicode_GET_LENGTH(self); 11229 kind = PyUnicode_KIND(self); 11230 data = PyUnicode_DATA(self); 11231 11232 /* Shortcut for single character strings */ 11233 if (length == 1) 11234 return PyBool_FromLong( 11235 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11236 11237 /* Special case for empty strings */ 11238 if (length == 0) 11239 return PyBool_FromLong(0); 11240 11241 for (i = 0; i < length; i++) { 11242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11243 if (!Py_UNICODE_ISSPACE(ch)) 11244 return PyBool_FromLong(0); 11245 } 11246 return PyBool_FromLong(1); 11247} 11248 11249PyDoc_STRVAR(isalpha__doc__, 11250 "S.isalpha() -> bool\n\ 11251\n\ 11252Return True if all characters in S are alphabetic\n\ 11253and there is at least one character in S, False otherwise."); 11254 11255static PyObject* 11256unicode_isalpha(PyObject *self) 11257{ 11258 Py_ssize_t i, length; 11259 int kind; 11260 void *data; 11261 11262 if (PyUnicode_READY(self) == -1) 11263 return NULL; 11264 length = PyUnicode_GET_LENGTH(self); 11265 kind = PyUnicode_KIND(self); 11266 data = PyUnicode_DATA(self); 11267 11268 /* Shortcut for single character strings */ 11269 if (length == 1) 11270 return PyBool_FromLong( 11271 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11272 11273 /* Special case for empty strings */ 11274 if (length == 0) 11275 return PyBool_FromLong(0); 11276 11277 for (i = 0; i < length; i++) { 11278 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11279 return PyBool_FromLong(0); 11280 } 11281 return PyBool_FromLong(1); 11282} 11283 11284PyDoc_STRVAR(isalnum__doc__, 11285 "S.isalnum() -> bool\n\ 11286\n\ 11287Return True if all characters in S are alphanumeric\n\ 11288and there is at least one character in S, False otherwise."); 11289 11290static PyObject* 11291unicode_isalnum(PyObject *self) 11292{ 11293 int kind; 11294 void *data; 11295 Py_ssize_t len, i; 11296 11297 if (PyUnicode_READY(self) == -1) 11298 return NULL; 11299 11300 kind = PyUnicode_KIND(self); 11301 data = PyUnicode_DATA(self); 11302 len = PyUnicode_GET_LENGTH(self); 11303 11304 /* Shortcut for single character strings */ 11305 if (len == 1) { 11306 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11307 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11308 } 11309 11310 /* Special case for empty strings */ 11311 if (len == 0) 11312 return PyBool_FromLong(0); 11313 11314 for (i = 0; i < len; i++) { 11315 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11316 if (!Py_UNICODE_ISALNUM(ch)) 11317 return PyBool_FromLong(0); 11318 } 11319 return PyBool_FromLong(1); 11320} 11321 11322PyDoc_STRVAR(isdecimal__doc__, 11323 "S.isdecimal() -> bool\n\ 11324\n\ 11325Return True if there are only decimal characters in S,\n\ 11326False otherwise."); 11327 11328static PyObject* 11329unicode_isdecimal(PyObject *self) 11330{ 11331 Py_ssize_t i, length; 11332 int kind; 11333 void *data; 11334 11335 if (PyUnicode_READY(self) == -1) 11336 return NULL; 11337 length = PyUnicode_GET_LENGTH(self); 11338 kind = PyUnicode_KIND(self); 11339 data = PyUnicode_DATA(self); 11340 11341 /* Shortcut for single character strings */ 11342 if (length == 1) 11343 return PyBool_FromLong( 11344 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11345 11346 /* Special case for empty strings */ 11347 if (length == 0) 11348 return PyBool_FromLong(0); 11349 11350 for (i = 0; i < length; i++) { 11351 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11352 return PyBool_FromLong(0); 11353 } 11354 return PyBool_FromLong(1); 11355} 11356 11357PyDoc_STRVAR(isdigit__doc__, 11358 "S.isdigit() -> bool\n\ 11359\n\ 11360Return True if all characters in S are digits\n\ 11361and there is at least one character in S, False otherwise."); 11362 11363static PyObject* 11364unicode_isdigit(PyObject *self) 11365{ 11366 Py_ssize_t i, length; 11367 int kind; 11368 void *data; 11369 11370 if (PyUnicode_READY(self) == -1) 11371 return NULL; 11372 length = PyUnicode_GET_LENGTH(self); 11373 kind = PyUnicode_KIND(self); 11374 data = PyUnicode_DATA(self); 11375 11376 /* Shortcut for single character strings */ 11377 if (length == 1) { 11378 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11379 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11380 } 11381 11382 /* Special case for empty strings */ 11383 if (length == 0) 11384 return PyBool_FromLong(0); 11385 11386 for (i = 0; i < length; i++) { 11387 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11388 return PyBool_FromLong(0); 11389 } 11390 return PyBool_FromLong(1); 11391} 11392 11393PyDoc_STRVAR(isnumeric__doc__, 11394 "S.isnumeric() -> bool\n\ 11395\n\ 11396Return True if there are only numeric characters in S,\n\ 11397False otherwise."); 11398 11399static PyObject* 11400unicode_isnumeric(PyObject *self) 11401{ 11402 Py_ssize_t i, length; 11403 int kind; 11404 void *data; 11405 11406 if (PyUnicode_READY(self) == -1) 11407 return NULL; 11408 length = PyUnicode_GET_LENGTH(self); 11409 kind = PyUnicode_KIND(self); 11410 data = PyUnicode_DATA(self); 11411 11412 /* Shortcut for single character strings */ 11413 if (length == 1) 11414 return PyBool_FromLong( 11415 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11416 11417 /* Special case for empty strings */ 11418 if (length == 0) 11419 return PyBool_FromLong(0); 11420 11421 for (i = 0; i < length; i++) { 11422 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11423 return PyBool_FromLong(0); 11424 } 11425 return PyBool_FromLong(1); 11426} 11427 11428int 11429PyUnicode_IsIdentifier(PyObject *self) 11430{ 11431 int kind; 11432 void *data; 11433 Py_ssize_t i; 11434 Py_UCS4 first; 11435 11436 if (PyUnicode_READY(self) == -1) { 11437 Py_FatalError("identifier not ready"); 11438 return 0; 11439 } 11440 11441 /* Special case for empty strings */ 11442 if (PyUnicode_GET_LENGTH(self) == 0) 11443 return 0; 11444 kind = PyUnicode_KIND(self); 11445 data = PyUnicode_DATA(self); 11446 11447 /* PEP 3131 says that the first character must be in 11448 XID_Start and subsequent characters in XID_Continue, 11449 and for the ASCII range, the 2.x rules apply (i.e 11450 start with letters and underscore, continue with 11451 letters, digits, underscore). However, given the current 11452 definition of XID_Start and XID_Continue, it is sufficient 11453 to check just for these, except that _ must be allowed 11454 as starting an identifier. */ 11455 first = PyUnicode_READ(kind, data, 0); 11456 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11457 return 0; 11458 11459 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11460 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11461 return 0; 11462 return 1; 11463} 11464 11465PyDoc_STRVAR(isidentifier__doc__, 11466 "S.isidentifier() -> bool\n\ 11467\n\ 11468Return True if S is a valid identifier according\n\ 11469to the language definition."); 11470 11471static PyObject* 11472unicode_isidentifier(PyObject *self) 11473{ 11474 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11475} 11476 11477PyDoc_STRVAR(isprintable__doc__, 11478 "S.isprintable() -> bool\n\ 11479\n\ 11480Return True if all characters in S are considered\n\ 11481printable in repr() or S is empty, False otherwise."); 11482 11483static PyObject* 11484unicode_isprintable(PyObject *self) 11485{ 11486 Py_ssize_t i, length; 11487 int kind; 11488 void *data; 11489 11490 if (PyUnicode_READY(self) == -1) 11491 return NULL; 11492 length = PyUnicode_GET_LENGTH(self); 11493 kind = PyUnicode_KIND(self); 11494 data = PyUnicode_DATA(self); 11495 11496 /* Shortcut for single character strings */ 11497 if (length == 1) 11498 return PyBool_FromLong( 11499 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11500 11501 for (i = 0; i < length; i++) { 11502 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11503 Py_RETURN_FALSE; 11504 } 11505 } 11506 Py_RETURN_TRUE; 11507} 11508 11509PyDoc_STRVAR(join__doc__, 11510 "S.join(iterable) -> str\n\ 11511\n\ 11512Return a string which is the concatenation of the strings in the\n\ 11513iterable. The separator between elements is S."); 11514 11515static PyObject* 11516unicode_join(PyObject *self, PyObject *data) 11517{ 11518 return PyUnicode_Join(self, data); 11519} 11520 11521static Py_ssize_t 11522unicode_length(PyObject *self) 11523{ 11524 if (PyUnicode_READY(self) == -1) 11525 return -1; 11526 return PyUnicode_GET_LENGTH(self); 11527} 11528 11529PyDoc_STRVAR(ljust__doc__, 11530 "S.ljust(width[, fillchar]) -> str\n\ 11531\n\ 11532Return S left-justified in a Unicode string of length width. Padding is\n\ 11533done using the specified fill character (default is a space)."); 11534 11535static PyObject * 11536unicode_ljust(PyObject *self, PyObject *args) 11537{ 11538 Py_ssize_t width; 11539 Py_UCS4 fillchar = ' '; 11540 11541 if (PyUnicode_READY(self) == -1) 11542 return NULL; 11543 11544 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11545 return NULL; 11546 11547 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 11548 Py_INCREF(self); 11549 return self; 11550 } 11551 11552 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar); 11553} 11554 11555PyDoc_STRVAR(lower__doc__, 11556 "S.lower() -> str\n\ 11557\n\ 11558Return a copy of the string S converted to lowercase."); 11559 11560static PyObject* 11561unicode_lower(PyObject *self) 11562{ 11563 return fixup(self, fixlower); 11564} 11565 11566#define LEFTSTRIP 0 11567#define RIGHTSTRIP 1 11568#define BOTHSTRIP 2 11569 11570/* Arrays indexed by above */ 11571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11572 11573#define STRIPNAME(i) (stripformat[i]+3) 11574 11575/* externally visible for str.strip(unicode) */ 11576PyObject * 11577_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11578{ 11579 void *data; 11580 int kind; 11581 Py_ssize_t i, j, len; 11582 BLOOM_MASK sepmask; 11583 11584 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11585 return NULL; 11586 11587 kind = PyUnicode_KIND(self); 11588 data = PyUnicode_DATA(self); 11589 len = PyUnicode_GET_LENGTH(self); 11590 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11591 PyUnicode_DATA(sepobj), 11592 PyUnicode_GET_LENGTH(sepobj)); 11593 11594 i = 0; 11595 if (striptype != RIGHTSTRIP) { 11596 while (i < len && 11597 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11598 i++; 11599 } 11600 } 11601 11602 j = len; 11603 if (striptype != LEFTSTRIP) { 11604 do { 11605 j--; 11606 } while (j >= i && 11607 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11608 j++; 11609 } 11610 11611 return PyUnicode_Substring(self, i, j); 11612} 11613 11614PyObject* 11615PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11616{ 11617 unsigned char *data; 11618 int kind; 11619 Py_ssize_t length; 11620 11621 if (PyUnicode_READY(self) == -1) 11622 return NULL; 11623 11624 end = Py_MIN(end, PyUnicode_GET_LENGTH(self)); 11625 11626 if (start == 0 && end == PyUnicode_GET_LENGTH(self)) 11627 { 11628 if (PyUnicode_CheckExact(self)) { 11629 Py_INCREF(self); 11630 return self; 11631 } 11632 else 11633 return PyUnicode_Copy(self); 11634 } 11635 11636 length = end - start; 11637 if (length == 1) 11638 return unicode_getitem(self, start); 11639 11640 if (start < 0 || end < 0) { 11641 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11642 return NULL; 11643 } 11644 11645 if (PyUnicode_IS_ASCII(self)) { 11646 kind = PyUnicode_KIND(self); 11647 data = PyUnicode_1BYTE_DATA(self); 11648 return unicode_fromascii(data + start, length); 11649 } 11650 else { 11651 kind = PyUnicode_KIND(self); 11652 data = PyUnicode_1BYTE_DATA(self); 11653 return PyUnicode_FromKindAndData(kind, 11654 data + kind * start, 11655 length); 11656 } 11657} 11658 11659static PyObject * 11660do_strip(PyObject *self, int striptype) 11661{ 11662 int kind; 11663 void *data; 11664 Py_ssize_t len, i, j; 11665 11666 if (PyUnicode_READY(self) == -1) 11667 return NULL; 11668 11669 kind = PyUnicode_KIND(self); 11670 data = PyUnicode_DATA(self); 11671 len = PyUnicode_GET_LENGTH(self); 11672 11673 i = 0; 11674 if (striptype != RIGHTSTRIP) { 11675 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11676 i++; 11677 } 11678 } 11679 11680 j = len; 11681 if (striptype != LEFTSTRIP) { 11682 do { 11683 j--; 11684 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11685 j++; 11686 } 11687 11688 return PyUnicode_Substring(self, i, j); 11689} 11690 11691 11692static PyObject * 11693do_argstrip(PyObject *self, int striptype, PyObject *args) 11694{ 11695 PyObject *sep = NULL; 11696 11697 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11698 return NULL; 11699 11700 if (sep != NULL && sep != Py_None) { 11701 if (PyUnicode_Check(sep)) 11702 return _PyUnicode_XStrip(self, striptype, sep); 11703 else { 11704 PyErr_Format(PyExc_TypeError, 11705 "%s arg must be None or str", 11706 STRIPNAME(striptype)); 11707 return NULL; 11708 } 11709 } 11710 11711 return do_strip(self, striptype); 11712} 11713 11714 11715PyDoc_STRVAR(strip__doc__, 11716 "S.strip([chars]) -> str\n\ 11717\n\ 11718Return a copy of the string S with leading and trailing\n\ 11719whitespace removed.\n\ 11720If chars is given and not None, remove characters in chars instead."); 11721 11722static PyObject * 11723unicode_strip(PyObject *self, PyObject *args) 11724{ 11725 if (PyTuple_GET_SIZE(args) == 0) 11726 return do_strip(self, BOTHSTRIP); /* Common case */ 11727 else 11728 return do_argstrip(self, BOTHSTRIP, args); 11729} 11730 11731 11732PyDoc_STRVAR(lstrip__doc__, 11733 "S.lstrip([chars]) -> str\n\ 11734\n\ 11735Return a copy of the string S with leading whitespace removed.\n\ 11736If chars is given and not None, remove characters in chars instead."); 11737 11738static PyObject * 11739unicode_lstrip(PyObject *self, PyObject *args) 11740{ 11741 if (PyTuple_GET_SIZE(args) == 0) 11742 return do_strip(self, LEFTSTRIP); /* Common case */ 11743 else 11744 return do_argstrip(self, LEFTSTRIP, args); 11745} 11746 11747 11748PyDoc_STRVAR(rstrip__doc__, 11749 "S.rstrip([chars]) -> str\n\ 11750\n\ 11751Return a copy of the string S with trailing whitespace removed.\n\ 11752If chars is given and not None, remove characters in chars instead."); 11753 11754static PyObject * 11755unicode_rstrip(PyObject *self, PyObject *args) 11756{ 11757 if (PyTuple_GET_SIZE(args) == 0) 11758 return do_strip(self, RIGHTSTRIP); /* Common case */ 11759 else 11760 return do_argstrip(self, RIGHTSTRIP, args); 11761} 11762 11763 11764static PyObject* 11765unicode_repeat(PyObject *str, Py_ssize_t len) 11766{ 11767 PyObject *u; 11768 Py_ssize_t nchars, n; 11769 11770 if (len < 1) { 11771 Py_INCREF(unicode_empty); 11772 return unicode_empty; 11773 } 11774 11775 if (len == 1 && PyUnicode_CheckExact(str)) { 11776 /* no repeat, return original string */ 11777 Py_INCREF(str); 11778 return str; 11779 } 11780 11781 if (PyUnicode_READY(str) == -1) 11782 return NULL; 11783 11784 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11785 PyErr_SetString(PyExc_OverflowError, 11786 "repeated string is too long"); 11787 return NULL; 11788 } 11789 nchars = len * PyUnicode_GET_LENGTH(str); 11790 11791 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11792 if (!u) 11793 return NULL; 11794 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11795 11796 if (PyUnicode_GET_LENGTH(str) == 1) { 11797 const int kind = PyUnicode_KIND(str); 11798 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11799 void *to = PyUnicode_DATA(u); 11800 if (kind == PyUnicode_1BYTE_KIND) 11801 memset(to, (unsigned char)fill_char, len); 11802 else { 11803 for (n = 0; n < len; ++n) 11804 PyUnicode_WRITE(kind, to, n, fill_char); 11805 } 11806 } 11807 else { 11808 /* number of characters copied this far */ 11809 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11810 const Py_ssize_t char_size = PyUnicode_KIND(str); 11811 char *to = (char *) PyUnicode_DATA(u); 11812 Py_MEMCPY(to, PyUnicode_DATA(str), 11813 PyUnicode_GET_LENGTH(str) * char_size); 11814 while (done < nchars) { 11815 n = (done <= nchars-done) ? done : nchars-done; 11816 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11817 done += n; 11818 } 11819 } 11820 11821 assert(_PyUnicode_CheckConsistency(u, 1)); 11822 return u; 11823} 11824 11825PyObject * 11826PyUnicode_Replace(PyObject *obj, 11827 PyObject *subobj, 11828 PyObject *replobj, 11829 Py_ssize_t maxcount) 11830{ 11831 PyObject *self; 11832 PyObject *str1; 11833 PyObject *str2; 11834 PyObject *result; 11835 11836 self = PyUnicode_FromObject(obj); 11837 if (self == NULL || PyUnicode_READY(self) == -1) 11838 return NULL; 11839 str1 = PyUnicode_FromObject(subobj); 11840 if (str1 == NULL || PyUnicode_READY(str1) == -1) { 11841 Py_DECREF(self); 11842 return NULL; 11843 } 11844 str2 = PyUnicode_FromObject(replobj); 11845 if (str2 == NULL || PyUnicode_READY(str2)) { 11846 Py_DECREF(self); 11847 Py_DECREF(str1); 11848 return NULL; 11849 } 11850 result = replace(self, str1, str2, maxcount); 11851 Py_DECREF(self); 11852 Py_DECREF(str1); 11853 Py_DECREF(str2); 11854 return result; 11855} 11856 11857PyDoc_STRVAR(replace__doc__, 11858 "S.replace(old, new[, count]) -> str\n\ 11859\n\ 11860Return a copy of S with all occurrences of substring\n\ 11861old replaced by new. If the optional argument count is\n\ 11862given, only the first count occurrences are replaced."); 11863 11864static PyObject* 11865unicode_replace(PyObject *self, PyObject *args) 11866{ 11867 PyObject *str1; 11868 PyObject *str2; 11869 Py_ssize_t maxcount = -1; 11870 PyObject *result; 11871 11872 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11873 return NULL; 11874 if (!PyUnicode_READY(self) == -1) 11875 return NULL; 11876 str1 = PyUnicode_FromObject(str1); 11877 if (str1 == NULL || PyUnicode_READY(str1) == -1) 11878 return NULL; 11879 str2 = PyUnicode_FromObject(str2); 11880 if (str2 == NULL || PyUnicode_READY(str2) == -1) { 11881 Py_DECREF(str1); 11882 return NULL; 11883 } 11884 11885 result = replace(self, str1, str2, maxcount); 11886 11887 Py_DECREF(str1); 11888 Py_DECREF(str2); 11889 return result; 11890} 11891 11892static PyObject * 11893unicode_repr(PyObject *unicode) 11894{ 11895 PyObject *repr; 11896 Py_ssize_t isize; 11897 Py_ssize_t osize, squote, dquote, i, o; 11898 Py_UCS4 max, quote; 11899 int ikind, okind; 11900 void *idata, *odata; 11901 11902 if (PyUnicode_READY(unicode) == -1) 11903 return NULL; 11904 11905 isize = PyUnicode_GET_LENGTH(unicode); 11906 idata = PyUnicode_DATA(unicode); 11907 11908 /* Compute length of output, quote characters, and 11909 maximum character */ 11910 osize = 2; /* quotes */ 11911 max = 127; 11912 squote = dquote = 0; 11913 ikind = PyUnicode_KIND(unicode); 11914 for (i = 0; i < isize; i++) { 11915 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11916 switch (ch) { 11917 case '\'': squote++; osize++; break; 11918 case '"': dquote++; osize++; break; 11919 case '\\': case '\t': case '\r': case '\n': 11920 osize += 2; break; 11921 default: 11922 /* Fast-path ASCII */ 11923 if (ch < ' ' || ch == 0x7f) 11924 osize += 4; /* \xHH */ 11925 else if (ch < 0x7f) 11926 osize++; 11927 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11928 osize++; 11929 max = ch > max ? ch : max; 11930 } 11931 else if (ch < 0x100) 11932 osize += 4; /* \xHH */ 11933 else if (ch < 0x10000) 11934 osize += 6; /* \uHHHH */ 11935 else 11936 osize += 10; /* \uHHHHHHHH */ 11937 } 11938 } 11939 11940 quote = '\''; 11941 if (squote) { 11942 if (dquote) 11943 /* Both squote and dquote present. Use squote, 11944 and escape them */ 11945 osize += squote; 11946 else 11947 quote = '"'; 11948 } 11949 11950 repr = PyUnicode_New(osize, max); 11951 if (repr == NULL) 11952 return NULL; 11953 okind = PyUnicode_KIND(repr); 11954 odata = PyUnicode_DATA(repr); 11955 11956 PyUnicode_WRITE(okind, odata, 0, quote); 11957 PyUnicode_WRITE(okind, odata, osize-1, quote); 11958 11959 for (i = 0, o = 1; i < isize; i++) { 11960 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11961 11962 /* Escape quotes and backslashes */ 11963 if ((ch == quote) || (ch == '\\')) { 11964 PyUnicode_WRITE(okind, odata, o++, '\\'); 11965 PyUnicode_WRITE(okind, odata, o++, ch); 11966 continue; 11967 } 11968 11969 /* Map special whitespace to '\t', \n', '\r' */ 11970 if (ch == '\t') { 11971 PyUnicode_WRITE(okind, odata, o++, '\\'); 11972 PyUnicode_WRITE(okind, odata, o++, 't'); 11973 } 11974 else if (ch == '\n') { 11975 PyUnicode_WRITE(okind, odata, o++, '\\'); 11976 PyUnicode_WRITE(okind, odata, o++, 'n'); 11977 } 11978 else if (ch == '\r') { 11979 PyUnicode_WRITE(okind, odata, o++, '\\'); 11980 PyUnicode_WRITE(okind, odata, o++, 'r'); 11981 } 11982 11983 /* Map non-printable US ASCII to '\xhh' */ 11984 else if (ch < ' ' || ch == 0x7F) { 11985 PyUnicode_WRITE(okind, odata, o++, '\\'); 11986 PyUnicode_WRITE(okind, odata, o++, 'x'); 11987 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 11988 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 11989 } 11990 11991 /* Copy ASCII characters as-is */ 11992 else if (ch < 0x7F) { 11993 PyUnicode_WRITE(okind, odata, o++, ch); 11994 } 11995 11996 /* Non-ASCII characters */ 11997 else { 11998 /* Map Unicode whitespace and control characters 11999 (categories Z* and C* except ASCII space) 12000 */ 12001 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12002 /* Map 8-bit characters to '\xhh' */ 12003 if (ch <= 0xff) { 12004 PyUnicode_WRITE(okind, odata, o++, '\\'); 12005 PyUnicode_WRITE(okind, odata, o++, 'x'); 12006 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12007 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12008 } 12009 /* Map 21-bit characters to '\U00xxxxxx' */ 12010 else if (ch >= 0x10000) { 12011 PyUnicode_WRITE(okind, odata, o++, '\\'); 12012 PyUnicode_WRITE(okind, odata, o++, 'U'); 12013 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12014 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12015 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12019 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12020 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12021 } 12022 /* Map 16-bit characters to '\uxxxx' */ 12023 else { 12024 PyUnicode_WRITE(okind, odata, o++, '\\'); 12025 PyUnicode_WRITE(okind, odata, o++, 'u'); 12026 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12027 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12028 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12029 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12030 } 12031 } 12032 /* Copy characters as-is */ 12033 else { 12034 PyUnicode_WRITE(okind, odata, o++, ch); 12035 } 12036 } 12037 } 12038 /* Closing quote already added at the beginning */ 12039 assert(_PyUnicode_CheckConsistency(repr, 1)); 12040 return repr; 12041} 12042 12043PyDoc_STRVAR(rfind__doc__, 12044 "S.rfind(sub[, start[, end]]) -> int\n\ 12045\n\ 12046Return the highest index in S where substring sub is found,\n\ 12047such that sub is contained within S[start:end]. Optional\n\ 12048arguments start and end are interpreted as in slice notation.\n\ 12049\n\ 12050Return -1 on failure."); 12051 12052static PyObject * 12053unicode_rfind(PyObject *self, PyObject *args) 12054{ 12055 PyObject *substring; 12056 Py_ssize_t start; 12057 Py_ssize_t end; 12058 Py_ssize_t result; 12059 12060 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12061 &start, &end)) 12062 return NULL; 12063 12064 if (PyUnicode_READY(self) == -1) 12065 return NULL; 12066 if (PyUnicode_READY(substring) == -1) 12067 return NULL; 12068 12069 result = any_find_slice(-1, self, substring, start, end); 12070 12071 Py_DECREF(substring); 12072 12073 if (result == -2) 12074 return NULL; 12075 12076 return PyLong_FromSsize_t(result); 12077} 12078 12079PyDoc_STRVAR(rindex__doc__, 12080 "S.rindex(sub[, start[, end]]) -> int\n\ 12081\n\ 12082Like S.rfind() but raise ValueError when the substring is not found."); 12083 12084static PyObject * 12085unicode_rindex(PyObject *self, PyObject *args) 12086{ 12087 PyObject *substring; 12088 Py_ssize_t start; 12089 Py_ssize_t end; 12090 Py_ssize_t result; 12091 12092 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12093 &start, &end)) 12094 return NULL; 12095 12096 if (PyUnicode_READY(self) == -1) 12097 return NULL; 12098 if (PyUnicode_READY(substring) == -1) 12099 return NULL; 12100 12101 result = any_find_slice(-1, self, substring, start, end); 12102 12103 Py_DECREF(substring); 12104 12105 if (result == -2) 12106 return NULL; 12107 12108 if (result < 0) { 12109 PyErr_SetString(PyExc_ValueError, "substring not found"); 12110 return NULL; 12111 } 12112 12113 return PyLong_FromSsize_t(result); 12114} 12115 12116PyDoc_STRVAR(rjust__doc__, 12117 "S.rjust(width[, fillchar]) -> str\n\ 12118\n\ 12119Return S right-justified in a string of length width. Padding is\n\ 12120done using the specified fill character (default is a space)."); 12121 12122static PyObject * 12123unicode_rjust(PyObject *self, PyObject *args) 12124{ 12125 Py_ssize_t width; 12126 Py_UCS4 fillchar = ' '; 12127 12128 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12129 return NULL; 12130 12131 if (PyUnicode_READY(self) == -1) 12132 return NULL; 12133 12134 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) { 12135 Py_INCREF(self); 12136 return self; 12137 } 12138 12139 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar); 12140} 12141 12142PyObject * 12143PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12144{ 12145 PyObject *result; 12146 12147 s = PyUnicode_FromObject(s); 12148 if (s == NULL) 12149 return NULL; 12150 if (sep != NULL) { 12151 sep = PyUnicode_FromObject(sep); 12152 if (sep == NULL) { 12153 Py_DECREF(s); 12154 return NULL; 12155 } 12156 } 12157 12158 result = split(s, sep, maxsplit); 12159 12160 Py_DECREF(s); 12161 Py_XDECREF(sep); 12162 return result; 12163} 12164 12165PyDoc_STRVAR(split__doc__, 12166 "S.split([sep[, maxsplit]]) -> list of strings\n\ 12167\n\ 12168Return a list of the words in S, using sep as the\n\ 12169delimiter string. If maxsplit is given, at most maxsplit\n\ 12170splits are done. If sep is not specified or is None, any\n\ 12171whitespace string is a separator and empty strings are\n\ 12172removed from the result."); 12173 12174static PyObject* 12175unicode_split(PyObject *self, PyObject *args) 12176{ 12177 PyObject *substring = Py_None; 12178 Py_ssize_t maxcount = -1; 12179 12180 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) 12181 return NULL; 12182 12183 if (substring == Py_None) 12184 return split(self, NULL, maxcount); 12185 else if (PyUnicode_Check(substring)) 12186 return split(self, substring, maxcount); 12187 else 12188 return PyUnicode_Split(self, substring, maxcount); 12189} 12190 12191PyObject * 12192PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12193{ 12194 PyObject* str_obj; 12195 PyObject* sep_obj; 12196 PyObject* out; 12197 int kind1, kind2, kind; 12198 void *buf1 = NULL, *buf2 = NULL; 12199 Py_ssize_t len1, len2; 12200 12201 str_obj = PyUnicode_FromObject(str_in); 12202 if (!str_obj || PyUnicode_READY(str_obj) == -1) 12203 return NULL; 12204 sep_obj = PyUnicode_FromObject(sep_in); 12205 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) { 12206 Py_DECREF(str_obj); 12207 return NULL; 12208 } 12209 12210 kind1 = PyUnicode_KIND(str_obj); 12211 kind2 = PyUnicode_KIND(sep_obj); 12212 kind = Py_MAX(kind1, kind2); 12213 buf1 = PyUnicode_DATA(str_obj); 12214 if (kind1 != kind) 12215 buf1 = _PyUnicode_AsKind(str_obj, kind); 12216 if (!buf1) 12217 goto onError; 12218 buf2 = PyUnicode_DATA(sep_obj); 12219 if (kind2 != kind) 12220 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12221 if (!buf2) 12222 goto onError; 12223 len1 = PyUnicode_GET_LENGTH(str_obj); 12224 len2 = PyUnicode_GET_LENGTH(sep_obj); 12225 12226 switch(PyUnicode_KIND(str_obj)) { 12227 case PyUnicode_1BYTE_KIND: 12228 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12229 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12230 else 12231 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12232 break; 12233 case PyUnicode_2BYTE_KIND: 12234 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12235 break; 12236 case PyUnicode_4BYTE_KIND: 12237 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12238 break; 12239 default: 12240 assert(0); 12241 out = 0; 12242 } 12243 12244 Py_DECREF(sep_obj); 12245 Py_DECREF(str_obj); 12246 if (kind1 != kind) 12247 PyMem_Free(buf1); 12248 if (kind2 != kind) 12249 PyMem_Free(buf2); 12250 12251 return out; 12252 onError: 12253 Py_DECREF(sep_obj); 12254 Py_DECREF(str_obj); 12255 if (kind1 != kind && buf1) 12256 PyMem_Free(buf1); 12257 if (kind2 != kind && buf2) 12258 PyMem_Free(buf2); 12259 return NULL; 12260} 12261 12262 12263PyObject * 12264PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12265{ 12266 PyObject* str_obj; 12267 PyObject* sep_obj; 12268 PyObject* out; 12269 int kind1, kind2, kind; 12270 void *buf1 = NULL, *buf2 = NULL; 12271 Py_ssize_t len1, len2; 12272 12273 str_obj = PyUnicode_FromObject(str_in); 12274 if (!str_obj) 12275 return NULL; 12276 sep_obj = PyUnicode_FromObject(sep_in); 12277 if (!sep_obj) { 12278 Py_DECREF(str_obj); 12279 return NULL; 12280 } 12281 12282 kind1 = PyUnicode_KIND(str_in); 12283 kind2 = PyUnicode_KIND(sep_obj); 12284 kind = Py_MAX(kind1, kind2); 12285 buf1 = PyUnicode_DATA(str_in); 12286 if (kind1 != kind) 12287 buf1 = _PyUnicode_AsKind(str_in, kind); 12288 if (!buf1) 12289 goto onError; 12290 buf2 = PyUnicode_DATA(sep_obj); 12291 if (kind2 != kind) 12292 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12293 if (!buf2) 12294 goto onError; 12295 len1 = PyUnicode_GET_LENGTH(str_obj); 12296 len2 = PyUnicode_GET_LENGTH(sep_obj); 12297 12298 switch(PyUnicode_KIND(str_in)) { 12299 case PyUnicode_1BYTE_KIND: 12300 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12301 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12302 else 12303 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12304 break; 12305 case PyUnicode_2BYTE_KIND: 12306 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12307 break; 12308 case PyUnicode_4BYTE_KIND: 12309 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12310 break; 12311 default: 12312 assert(0); 12313 out = 0; 12314 } 12315 12316 Py_DECREF(sep_obj); 12317 Py_DECREF(str_obj); 12318 if (kind1 != kind) 12319 PyMem_Free(buf1); 12320 if (kind2 != kind) 12321 PyMem_Free(buf2); 12322 12323 return out; 12324 onError: 12325 Py_DECREF(sep_obj); 12326 Py_DECREF(str_obj); 12327 if (kind1 != kind && buf1) 12328 PyMem_Free(buf1); 12329 if (kind2 != kind && buf2) 12330 PyMem_Free(buf2); 12331 return NULL; 12332} 12333 12334PyDoc_STRVAR(partition__doc__, 12335 "S.partition(sep) -> (head, sep, tail)\n\ 12336\n\ 12337Search for the separator sep in S, and return the part before it,\n\ 12338the separator itself, and the part after it. If the separator is not\n\ 12339found, return S and two empty strings."); 12340 12341static PyObject* 12342unicode_partition(PyObject *self, PyObject *separator) 12343{ 12344 return PyUnicode_Partition(self, separator); 12345} 12346 12347PyDoc_STRVAR(rpartition__doc__, 12348 "S.rpartition(sep) -> (head, sep, tail)\n\ 12349\n\ 12350Search for the separator sep in S, starting at the end of S, and return\n\ 12351the part before it, the separator itself, and the part after it. If the\n\ 12352separator is not found, return two empty strings and S."); 12353 12354static PyObject* 12355unicode_rpartition(PyObject *self, PyObject *separator) 12356{ 12357 return PyUnicode_RPartition(self, separator); 12358} 12359 12360PyObject * 12361PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12362{ 12363 PyObject *result; 12364 12365 s = PyUnicode_FromObject(s); 12366 if (s == NULL) 12367 return NULL; 12368 if (sep != NULL) { 12369 sep = PyUnicode_FromObject(sep); 12370 if (sep == NULL) { 12371 Py_DECREF(s); 12372 return NULL; 12373 } 12374 } 12375 12376 result = rsplit(s, sep, maxsplit); 12377 12378 Py_DECREF(s); 12379 Py_XDECREF(sep); 12380 return result; 12381} 12382 12383PyDoc_STRVAR(rsplit__doc__, 12384 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\ 12385\n\ 12386Return a list of the words in S, using sep as the\n\ 12387delimiter string, starting at the end of the string and\n\ 12388working to the front. If maxsplit is given, at most maxsplit\n\ 12389splits are done. If sep is not specified, any whitespace string\n\ 12390is a separator."); 12391 12392static PyObject* 12393unicode_rsplit(PyObject *self, PyObject *args) 12394{ 12395 PyObject *substring = Py_None; 12396 Py_ssize_t maxcount = -1; 12397 12398 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) 12399 return NULL; 12400 12401 if (substring == Py_None) 12402 return rsplit(self, NULL, maxcount); 12403 else if (PyUnicode_Check(substring)) 12404 return rsplit(self, substring, maxcount); 12405 else 12406 return PyUnicode_RSplit(self, substring, maxcount); 12407} 12408 12409PyDoc_STRVAR(splitlines__doc__, 12410 "S.splitlines([keepends]) -> list of strings\n\ 12411\n\ 12412Return a list of the lines in S, breaking at line boundaries.\n\ 12413Line breaks are not included in the resulting list unless keepends\n\ 12414is given and true."); 12415 12416static PyObject* 12417unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12418{ 12419 static char *kwlist[] = {"keepends", 0}; 12420 int keepends = 0; 12421 12422 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12423 kwlist, &keepends)) 12424 return NULL; 12425 12426 return PyUnicode_Splitlines(self, keepends); 12427} 12428 12429static 12430PyObject *unicode_str(PyObject *self) 12431{ 12432 if (PyUnicode_CheckExact(self)) { 12433 Py_INCREF(self); 12434 return self; 12435 } else 12436 /* Subtype -- return genuine unicode string with the same value. */ 12437 return PyUnicode_Copy(self); 12438} 12439 12440PyDoc_STRVAR(swapcase__doc__, 12441 "S.swapcase() -> str\n\ 12442\n\ 12443Return a copy of S with uppercase characters converted to lowercase\n\ 12444and vice versa."); 12445 12446static PyObject* 12447unicode_swapcase(PyObject *self) 12448{ 12449 return fixup(self, fixswapcase); 12450} 12451 12452PyDoc_STRVAR(maketrans__doc__, 12453 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12454\n\ 12455Return a translation table usable for str.translate().\n\ 12456If there is only one argument, it must be a dictionary mapping Unicode\n\ 12457ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12458Character keys will be then converted to ordinals.\n\ 12459If there are two arguments, they must be strings of equal length, and\n\ 12460in the resulting dictionary, each character in x will be mapped to the\n\ 12461character at the same position in y. If there is a third argument, it\n\ 12462must be a string, whose characters will be mapped to None in the result."); 12463 12464static PyObject* 12465unicode_maketrans(PyObject *null, PyObject *args) 12466{ 12467 PyObject *x, *y = NULL, *z = NULL; 12468 PyObject *new = NULL, *key, *value; 12469 Py_ssize_t i = 0; 12470 int res; 12471 12472 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12473 return NULL; 12474 new = PyDict_New(); 12475 if (!new) 12476 return NULL; 12477 if (y != NULL) { 12478 int x_kind, y_kind, z_kind; 12479 void *x_data, *y_data, *z_data; 12480 12481 /* x must be a string too, of equal length */ 12482 if (!PyUnicode_Check(x)) { 12483 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12484 "be a string if there is a second argument"); 12485 goto err; 12486 } 12487 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12488 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12489 "arguments must have equal length"); 12490 goto err; 12491 } 12492 /* create entries for translating chars in x to those in y */ 12493 x_kind = PyUnicode_KIND(x); 12494 y_kind = PyUnicode_KIND(y); 12495 x_data = PyUnicode_DATA(x); 12496 y_data = PyUnicode_DATA(y); 12497 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12498 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12499 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12500 if (!key || !value) 12501 goto err; 12502 res = PyDict_SetItem(new, key, value); 12503 Py_DECREF(key); 12504 Py_DECREF(value); 12505 if (res < 0) 12506 goto err; 12507 } 12508 /* create entries for deleting chars in z */ 12509 if (z != NULL) { 12510 z_kind = PyUnicode_KIND(z); 12511 z_data = PyUnicode_DATA(z); 12512 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12513 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12514 if (!key) 12515 goto err; 12516 res = PyDict_SetItem(new, key, Py_None); 12517 Py_DECREF(key); 12518 if (res < 0) 12519 goto err; 12520 } 12521 } 12522 } else { 12523 int kind; 12524 void *data; 12525 12526 /* x must be a dict */ 12527 if (!PyDict_CheckExact(x)) { 12528 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12529 "to maketrans it must be a dict"); 12530 goto err; 12531 } 12532 /* copy entries into the new dict, converting string keys to int keys */ 12533 while (PyDict_Next(x, &i, &key, &value)) { 12534 if (PyUnicode_Check(key)) { 12535 /* convert string keys to integer keys */ 12536 PyObject *newkey; 12537 if (PyUnicode_GET_LENGTH(key) != 1) { 12538 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12539 "table must be of length 1"); 12540 goto err; 12541 } 12542 kind = PyUnicode_KIND(key); 12543 data = PyUnicode_DATA(key); 12544 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12545 if (!newkey) 12546 goto err; 12547 res = PyDict_SetItem(new, newkey, value); 12548 Py_DECREF(newkey); 12549 if (res < 0) 12550 goto err; 12551 } else if (PyLong_Check(key)) { 12552 /* just keep integer keys */ 12553 if (PyDict_SetItem(new, key, value) < 0) 12554 goto err; 12555 } else { 12556 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12557 "be strings or integers"); 12558 goto err; 12559 } 12560 } 12561 } 12562 return new; 12563 err: 12564 Py_DECREF(new); 12565 return NULL; 12566} 12567 12568PyDoc_STRVAR(translate__doc__, 12569 "S.translate(table) -> str\n\ 12570\n\ 12571Return a copy of the string S, where all characters have been mapped\n\ 12572through the given translation table, which must be a mapping of\n\ 12573Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12574Unmapped characters are left untouched. Characters mapped to None\n\ 12575are deleted."); 12576 12577static PyObject* 12578unicode_translate(PyObject *self, PyObject *table) 12579{ 12580 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12581} 12582 12583PyDoc_STRVAR(upper__doc__, 12584 "S.upper() -> str\n\ 12585\n\ 12586Return a copy of S converted to uppercase."); 12587 12588static PyObject* 12589unicode_upper(PyObject *self) 12590{ 12591 return fixup(self, fixupper); 12592} 12593 12594PyDoc_STRVAR(zfill__doc__, 12595 "S.zfill(width) -> str\n\ 12596\n\ 12597Pad a numeric string S with zeros on the left, to fill a field\n\ 12598of the specified width. The string S is never truncated."); 12599 12600static PyObject * 12601unicode_zfill(PyObject *self, PyObject *args) 12602{ 12603 Py_ssize_t fill; 12604 PyObject *u; 12605 Py_ssize_t width; 12606 int kind; 12607 void *data; 12608 Py_UCS4 chr; 12609 12610 if (PyUnicode_READY(self) == -1) 12611 return NULL; 12612 12613 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12614 return NULL; 12615 12616 if (PyUnicode_GET_LENGTH(self) >= width) { 12617 if (PyUnicode_CheckExact(self)) { 12618 Py_INCREF(self); 12619 return self; 12620 } 12621 else 12622 return PyUnicode_Copy(self); 12623 } 12624 12625 fill = width - _PyUnicode_LENGTH(self); 12626 12627 u = pad(self, fill, 0, '0'); 12628 12629 if (u == NULL) 12630 return NULL; 12631 12632 kind = PyUnicode_KIND(u); 12633 data = PyUnicode_DATA(u); 12634 chr = PyUnicode_READ(kind, data, fill); 12635 12636 if (chr == '+' || chr == '-') { 12637 /* move sign to beginning of string */ 12638 PyUnicode_WRITE(kind, data, 0, chr); 12639 PyUnicode_WRITE(kind, data, fill, '0'); 12640 } 12641 12642 assert(_PyUnicode_CheckConsistency(u, 1)); 12643 return u; 12644} 12645 12646#if 0 12647static PyObject * 12648unicode__decimal2ascii(PyObject *self) 12649{ 12650 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12651} 12652#endif 12653 12654PyDoc_STRVAR(startswith__doc__, 12655 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12656\n\ 12657Return True if S starts with the specified prefix, False otherwise.\n\ 12658With optional start, test S beginning at that position.\n\ 12659With optional end, stop comparing S at that position.\n\ 12660prefix can also be a tuple of strings to try."); 12661 12662static PyObject * 12663unicode_startswith(PyObject *self, 12664 PyObject *args) 12665{ 12666 PyObject *subobj; 12667 PyObject *substring; 12668 Py_ssize_t start = 0; 12669 Py_ssize_t end = PY_SSIZE_T_MAX; 12670 int result; 12671 12672 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12673 return NULL; 12674 if (PyTuple_Check(subobj)) { 12675 Py_ssize_t i; 12676 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12677 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12678 if (substring == NULL) 12679 return NULL; 12680 result = tailmatch(self, substring, start, end, -1); 12681 Py_DECREF(substring); 12682 if (result) { 12683 Py_RETURN_TRUE; 12684 } 12685 } 12686 /* nothing matched */ 12687 Py_RETURN_FALSE; 12688 } 12689 substring = PyUnicode_FromObject(subobj); 12690 if (substring == NULL) { 12691 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12692 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12693 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12694 return NULL; 12695 } 12696 result = tailmatch(self, substring, start, end, -1); 12697 Py_DECREF(substring); 12698 return PyBool_FromLong(result); 12699} 12700 12701 12702PyDoc_STRVAR(endswith__doc__, 12703 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12704\n\ 12705Return True if S ends with the specified suffix, False otherwise.\n\ 12706With optional start, test S beginning at that position.\n\ 12707With optional end, stop comparing S at that position.\n\ 12708suffix can also be a tuple of strings to try."); 12709 12710static PyObject * 12711unicode_endswith(PyObject *self, 12712 PyObject *args) 12713{ 12714 PyObject *subobj; 12715 PyObject *substring; 12716 Py_ssize_t start = 0; 12717 Py_ssize_t end = PY_SSIZE_T_MAX; 12718 int result; 12719 12720 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12721 return NULL; 12722 if (PyTuple_Check(subobj)) { 12723 Py_ssize_t i; 12724 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12725 substring = PyUnicode_FromObject( 12726 PyTuple_GET_ITEM(subobj, i)); 12727 if (substring == NULL) 12728 return NULL; 12729 result = tailmatch(self, substring, start, end, +1); 12730 Py_DECREF(substring); 12731 if (result) { 12732 Py_RETURN_TRUE; 12733 } 12734 } 12735 Py_RETURN_FALSE; 12736 } 12737 substring = PyUnicode_FromObject(subobj); 12738 if (substring == NULL) { 12739 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12740 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12741 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12742 return NULL; 12743 } 12744 result = tailmatch(self, substring, start, end, +1); 12745 Py_DECREF(substring); 12746 return PyBool_FromLong(result); 12747} 12748 12749#include "stringlib/unicode_format.h" 12750 12751PyDoc_STRVAR(format__doc__, 12752 "S.format(*args, **kwargs) -> str\n\ 12753\n\ 12754Return a formatted version of S, using substitutions from args and kwargs.\n\ 12755The substitutions are identified by braces ('{' and '}')."); 12756 12757PyDoc_STRVAR(format_map__doc__, 12758 "S.format_map(mapping) -> str\n\ 12759\n\ 12760Return a formatted version of S, using substitutions from mapping.\n\ 12761The substitutions are identified by braces ('{' and '}')."); 12762 12763static PyObject * 12764unicode__format__(PyObject* self, PyObject* args) 12765{ 12766 PyObject *format_spec, *out; 12767 12768 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12769 return NULL; 12770 12771 out = _PyUnicode_FormatAdvanced(self, format_spec, 0, 12772 PyUnicode_GET_LENGTH(format_spec)); 12773 return out; 12774} 12775 12776PyDoc_STRVAR(p_format__doc__, 12777 "S.__format__(format_spec) -> str\n\ 12778\n\ 12779Return a formatted version of S as described by format_spec."); 12780 12781static PyObject * 12782unicode__sizeof__(PyObject *v) 12783{ 12784 Py_ssize_t size; 12785 12786 /* If it's a compact object, account for base structure + 12787 character data. */ 12788 if (PyUnicode_IS_COMPACT_ASCII(v)) 12789 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 12790 else if (PyUnicode_IS_COMPACT(v)) 12791 size = sizeof(PyCompactUnicodeObject) + 12792 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 12793 else { 12794 /* If it is a two-block object, account for base object, and 12795 for character block if present. */ 12796 size = sizeof(PyUnicodeObject); 12797 if (_PyUnicode_DATA_ANY(v)) 12798 size += (PyUnicode_GET_LENGTH(v) + 1) * 12799 PyUnicode_KIND(v); 12800 } 12801 /* If the wstr pointer is present, account for it unless it is shared 12802 with the data pointer. Check if the data is not shared. */ 12803 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 12804 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 12805 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 12806 size += PyUnicode_UTF8_LENGTH(v) + 1; 12807 12808 return PyLong_FromSsize_t(size); 12809} 12810 12811PyDoc_STRVAR(sizeof__doc__, 12812 "S.__sizeof__() -> size of S in memory, in bytes"); 12813 12814static PyObject * 12815unicode_getnewargs(PyObject *v) 12816{ 12817 PyObject *copy = PyUnicode_Copy(v); 12818 if (!copy) 12819 return NULL; 12820 return Py_BuildValue("(N)", copy); 12821} 12822 12823static PyMethodDef unicode_methods[] = { 12824 12825 /* Order is according to common usage: often used methods should 12826 appear first, since lookup is done sequentially. */ 12827 12828 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 12829 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 12830 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, 12831 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, 12832 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 12833 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 12834 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 12835 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 12836 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 12837 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 12838 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 12839 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 12840 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 12841 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 12842 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 12843 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 12844 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 12845 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 12846 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 12847 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 12848 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 12849 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 12850 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 12851 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 12852 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 12853 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 12854 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 12855 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 12856 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 12857 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 12858 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 12859 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 12860 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 12861 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 12862 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 12863 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 12864 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 12865 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 12866 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 12867 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 12868 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 12869 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 12870 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 12871 {"maketrans", (PyCFunction) unicode_maketrans, 12872 METH_VARARGS | METH_STATIC, maketrans__doc__}, 12873 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 12874#if 0 12875 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, 12876#endif 12877 12878#if 0 12879 /* These methods are just used for debugging the implementation. */ 12880 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 12881#endif 12882 12883 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 12884 {NULL, NULL} 12885}; 12886 12887static PyObject * 12888unicode_mod(PyObject *v, PyObject *w) 12889{ 12890 if (!PyUnicode_Check(v)) 12891 Py_RETURN_NOTIMPLEMENTED; 12892 return PyUnicode_Format(v, w); 12893} 12894 12895static PyNumberMethods unicode_as_number = { 12896 0, /*nb_add*/ 12897 0, /*nb_subtract*/ 12898 0, /*nb_multiply*/ 12899 unicode_mod, /*nb_remainder*/ 12900}; 12901 12902static PySequenceMethods unicode_as_sequence = { 12903 (lenfunc) unicode_length, /* sq_length */ 12904 PyUnicode_Concat, /* sq_concat */ 12905 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 12906 (ssizeargfunc) unicode_getitem, /* sq_item */ 12907 0, /* sq_slice */ 12908 0, /* sq_ass_item */ 12909 0, /* sq_ass_slice */ 12910 PyUnicode_Contains, /* sq_contains */ 12911}; 12912 12913static PyObject* 12914unicode_subscript(PyObject* self, PyObject* item) 12915{ 12916 if (PyUnicode_READY(self) == -1) 12917 return NULL; 12918 12919 if (PyIndex_Check(item)) { 12920 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 12921 if (i == -1 && PyErr_Occurred()) 12922 return NULL; 12923 if (i < 0) 12924 i += PyUnicode_GET_LENGTH(self); 12925 return unicode_getitem(self, i); 12926 } else if (PySlice_Check(item)) { 12927 Py_ssize_t start, stop, step, slicelength, cur, i; 12928 PyObject *result; 12929 void *src_data, *dest_data; 12930 int src_kind, dest_kind; 12931 Py_UCS4 ch, max_char, kind_limit; 12932 12933 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 12934 &start, &stop, &step, &slicelength) < 0) { 12935 return NULL; 12936 } 12937 12938 if (slicelength <= 0) { 12939 return PyUnicode_New(0, 0); 12940 } else if (start == 0 && step == 1 && 12941 slicelength == PyUnicode_GET_LENGTH(self) && 12942 PyUnicode_CheckExact(self)) { 12943 Py_INCREF(self); 12944 return self; 12945 } else if (step == 1) { 12946 return PyUnicode_Substring(self, 12947 start, start + slicelength); 12948 } 12949 /* General case */ 12950 src_kind = PyUnicode_KIND(self); 12951 src_data = PyUnicode_DATA(self); 12952 if (!PyUnicode_IS_ASCII(self)) { 12953 kind_limit = kind_maxchar_limit(src_kind); 12954 max_char = 0; 12955 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12956 ch = PyUnicode_READ(src_kind, src_data, cur); 12957 if (ch > max_char) { 12958 max_char = ch; 12959 if (max_char >= kind_limit) 12960 break; 12961 } 12962 } 12963 } 12964 else 12965 max_char = 127; 12966 result = PyUnicode_New(slicelength, max_char); 12967 if (result == NULL) 12968 return NULL; 12969 dest_kind = PyUnicode_KIND(result); 12970 dest_data = PyUnicode_DATA(result); 12971 12972 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 12973 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 12974 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 12975 } 12976 assert(_PyUnicode_CheckConsistency(result, 1)); 12977 return result; 12978 } else { 12979 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 12980 return NULL; 12981 } 12982} 12983 12984static PyMappingMethods unicode_as_mapping = { 12985 (lenfunc)unicode_length, /* mp_length */ 12986 (binaryfunc)unicode_subscript, /* mp_subscript */ 12987 (objobjargproc)0, /* mp_ass_subscript */ 12988}; 12989 12990 12991/* Helpers for PyUnicode_Format() */ 12992 12993static PyObject * 12994getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 12995{ 12996 Py_ssize_t argidx = *p_argidx; 12997 if (argidx < arglen) { 12998 (*p_argidx)++; 12999 if (arglen < 0) 13000 return args; 13001 else 13002 return PyTuple_GetItem(args, argidx); 13003 } 13004 PyErr_SetString(PyExc_TypeError, 13005 "not enough arguments for format string"); 13006 return NULL; 13007} 13008 13009/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13010 13011static PyObject * 13012formatfloat(PyObject *v, int flags, int prec, int type) 13013{ 13014 char *p; 13015 PyObject *result; 13016 double x; 13017 13018 x = PyFloat_AsDouble(v); 13019 if (x == -1.0 && PyErr_Occurred()) 13020 return NULL; 13021 13022 if (prec < 0) 13023 prec = 6; 13024 13025 p = PyOS_double_to_string(x, type, prec, 13026 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13027 if (p == NULL) 13028 return NULL; 13029 result = PyUnicode_DecodeASCII(p, strlen(p), NULL); 13030 PyMem_Free(p); 13031 return result; 13032} 13033 13034static PyObject* 13035formatlong(PyObject *val, int flags, int prec, int type) 13036{ 13037 char *buf; 13038 int len; 13039 PyObject *str; /* temporary string object. */ 13040 PyObject *result; 13041 13042 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len); 13043 if (!str) 13044 return NULL; 13045 result = PyUnicode_DecodeASCII(buf, len, NULL); 13046 Py_DECREF(str); 13047 return result; 13048} 13049 13050static Py_UCS4 13051formatchar(PyObject *v) 13052{ 13053 /* presume that the buffer is at least 3 characters long */ 13054 if (PyUnicode_Check(v)) { 13055 if (PyUnicode_GET_LENGTH(v) == 1) { 13056 return PyUnicode_READ_CHAR(v, 0); 13057 } 13058 goto onError; 13059 } 13060 else { 13061 /* Integer input truncated to a character */ 13062 long x; 13063 x = PyLong_AsLong(v); 13064 if (x == -1 && PyErr_Occurred()) 13065 goto onError; 13066 13067 if (x < 0 || x > MAX_UNICODE) { 13068 PyErr_SetString(PyExc_OverflowError, 13069 "%c arg not in range(0x110000)"); 13070 return (Py_UCS4) -1; 13071 } 13072 13073 return (Py_UCS4) x; 13074 } 13075 13076 onError: 13077 PyErr_SetString(PyExc_TypeError, 13078 "%c requires int or char"); 13079 return (Py_UCS4) -1; 13080} 13081 13082static int 13083repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count) 13084{ 13085 int r; 13086 assert(count > 0); 13087 assert(PyUnicode_Check(obj)); 13088 if (count > 5) { 13089 PyObject *repeated = unicode_repeat(obj, count); 13090 if (repeated == NULL) 13091 return -1; 13092 r = _PyAccu_Accumulate(acc, repeated); 13093 Py_DECREF(repeated); 13094 return r; 13095 } 13096 else { 13097 do { 13098 if (_PyAccu_Accumulate(acc, obj)) 13099 return -1; 13100 } while (--count); 13101 return 0; 13102 } 13103} 13104 13105PyObject * 13106PyUnicode_Format(PyObject *format, PyObject *args) 13107{ 13108 void *fmt; 13109 int fmtkind; 13110 PyObject *result; 13111 int kind; 13112 int r; 13113 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13114 int args_owned = 0; 13115 PyObject *dict = NULL; 13116 PyObject *temp = NULL; 13117 PyObject *second = NULL; 13118 PyObject *uformat; 13119 _PyAccu acc; 13120 static PyObject *plus, *minus, *blank, *zero, *percent; 13121 13122 if (!plus && !(plus = get_latin1_char('+'))) 13123 return NULL; 13124 if (!minus && !(minus = get_latin1_char('-'))) 13125 return NULL; 13126 if (!blank && !(blank = get_latin1_char(' '))) 13127 return NULL; 13128 if (!zero && !(zero = get_latin1_char('0'))) 13129 return NULL; 13130 if (!percent && !(percent = get_latin1_char('%'))) 13131 return NULL; 13132 13133 if (format == NULL || args == NULL) { 13134 PyErr_BadInternalCall(); 13135 return NULL; 13136 } 13137 uformat = PyUnicode_FromObject(format); 13138 if (uformat == NULL || PyUnicode_READY(uformat) == -1) 13139 return NULL; 13140 if (_PyAccu_Init(&acc)) 13141 goto onError; 13142 fmt = PyUnicode_DATA(uformat); 13143 fmtkind = PyUnicode_KIND(uformat); 13144 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13145 fmtpos = 0; 13146 13147 if (PyTuple_Check(args)) { 13148 arglen = PyTuple_Size(args); 13149 argidx = 0; 13150 } 13151 else { 13152 arglen = -1; 13153 argidx = -2; 13154 } 13155 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && 13156 !PyUnicode_Check(args)) 13157 dict = args; 13158 13159 while (--fmtcnt >= 0) { 13160 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13161 PyObject *nonfmt; 13162 Py_ssize_t nonfmtpos; 13163 nonfmtpos = fmtpos++; 13164 while (fmtcnt >= 0 && 13165 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13166 fmtpos++; 13167 fmtcnt--; 13168 } 13169 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos); 13170 if (nonfmt == NULL) 13171 goto onError; 13172 r = _PyAccu_Accumulate(&acc, nonfmt); 13173 Py_DECREF(nonfmt); 13174 if (r) 13175 goto onError; 13176 } 13177 else { 13178 /* Got a format specifier */ 13179 int flags = 0; 13180 Py_ssize_t width = -1; 13181 int prec = -1; 13182 Py_UCS4 c = '\0'; 13183 Py_UCS4 fill, sign; 13184 int isnumok; 13185 PyObject *v = NULL; 13186 void *pbuf = NULL; 13187 Py_ssize_t pindex, len; 13188 PyObject *signobj = NULL, *fillobj = NULL; 13189 13190 fmtpos++; 13191 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') { 13192 Py_ssize_t keystart; 13193 Py_ssize_t keylen; 13194 PyObject *key; 13195 int pcount = 1; 13196 13197 if (dict == NULL) { 13198 PyErr_SetString(PyExc_TypeError, 13199 "format requires a mapping"); 13200 goto onError; 13201 } 13202 ++fmtpos; 13203 --fmtcnt; 13204 keystart = fmtpos; 13205 /* Skip over balanced parentheses */ 13206 while (pcount > 0 && --fmtcnt >= 0) { 13207 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')') 13208 --pcount; 13209 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') 13210 ++pcount; 13211 fmtpos++; 13212 } 13213 keylen = fmtpos - keystart - 1; 13214 if (fmtcnt < 0 || pcount > 0) { 13215 PyErr_SetString(PyExc_ValueError, 13216 "incomplete format key"); 13217 goto onError; 13218 } 13219 key = PyUnicode_Substring(uformat, 13220 keystart, keystart + keylen); 13221 if (key == NULL) 13222 goto onError; 13223 if (args_owned) { 13224 Py_DECREF(args); 13225 args_owned = 0; 13226 } 13227 args = PyObject_GetItem(dict, key); 13228 Py_DECREF(key); 13229 if (args == NULL) { 13230 goto onError; 13231 } 13232 args_owned = 1; 13233 arglen = -1; 13234 argidx = -2; 13235 } 13236 while (--fmtcnt >= 0) { 13237 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) { 13238 case '-': flags |= F_LJUST; continue; 13239 case '+': flags |= F_SIGN; continue; 13240 case ' ': flags |= F_BLANK; continue; 13241 case '#': flags |= F_ALT; continue; 13242 case '0': flags |= F_ZERO; continue; 13243 } 13244 break; 13245 } 13246 if (c == '*') { 13247 v = getnextarg(args, arglen, &argidx); 13248 if (v == NULL) 13249 goto onError; 13250 if (!PyLong_Check(v)) { 13251 PyErr_SetString(PyExc_TypeError, 13252 "* wants int"); 13253 goto onError; 13254 } 13255 width = PyLong_AsLong(v); 13256 if (width == -1 && PyErr_Occurred()) 13257 goto onError; 13258 if (width < 0) { 13259 flags |= F_LJUST; 13260 width = -width; 13261 } 13262 if (--fmtcnt >= 0) 13263 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13264 } 13265 else if (c >= '0' && c <= '9') { 13266 width = c - '0'; 13267 while (--fmtcnt >= 0) { 13268 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13269 if (c < '0' || c > '9') 13270 break; 13271 if ((width*10) / 10 != width) { 13272 PyErr_SetString(PyExc_ValueError, 13273 "width too big"); 13274 goto onError; 13275 } 13276 width = width*10 + (c - '0'); 13277 } 13278 } 13279 if (c == '.') { 13280 prec = 0; 13281 if (--fmtcnt >= 0) 13282 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13283 if (c == '*') { 13284 v = getnextarg(args, arglen, &argidx); 13285 if (v == NULL) 13286 goto onError; 13287 if (!PyLong_Check(v)) { 13288 PyErr_SetString(PyExc_TypeError, 13289 "* wants int"); 13290 goto onError; 13291 } 13292 prec = PyLong_AsLong(v); 13293 if (prec == -1 && PyErr_Occurred()) 13294 goto onError; 13295 if (prec < 0) 13296 prec = 0; 13297 if (--fmtcnt >= 0) 13298 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13299 } 13300 else if (c >= '0' && c <= '9') { 13301 prec = c - '0'; 13302 while (--fmtcnt >= 0) { 13303 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13304 if (c < '0' || c > '9') 13305 break; 13306 if ((prec*10) / 10 != prec) { 13307 PyErr_SetString(PyExc_ValueError, 13308 "prec too big"); 13309 goto onError; 13310 } 13311 prec = prec*10 + (c - '0'); 13312 } 13313 } 13314 } /* prec */ 13315 if (fmtcnt >= 0) { 13316 if (c == 'h' || c == 'l' || c == 'L') { 13317 if (--fmtcnt >= 0) 13318 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13319 } 13320 } 13321 if (fmtcnt < 0) { 13322 PyErr_SetString(PyExc_ValueError, 13323 "incomplete format"); 13324 goto onError; 13325 } 13326 if (c != '%') { 13327 v = getnextarg(args, arglen, &argidx); 13328 if (v == NULL) 13329 goto onError; 13330 } 13331 sign = 0; 13332 fill = ' '; 13333 fillobj = blank; 13334 switch (c) { 13335 13336 case '%': 13337 _PyAccu_Accumulate(&acc, percent); 13338 continue; 13339 13340 case 's': 13341 case 'r': 13342 case 'a': 13343 if (PyUnicode_CheckExact(v) && c == 's') { 13344 temp = v; 13345 Py_INCREF(temp); 13346 } 13347 else { 13348 if (c == 's') 13349 temp = PyObject_Str(v); 13350 else if (c == 'r') 13351 temp = PyObject_Repr(v); 13352 else 13353 temp = PyObject_ASCII(v); 13354 if (temp == NULL) 13355 goto onError; 13356 if (PyUnicode_Check(temp)) 13357 /* nothing to do */; 13358 else { 13359 Py_DECREF(temp); 13360 PyErr_SetString(PyExc_TypeError, 13361 "%s argument has non-string str()"); 13362 goto onError; 13363 } 13364 } 13365 if (PyUnicode_READY(temp) == -1) { 13366 Py_CLEAR(temp); 13367 goto onError; 13368 } 13369 pbuf = PyUnicode_DATA(temp); 13370 kind = PyUnicode_KIND(temp); 13371 len = PyUnicode_GET_LENGTH(temp); 13372 if (prec >= 0 && len > prec) 13373 len = prec; 13374 break; 13375 13376 case 'i': 13377 case 'd': 13378 case 'u': 13379 case 'o': 13380 case 'x': 13381 case 'X': 13382 isnumok = 0; 13383 if (PyNumber_Check(v)) { 13384 PyObject *iobj=NULL; 13385 13386 if (PyLong_Check(v)) { 13387 iobj = v; 13388 Py_INCREF(iobj); 13389 } 13390 else { 13391 iobj = PyNumber_Long(v); 13392 } 13393 if (iobj!=NULL) { 13394 if (PyLong_Check(iobj)) { 13395 isnumok = 1; 13396 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13397 Py_DECREF(iobj); 13398 if (!temp) 13399 goto onError; 13400 if (PyUnicode_READY(temp) == -1) { 13401 Py_CLEAR(temp); 13402 goto onError; 13403 } 13404 pbuf = PyUnicode_DATA(temp); 13405 kind = PyUnicode_KIND(temp); 13406 len = PyUnicode_GET_LENGTH(temp); 13407 sign = 1; 13408 } 13409 else { 13410 Py_DECREF(iobj); 13411 } 13412 } 13413 } 13414 if (!isnumok) { 13415 PyErr_Format(PyExc_TypeError, 13416 "%%%c format: a number is required, " 13417 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13418 goto onError; 13419 } 13420 if (flags & F_ZERO) { 13421 fill = '0'; 13422 fillobj = zero; 13423 } 13424 break; 13425 13426 case 'e': 13427 case 'E': 13428 case 'f': 13429 case 'F': 13430 case 'g': 13431 case 'G': 13432 temp = formatfloat(v, flags, prec, c); 13433 if (!temp) 13434 goto onError; 13435 if (PyUnicode_READY(temp) == -1) { 13436 Py_CLEAR(temp); 13437 goto onError; 13438 } 13439 pbuf = PyUnicode_DATA(temp); 13440 kind = PyUnicode_KIND(temp); 13441 len = PyUnicode_GET_LENGTH(temp); 13442 sign = 1; 13443 if (flags & F_ZERO) { 13444 fill = '0'; 13445 fillobj = zero; 13446 } 13447 break; 13448 13449 case 'c': 13450 { 13451 Py_UCS4 ch = formatchar(v); 13452 if (ch == (Py_UCS4) -1) 13453 goto onError; 13454 temp = _PyUnicode_FromUCS4(&ch, 1); 13455 if (temp == NULL) 13456 goto onError; 13457 pbuf = PyUnicode_DATA(temp); 13458 kind = PyUnicode_KIND(temp); 13459 len = PyUnicode_GET_LENGTH(temp); 13460 break; 13461 } 13462 13463 default: 13464 PyErr_Format(PyExc_ValueError, 13465 "unsupported format character '%c' (0x%x) " 13466 "at index %zd", 13467 (31<=c && c<=126) ? (char)c : '?', 13468 (int)c, 13469 fmtpos - 1); 13470 goto onError; 13471 } 13472 /* pbuf is initialized here. */ 13473 pindex = 0; 13474 if (sign) { 13475 if (PyUnicode_READ(kind, pbuf, pindex) == '-') { 13476 signobj = minus; 13477 len--; 13478 pindex++; 13479 } 13480 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') { 13481 signobj = plus; 13482 len--; 13483 pindex++; 13484 } 13485 else if (flags & F_SIGN) 13486 signobj = plus; 13487 else if (flags & F_BLANK) 13488 signobj = blank; 13489 else 13490 sign = 0; 13491 } 13492 if (width < len) 13493 width = len; 13494 if (sign) { 13495 if (fill != ' ') { 13496 assert(signobj != NULL); 13497 if (_PyAccu_Accumulate(&acc, signobj)) 13498 goto onError; 13499 } 13500 if (width > len) 13501 width--; 13502 } 13503 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13504 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13505 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13506 if (fill != ' ') { 13507 second = get_latin1_char( 13508 PyUnicode_READ(kind, pbuf, pindex + 1)); 13509 pindex += 2; 13510 if (second == NULL || 13511 _PyAccu_Accumulate(&acc, zero) || 13512 _PyAccu_Accumulate(&acc, second)) 13513 goto onError; 13514 Py_CLEAR(second); 13515 } 13516 width -= 2; 13517 if (width < 0) 13518 width = 0; 13519 len -= 2; 13520 } 13521 if (width > len && !(flags & F_LJUST)) { 13522 assert(fillobj != NULL); 13523 if (repeat_accumulate(&acc, fillobj, width - len)) 13524 goto onError; 13525 width = len; 13526 } 13527 if (fill == ' ') { 13528 if (sign) { 13529 assert(signobj != NULL); 13530 if (_PyAccu_Accumulate(&acc, signobj)) 13531 goto onError; 13532 } 13533 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13534 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13535 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13536 second = get_latin1_char( 13537 PyUnicode_READ(kind, pbuf, pindex + 1)); 13538 pindex += 2; 13539 if (second == NULL || 13540 _PyAccu_Accumulate(&acc, zero) || 13541 _PyAccu_Accumulate(&acc, second)) 13542 goto onError; 13543 Py_CLEAR(second); 13544 } 13545 } 13546 /* Copy all characters, preserving len */ 13547 if (temp != NULL) { 13548 assert(pbuf == PyUnicode_DATA(temp)); 13549 v = PyUnicode_Substring(temp, pindex, pindex + len); 13550 } 13551 else { 13552 const char *p = (const char *) pbuf; 13553 assert(pbuf != NULL); 13554 p += kind * pindex; 13555 v = PyUnicode_FromKindAndData(kind, p, len); 13556 } 13557 if (v == NULL) 13558 goto onError; 13559 r = _PyAccu_Accumulate(&acc, v); 13560 Py_DECREF(v); 13561 if (r) 13562 goto onError; 13563 if (width > len && repeat_accumulate(&acc, blank, width - len)) 13564 goto onError; 13565 if (dict && (argidx < arglen) && c != '%') { 13566 PyErr_SetString(PyExc_TypeError, 13567 "not all arguments converted during string formatting"); 13568 goto onError; 13569 } 13570 Py_CLEAR(temp); 13571 } /* '%' */ 13572 } /* until end */ 13573 if (argidx < arglen && !dict) { 13574 PyErr_SetString(PyExc_TypeError, 13575 "not all arguments converted during string formatting"); 13576 goto onError; 13577 } 13578 13579 result = _PyAccu_Finish(&acc); 13580 if (args_owned) { 13581 Py_DECREF(args); 13582 } 13583 Py_DECREF(uformat); 13584 Py_XDECREF(temp); 13585 Py_XDECREF(second); 13586 return result; 13587 13588 onError: 13589 Py_DECREF(uformat); 13590 Py_XDECREF(temp); 13591 Py_XDECREF(second); 13592 _PyAccu_Destroy(&acc); 13593 if (args_owned) { 13594 Py_DECREF(args); 13595 } 13596 return NULL; 13597} 13598 13599static PyObject * 13600unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13601 13602static PyObject * 13603unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13604{ 13605 PyObject *x = NULL; 13606 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13607 char *encoding = NULL; 13608 char *errors = NULL; 13609 13610 if (type != &PyUnicode_Type) 13611 return unicode_subtype_new(type, args, kwds); 13612 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13613 kwlist, &x, &encoding, &errors)) 13614 return NULL; 13615 if (x == NULL) 13616 return PyUnicode_New(0, 0); 13617 if (encoding == NULL && errors == NULL) 13618 return PyObject_Str(x); 13619 else 13620 return PyUnicode_FromEncodedObject(x, encoding, errors); 13621} 13622 13623static PyObject * 13624unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13625{ 13626 PyObject *unicode, *self; 13627 Py_ssize_t length, char_size; 13628 int share_wstr, share_utf8; 13629 unsigned int kind; 13630 void *data; 13631 13632 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13633 13634 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13635 if (unicode == NULL) 13636 return NULL; 13637 assert(_PyUnicode_CHECK(unicode)); 13638 if (PyUnicode_READY(unicode)) 13639 return NULL; 13640 13641 self = type->tp_alloc(type, 0); 13642 if (self == NULL) { 13643 Py_DECREF(unicode); 13644 return NULL; 13645 } 13646 kind = PyUnicode_KIND(unicode); 13647 length = PyUnicode_GET_LENGTH(unicode); 13648 13649 _PyUnicode_LENGTH(self) = length; 13650#ifdef Py_DEBUG 13651 _PyUnicode_HASH(self) = -1; 13652#else 13653 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13654#endif 13655 _PyUnicode_STATE(self).interned = 0; 13656 _PyUnicode_STATE(self).kind = kind; 13657 _PyUnicode_STATE(self).compact = 0; 13658 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 13659 _PyUnicode_STATE(self).ready = 1; 13660 _PyUnicode_WSTR(self) = NULL; 13661 _PyUnicode_UTF8_LENGTH(self) = 0; 13662 _PyUnicode_UTF8(self) = NULL; 13663 _PyUnicode_WSTR_LENGTH(self) = 0; 13664 _PyUnicode_DATA_ANY(self) = NULL; 13665 13666 share_utf8 = 0; 13667 share_wstr = 0; 13668 if (kind == PyUnicode_1BYTE_KIND) { 13669 char_size = 1; 13670 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 13671 share_utf8 = 1; 13672 } 13673 else if (kind == PyUnicode_2BYTE_KIND) { 13674 char_size = 2; 13675 if (sizeof(wchar_t) == 2) 13676 share_wstr = 1; 13677 } 13678 else { 13679 assert(kind == PyUnicode_4BYTE_KIND); 13680 char_size = 4; 13681 if (sizeof(wchar_t) == 4) 13682 share_wstr = 1; 13683 } 13684 13685 /* Ensure we won't overflow the length. */ 13686 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 13687 PyErr_NoMemory(); 13688 goto onError; 13689 } 13690 data = PyObject_MALLOC((length + 1) * char_size); 13691 if (data == NULL) { 13692 PyErr_NoMemory(); 13693 goto onError; 13694 } 13695 13696 _PyUnicode_DATA_ANY(self) = data; 13697 if (share_utf8) { 13698 _PyUnicode_UTF8_LENGTH(self) = length; 13699 _PyUnicode_UTF8(self) = data; 13700 } 13701 if (share_wstr) { 13702 _PyUnicode_WSTR_LENGTH(self) = length; 13703 _PyUnicode_WSTR(self) = (wchar_t *)data; 13704 } 13705 13706 Py_MEMCPY(data, PyUnicode_DATA(unicode), 13707 kind * (length + 1)); 13708 assert(_PyUnicode_CheckConsistency(self, 1)); 13709#ifdef Py_DEBUG 13710 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 13711#endif 13712 Py_DECREF(unicode); 13713 return self; 13714 13715onError: 13716 Py_DECREF(unicode); 13717 Py_DECREF(self); 13718 return NULL; 13719} 13720 13721PyDoc_STRVAR(unicode_doc, 13722 "str(string[, encoding[, errors]]) -> str\n\ 13723\n\ 13724Create a new string object from the given encoded string.\n\ 13725encoding defaults to the current default string encoding.\n\ 13726errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); 13727 13728static PyObject *unicode_iter(PyObject *seq); 13729 13730PyTypeObject PyUnicode_Type = { 13731 PyVarObject_HEAD_INIT(&PyType_Type, 0) 13732 "str", /* tp_name */ 13733 sizeof(PyUnicodeObject), /* tp_size */ 13734 0, /* tp_itemsize */ 13735 /* Slots */ 13736 (destructor)unicode_dealloc, /* tp_dealloc */ 13737 0, /* tp_print */ 13738 0, /* tp_getattr */ 13739 0, /* tp_setattr */ 13740 0, /* tp_reserved */ 13741 unicode_repr, /* tp_repr */ 13742 &unicode_as_number, /* tp_as_number */ 13743 &unicode_as_sequence, /* tp_as_sequence */ 13744 &unicode_as_mapping, /* tp_as_mapping */ 13745 (hashfunc) unicode_hash, /* tp_hash*/ 13746 0, /* tp_call*/ 13747 (reprfunc) unicode_str, /* tp_str */ 13748 PyObject_GenericGetAttr, /* tp_getattro */ 13749 0, /* tp_setattro */ 13750 0, /* tp_as_buffer */ 13751 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 13752 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 13753 unicode_doc, /* tp_doc */ 13754 0, /* tp_traverse */ 13755 0, /* tp_clear */ 13756 PyUnicode_RichCompare, /* tp_richcompare */ 13757 0, /* tp_weaklistoffset */ 13758 unicode_iter, /* tp_iter */ 13759 0, /* tp_iternext */ 13760 unicode_methods, /* tp_methods */ 13761 0, /* tp_members */ 13762 0, /* tp_getset */ 13763 &PyBaseObject_Type, /* tp_base */ 13764 0, /* tp_dict */ 13765 0, /* tp_descr_get */ 13766 0, /* tp_descr_set */ 13767 0, /* tp_dictoffset */ 13768 0, /* tp_init */ 13769 0, /* tp_alloc */ 13770 unicode_new, /* tp_new */ 13771 PyObject_Del, /* tp_free */ 13772}; 13773 13774/* Initialize the Unicode implementation */ 13775 13776int _PyUnicode_Init(void) 13777{ 13778 int i; 13779 13780 /* XXX - move this array to unicodectype.c ? */ 13781 Py_UCS2 linebreak[] = { 13782 0x000A, /* LINE FEED */ 13783 0x000D, /* CARRIAGE RETURN */ 13784 0x001C, /* FILE SEPARATOR */ 13785 0x001D, /* GROUP SEPARATOR */ 13786 0x001E, /* RECORD SEPARATOR */ 13787 0x0085, /* NEXT LINE */ 13788 0x2028, /* LINE SEPARATOR */ 13789 0x2029, /* PARAGRAPH SEPARATOR */ 13790 }; 13791 13792 /* Init the implementation */ 13793 unicode_empty = PyUnicode_New(0, 0); 13794 if (!unicode_empty) 13795 Py_FatalError("Can't create empty string"); 13796 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 13797 13798 for (i = 0; i < 256; i++) 13799 unicode_latin1[i] = NULL; 13800 if (PyType_Ready(&PyUnicode_Type) < 0) 13801 Py_FatalError("Can't initialize 'unicode'"); 13802 13803 /* initialize the linebreak bloom filter */ 13804 bloom_linebreak = make_bloom_mask( 13805 PyUnicode_2BYTE_KIND, linebreak, 13806 Py_ARRAY_LENGTH(linebreak)); 13807 13808 PyType_Ready(&EncodingMapType); 13809 13810#ifdef HAVE_MBCS 13811 winver.dwOSVersionInfoSize = sizeof(winver); 13812 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 13813 PyErr_SetFromWindowsErr(0); 13814 return -1; 13815 } 13816#endif 13817 return 0; 13818} 13819 13820/* Finalize the Unicode implementation */ 13821 13822int 13823PyUnicode_ClearFreeList(void) 13824{ 13825 return 0; 13826} 13827 13828void 13829_PyUnicode_Fini(void) 13830{ 13831 int i; 13832 13833 Py_XDECREF(unicode_empty); 13834 unicode_empty = NULL; 13835 13836 for (i = 0; i < 256; i++) { 13837 if (unicode_latin1[i]) { 13838 Py_DECREF(unicode_latin1[i]); 13839 unicode_latin1[i] = NULL; 13840 } 13841 } 13842 _PyUnicode_ClearStaticStrings(); 13843 (void)PyUnicode_ClearFreeList(); 13844} 13845 13846void 13847PyUnicode_InternInPlace(PyObject **p) 13848{ 13849 register PyObject *s = *p; 13850 PyObject *t; 13851#ifdef Py_DEBUG 13852 assert(s != NULL); 13853 assert(_PyUnicode_CHECK(s)); 13854#else 13855 if (s == NULL || !PyUnicode_Check(s)) 13856 return; 13857#endif 13858 /* If it's a subclass, we don't really know what putting 13859 it in the interned dict might do. */ 13860 if (!PyUnicode_CheckExact(s)) 13861 return; 13862 if (PyUnicode_CHECK_INTERNED(s)) 13863 return; 13864 if (interned == NULL) { 13865 interned = PyDict_New(); 13866 if (interned == NULL) { 13867 PyErr_Clear(); /* Don't leave an exception */ 13868 return; 13869 } 13870 } 13871 /* It might be that the GetItem call fails even 13872 though the key is present in the dictionary, 13873 namely when this happens during a stack overflow. */ 13874 Py_ALLOW_RECURSION 13875 t = PyDict_GetItem(interned, s); 13876 Py_END_ALLOW_RECURSION 13877 13878 if (t) { 13879 Py_INCREF(t); 13880 Py_DECREF(*p); 13881 *p = t; 13882 return; 13883 } 13884 13885 PyThreadState_GET()->recursion_critical = 1; 13886 if (PyDict_SetItem(interned, s, s) < 0) { 13887 PyErr_Clear(); 13888 PyThreadState_GET()->recursion_critical = 0; 13889 return; 13890 } 13891 PyThreadState_GET()->recursion_critical = 0; 13892 /* The two references in interned are not counted by refcnt. 13893 The deallocator will take care of this */ 13894 Py_REFCNT(s) -= 2; 13895 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 13896} 13897 13898void 13899PyUnicode_InternImmortal(PyObject **p) 13900{ 13901 PyUnicode_InternInPlace(p); 13902 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 13903 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 13904 Py_INCREF(*p); 13905 } 13906} 13907 13908PyObject * 13909PyUnicode_InternFromString(const char *cp) 13910{ 13911 PyObject *s = PyUnicode_FromString(cp); 13912 if (s == NULL) 13913 return NULL; 13914 PyUnicode_InternInPlace(&s); 13915 return s; 13916} 13917 13918void 13919_Py_ReleaseInternedUnicodeStrings(void) 13920{ 13921 PyObject *keys; 13922 PyObject *s; 13923 Py_ssize_t i, n; 13924 Py_ssize_t immortal_size = 0, mortal_size = 0; 13925 13926 if (interned == NULL || !PyDict_Check(interned)) 13927 return; 13928 keys = PyDict_Keys(interned); 13929 if (keys == NULL || !PyList_Check(keys)) { 13930 PyErr_Clear(); 13931 return; 13932 } 13933 13934 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 13935 detector, interned unicode strings are not forcibly deallocated; 13936 rather, we give them their stolen references back, and then clear 13937 and DECREF the interned dict. */ 13938 13939 n = PyList_GET_SIZE(keys); 13940 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 13941 n); 13942 for (i = 0; i < n; i++) { 13943 s = PyList_GET_ITEM(keys, i); 13944 if (PyUnicode_READY(s) == -1) { 13945 assert(0 && "could not ready string"); 13946 fprintf(stderr, "could not ready string\n"); 13947 } 13948 switch (PyUnicode_CHECK_INTERNED(s)) { 13949 case SSTATE_NOT_INTERNED: 13950 /* XXX Shouldn't happen */ 13951 break; 13952 case SSTATE_INTERNED_IMMORTAL: 13953 Py_REFCNT(s) += 1; 13954 immortal_size += PyUnicode_GET_LENGTH(s); 13955 break; 13956 case SSTATE_INTERNED_MORTAL: 13957 Py_REFCNT(s) += 2; 13958 mortal_size += PyUnicode_GET_LENGTH(s); 13959 break; 13960 default: 13961 Py_FatalError("Inconsistent interned string state."); 13962 } 13963 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 13964 } 13965 fprintf(stderr, "total size of all interned strings: " 13966 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 13967 "mortal/immortal\n", mortal_size, immortal_size); 13968 Py_DECREF(keys); 13969 PyDict_Clear(interned); 13970 Py_DECREF(interned); 13971 interned = NULL; 13972} 13973 13974 13975/********************* Unicode Iterator **************************/ 13976 13977typedef struct { 13978 PyObject_HEAD 13979 Py_ssize_t it_index; 13980 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 13981} unicodeiterobject; 13982 13983static void 13984unicodeiter_dealloc(unicodeiterobject *it) 13985{ 13986 _PyObject_GC_UNTRACK(it); 13987 Py_XDECREF(it->it_seq); 13988 PyObject_GC_Del(it); 13989} 13990 13991static int 13992unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 13993{ 13994 Py_VISIT(it->it_seq); 13995 return 0; 13996} 13997 13998static PyObject * 13999unicodeiter_next(unicodeiterobject *it) 14000{ 14001 PyObject *seq, *item; 14002 14003 assert(it != NULL); 14004 seq = it->it_seq; 14005 if (seq == NULL) 14006 return NULL; 14007 assert(_PyUnicode_CHECK(seq)); 14008 14009 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14010 int kind = PyUnicode_KIND(seq); 14011 void *data = PyUnicode_DATA(seq); 14012 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14013 item = PyUnicode_FromOrdinal(chr); 14014 if (item != NULL) 14015 ++it->it_index; 14016 return item; 14017 } 14018 14019 Py_DECREF(seq); 14020 it->it_seq = NULL; 14021 return NULL; 14022} 14023 14024static PyObject * 14025unicodeiter_len(unicodeiterobject *it) 14026{ 14027 Py_ssize_t len = 0; 14028 if (it->it_seq) 14029 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14030 return PyLong_FromSsize_t(len); 14031} 14032 14033PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14034 14035static PyMethodDef unicodeiter_methods[] = { 14036 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14037 length_hint_doc}, 14038 {NULL, NULL} /* sentinel */ 14039}; 14040 14041PyTypeObject PyUnicodeIter_Type = { 14042 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14043 "str_iterator", /* tp_name */ 14044 sizeof(unicodeiterobject), /* tp_basicsize */ 14045 0, /* tp_itemsize */ 14046 /* methods */ 14047 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14048 0, /* tp_print */ 14049 0, /* tp_getattr */ 14050 0, /* tp_setattr */ 14051 0, /* tp_reserved */ 14052 0, /* tp_repr */ 14053 0, /* tp_as_number */ 14054 0, /* tp_as_sequence */ 14055 0, /* tp_as_mapping */ 14056 0, /* tp_hash */ 14057 0, /* tp_call */ 14058 0, /* tp_str */ 14059 PyObject_GenericGetAttr, /* tp_getattro */ 14060 0, /* tp_setattro */ 14061 0, /* tp_as_buffer */ 14062 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14063 0, /* tp_doc */ 14064 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14065 0, /* tp_clear */ 14066 0, /* tp_richcompare */ 14067 0, /* tp_weaklistoffset */ 14068 PyObject_SelfIter, /* tp_iter */ 14069 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14070 unicodeiter_methods, /* tp_methods */ 14071 0, 14072}; 14073 14074static PyObject * 14075unicode_iter(PyObject *seq) 14076{ 14077 unicodeiterobject *it; 14078 14079 if (!PyUnicode_Check(seq)) { 14080 PyErr_BadInternalCall(); 14081 return NULL; 14082 } 14083 if (PyUnicode_READY(seq) == -1) 14084 return NULL; 14085 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14086 if (it == NULL) 14087 return NULL; 14088 it->it_index = 0; 14089 Py_INCREF(seq); 14090 it->it_seq = seq; 14091 _PyObject_GC_TRACK(it); 14092 return (PyObject *)it; 14093} 14094 14095 14096size_t 14097Py_UNICODE_strlen(const Py_UNICODE *u) 14098{ 14099 int res = 0; 14100 while(*u++) 14101 res++; 14102 return res; 14103} 14104 14105Py_UNICODE* 14106Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14107{ 14108 Py_UNICODE *u = s1; 14109 while ((*u++ = *s2++)); 14110 return s1; 14111} 14112 14113Py_UNICODE* 14114Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14115{ 14116 Py_UNICODE *u = s1; 14117 while ((*u++ = *s2++)) 14118 if (n-- == 0) 14119 break; 14120 return s1; 14121} 14122 14123Py_UNICODE* 14124Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14125{ 14126 Py_UNICODE *u1 = s1; 14127 u1 += Py_UNICODE_strlen(u1); 14128 Py_UNICODE_strcpy(u1, s2); 14129 return s1; 14130} 14131 14132int 14133Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14134{ 14135 while (*s1 && *s2 && *s1 == *s2) 14136 s1++, s2++; 14137 if (*s1 && *s2) 14138 return (*s1 < *s2) ? -1 : +1; 14139 if (*s1) 14140 return 1; 14141 if (*s2) 14142 return -1; 14143 return 0; 14144} 14145 14146int 14147Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14148{ 14149 register Py_UNICODE u1, u2; 14150 for (; n != 0; n--) { 14151 u1 = *s1; 14152 u2 = *s2; 14153 if (u1 != u2) 14154 return (u1 < u2) ? -1 : +1; 14155 if (u1 == '\0') 14156 return 0; 14157 s1++; 14158 s2++; 14159 } 14160 return 0; 14161} 14162 14163Py_UNICODE* 14164Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14165{ 14166 const Py_UNICODE *p; 14167 for (p = s; *p; p++) 14168 if (*p == c) 14169 return (Py_UNICODE*)p; 14170 return NULL; 14171} 14172 14173Py_UNICODE* 14174Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14175{ 14176 const Py_UNICODE *p; 14177 p = s + Py_UNICODE_strlen(s); 14178 while (p != s) { 14179 p--; 14180 if (*p == c) 14181 return (Py_UNICODE*)p; 14182 } 14183 return NULL; 14184} 14185 14186Py_UNICODE* 14187PyUnicode_AsUnicodeCopy(PyObject *unicode) 14188{ 14189 Py_UNICODE *u, *copy; 14190 Py_ssize_t len, size; 14191 14192 if (!PyUnicode_Check(unicode)) { 14193 PyErr_BadArgument(); 14194 return NULL; 14195 } 14196 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14197 if (u == NULL) 14198 return NULL; 14199 /* Ensure we won't overflow the size. */ 14200 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14201 PyErr_NoMemory(); 14202 return NULL; 14203 } 14204 size = len + 1; /* copy the null character */ 14205 size *= sizeof(Py_UNICODE); 14206 copy = PyMem_Malloc(size); 14207 if (copy == NULL) { 14208 PyErr_NoMemory(); 14209 return NULL; 14210 } 14211 memcpy(copy, u, size); 14212 return copy; 14213} 14214 14215/* A _string module, to export formatter_parser and formatter_field_name_split 14216 to the string.Formatter class implemented in Python. */ 14217 14218static PyMethodDef _string_methods[] = { 14219 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14220 METH_O, PyDoc_STR("split the argument as a field name")}, 14221 {"formatter_parser", (PyCFunction) formatter_parser, 14222 METH_O, PyDoc_STR("parse the argument as a format string")}, 14223 {NULL, NULL} 14224}; 14225 14226static struct PyModuleDef _string_module = { 14227 PyModuleDef_HEAD_INIT, 14228 "_string", 14229 PyDoc_STR("string helper module"), 14230 0, 14231 _string_methods, 14232 NULL, 14233 NULL, 14234 NULL, 14235 NULL 14236}; 14237 14238PyMODINIT_FUNC 14239PyInit__string(void) 14240{ 14241 return PyModule_Create(&_string_module); 14242} 14243 14244 14245#ifdef __cplusplus 14246} 14247#endif 14248