unicodeobject.c revision 1929407406966f9f2093a9e6b421cad39361dbb4
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45 46#ifdef MS_WINDOWS 47#include <windows.h> 48#endif 49 50/* Endianness switches; defaults to little endian */ 51 52#ifdef WORDS_BIGENDIAN 53# define BYTEORDER_IS_BIG_ENDIAN 54#else 55# define BYTEORDER_IS_LITTLE_ENDIAN 56#endif 57 58/* --- Globals ------------------------------------------------------------ 59 60 The globals are initialized by the _PyUnicode_Init() API and should 61 not be used before calling that API. 62 63*/ 64 65 66#ifdef __cplusplus 67extern "C" { 68#endif 69 70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 71#define MAX_UNICODE 0x10ffff 72 73#ifdef Py_DEBUG 74# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 75#else 76# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 77#endif 78 79#define _PyUnicode_UTF8(op) \ 80 (((PyCompactUnicodeObject*)(op))->utf8) 81#define PyUnicode_UTF8(op) \ 82 (assert(_PyUnicode_CHECK(op)), \ 83 assert(PyUnicode_IS_READY(op)), \ 84 PyUnicode_IS_COMPACT_ASCII(op) ? \ 85 ((char*)((PyASCIIObject*)(op) + 1)) : \ 86 _PyUnicode_UTF8(op)) 87#define _PyUnicode_UTF8_LENGTH(op) \ 88 (((PyCompactUnicodeObject*)(op))->utf8_length) 89#define PyUnicode_UTF8_LENGTH(op) \ 90 (assert(_PyUnicode_CHECK(op)), \ 91 assert(PyUnicode_IS_READY(op)), \ 92 PyUnicode_IS_COMPACT_ASCII(op) ? \ 93 ((PyASCIIObject*)(op))->length : \ 94 _PyUnicode_UTF8_LENGTH(op)) 95#define _PyUnicode_WSTR(op) \ 96 (((PyASCIIObject*)(op))->wstr) 97#define _PyUnicode_WSTR_LENGTH(op) \ 98 (((PyCompactUnicodeObject*)(op))->wstr_length) 99#define _PyUnicode_LENGTH(op) \ 100 (((PyASCIIObject *)(op))->length) 101#define _PyUnicode_STATE(op) \ 102 (((PyASCIIObject *)(op))->state) 103#define _PyUnicode_HASH(op) \ 104 (((PyASCIIObject *)(op))->hash) 105#define _PyUnicode_KIND(op) \ 106 (assert(_PyUnicode_CHECK(op)), \ 107 ((PyASCIIObject *)(op))->state.kind) 108#define _PyUnicode_GET_LENGTH(op) \ 109 (assert(_PyUnicode_CHECK(op)), \ 110 ((PyASCIIObject *)(op))->length) 111#define _PyUnicode_DATA_ANY(op) \ 112 (((PyUnicodeObject*)(op))->data.any) 113 114/* Optimized version of Py_MAX() to compute the maximum character: 115 use it when your are computing the second argument of PyUnicode_New() */ 116#define MAX_MAXCHAR(maxchar1, maxchar2) \ 117 ((maxchar1) | (maxchar2)) 118 119#undef PyUnicode_READY 120#define PyUnicode_READY(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 (PyUnicode_IS_READY(op) ? \ 123 0 : \ 124 _PyUnicode_Ready(op))) 125 126#define _PyUnicode_SHARE_UTF8(op) \ 127 (assert(_PyUnicode_CHECK(op)), \ 128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 130#define _PyUnicode_SHARE_WSTR(op) \ 131 (assert(_PyUnicode_CHECK(op)), \ 132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 133 134/* true if the Unicode object has an allocated UTF-8 memory block 135 (not shared with other data) */ 136#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 137 (assert(_PyUnicode_CHECK(op)), \ 138 (!PyUnicode_IS_COMPACT_ASCII(op) \ 139 && _PyUnicode_UTF8(op) \ 140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 141 142/* true if the Unicode object has an allocated wstr memory block 143 (not shared with other data) */ 144#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 145 (assert(_PyUnicode_CHECK(op)), \ 146 (_PyUnicode_WSTR(op) && \ 147 (!PyUnicode_IS_READY(op) || \ 148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 149 150/* Generic helper macro to convert characters of different types. 151 from_type and to_type have to be valid type names, begin and end 152 are pointers to the source characters which should be of type 153 "from_type *". to is a pointer of type "to_type *" and points to the 154 buffer where the result characters are written to. */ 155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 156 do { \ 157 to_type *_to = (to_type *) to; \ 158 const from_type *_iter = (begin); \ 159 const from_type *_end = (end); \ 160 Py_ssize_t n = (_end) - (_iter); \ 161 const from_type *_unrolled_end = \ 162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 163 while (_iter < (_unrolled_end)) { \ 164 _to[0] = (to_type) _iter[0]; \ 165 _to[1] = (to_type) _iter[1]; \ 166 _to[2] = (to_type) _iter[2]; \ 167 _to[3] = (to_type) _iter[3]; \ 168 _iter += 4; _to += 4; \ 169 } \ 170 while (_iter < (_end)) \ 171 *_to++ = (to_type) *_iter++; \ 172 } while (0) 173 174/* This dictionary holds all interned unicode strings. Note that references 175 to strings in this dictionary are *not* counted in the string's ob_refcnt. 176 When the interned string reaches a refcnt of 0 the string deallocation 177 function will delete the reference from this dictionary. 178 179 Another way to look at this is that to say that the actual reference 180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 181*/ 182static PyObject *interned; 183 184/* The empty Unicode object is shared to improve performance. */ 185static PyObject *unicode_empty; 186 187/* List of static strings. */ 188static _Py_Identifier *static_strings; 189 190/* Single character Unicode strings in the Latin-1 range are being 191 shared as well. */ 192static PyObject *unicode_latin1[256]; 193 194/* Fast detection of the most frequent whitespace characters */ 195const unsigned char _Py_ascii_whitespace[] = { 196 0, 0, 0, 0, 0, 0, 0, 0, 197/* case 0x0009: * CHARACTER TABULATION */ 198/* case 0x000A: * LINE FEED */ 199/* case 0x000B: * LINE TABULATION */ 200/* case 0x000C: * FORM FEED */ 201/* case 0x000D: * CARRIAGE RETURN */ 202 0, 1, 1, 1, 1, 1, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204/* case 0x001C: * FILE SEPARATOR */ 205/* case 0x001D: * GROUP SEPARATOR */ 206/* case 0x001E: * RECORD SEPARATOR */ 207/* case 0x001F: * UNIT SEPARATOR */ 208 0, 0, 0, 0, 1, 1, 1, 1, 209/* case 0x0020: * SPACE */ 210 1, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 0, 0, 0, 0, 0, 0, 0, 0, 222 0, 0, 0, 0, 0, 0, 0, 0 223}; 224 225/* forward */ 226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 227static PyObject* get_latin1_char(unsigned char ch); 228static int unicode_modifiable(PyObject *unicode); 229 230 231static PyObject * 232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size); 233static PyObject * 234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 235static PyObject * 236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 237 238static PyObject * 239unicode_encode_call_errorhandler(const char *errors, 240 PyObject **errorHandler,const char *encoding, const char *reason, 241 PyObject *unicode, PyObject **exceptionObject, 242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 243 244static void 245raise_encode_exception(PyObject **exceptionObject, 246 const char *encoding, 247 PyObject *unicode, 248 Py_ssize_t startpos, Py_ssize_t endpos, 249 const char *reason); 250 251/* Same for linebreaks */ 252static unsigned char ascii_linebreak[] = { 253 0, 0, 0, 0, 0, 0, 0, 0, 254/* 0x000A, * LINE FEED */ 255/* 0x000B, * LINE TABULATION */ 256/* 0x000C, * FORM FEED */ 257/* 0x000D, * CARRIAGE RETURN */ 258 0, 0, 1, 1, 1, 1, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 260/* 0x001C, * FILE SEPARATOR */ 261/* 0x001D, * GROUP SEPARATOR */ 262/* 0x001E, * RECORD SEPARATOR */ 263 0, 0, 0, 0, 1, 1, 1, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 265 0, 0, 0, 0, 0, 0, 0, 0, 266 0, 0, 0, 0, 0, 0, 0, 0, 267 0, 0, 0, 0, 0, 0, 0, 0, 268 269 0, 0, 0, 0, 0, 0, 0, 0, 270 0, 0, 0, 0, 0, 0, 0, 0, 271 0, 0, 0, 0, 0, 0, 0, 0, 272 0, 0, 0, 0, 0, 0, 0, 0, 273 0, 0, 0, 0, 0, 0, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275 0, 0, 0, 0, 0, 0, 0, 0, 276 0, 0, 0, 0, 0, 0, 0, 0 277}; 278 279/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 280 This function is kept for backward compatibility with the old API. */ 281Py_UNICODE 282PyUnicode_GetMax(void) 283{ 284#ifdef Py_UNICODE_WIDE 285 return 0x10FFFF; 286#else 287 /* This is actually an illegal character, so it should 288 not be passed to unichr. */ 289 return 0xFFFF; 290#endif 291} 292 293#ifdef Py_DEBUG 294int 295_PyUnicode_CheckConsistency(PyObject *op, int check_content) 296{ 297 PyASCIIObject *ascii; 298 unsigned int kind; 299 300 assert(PyUnicode_Check(op)); 301 302 ascii = (PyASCIIObject *)op; 303 kind = ascii->state.kind; 304 305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 306 assert(kind == PyUnicode_1BYTE_KIND); 307 assert(ascii->state.ready == 1); 308 } 309 else { 310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 311 void *data; 312 313 if (ascii->state.compact == 1) { 314 data = compact + 1; 315 assert(kind == PyUnicode_1BYTE_KIND 316 || kind == PyUnicode_2BYTE_KIND 317 || kind == PyUnicode_4BYTE_KIND); 318 assert(ascii->state.ascii == 0); 319 assert(ascii->state.ready == 1); 320 assert (compact->utf8 != data); 321 } 322 else { 323 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 324 325 data = unicode->data.any; 326 if (kind == PyUnicode_WCHAR_KIND) { 327 assert(ascii->length == 0); 328 assert(ascii->hash == -1); 329 assert(ascii->state.compact == 0); 330 assert(ascii->state.ascii == 0); 331 assert(ascii->state.ready == 0); 332 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 333 assert(ascii->wstr != NULL); 334 assert(data == NULL); 335 assert(compact->utf8 == NULL); 336 } 337 else { 338 assert(kind == PyUnicode_1BYTE_KIND 339 || kind == PyUnicode_2BYTE_KIND 340 || kind == PyUnicode_4BYTE_KIND); 341 assert(ascii->state.compact == 0); 342 assert(ascii->state.ready == 1); 343 assert(data != NULL); 344 if (ascii->state.ascii) { 345 assert (compact->utf8 == data); 346 assert (compact->utf8_length == ascii->length); 347 } 348 else 349 assert (compact->utf8 != data); 350 } 351 } 352 if (kind != PyUnicode_WCHAR_KIND) { 353 if ( 354#if SIZEOF_WCHAR_T == 2 355 kind == PyUnicode_2BYTE_KIND 356#else 357 kind == PyUnicode_4BYTE_KIND 358#endif 359 ) 360 { 361 assert(ascii->wstr == data); 362 assert(compact->wstr_length == ascii->length); 363 } else 364 assert(ascii->wstr != data); 365 } 366 367 if (compact->utf8 == NULL) 368 assert(compact->utf8_length == 0); 369 if (ascii->wstr == NULL) 370 assert(compact->wstr_length == 0); 371 } 372 /* check that the best kind is used */ 373 if (check_content && kind != PyUnicode_WCHAR_KIND) 374 { 375 Py_ssize_t i; 376 Py_UCS4 maxchar = 0; 377 void *data; 378 Py_UCS4 ch; 379 380 data = PyUnicode_DATA(ascii); 381 for (i=0; i < ascii->length; i++) 382 { 383 ch = PyUnicode_READ(kind, data, i); 384 if (ch > maxchar) 385 maxchar = ch; 386 } 387 if (kind == PyUnicode_1BYTE_KIND) { 388 if (ascii->state.ascii == 0) { 389 assert(maxchar >= 128); 390 assert(maxchar <= 255); 391 } 392 else 393 assert(maxchar < 128); 394 } 395 else if (kind == PyUnicode_2BYTE_KIND) { 396 assert(maxchar >= 0x100); 397 assert(maxchar <= 0xFFFF); 398 } 399 else { 400 assert(maxchar >= 0x10000); 401 assert(maxchar <= MAX_UNICODE); 402 } 403 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 404 } 405 return 1; 406} 407#endif 408 409static PyObject* 410unicode_result_wchar(PyObject *unicode) 411{ 412#ifndef Py_DEBUG 413 Py_ssize_t len; 414 415 assert(Py_REFCNT(unicode) == 1); 416 417 len = _PyUnicode_WSTR_LENGTH(unicode); 418 if (len == 0) { 419 Py_INCREF(unicode_empty); 420 Py_DECREF(unicode); 421 return unicode_empty; 422 } 423 424 if (len == 1) { 425 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 426 if (ch < 256) { 427 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 428 Py_DECREF(unicode); 429 return latin1_char; 430 } 431 } 432 433 if (_PyUnicode_Ready(unicode) < 0) { 434 Py_XDECREF(unicode); 435 return NULL; 436 } 437#else 438 /* don't make the result ready in debug mode to ensure that the caller 439 makes the string ready before using it */ 440 assert(_PyUnicode_CheckConsistency(unicode, 1)); 441#endif 442 return unicode; 443} 444 445static PyObject* 446unicode_result_ready(PyObject *unicode) 447{ 448 Py_ssize_t length; 449 450 length = PyUnicode_GET_LENGTH(unicode); 451 if (length == 0) { 452 if (unicode != unicode_empty) { 453 Py_INCREF(unicode_empty); 454 Py_DECREF(unicode); 455 } 456 return unicode_empty; 457 } 458 459 if (length == 1) { 460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 461 if (ch < 256) { 462 PyObject *latin1_char = unicode_latin1[ch]; 463 if (latin1_char != NULL) { 464 if (unicode != latin1_char) { 465 Py_INCREF(latin1_char); 466 Py_DECREF(unicode); 467 } 468 return latin1_char; 469 } 470 else { 471 assert(_PyUnicode_CheckConsistency(unicode, 1)); 472 Py_INCREF(unicode); 473 unicode_latin1[ch] = unicode; 474 return unicode; 475 } 476 } 477 } 478 479 assert(_PyUnicode_CheckConsistency(unicode, 1)); 480 return unicode; 481} 482 483static PyObject* 484unicode_result(PyObject *unicode) 485{ 486 assert(_PyUnicode_CHECK(unicode)); 487 if (PyUnicode_IS_READY(unicode)) 488 return unicode_result_ready(unicode); 489 else 490 return unicode_result_wchar(unicode); 491} 492 493static PyObject* 494unicode_result_unchanged(PyObject *unicode) 495{ 496 if (PyUnicode_CheckExact(unicode)) { 497 if (PyUnicode_READY(unicode) == -1) 498 return NULL; 499 Py_INCREF(unicode); 500 return unicode; 501 } 502 else 503 /* Subtype -- return genuine unicode string with the same value. */ 504 return _PyUnicode_Copy(unicode); 505} 506 507#ifdef HAVE_MBCS 508static OSVERSIONINFOEX winver; 509#endif 510 511/* --- Bloom Filters ----------------------------------------------------- */ 512 513/* stuff to implement simple "bloom filters" for Unicode characters. 514 to keep things simple, we use a single bitmask, using the least 5 515 bits from each unicode characters as the bit index. */ 516 517/* the linebreak mask is set up by Unicode_Init below */ 518 519#if LONG_BIT >= 128 520#define BLOOM_WIDTH 128 521#elif LONG_BIT >= 64 522#define BLOOM_WIDTH 64 523#elif LONG_BIT >= 32 524#define BLOOM_WIDTH 32 525#else 526#error "LONG_BIT is smaller than 32" 527#endif 528 529#define BLOOM_MASK unsigned long 530 531static BLOOM_MASK bloom_linebreak; 532 533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 535 536#define BLOOM_LINEBREAK(ch) \ 537 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 539 540Py_LOCAL_INLINE(BLOOM_MASK) 541make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 542{ 543 /* calculate simple bloom-style bitmask for a given unicode string */ 544 545 BLOOM_MASK mask; 546 Py_ssize_t i; 547 548 mask = 0; 549 for (i = 0; i < len; i++) 550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); 551 552 return mask; 553} 554 555#define BLOOM_MEMBER(mask, chr, str) \ 556 (BLOOM(mask, chr) \ 557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) 558 559/* Compilation of templated routines */ 560 561#include "stringlib/asciilib.h" 562#include "stringlib/fastsearch.h" 563#include "stringlib/partition.h" 564#include "stringlib/split.h" 565#include "stringlib/count.h" 566#include "stringlib/find.h" 567#include "stringlib/find_max_char.h" 568#include "stringlib/localeutil.h" 569#include "stringlib/undef.h" 570 571#include "stringlib/ucs1lib.h" 572#include "stringlib/fastsearch.h" 573#include "stringlib/partition.h" 574#include "stringlib/split.h" 575#include "stringlib/count.h" 576#include "stringlib/find.h" 577#include "stringlib/find_max_char.h" 578#include "stringlib/localeutil.h" 579#include "stringlib/undef.h" 580 581#include "stringlib/ucs2lib.h" 582#include "stringlib/fastsearch.h" 583#include "stringlib/partition.h" 584#include "stringlib/split.h" 585#include "stringlib/count.h" 586#include "stringlib/find.h" 587#include "stringlib/find_max_char.h" 588#include "stringlib/localeutil.h" 589#include "stringlib/undef.h" 590 591#include "stringlib/ucs4lib.h" 592#include "stringlib/fastsearch.h" 593#include "stringlib/partition.h" 594#include "stringlib/split.h" 595#include "stringlib/count.h" 596#include "stringlib/find.h" 597#include "stringlib/find_max_char.h" 598#include "stringlib/localeutil.h" 599#include "stringlib/undef.h" 600 601#include "stringlib/unicodedefs.h" 602#include "stringlib/fastsearch.h" 603#include "stringlib/count.h" 604#include "stringlib/find.h" 605#include "stringlib/undef.h" 606 607/* --- Unicode Object ----------------------------------------------------- */ 608 609static PyObject * 610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 611 612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind, 613 Py_ssize_t size, Py_UCS4 ch, 614 int direction) 615{ 616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 617 618 switch (kind) { 619 case PyUnicode_1BYTE_KIND: 620 { 621 Py_UCS1 ch1 = (Py_UCS1) ch; 622 if (ch1 == ch) 623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 624 else 625 return -1; 626 } 627 case PyUnicode_2BYTE_KIND: 628 { 629 Py_UCS2 ch2 = (Py_UCS2) ch; 630 if (ch2 == ch) 631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 632 else 633 return -1; 634 } 635 case PyUnicode_4BYTE_KIND: 636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 637 default: 638 assert(0); 639 return -1; 640 } 641} 642 643static PyObject* 644resize_compact(PyObject *unicode, Py_ssize_t length) 645{ 646 Py_ssize_t char_size; 647 Py_ssize_t struct_size; 648 Py_ssize_t new_size; 649 int share_wstr; 650 PyObject *new_unicode; 651 assert(unicode_modifiable(unicode)); 652 assert(PyUnicode_IS_READY(unicode)); 653 assert(PyUnicode_IS_COMPACT(unicode)); 654 655 char_size = PyUnicode_KIND(unicode); 656 if (PyUnicode_IS_ASCII(unicode)) 657 struct_size = sizeof(PyASCIIObject); 658 else 659 struct_size = sizeof(PyCompactUnicodeObject); 660 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 661 662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 663 PyErr_NoMemory(); 664 return NULL; 665 } 666 new_size = (struct_size + (length + 1) * char_size); 667 668 _Py_DEC_REFTOTAL; 669 _Py_ForgetReference(unicode); 670 671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); 672 if (new_unicode == NULL) { 673 _Py_NewReference(unicode); 674 PyErr_NoMemory(); 675 return NULL; 676 } 677 unicode = new_unicode; 678 _Py_NewReference(unicode); 679 680 _PyUnicode_LENGTH(unicode) = length; 681 if (share_wstr) { 682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 683 if (!PyUnicode_IS_ASCII(unicode)) 684 _PyUnicode_WSTR_LENGTH(unicode) = length; 685 } 686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 687 length, 0); 688 assert(_PyUnicode_CheckConsistency(unicode, 0)); 689 return unicode; 690} 691 692static int 693resize_inplace(PyObject *unicode, Py_ssize_t length) 694{ 695 wchar_t *wstr; 696 Py_ssize_t new_size; 697 assert(!PyUnicode_IS_COMPACT(unicode)); 698 assert(Py_REFCNT(unicode) == 1); 699 700 if (PyUnicode_IS_READY(unicode)) { 701 Py_ssize_t char_size; 702 int share_wstr, share_utf8; 703 void *data; 704 705 data = _PyUnicode_DATA_ANY(unicode); 706 char_size = PyUnicode_KIND(unicode); 707 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 709 710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 711 PyErr_NoMemory(); 712 return -1; 713 } 714 new_size = (length + 1) * char_size; 715 716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 717 { 718 PyObject_DEL(_PyUnicode_UTF8(unicode)); 719 _PyUnicode_UTF8(unicode) = NULL; 720 _PyUnicode_UTF8_LENGTH(unicode) = 0; 721 } 722 723 data = (PyObject *)PyObject_REALLOC(data, new_size); 724 if (data == NULL) { 725 PyErr_NoMemory(); 726 return -1; 727 } 728 _PyUnicode_DATA_ANY(unicode) = data; 729 if (share_wstr) { 730 _PyUnicode_WSTR(unicode) = data; 731 _PyUnicode_WSTR_LENGTH(unicode) = length; 732 } 733 if (share_utf8) { 734 _PyUnicode_UTF8(unicode) = data; 735 _PyUnicode_UTF8_LENGTH(unicode) = length; 736 } 737 _PyUnicode_LENGTH(unicode) = length; 738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 740 assert(_PyUnicode_CheckConsistency(unicode, 0)); 741 return 0; 742 } 743 } 744 assert(_PyUnicode_WSTR(unicode) != NULL); 745 746 /* check for integer overflow */ 747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 748 PyErr_NoMemory(); 749 return -1; 750 } 751 new_size = sizeof(wchar_t) * (length + 1); 752 wstr = _PyUnicode_WSTR(unicode); 753 wstr = PyObject_REALLOC(wstr, new_size); 754 if (!wstr) { 755 PyErr_NoMemory(); 756 return -1; 757 } 758 _PyUnicode_WSTR(unicode) = wstr; 759 _PyUnicode_WSTR(unicode)[length] = 0; 760 _PyUnicode_WSTR_LENGTH(unicode) = length; 761 assert(_PyUnicode_CheckConsistency(unicode, 0)); 762 return 0; 763} 764 765static PyObject* 766resize_copy(PyObject *unicode, Py_ssize_t length) 767{ 768 Py_ssize_t copy_length; 769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 770 PyObject *copy; 771 772 if (PyUnicode_READY(unicode) == -1) 773 return NULL; 774 775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 776 if (copy == NULL) 777 return NULL; 778 779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 781 return copy; 782 } 783 else { 784 PyObject *w; 785 786 w = (PyObject*)_PyUnicode_New(length); 787 if (w == NULL) 788 return NULL; 789 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 790 copy_length = Py_MIN(copy_length, length); 791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 792 copy_length); 793 return w; 794 } 795} 796 797/* We allocate one more byte to make sure the string is 798 Ux0000 terminated; some code (e.g. new_identifier) 799 relies on that. 800 801 XXX This allocator could further be enhanced by assuring that the 802 free list never reduces its size below 1. 803 804*/ 805 806static PyUnicodeObject * 807_PyUnicode_New(Py_ssize_t length) 808{ 809 register PyUnicodeObject *unicode; 810 size_t new_size; 811 812 /* Optimization for empty strings */ 813 if (length == 0 && unicode_empty != NULL) { 814 Py_INCREF(unicode_empty); 815 return (PyUnicodeObject*)unicode_empty; 816 } 817 818 /* Ensure we won't overflow the size. */ 819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 820 return (PyUnicodeObject *)PyErr_NoMemory(); 821 } 822 if (length < 0) { 823 PyErr_SetString(PyExc_SystemError, 824 "Negative size passed to _PyUnicode_New"); 825 return NULL; 826 } 827 828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 829 if (unicode == NULL) 830 return NULL; 831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 833 if (!_PyUnicode_WSTR(unicode)) { 834 Py_DECREF(unicode); 835 PyErr_NoMemory(); 836 return NULL; 837 } 838 839 /* Initialize the first element to guard against cases where 840 * the caller fails before initializing str -- unicode_resize() 841 * reads str[0], and the Keep-Alive optimization can keep memory 842 * allocated for str alive across a call to unicode_dealloc(unicode). 843 * We don't want unicode_resize to read uninitialized memory in 844 * that case. 845 */ 846 _PyUnicode_WSTR(unicode)[0] = 0; 847 _PyUnicode_WSTR(unicode)[length] = 0; 848 _PyUnicode_WSTR_LENGTH(unicode) = length; 849 _PyUnicode_HASH(unicode) = -1; 850 _PyUnicode_STATE(unicode).interned = 0; 851 _PyUnicode_STATE(unicode).kind = 0; 852 _PyUnicode_STATE(unicode).compact = 0; 853 _PyUnicode_STATE(unicode).ready = 0; 854 _PyUnicode_STATE(unicode).ascii = 0; 855 _PyUnicode_DATA_ANY(unicode) = NULL; 856 _PyUnicode_LENGTH(unicode) = 0; 857 _PyUnicode_UTF8(unicode) = NULL; 858 _PyUnicode_UTF8_LENGTH(unicode) = 0; 859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 860 return unicode; 861} 862 863static const char* 864unicode_kind_name(PyObject *unicode) 865{ 866 /* don't check consistency: unicode_kind_name() is called from 867 _PyUnicode_Dump() */ 868 if (!PyUnicode_IS_COMPACT(unicode)) 869 { 870 if (!PyUnicode_IS_READY(unicode)) 871 return "wstr"; 872 switch (PyUnicode_KIND(unicode)) 873 { 874 case PyUnicode_1BYTE_KIND: 875 if (PyUnicode_IS_ASCII(unicode)) 876 return "legacy ascii"; 877 else 878 return "legacy latin1"; 879 case PyUnicode_2BYTE_KIND: 880 return "legacy UCS2"; 881 case PyUnicode_4BYTE_KIND: 882 return "legacy UCS4"; 883 default: 884 return "<legacy invalid kind>"; 885 } 886 } 887 assert(PyUnicode_IS_READY(unicode)); 888 switch (PyUnicode_KIND(unicode)) { 889 case PyUnicode_1BYTE_KIND: 890 if (PyUnicode_IS_ASCII(unicode)) 891 return "ascii"; 892 else 893 return "latin1"; 894 case PyUnicode_2BYTE_KIND: 895 return "UCS2"; 896 case PyUnicode_4BYTE_KIND: 897 return "UCS4"; 898 default: 899 return "<invalid compact kind>"; 900 } 901} 902 903#ifdef Py_DEBUG 904/* Functions wrapping macros for use in debugger */ 905char *_PyUnicode_utf8(void *unicode){ 906 return PyUnicode_UTF8(unicode); 907} 908 909void *_PyUnicode_compact_data(void *unicode) { 910 return _PyUnicode_COMPACT_DATA(unicode); 911} 912void *_PyUnicode_data(void *unicode){ 913 printf("obj %p\n", unicode); 914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 919 return PyUnicode_DATA(unicode); 920} 921 922void 923_PyUnicode_Dump(PyObject *op) 924{ 925 PyASCIIObject *ascii = (PyASCIIObject *)op; 926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 927 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 928 void *data; 929 930 if (ascii->state.compact) 931 { 932 if (ascii->state.ascii) 933 data = (ascii + 1); 934 else 935 data = (compact + 1); 936 } 937 else 938 data = unicode->data.any; 939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length); 940 941 if (ascii->wstr == data) 942 printf("shared "); 943 printf("wstr=%p", ascii->wstr); 944 945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 946 printf(" (%zu), ", compact->wstr_length); 947 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 948 printf("shared "); 949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length); 950 } 951 printf(", data=%p\n", data); 952} 953#endif 954 955PyObject * 956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 957{ 958 PyObject *obj; 959 PyCompactUnicodeObject *unicode; 960 void *data; 961 enum PyUnicode_Kind kind; 962 int is_sharing, is_ascii; 963 Py_ssize_t char_size; 964 Py_ssize_t struct_size; 965 966 /* Optimization for empty strings */ 967 if (size == 0 && unicode_empty != NULL) { 968 Py_INCREF(unicode_empty); 969 return unicode_empty; 970 } 971 972 is_ascii = 0; 973 is_sharing = 0; 974 struct_size = sizeof(PyCompactUnicodeObject); 975 if (maxchar < 128) { 976 kind = PyUnicode_1BYTE_KIND; 977 char_size = 1; 978 is_ascii = 1; 979 struct_size = sizeof(PyASCIIObject); 980 } 981 else if (maxchar < 256) { 982 kind = PyUnicode_1BYTE_KIND; 983 char_size = 1; 984 } 985 else if (maxchar < 65536) { 986 kind = PyUnicode_2BYTE_KIND; 987 char_size = 2; 988 if (sizeof(wchar_t) == 2) 989 is_sharing = 1; 990 } 991 else { 992 if (maxchar > MAX_UNICODE) { 993 PyErr_SetString(PyExc_SystemError, 994 "invalid maximum character passed to PyUnicode_New"); 995 return NULL; 996 } 997 kind = PyUnicode_4BYTE_KIND; 998 char_size = 4; 999 if (sizeof(wchar_t) == 4) 1000 is_sharing = 1; 1001 } 1002 1003 /* Ensure we won't overflow the size. */ 1004 if (size < 0) { 1005 PyErr_SetString(PyExc_SystemError, 1006 "Negative size passed to PyUnicode_New"); 1007 return NULL; 1008 } 1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1010 return PyErr_NoMemory(); 1011 1012 /* Duplicated allocation code from _PyObject_New() instead of a call to 1013 * PyObject_New() so we are able to allocate space for the object and 1014 * it's data buffer. 1015 */ 1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1017 if (obj == NULL) 1018 return PyErr_NoMemory(); 1019 obj = PyObject_INIT(obj, &PyUnicode_Type); 1020 if (obj == NULL) 1021 return NULL; 1022 1023 unicode = (PyCompactUnicodeObject *)obj; 1024 if (is_ascii) 1025 data = ((PyASCIIObject*)obj) + 1; 1026 else 1027 data = unicode + 1; 1028 _PyUnicode_LENGTH(unicode) = size; 1029 _PyUnicode_HASH(unicode) = -1; 1030 _PyUnicode_STATE(unicode).interned = 0; 1031 _PyUnicode_STATE(unicode).kind = kind; 1032 _PyUnicode_STATE(unicode).compact = 1; 1033 _PyUnicode_STATE(unicode).ready = 1; 1034 _PyUnicode_STATE(unicode).ascii = is_ascii; 1035 if (is_ascii) { 1036 ((char*)data)[size] = 0; 1037 _PyUnicode_WSTR(unicode) = NULL; 1038 } 1039 else if (kind == PyUnicode_1BYTE_KIND) { 1040 ((char*)data)[size] = 0; 1041 _PyUnicode_WSTR(unicode) = NULL; 1042 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1043 unicode->utf8 = NULL; 1044 unicode->utf8_length = 0; 1045 } 1046 else { 1047 unicode->utf8 = NULL; 1048 unicode->utf8_length = 0; 1049 if (kind == PyUnicode_2BYTE_KIND) 1050 ((Py_UCS2*)data)[size] = 0; 1051 else /* kind == PyUnicode_4BYTE_KIND */ 1052 ((Py_UCS4*)data)[size] = 0; 1053 if (is_sharing) { 1054 _PyUnicode_WSTR_LENGTH(unicode) = size; 1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1056 } 1057 else { 1058 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1059 _PyUnicode_WSTR(unicode) = NULL; 1060 } 1061 } 1062#ifdef Py_DEBUG 1063 /* Fill the data with invalid characters to detect bugs earlier. 1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, 1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII 1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */ 1067 memset(data, 0xff, size * kind); 1068#endif 1069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1070 return obj; 1071} 1072 1073#if SIZEOF_WCHAR_T == 2 1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1075 will decode surrogate pairs, the other conversions are implemented as macros 1076 for efficiency. 1077 1078 This function assumes that unicode can hold one more code point than wstr 1079 characters for a terminating null character. */ 1080static void 1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1082 PyObject *unicode) 1083{ 1084 const wchar_t *iter; 1085 Py_UCS4 *ucs4_out; 1086 1087 assert(unicode != NULL); 1088 assert(_PyUnicode_CHECK(unicode)); 1089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1091 1092 for (iter = begin; iter < end; ) { 1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1094 _PyUnicode_GET_LENGTH(unicode))); 1095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1096 && (iter+1) < end 1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1098 { 1099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1100 iter += 2; 1101 } 1102 else { 1103 *ucs4_out++ = *iter; 1104 iter++; 1105 } 1106 } 1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1108 _PyUnicode_GET_LENGTH(unicode))); 1109 1110} 1111#endif 1112 1113static int 1114unicode_check_modifiable(PyObject *unicode) 1115{ 1116 if (!unicode_modifiable(unicode)) { 1117 PyErr_SetString(PyExc_SystemError, 1118 "Cannot modify a string currently used"); 1119 return -1; 1120 } 1121 return 0; 1122} 1123 1124static int 1125_copy_characters(PyObject *to, Py_ssize_t to_start, 1126 PyObject *from, Py_ssize_t from_start, 1127 Py_ssize_t how_many, int check_maxchar) 1128{ 1129 unsigned int from_kind, to_kind; 1130 void *from_data, *to_data; 1131 1132 assert(0 <= how_many); 1133 assert(0 <= from_start); 1134 assert(0 <= to_start); 1135 assert(PyUnicode_Check(from)); 1136 assert(PyUnicode_IS_READY(from)); 1137 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1138 1139 assert(PyUnicode_Check(to)); 1140 assert(PyUnicode_IS_READY(to)); 1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1142 1143 if (how_many == 0) 1144 return 0; 1145 1146 from_kind = PyUnicode_KIND(from); 1147 from_data = PyUnicode_DATA(from); 1148 to_kind = PyUnicode_KIND(to); 1149 to_data = PyUnicode_DATA(to); 1150 1151#ifdef Py_DEBUG 1152 if (!check_maxchar 1153 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1154 { 1155 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1156 Py_UCS4 ch; 1157 Py_ssize_t i; 1158 for (i=0; i < how_many; i++) { 1159 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1160 assert(ch <= to_maxchar); 1161 } 1162 } 1163#endif 1164 1165 if (from_kind == to_kind) { 1166 if (check_maxchar 1167 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1168 { 1169 /* Writing Latin-1 characters into an ASCII string requires to 1170 check that all written characters are pure ASCII */ 1171 Py_UCS4 max_char; 1172 max_char = ucs1lib_find_max_char(from_data, 1173 (Py_UCS1*)from_data + how_many); 1174 if (max_char >= 128) 1175 return -1; 1176 } 1177 Py_MEMCPY((char*)to_data + to_kind * to_start, 1178 (char*)from_data + from_kind * from_start, 1179 to_kind * how_many); 1180 } 1181 else if (from_kind == PyUnicode_1BYTE_KIND 1182 && to_kind == PyUnicode_2BYTE_KIND) 1183 { 1184 _PyUnicode_CONVERT_BYTES( 1185 Py_UCS1, Py_UCS2, 1186 PyUnicode_1BYTE_DATA(from) + from_start, 1187 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1188 PyUnicode_2BYTE_DATA(to) + to_start 1189 ); 1190 } 1191 else if (from_kind == PyUnicode_1BYTE_KIND 1192 && to_kind == PyUnicode_4BYTE_KIND) 1193 { 1194 _PyUnicode_CONVERT_BYTES( 1195 Py_UCS1, Py_UCS4, 1196 PyUnicode_1BYTE_DATA(from) + from_start, 1197 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1198 PyUnicode_4BYTE_DATA(to) + to_start 1199 ); 1200 } 1201 else if (from_kind == PyUnicode_2BYTE_KIND 1202 && to_kind == PyUnicode_4BYTE_KIND) 1203 { 1204 _PyUnicode_CONVERT_BYTES( 1205 Py_UCS2, Py_UCS4, 1206 PyUnicode_2BYTE_DATA(from) + from_start, 1207 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1208 PyUnicode_4BYTE_DATA(to) + to_start 1209 ); 1210 } 1211 else { 1212 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1213 1214 if (!check_maxchar) { 1215 if (from_kind == PyUnicode_2BYTE_KIND 1216 && to_kind == PyUnicode_1BYTE_KIND) 1217 { 1218 _PyUnicode_CONVERT_BYTES( 1219 Py_UCS2, Py_UCS1, 1220 PyUnicode_2BYTE_DATA(from) + from_start, 1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1222 PyUnicode_1BYTE_DATA(to) + to_start 1223 ); 1224 } 1225 else if (from_kind == PyUnicode_4BYTE_KIND 1226 && to_kind == PyUnicode_1BYTE_KIND) 1227 { 1228 _PyUnicode_CONVERT_BYTES( 1229 Py_UCS4, Py_UCS1, 1230 PyUnicode_4BYTE_DATA(from) + from_start, 1231 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1232 PyUnicode_1BYTE_DATA(to) + to_start 1233 ); 1234 } 1235 else if (from_kind == PyUnicode_4BYTE_KIND 1236 && to_kind == PyUnicode_2BYTE_KIND) 1237 { 1238 _PyUnicode_CONVERT_BYTES( 1239 Py_UCS4, Py_UCS2, 1240 PyUnicode_4BYTE_DATA(from) + from_start, 1241 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1242 PyUnicode_2BYTE_DATA(to) + to_start 1243 ); 1244 } 1245 else { 1246 assert(0); 1247 return -1; 1248 } 1249 } 1250 else { 1251 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1252 Py_UCS4 ch; 1253 Py_ssize_t i; 1254 1255 for (i=0; i < how_many; i++) { 1256 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1257 if (ch > to_maxchar) 1258 return -1; 1259 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1260 } 1261 } 1262 } 1263 return 0; 1264} 1265 1266void 1267_PyUnicode_FastCopyCharacters( 1268 PyObject *to, Py_ssize_t to_start, 1269 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1270{ 1271 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1272} 1273 1274Py_ssize_t 1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1276 PyObject *from, Py_ssize_t from_start, 1277 Py_ssize_t how_many) 1278{ 1279 int err; 1280 1281 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1282 PyErr_BadInternalCall(); 1283 return -1; 1284 } 1285 1286 if (PyUnicode_READY(from) == -1) 1287 return -1; 1288 if (PyUnicode_READY(to) == -1) 1289 return -1; 1290 1291 if (from_start < 0) { 1292 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1293 return -1; 1294 } 1295 if (to_start < 0) { 1296 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1297 return -1; 1298 } 1299 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1300 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1301 PyErr_Format(PyExc_SystemError, 1302 "Cannot write %zi characters at %zi " 1303 "in a string of %zi characters", 1304 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1305 return -1; 1306 } 1307 1308 if (how_many == 0) 1309 return 0; 1310 1311 if (unicode_check_modifiable(to)) 1312 return -1; 1313 1314 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1315 if (err) { 1316 PyErr_Format(PyExc_SystemError, 1317 "Cannot copy %s characters " 1318 "into a string of %s characters", 1319 unicode_kind_name(from), 1320 unicode_kind_name(to)); 1321 return -1; 1322 } 1323 return how_many; 1324} 1325 1326/* Find the maximum code point and count the number of surrogate pairs so a 1327 correct string length can be computed before converting a string to UCS4. 1328 This function counts single surrogates as a character and not as a pair. 1329 1330 Return 0 on success, or -1 on error. */ 1331static int 1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1333 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1334{ 1335 const wchar_t *iter; 1336 Py_UCS4 ch; 1337 1338 assert(num_surrogates != NULL && maxchar != NULL); 1339 *num_surrogates = 0; 1340 *maxchar = 0; 1341 1342 for (iter = begin; iter < end; ) { 1343#if SIZEOF_WCHAR_T == 2 1344 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1345 && (iter+1) < end 1346 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1347 { 1348 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1349 ++(*num_surrogates); 1350 iter += 2; 1351 } 1352 else 1353#endif 1354 { 1355 ch = *iter; 1356 iter++; 1357 } 1358 if (ch > *maxchar) { 1359 *maxchar = ch; 1360 if (*maxchar > MAX_UNICODE) { 1361 PyErr_Format(PyExc_ValueError, 1362 "character U+%x is not in range [U+0000; U+10ffff]", 1363 ch); 1364 return -1; 1365 } 1366 } 1367 } 1368 return 0; 1369} 1370 1371int 1372_PyUnicode_Ready(PyObject *unicode) 1373{ 1374 wchar_t *end; 1375 Py_UCS4 maxchar = 0; 1376 Py_ssize_t num_surrogates; 1377#if SIZEOF_WCHAR_T == 2 1378 Py_ssize_t length_wo_surrogates; 1379#endif 1380 1381 /* _PyUnicode_Ready() is only intended for old-style API usage where 1382 strings were created using _PyObject_New() and where no canonical 1383 representation (the str field) has been set yet aka strings 1384 which are not yet ready. */ 1385 assert(_PyUnicode_CHECK(unicode)); 1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1387 assert(_PyUnicode_WSTR(unicode) != NULL); 1388 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1389 assert(_PyUnicode_UTF8(unicode) == NULL); 1390 /* Actually, it should neither be interned nor be anything else: */ 1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1392 1393 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1394 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1395 &maxchar, &num_surrogates) == -1) 1396 return -1; 1397 1398 if (maxchar < 256) { 1399 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1400 if (!_PyUnicode_DATA_ANY(unicode)) { 1401 PyErr_NoMemory(); 1402 return -1; 1403 } 1404 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1405 _PyUnicode_WSTR(unicode), end, 1406 PyUnicode_1BYTE_DATA(unicode)); 1407 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1408 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1409 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1410 if (maxchar < 128) { 1411 _PyUnicode_STATE(unicode).ascii = 1; 1412 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1413 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1414 } 1415 else { 1416 _PyUnicode_STATE(unicode).ascii = 0; 1417 _PyUnicode_UTF8(unicode) = NULL; 1418 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1419 } 1420 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1421 _PyUnicode_WSTR(unicode) = NULL; 1422 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1423 } 1424 /* In this case we might have to convert down from 4-byte native 1425 wchar_t to 2-byte unicode. */ 1426 else if (maxchar < 65536) { 1427 assert(num_surrogates == 0 && 1428 "FindMaxCharAndNumSurrogatePairs() messed up"); 1429 1430#if SIZEOF_WCHAR_T == 2 1431 /* We can share representations and are done. */ 1432 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1433 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1434 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1435 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1436 _PyUnicode_UTF8(unicode) = NULL; 1437 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1438#else 1439 /* sizeof(wchar_t) == 4 */ 1440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1441 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1442 if (!_PyUnicode_DATA_ANY(unicode)) { 1443 PyErr_NoMemory(); 1444 return -1; 1445 } 1446 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1447 _PyUnicode_WSTR(unicode), end, 1448 PyUnicode_2BYTE_DATA(unicode)); 1449 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1450 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1451 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1452 _PyUnicode_UTF8(unicode) = NULL; 1453 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1454 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1455 _PyUnicode_WSTR(unicode) = NULL; 1456 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1457#endif 1458 } 1459 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1460 else { 1461#if SIZEOF_WCHAR_T == 2 1462 /* in case the native representation is 2-bytes, we need to allocate a 1463 new normalized 4-byte version. */ 1464 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1466 if (!_PyUnicode_DATA_ANY(unicode)) { 1467 PyErr_NoMemory(); 1468 return -1; 1469 } 1470 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1471 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1472 _PyUnicode_UTF8(unicode) = NULL; 1473 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1474 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1475 _PyUnicode_STATE(unicode).ready = 1; 1476 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1477 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1478 _PyUnicode_WSTR(unicode) = NULL; 1479 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1480#else 1481 assert(num_surrogates == 0); 1482 1483 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1484 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1485 _PyUnicode_UTF8(unicode) = NULL; 1486 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1487 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1488#endif 1489 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1490 } 1491 _PyUnicode_STATE(unicode).ready = 1; 1492 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1493 return 0; 1494} 1495 1496static void 1497unicode_dealloc(register PyObject *unicode) 1498{ 1499 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1500 case SSTATE_NOT_INTERNED: 1501 break; 1502 1503 case SSTATE_INTERNED_MORTAL: 1504 /* revive dead object temporarily for DelItem */ 1505 Py_REFCNT(unicode) = 3; 1506 if (PyDict_DelItem(interned, unicode) != 0) 1507 Py_FatalError( 1508 "deletion of interned string failed"); 1509 break; 1510 1511 case SSTATE_INTERNED_IMMORTAL: 1512 Py_FatalError("Immortal interned string died."); 1513 1514 default: 1515 Py_FatalError("Inconsistent interned string state."); 1516 } 1517 1518 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1519 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1520 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1521 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1522 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1523 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1524 1525 Py_TYPE(unicode)->tp_free(unicode); 1526} 1527 1528#ifdef Py_DEBUG 1529static int 1530unicode_is_singleton(PyObject *unicode) 1531{ 1532 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1533 if (unicode == unicode_empty) 1534 return 1; 1535 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1536 { 1537 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1538 if (ch < 256 && unicode_latin1[ch] == unicode) 1539 return 1; 1540 } 1541 return 0; 1542} 1543#endif 1544 1545static int 1546unicode_modifiable(PyObject *unicode) 1547{ 1548 assert(_PyUnicode_CHECK(unicode)); 1549 if (Py_REFCNT(unicode) != 1) 1550 return 0; 1551 if (_PyUnicode_HASH(unicode) != -1) 1552 return 0; 1553 if (PyUnicode_CHECK_INTERNED(unicode)) 1554 return 0; 1555 if (!PyUnicode_CheckExact(unicode)) 1556 return 0; 1557#ifdef Py_DEBUG 1558 /* singleton refcount is greater than 1 */ 1559 assert(!unicode_is_singleton(unicode)); 1560#endif 1561 return 1; 1562} 1563 1564static int 1565unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1566{ 1567 PyObject *unicode; 1568 Py_ssize_t old_length; 1569 1570 assert(p_unicode != NULL); 1571 unicode = *p_unicode; 1572 1573 assert(unicode != NULL); 1574 assert(PyUnicode_Check(unicode)); 1575 assert(0 <= length); 1576 1577 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1578 old_length = PyUnicode_WSTR_LENGTH(unicode); 1579 else 1580 old_length = PyUnicode_GET_LENGTH(unicode); 1581 if (old_length == length) 1582 return 0; 1583 1584 if (length == 0) { 1585 Py_DECREF(*p_unicode); 1586 *p_unicode = unicode_empty; 1587 Py_INCREF(*p_unicode); 1588 return 0; 1589 } 1590 1591 if (!unicode_modifiable(unicode)) { 1592 PyObject *copy = resize_copy(unicode, length); 1593 if (copy == NULL) 1594 return -1; 1595 Py_DECREF(*p_unicode); 1596 *p_unicode = copy; 1597 return 0; 1598 } 1599 1600 if (PyUnicode_IS_COMPACT(unicode)) { 1601 PyObject *new_unicode = resize_compact(unicode, length); 1602 if (new_unicode == NULL) 1603 return -1; 1604 *p_unicode = new_unicode; 1605 return 0; 1606 } 1607 return resize_inplace(unicode, length); 1608} 1609 1610int 1611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1612{ 1613 PyObject *unicode; 1614 if (p_unicode == NULL) { 1615 PyErr_BadInternalCall(); 1616 return -1; 1617 } 1618 unicode = *p_unicode; 1619 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1620 { 1621 PyErr_BadInternalCall(); 1622 return -1; 1623 } 1624 return unicode_resize(p_unicode, length); 1625} 1626 1627static int 1628unicode_widen(PyObject **p_unicode, Py_ssize_t length, 1629 unsigned int maxchar) 1630{ 1631 PyObject *result; 1632 assert(PyUnicode_IS_READY(*p_unicode)); 1633 assert(length <= PyUnicode_GET_LENGTH(*p_unicode)); 1634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode)) 1635 return 0; 1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode), 1637 maxchar); 1638 if (result == NULL) 1639 return -1; 1640 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length); 1641 Py_DECREF(*p_unicode); 1642 *p_unicode = result; 1643 return 0; 1644} 1645 1646static int 1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos, 1648 Py_UCS4 ch) 1649{ 1650 assert(ch <= MAX_UNICODE); 1651 if (unicode_widen(p_unicode, *pos, ch) < 0) 1652 return -1; 1653 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode), 1654 PyUnicode_DATA(*p_unicode), 1655 (*pos)++, ch); 1656 return 0; 1657} 1658 1659/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1660 1661 WARNING: The function doesn't copy the terminating null character and 1662 doesn't check the maximum character (may write a latin1 character in an 1663 ASCII string). */ 1664static void 1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1666 const char *str, Py_ssize_t len) 1667{ 1668 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1669 void *data = PyUnicode_DATA(unicode); 1670 const char *end = str + len; 1671 1672 switch (kind) { 1673 case PyUnicode_1BYTE_KIND: { 1674 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1675 memcpy((char *) data + index, str, len); 1676 break; 1677 } 1678 case PyUnicode_2BYTE_KIND: { 1679 Py_UCS2 *start = (Py_UCS2 *)data + index; 1680 Py_UCS2 *ucs2 = start; 1681 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1682 1683 for (; str < end; ++ucs2, ++str) 1684 *ucs2 = (Py_UCS2)*str; 1685 1686 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1687 break; 1688 } 1689 default: { 1690 Py_UCS4 *start = (Py_UCS4 *)data + index; 1691 Py_UCS4 *ucs4 = start; 1692 assert(kind == PyUnicode_4BYTE_KIND); 1693 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1694 1695 for (; str < end; ++ucs4, ++str) 1696 *ucs4 = (Py_UCS4)*str; 1697 1698 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1699 } 1700 } 1701} 1702 1703 1704static PyObject* 1705get_latin1_char(unsigned char ch) 1706{ 1707 PyObject *unicode = unicode_latin1[ch]; 1708 if (!unicode) { 1709 unicode = PyUnicode_New(1, ch); 1710 if (!unicode) 1711 return NULL; 1712 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1713 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1714 unicode_latin1[ch] = unicode; 1715 } 1716 Py_INCREF(unicode); 1717 return unicode; 1718} 1719 1720PyObject * 1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1722{ 1723 PyObject *unicode; 1724 Py_UCS4 maxchar = 0; 1725 Py_ssize_t num_surrogates; 1726 1727 if (u == NULL) 1728 return (PyObject*)_PyUnicode_New(size); 1729 1730 /* If the Unicode data is known at construction time, we can apply 1731 some optimizations which share commonly used objects. */ 1732 1733 /* Optimization for empty strings */ 1734 if (size == 0 && unicode_empty != NULL) { 1735 Py_INCREF(unicode_empty); 1736 return unicode_empty; 1737 } 1738 1739 /* Single character Unicode objects in the Latin-1 range are 1740 shared when using this constructor */ 1741 if (size == 1 && *u < 256) 1742 return get_latin1_char((unsigned char)*u); 1743 1744 /* If not empty and not single character, copy the Unicode data 1745 into the new object */ 1746 if (find_maxchar_surrogates(u, u + size, 1747 &maxchar, &num_surrogates) == -1) 1748 return NULL; 1749 1750 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1751 if (!unicode) 1752 return NULL; 1753 1754 switch (PyUnicode_KIND(unicode)) { 1755 case PyUnicode_1BYTE_KIND: 1756 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1757 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1758 break; 1759 case PyUnicode_2BYTE_KIND: 1760#if Py_UNICODE_SIZE == 2 1761 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1762#else 1763 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1764 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1765#endif 1766 break; 1767 case PyUnicode_4BYTE_KIND: 1768#if SIZEOF_WCHAR_T == 2 1769 /* This is the only case which has to process surrogates, thus 1770 a simple copy loop is not enough and we need a function. */ 1771 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1772#else 1773 assert(num_surrogates == 0); 1774 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1775#endif 1776 break; 1777 default: 1778 assert(0 && "Impossible state"); 1779 } 1780 1781 return unicode_result(unicode); 1782} 1783 1784PyObject * 1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1786{ 1787 if (size < 0) { 1788 PyErr_SetString(PyExc_SystemError, 1789 "Negative size passed to PyUnicode_FromStringAndSize"); 1790 return NULL; 1791 } 1792 if (u != NULL) 1793 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1794 else 1795 return (PyObject *)_PyUnicode_New(size); 1796} 1797 1798PyObject * 1799PyUnicode_FromString(const char *u) 1800{ 1801 size_t size = strlen(u); 1802 if (size > PY_SSIZE_T_MAX) { 1803 PyErr_SetString(PyExc_OverflowError, "input too long"); 1804 return NULL; 1805 } 1806 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1807} 1808 1809PyObject * 1810_PyUnicode_FromId(_Py_Identifier *id) 1811{ 1812 if (!id->object) { 1813 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1814 strlen(id->string), 1815 NULL, NULL); 1816 if (!id->object) 1817 return NULL; 1818 PyUnicode_InternInPlace(&id->object); 1819 assert(!id->next); 1820 id->next = static_strings; 1821 static_strings = id; 1822 } 1823 return id->object; 1824} 1825 1826void 1827_PyUnicode_ClearStaticStrings() 1828{ 1829 _Py_Identifier *i; 1830 for (i = static_strings; i; i = i->next) { 1831 Py_DECREF(i->object); 1832 i->object = NULL; 1833 i->next = NULL; 1834 } 1835} 1836 1837/* Internal function, doesn't check maximum character */ 1838 1839PyObject* 1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1841{ 1842 const unsigned char *s = (const unsigned char *)buffer; 1843 PyObject *unicode; 1844 if (size == 1) { 1845#ifdef Py_DEBUG 1846 assert(s[0] < 128); 1847#endif 1848 return get_latin1_char(s[0]); 1849 } 1850 unicode = PyUnicode_New(size, 127); 1851 if (!unicode) 1852 return NULL; 1853 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1854 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1855 return unicode; 1856} 1857 1858static Py_UCS4 1859kind_maxchar_limit(unsigned int kind) 1860{ 1861 switch (kind) { 1862 case PyUnicode_1BYTE_KIND: 1863 return 0x80; 1864 case PyUnicode_2BYTE_KIND: 1865 return 0x100; 1866 case PyUnicode_4BYTE_KIND: 1867 return 0x10000; 1868 default: 1869 assert(0 && "invalid kind"); 1870 return MAX_UNICODE; 1871 } 1872} 1873 1874Py_LOCAL_INLINE(Py_UCS4) 1875align_maxchar(Py_UCS4 maxchar) 1876{ 1877 if (maxchar <= 127) 1878 return 127; 1879 else if (maxchar <= 255) 1880 return 255; 1881 else if (maxchar <= 65535) 1882 return 65535; 1883 else 1884 return MAX_UNICODE; 1885} 1886 1887static PyObject* 1888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) 1889{ 1890 PyObject *res; 1891 unsigned char max_char; 1892 1893 if (size == 0) { 1894 Py_INCREF(unicode_empty); 1895 return unicode_empty; 1896 } 1897 assert(size > 0); 1898 if (size == 1) 1899 return get_latin1_char(u[0]); 1900 1901 max_char = ucs1lib_find_max_char(u, u + size); 1902 res = PyUnicode_New(size, max_char); 1903 if (!res) 1904 return NULL; 1905 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 1906 assert(_PyUnicode_CheckConsistency(res, 1)); 1907 return res; 1908} 1909 1910static PyObject* 1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 1912{ 1913 PyObject *res; 1914 Py_UCS2 max_char; 1915 1916 if (size == 0) { 1917 Py_INCREF(unicode_empty); 1918 return unicode_empty; 1919 } 1920 assert(size > 0); 1921 if (size == 1) { 1922 Py_UCS4 ch = u[0]; 1923 if (ch < 256) 1924 return get_latin1_char((unsigned char)ch); 1925 1926 res = PyUnicode_New(1, ch); 1927 if (res == NULL) 1928 return NULL; 1929 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1930 assert(_PyUnicode_CheckConsistency(res, 1)); 1931 return res; 1932 } 1933 1934 max_char = ucs2lib_find_max_char(u, u + size); 1935 res = PyUnicode_New(size, max_char); 1936 if (!res) 1937 return NULL; 1938 if (max_char >= 256) 1939 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 1940 else { 1941 _PyUnicode_CONVERT_BYTES( 1942 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 1943 } 1944 assert(_PyUnicode_CheckConsistency(res, 1)); 1945 return res; 1946} 1947 1948static PyObject* 1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 1950{ 1951 PyObject *res; 1952 Py_UCS4 max_char; 1953 1954 if (size == 0) { 1955 Py_INCREF(unicode_empty); 1956 return unicode_empty; 1957 } 1958 assert(size > 0); 1959 if (size == 1) { 1960 Py_UCS4 ch = u[0]; 1961 if (ch < 256) 1962 return get_latin1_char((unsigned char)ch); 1963 1964 res = PyUnicode_New(1, ch); 1965 if (res == NULL) 1966 return NULL; 1967 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch); 1968 assert(_PyUnicode_CheckConsistency(res, 1)); 1969 return res; 1970 } 1971 1972 max_char = ucs4lib_find_max_char(u, u + size); 1973 res = PyUnicode_New(size, max_char); 1974 if (!res) 1975 return NULL; 1976 if (max_char < 256) 1977 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 1978 PyUnicode_1BYTE_DATA(res)); 1979 else if (max_char < 0x10000) 1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 1981 PyUnicode_2BYTE_DATA(res)); 1982 else 1983 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 1984 assert(_PyUnicode_CheckConsistency(res, 1)); 1985 return res; 1986} 1987 1988PyObject* 1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 1990{ 1991 if (size < 0) { 1992 PyErr_SetString(PyExc_ValueError, "size must be positive"); 1993 return NULL; 1994 } 1995 switch (kind) { 1996 case PyUnicode_1BYTE_KIND: 1997 return _PyUnicode_FromUCS1(buffer, size); 1998 case PyUnicode_2BYTE_KIND: 1999 return _PyUnicode_FromUCS2(buffer, size); 2000 case PyUnicode_4BYTE_KIND: 2001 return _PyUnicode_FromUCS4(buffer, size); 2002 default: 2003 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2004 return NULL; 2005 } 2006} 2007 2008Py_UCS4 2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2010{ 2011 enum PyUnicode_Kind kind; 2012 void *startptr, *endptr; 2013 2014 assert(PyUnicode_IS_READY(unicode)); 2015 assert(0 <= start); 2016 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2017 assert(start <= end); 2018 2019 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2020 return PyUnicode_MAX_CHAR_VALUE(unicode); 2021 2022 if (start == end) 2023 return 127; 2024 2025 if (PyUnicode_IS_ASCII(unicode)) 2026 return 127; 2027 2028 kind = PyUnicode_KIND(unicode); 2029 startptr = PyUnicode_DATA(unicode); 2030 endptr = (char *)startptr + end * kind; 2031 startptr = (char *)startptr + start * kind; 2032 switch(kind) { 2033 case PyUnicode_1BYTE_KIND: 2034 return ucs1lib_find_max_char(startptr, endptr); 2035 case PyUnicode_2BYTE_KIND: 2036 return ucs2lib_find_max_char(startptr, endptr); 2037 case PyUnicode_4BYTE_KIND: 2038 return ucs4lib_find_max_char(startptr, endptr); 2039 default: 2040 assert(0); 2041 return 0; 2042 } 2043} 2044 2045/* Ensure that a string uses the most efficient storage, if it is not the 2046 case: create a new string with of the right kind. Write NULL into *p_unicode 2047 on error. */ 2048static void 2049unicode_adjust_maxchar(PyObject **p_unicode) 2050{ 2051 PyObject *unicode, *copy; 2052 Py_UCS4 max_char; 2053 Py_ssize_t len; 2054 unsigned int kind; 2055 2056 assert(p_unicode != NULL); 2057 unicode = *p_unicode; 2058 assert(PyUnicode_IS_READY(unicode)); 2059 if (PyUnicode_IS_ASCII(unicode)) 2060 return; 2061 2062 len = PyUnicode_GET_LENGTH(unicode); 2063 kind = PyUnicode_KIND(unicode); 2064 if (kind == PyUnicode_1BYTE_KIND) { 2065 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2066 max_char = ucs1lib_find_max_char(u, u + len); 2067 if (max_char >= 128) 2068 return; 2069 } 2070 else if (kind == PyUnicode_2BYTE_KIND) { 2071 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2072 max_char = ucs2lib_find_max_char(u, u + len); 2073 if (max_char >= 256) 2074 return; 2075 } 2076 else { 2077 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2078 assert(kind == PyUnicode_4BYTE_KIND); 2079 max_char = ucs4lib_find_max_char(u, u + len); 2080 if (max_char >= 0x10000) 2081 return; 2082 } 2083 copy = PyUnicode_New(len, max_char); 2084 if (copy != NULL) 2085 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2086 Py_DECREF(unicode); 2087 *p_unicode = copy; 2088} 2089 2090PyObject* 2091_PyUnicode_Copy(PyObject *unicode) 2092{ 2093 Py_ssize_t length; 2094 PyObject *copy; 2095 2096 if (!PyUnicode_Check(unicode)) { 2097 PyErr_BadInternalCall(); 2098 return NULL; 2099 } 2100 if (PyUnicode_READY(unicode) == -1) 2101 return NULL; 2102 2103 length = PyUnicode_GET_LENGTH(unicode); 2104 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2105 if (!copy) 2106 return NULL; 2107 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2108 2109 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2110 length * PyUnicode_KIND(unicode)); 2111 assert(_PyUnicode_CheckConsistency(copy, 1)); 2112 return copy; 2113} 2114 2115 2116/* Widen Unicode objects to larger buffers. Don't write terminating null 2117 character. Return NULL on error. */ 2118 2119void* 2120_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2121{ 2122 Py_ssize_t len; 2123 void *result; 2124 unsigned int skind; 2125 2126 if (PyUnicode_READY(s) == -1) 2127 return NULL; 2128 2129 len = PyUnicode_GET_LENGTH(s); 2130 skind = PyUnicode_KIND(s); 2131 if (skind >= kind) { 2132 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2133 return NULL; 2134 } 2135 switch (kind) { 2136 case PyUnicode_2BYTE_KIND: 2137 result = PyMem_Malloc(len * sizeof(Py_UCS2)); 2138 if (!result) 2139 return PyErr_NoMemory(); 2140 assert(skind == PyUnicode_1BYTE_KIND); 2141 _PyUnicode_CONVERT_BYTES( 2142 Py_UCS1, Py_UCS2, 2143 PyUnicode_1BYTE_DATA(s), 2144 PyUnicode_1BYTE_DATA(s) + len, 2145 result); 2146 return result; 2147 case PyUnicode_4BYTE_KIND: 2148 result = PyMem_Malloc(len * sizeof(Py_UCS4)); 2149 if (!result) 2150 return PyErr_NoMemory(); 2151 if (skind == PyUnicode_2BYTE_KIND) { 2152 _PyUnicode_CONVERT_BYTES( 2153 Py_UCS2, Py_UCS4, 2154 PyUnicode_2BYTE_DATA(s), 2155 PyUnicode_2BYTE_DATA(s) + len, 2156 result); 2157 } 2158 else { 2159 assert(skind == PyUnicode_1BYTE_KIND); 2160 _PyUnicode_CONVERT_BYTES( 2161 Py_UCS1, Py_UCS4, 2162 PyUnicode_1BYTE_DATA(s), 2163 PyUnicode_1BYTE_DATA(s) + len, 2164 result); 2165 } 2166 return result; 2167 default: 2168 break; 2169 } 2170 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2171 return NULL; 2172} 2173 2174static Py_UCS4* 2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2176 int copy_null) 2177{ 2178 int kind; 2179 void *data; 2180 Py_ssize_t len, targetlen; 2181 if (PyUnicode_READY(string) == -1) 2182 return NULL; 2183 kind = PyUnicode_KIND(string); 2184 data = PyUnicode_DATA(string); 2185 len = PyUnicode_GET_LENGTH(string); 2186 targetlen = len; 2187 if (copy_null) 2188 targetlen++; 2189 if (!target) { 2190 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) { 2191 PyErr_NoMemory(); 2192 return NULL; 2193 } 2194 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4)); 2195 if (!target) { 2196 PyErr_NoMemory(); 2197 return NULL; 2198 } 2199 } 2200 else { 2201 if (targetsize < targetlen) { 2202 PyErr_Format(PyExc_SystemError, 2203 "string is longer than the buffer"); 2204 if (copy_null && 0 < targetsize) 2205 target[0] = 0; 2206 return NULL; 2207 } 2208 } 2209 if (kind == PyUnicode_1BYTE_KIND) { 2210 Py_UCS1 *start = (Py_UCS1 *) data; 2211 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2212 } 2213 else if (kind == PyUnicode_2BYTE_KIND) { 2214 Py_UCS2 *start = (Py_UCS2 *) data; 2215 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2216 } 2217 else { 2218 assert(kind == PyUnicode_4BYTE_KIND); 2219 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2220 } 2221 if (copy_null) 2222 target[len] = 0; 2223 return target; 2224} 2225 2226Py_UCS4* 2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2228 int copy_null) 2229{ 2230 if (target == NULL || targetsize < 0) { 2231 PyErr_BadInternalCall(); 2232 return NULL; 2233 } 2234 return as_ucs4(string, target, targetsize, copy_null); 2235} 2236 2237Py_UCS4* 2238PyUnicode_AsUCS4Copy(PyObject *string) 2239{ 2240 return as_ucs4(string, NULL, 0, 1); 2241} 2242 2243#ifdef HAVE_WCHAR_H 2244 2245PyObject * 2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) 2247{ 2248 if (w == NULL) { 2249 if (size == 0) { 2250 Py_INCREF(unicode_empty); 2251 return unicode_empty; 2252 } 2253 PyErr_BadInternalCall(); 2254 return NULL; 2255 } 2256 2257 if (size == -1) { 2258 size = wcslen(w); 2259 } 2260 2261 return PyUnicode_FromUnicode(w, size); 2262} 2263 2264#endif /* HAVE_WCHAR_H */ 2265 2266static void 2267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, 2268 int zeropad, int width, int precision, char c) 2269{ 2270 *fmt++ = '%'; 2271 if (width) { 2272 if (zeropad) 2273 *fmt++ = '0'; 2274 fmt += sprintf(fmt, "%d", width); 2275 } 2276 if (precision) 2277 fmt += sprintf(fmt, ".%d", precision); 2278 if (longflag) 2279 *fmt++ = 'l'; 2280 else if (longlongflag) { 2281 /* longlongflag should only ever be nonzero on machines with 2282 HAVE_LONG_LONG defined */ 2283#ifdef HAVE_LONG_LONG 2284 char *f = PY_FORMAT_LONG_LONG; 2285 while (*f) 2286 *fmt++ = *f++; 2287#else 2288 /* we shouldn't ever get here */ 2289 assert(0); 2290 *fmt++ = 'l'; 2291#endif 2292 } 2293 else if (size_tflag) { 2294 char *f = PY_FORMAT_SIZE_T; 2295 while (*f) 2296 *fmt++ = *f++; 2297 } 2298 *fmt++ = c; 2299 *fmt = '\0'; 2300} 2301 2302/* helper for PyUnicode_FromFormatV() */ 2303 2304static const char* 2305parse_format_flags(const char *f, 2306 int *p_width, int *p_precision, 2307 int *p_longflag, int *p_longlongflag, int *p_size_tflag) 2308{ 2309 int width, precision, longflag, longlongflag, size_tflag; 2310 2311 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2312 f++; 2313 width = 0; 2314 while (Py_ISDIGIT((unsigned)*f)) 2315 width = (width*10) + *f++ - '0'; 2316 precision = 0; 2317 if (*f == '.') { 2318 f++; 2319 while (Py_ISDIGIT((unsigned)*f)) 2320 precision = (precision*10) + *f++ - '0'; 2321 if (*f == '%') { 2322 /* "%.3%s" => f points to "3" */ 2323 f--; 2324 } 2325 } 2326 if (*f == '\0') { 2327 /* bogus format "%.1" => go backward, f points to "1" */ 2328 f--; 2329 } 2330 if (p_width != NULL) 2331 *p_width = width; 2332 if (p_precision != NULL) 2333 *p_precision = precision; 2334 2335 /* Handle %ld, %lu, %lld and %llu. */ 2336 longflag = 0; 2337 longlongflag = 0; 2338 size_tflag = 0; 2339 2340 if (*f == 'l') { 2341 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2342 longflag = 1; 2343 ++f; 2344 } 2345#ifdef HAVE_LONG_LONG 2346 else if (f[1] == 'l' && 2347 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2348 longlongflag = 1; 2349 f += 2; 2350 } 2351#endif 2352 } 2353 /* handle the size_t flag. */ 2354 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2355 size_tflag = 1; 2356 ++f; 2357 } 2358 if (p_longflag != NULL) 2359 *p_longflag = longflag; 2360 if (p_longlongflag != NULL) 2361 *p_longlongflag = longlongflag; 2362 if (p_size_tflag != NULL) 2363 *p_size_tflag = size_tflag; 2364 return f; 2365} 2366 2367/* maximum number of characters required for output of %ld. 21 characters 2368 allows for 64-bit integers (in decimal) and an optional sign. */ 2369#define MAX_LONG_CHARS 21 2370/* maximum number of characters required for output of %lld. 2371 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2372 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2374 2375PyObject * 2376PyUnicode_FromFormatV(const char *format, va_list vargs) 2377{ 2378 va_list count; 2379 Py_ssize_t callcount = 0; 2380 PyObject **callresults = NULL; 2381 PyObject **callresult = NULL; 2382 Py_ssize_t n = 0; 2383 int width = 0; 2384 int precision = 0; 2385 int zeropad; 2386 const char* f; 2387 PyObject *string; 2388 /* used by sprintf */ 2389 char fmt[61]; /* should be enough for %0width.precisionlld */ 2390 Py_UCS4 maxchar = 127; /* result is ASCII by default */ 2391 Py_UCS4 argmaxchar; 2392 Py_ssize_t numbersize = 0; 2393 char *numberresults = NULL; 2394 char *numberresult = NULL; 2395 Py_ssize_t i; 2396 int kind; 2397 void *data; 2398 2399 Py_VA_COPY(count, vargs); 2400 /* step 1: count the number of %S/%R/%A/%s format specifications 2401 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ 2402 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the 2403 * result in an array) 2404 * also estimate a upper bound for all the number formats in the string, 2405 * numbers will be formatted in step 3 and be kept in a '\0'-separated 2406 * buffer before putting everything together. */ 2407 for (f = format; *f; f++) { 2408 if (*f == '%') { 2409 int longlongflag; 2410 /* skip width or width.precision (eg. "1.2" of "%1.2f") */ 2411 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); 2412 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') 2413 ++callcount; 2414 2415 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { 2416#ifdef HAVE_LONG_LONG 2417 if (longlongflag) { 2418 if (width < MAX_LONG_LONG_CHARS) 2419 width = MAX_LONG_LONG_CHARS; 2420 } 2421 else 2422#endif 2423 /* MAX_LONG_CHARS is enough to hold a 64-bit integer, 2424 including sign. Decimal takes the most space. This 2425 isn't enough for octal. If a width is specified we 2426 need more (which we allocate later). */ 2427 if (width < MAX_LONG_CHARS) 2428 width = MAX_LONG_CHARS; 2429 2430 /* account for the size + '\0' to separate numbers 2431 inside of the numberresults buffer */ 2432 numbersize += (width + 1); 2433 } 2434 } 2435 else if ((unsigned char)*f > 127) { 2436 PyErr_Format(PyExc_ValueError, 2437 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2438 "string, got a non-ASCII byte: 0x%02x", 2439 (unsigned char)*f); 2440 return NULL; 2441 } 2442 } 2443 /* step 2: allocate memory for the results of 2444 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ 2445 if (callcount) { 2446 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); 2447 if (!callresults) { 2448 PyErr_NoMemory(); 2449 return NULL; 2450 } 2451 callresult = callresults; 2452 } 2453 /* step 2.5: allocate memory for the results of formating numbers */ 2454 if (numbersize) { 2455 numberresults = PyObject_Malloc(numbersize); 2456 if (!numberresults) { 2457 PyErr_NoMemory(); 2458 goto fail; 2459 } 2460 numberresult = numberresults; 2461 } 2462 2463 /* step 3: format numbers and figure out how large a buffer we need */ 2464 for (f = format; *f; f++) { 2465 if (*f == '%') { 2466 const char* p; 2467 int longflag; 2468 int longlongflag; 2469 int size_tflag; 2470 int numprinted; 2471 2472 p = f; 2473 zeropad = (f[1] == '0'); 2474 f = parse_format_flags(f, &width, &precision, 2475 &longflag, &longlongflag, &size_tflag); 2476 switch (*f) { 2477 case 'c': 2478 { 2479 Py_UCS4 ordinal = va_arg(count, int); 2480 maxchar = MAX_MAXCHAR(maxchar, ordinal); 2481 n++; 2482 break; 2483 } 2484 case '%': 2485 n++; 2486 break; 2487 case 'i': 2488 case 'd': 2489 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2490 width, precision, *f); 2491 if (longflag) 2492 numprinted = sprintf(numberresult, fmt, 2493 va_arg(count, long)); 2494#ifdef HAVE_LONG_LONG 2495 else if (longlongflag) 2496 numprinted = sprintf(numberresult, fmt, 2497 va_arg(count, PY_LONG_LONG)); 2498#endif 2499 else if (size_tflag) 2500 numprinted = sprintf(numberresult, fmt, 2501 va_arg(count, Py_ssize_t)); 2502 else 2503 numprinted = sprintf(numberresult, fmt, 2504 va_arg(count, int)); 2505 n += numprinted; 2506 /* advance by +1 to skip over the '\0' */ 2507 numberresult += (numprinted + 1); 2508 assert(*(numberresult - 1) == '\0'); 2509 assert(*(numberresult - 2) != '\0'); 2510 assert(numprinted >= 0); 2511 assert(numberresult <= numberresults + numbersize); 2512 break; 2513 case 'u': 2514 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, 2515 width, precision, 'u'); 2516 if (longflag) 2517 numprinted = sprintf(numberresult, fmt, 2518 va_arg(count, unsigned long)); 2519#ifdef HAVE_LONG_LONG 2520 else if (longlongflag) 2521 numprinted = sprintf(numberresult, fmt, 2522 va_arg(count, unsigned PY_LONG_LONG)); 2523#endif 2524 else if (size_tflag) 2525 numprinted = sprintf(numberresult, fmt, 2526 va_arg(count, size_t)); 2527 else 2528 numprinted = sprintf(numberresult, fmt, 2529 va_arg(count, unsigned int)); 2530 n += numprinted; 2531 numberresult += (numprinted + 1); 2532 assert(*(numberresult - 1) == '\0'); 2533 assert(*(numberresult - 2) != '\0'); 2534 assert(numprinted >= 0); 2535 assert(numberresult <= numberresults + numbersize); 2536 break; 2537 case 'x': 2538 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); 2539 numprinted = sprintf(numberresult, fmt, va_arg(count, int)); 2540 n += numprinted; 2541 numberresult += (numprinted + 1); 2542 assert(*(numberresult - 1) == '\0'); 2543 assert(*(numberresult - 2) != '\0'); 2544 assert(numprinted >= 0); 2545 assert(numberresult <= numberresults + numbersize); 2546 break; 2547 case 'p': 2548 numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); 2549 /* %p is ill-defined: ensure leading 0x. */ 2550 if (numberresult[1] == 'X') 2551 numberresult[1] = 'x'; 2552 else if (numberresult[1] != 'x') { 2553 memmove(numberresult + 2, numberresult, 2554 strlen(numberresult) + 1); 2555 numberresult[0] = '0'; 2556 numberresult[1] = 'x'; 2557 numprinted += 2; 2558 } 2559 n += numprinted; 2560 numberresult += (numprinted + 1); 2561 assert(*(numberresult - 1) == '\0'); 2562 assert(*(numberresult - 2) != '\0'); 2563 assert(numprinted >= 0); 2564 assert(numberresult <= numberresults + numbersize); 2565 break; 2566 case 's': 2567 { 2568 /* UTF-8 */ 2569 const char *s = va_arg(count, const char*); 2570 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); 2571 if (!str) 2572 goto fail; 2573 /* since PyUnicode_DecodeUTF8 returns already flexible 2574 unicode objects, there is no need to call ready on them */ 2575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2576 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2577 n += PyUnicode_GET_LENGTH(str); 2578 /* Remember the str and switch to the next slot */ 2579 *callresult++ = str; 2580 break; 2581 } 2582 case 'U': 2583 { 2584 PyObject *obj = va_arg(count, PyObject *); 2585 assert(obj && _PyUnicode_CHECK(obj)); 2586 if (PyUnicode_READY(obj) == -1) 2587 goto fail; 2588 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2589 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2590 n += PyUnicode_GET_LENGTH(obj); 2591 break; 2592 } 2593 case 'V': 2594 { 2595 PyObject *obj = va_arg(count, PyObject *); 2596 const char *str = va_arg(count, const char *); 2597 PyObject *str_obj; 2598 assert(obj || str); 2599 assert(!obj || _PyUnicode_CHECK(obj)); 2600 if (obj) { 2601 if (PyUnicode_READY(obj) == -1) 2602 goto fail; 2603 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); 2604 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2605 n += PyUnicode_GET_LENGTH(obj); 2606 *callresult++ = NULL; 2607 } 2608 else { 2609 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); 2610 if (!str_obj) 2611 goto fail; 2612 if (PyUnicode_READY(str_obj) == -1) { 2613 Py_DECREF(str_obj); 2614 goto fail; 2615 } 2616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); 2617 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2618 n += PyUnicode_GET_LENGTH(str_obj); 2619 *callresult++ = str_obj; 2620 } 2621 break; 2622 } 2623 case 'S': 2624 { 2625 PyObject *obj = va_arg(count, PyObject *); 2626 PyObject *str; 2627 assert(obj); 2628 str = PyObject_Str(obj); 2629 if (!str) 2630 goto fail; 2631 if (PyUnicode_READY(str) == -1) { 2632 Py_DECREF(str); 2633 goto fail; 2634 } 2635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); 2636 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2637 n += PyUnicode_GET_LENGTH(str); 2638 /* Remember the str and switch to the next slot */ 2639 *callresult++ = str; 2640 break; 2641 } 2642 case 'R': 2643 { 2644 PyObject *obj = va_arg(count, PyObject *); 2645 PyObject *repr; 2646 assert(obj); 2647 repr = PyObject_Repr(obj); 2648 if (!repr) 2649 goto fail; 2650 if (PyUnicode_READY(repr) == -1) { 2651 Py_DECREF(repr); 2652 goto fail; 2653 } 2654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); 2655 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2656 n += PyUnicode_GET_LENGTH(repr); 2657 /* Remember the repr and switch to the next slot */ 2658 *callresult++ = repr; 2659 break; 2660 } 2661 case 'A': 2662 { 2663 PyObject *obj = va_arg(count, PyObject *); 2664 PyObject *ascii; 2665 assert(obj); 2666 ascii = PyObject_ASCII(obj); 2667 if (!ascii) 2668 goto fail; 2669 if (PyUnicode_READY(ascii) == -1) { 2670 Py_DECREF(ascii); 2671 goto fail; 2672 } 2673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); 2674 maxchar = MAX_MAXCHAR(maxchar, argmaxchar); 2675 n += PyUnicode_GET_LENGTH(ascii); 2676 /* Remember the repr and switch to the next slot */ 2677 *callresult++ = ascii; 2678 break; 2679 } 2680 default: 2681 /* if we stumble upon an unknown 2682 formatting code, copy the rest of 2683 the format string to the output 2684 string. (we cannot just skip the 2685 code, since there's no way to know 2686 what's in the argument list) */ 2687 n += strlen(p); 2688 goto expand; 2689 } 2690 } else 2691 n++; 2692 } 2693 expand: 2694 /* step 4: fill the buffer */ 2695 /* Since we've analyzed how much space we need, 2696 we don't have to resize the string. 2697 There can be no errors beyond this point. */ 2698 string = PyUnicode_New(n, maxchar); 2699 if (!string) 2700 goto fail; 2701 kind = PyUnicode_KIND(string); 2702 data = PyUnicode_DATA(string); 2703 callresult = callresults; 2704 numberresult = numberresults; 2705 2706 for (i = 0, f = format; *f; f++) { 2707 if (*f == '%') { 2708 const char* p; 2709 2710 p = f; 2711 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); 2712 /* checking for == because the last argument could be a empty 2713 string, which causes i to point to end, the assert at the end of 2714 the loop */ 2715 assert(i <= PyUnicode_GET_LENGTH(string)); 2716 2717 switch (*f) { 2718 case 'c': 2719 { 2720 const int ordinal = va_arg(vargs, int); 2721 PyUnicode_WRITE(kind, data, i++, ordinal); 2722 break; 2723 } 2724 case 'i': 2725 case 'd': 2726 case 'u': 2727 case 'x': 2728 case 'p': 2729 { 2730 Py_ssize_t len; 2731 /* unused, since we already have the result */ 2732 if (*f == 'p') 2733 (void) va_arg(vargs, void *); 2734 else 2735 (void) va_arg(vargs, int); 2736 /* extract the result from numberresults and append. */ 2737 len = strlen(numberresult); 2738 unicode_write_cstr(string, i, numberresult, len); 2739 /* skip over the separating '\0' */ 2740 i += len; 2741 numberresult += len; 2742 assert(*numberresult == '\0'); 2743 numberresult++; 2744 assert(numberresult <= numberresults + numbersize); 2745 break; 2746 } 2747 case 's': 2748 { 2749 /* unused, since we already have the result */ 2750 Py_ssize_t size; 2751 (void) va_arg(vargs, char *); 2752 size = PyUnicode_GET_LENGTH(*callresult); 2753 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2754 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2755 i += size; 2756 /* We're done with the unicode()/repr() => forget it */ 2757 Py_DECREF(*callresult); 2758 /* switch to next unicode()/repr() result */ 2759 ++callresult; 2760 break; 2761 } 2762 case 'U': 2763 { 2764 PyObject *obj = va_arg(vargs, PyObject *); 2765 Py_ssize_t size; 2766 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2767 size = PyUnicode_GET_LENGTH(obj); 2768 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2769 i += size; 2770 break; 2771 } 2772 case 'V': 2773 { 2774 Py_ssize_t size; 2775 PyObject *obj = va_arg(vargs, PyObject *); 2776 va_arg(vargs, const char *); 2777 if (obj) { 2778 size = PyUnicode_GET_LENGTH(obj); 2779 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); 2780 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); 2781 i += size; 2782 } else { 2783 size = PyUnicode_GET_LENGTH(*callresult); 2784 assert(PyUnicode_KIND(*callresult) <= 2785 PyUnicode_KIND(string)); 2786 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2787 i += size; 2788 Py_DECREF(*callresult); 2789 } 2790 ++callresult; 2791 break; 2792 } 2793 case 'S': 2794 case 'R': 2795 case 'A': 2796 { 2797 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); 2798 /* unused, since we already have the result */ 2799 (void) va_arg(vargs, PyObject *); 2800 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); 2801 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); 2802 i += size; 2803 /* We're done with the unicode()/repr() => forget it */ 2804 Py_DECREF(*callresult); 2805 /* switch to next unicode()/repr() result */ 2806 ++callresult; 2807 break; 2808 } 2809 case '%': 2810 PyUnicode_WRITE(kind, data, i++, '%'); 2811 break; 2812 default: 2813 { 2814 Py_ssize_t len = strlen(p); 2815 unicode_write_cstr(string, i, p, len); 2816 i += len; 2817 assert(i == PyUnicode_GET_LENGTH(string)); 2818 goto end; 2819 } 2820 } 2821 } 2822 else { 2823 assert(i < PyUnicode_GET_LENGTH(string)); 2824 PyUnicode_WRITE(kind, data, i++, *f); 2825 } 2826 } 2827 assert(i == PyUnicode_GET_LENGTH(string)); 2828 2829 end: 2830 if (callresults) 2831 PyObject_Free(callresults); 2832 if (numberresults) 2833 PyObject_Free(numberresults); 2834 return unicode_result(string); 2835 fail: 2836 if (callresults) { 2837 PyObject **callresult2 = callresults; 2838 while (callresult2 < callresult) { 2839 Py_XDECREF(*callresult2); 2840 ++callresult2; 2841 } 2842 PyObject_Free(callresults); 2843 } 2844 if (numberresults) 2845 PyObject_Free(numberresults); 2846 return NULL; 2847} 2848 2849PyObject * 2850PyUnicode_FromFormat(const char *format, ...) 2851{ 2852 PyObject* ret; 2853 va_list vargs; 2854 2855#ifdef HAVE_STDARG_PROTOTYPES 2856 va_start(vargs, format); 2857#else 2858 va_start(vargs); 2859#endif 2860 ret = PyUnicode_FromFormatV(format, vargs); 2861 va_end(vargs); 2862 return ret; 2863} 2864 2865#ifdef HAVE_WCHAR_H 2866 2867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2868 convert a Unicode object to a wide character string. 2869 2870 - If w is NULL: return the number of wide characters (including the null 2871 character) required to convert the unicode object. Ignore size argument. 2872 2873 - Otherwise: return the number of wide characters (excluding the null 2874 character) written into w. Write at most size wide characters (including 2875 the null character). */ 2876static Py_ssize_t 2877unicode_aswidechar(PyObject *unicode, 2878 wchar_t *w, 2879 Py_ssize_t size) 2880{ 2881 Py_ssize_t res; 2882 const wchar_t *wstr; 2883 2884 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2885 if (wstr == NULL) 2886 return -1; 2887 2888 if (w != NULL) { 2889 if (size > res) 2890 size = res + 1; 2891 else 2892 res = size; 2893 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2894 return res; 2895 } 2896 else 2897 return res + 1; 2898} 2899 2900Py_ssize_t 2901PyUnicode_AsWideChar(PyObject *unicode, 2902 wchar_t *w, 2903 Py_ssize_t size) 2904{ 2905 if (unicode == NULL) { 2906 PyErr_BadInternalCall(); 2907 return -1; 2908 } 2909 return unicode_aswidechar(unicode, w, size); 2910} 2911 2912wchar_t* 2913PyUnicode_AsWideCharString(PyObject *unicode, 2914 Py_ssize_t *size) 2915{ 2916 wchar_t* buffer; 2917 Py_ssize_t buflen; 2918 2919 if (unicode == NULL) { 2920 PyErr_BadInternalCall(); 2921 return NULL; 2922 } 2923 2924 buflen = unicode_aswidechar(unicode, NULL, 0); 2925 if (buflen == -1) 2926 return NULL; 2927 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { 2928 PyErr_NoMemory(); 2929 return NULL; 2930 } 2931 2932 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); 2933 if (buffer == NULL) { 2934 PyErr_NoMemory(); 2935 return NULL; 2936 } 2937 buflen = unicode_aswidechar(unicode, buffer, buflen); 2938 if (buflen == -1) { 2939 PyMem_FREE(buffer); 2940 return NULL; 2941 } 2942 if (size != NULL) 2943 *size = buflen; 2944 return buffer; 2945} 2946 2947#endif /* HAVE_WCHAR_H */ 2948 2949PyObject * 2950PyUnicode_FromOrdinal(int ordinal) 2951{ 2952 PyObject *v; 2953 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2954 PyErr_SetString(PyExc_ValueError, 2955 "chr() arg not in range(0x110000)"); 2956 return NULL; 2957 } 2958 2959 if (ordinal < 256) 2960 return get_latin1_char(ordinal); 2961 2962 v = PyUnicode_New(1, ordinal); 2963 if (v == NULL) 2964 return NULL; 2965 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); 2966 assert(_PyUnicode_CheckConsistency(v, 1)); 2967 return v; 2968} 2969 2970PyObject * 2971PyUnicode_FromObject(register PyObject *obj) 2972{ 2973 /* XXX Perhaps we should make this API an alias of 2974 PyObject_Str() instead ?! */ 2975 if (PyUnicode_CheckExact(obj)) { 2976 if (PyUnicode_READY(obj) == -1) 2977 return NULL; 2978 Py_INCREF(obj); 2979 return obj; 2980 } 2981 if (PyUnicode_Check(obj)) { 2982 /* For a Unicode subtype that's not a Unicode object, 2983 return a true Unicode object with the same data. */ 2984 return _PyUnicode_Copy(obj); 2985 } 2986 PyErr_Format(PyExc_TypeError, 2987 "Can't convert '%.100s' object to str implicitly", 2988 Py_TYPE(obj)->tp_name); 2989 return NULL; 2990} 2991 2992PyObject * 2993PyUnicode_FromEncodedObject(register PyObject *obj, 2994 const char *encoding, 2995 const char *errors) 2996{ 2997 Py_buffer buffer; 2998 PyObject *v; 2999 3000 if (obj == NULL) { 3001 PyErr_BadInternalCall(); 3002 return NULL; 3003 } 3004 3005 /* Decoding bytes objects is the most common case and should be fast */ 3006 if (PyBytes_Check(obj)) { 3007 if (PyBytes_GET_SIZE(obj) == 0) { 3008 Py_INCREF(unicode_empty); 3009 v = unicode_empty; 3010 } 3011 else { 3012 v = PyUnicode_Decode( 3013 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 3014 encoding, errors); 3015 } 3016 return v; 3017 } 3018 3019 if (PyUnicode_Check(obj)) { 3020 PyErr_SetString(PyExc_TypeError, 3021 "decoding str is not supported"); 3022 return NULL; 3023 } 3024 3025 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 3026 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 3027 PyErr_Format(PyExc_TypeError, 3028 "coercing to str: need bytes, bytearray " 3029 "or buffer-like object, %.80s found", 3030 Py_TYPE(obj)->tp_name); 3031 return NULL; 3032 } 3033 3034 if (buffer.len == 0) { 3035 Py_INCREF(unicode_empty); 3036 v = unicode_empty; 3037 } 3038 else 3039 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 3040 3041 PyBuffer_Release(&buffer); 3042 return v; 3043} 3044 3045/* Convert encoding to lower case and replace '_' with '-' in order to 3046 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 3047 1 on success. */ 3048static int 3049normalize_encoding(const char *encoding, 3050 char *lower, 3051 size_t lower_len) 3052{ 3053 const char *e; 3054 char *l; 3055 char *l_end; 3056 3057 if (encoding == NULL) { 3058 strcpy(lower, "utf-8"); 3059 return 1; 3060 } 3061 e = encoding; 3062 l = lower; 3063 l_end = &lower[lower_len - 1]; 3064 while (*e) { 3065 if (l == l_end) 3066 return 0; 3067 if (Py_ISUPPER(*e)) { 3068 *l++ = Py_TOLOWER(*e++); 3069 } 3070 else if (*e == '_') { 3071 *l++ = '-'; 3072 e++; 3073 } 3074 else { 3075 *l++ = *e++; 3076 } 3077 } 3078 *l = '\0'; 3079 return 1; 3080} 3081 3082PyObject * 3083PyUnicode_Decode(const char *s, 3084 Py_ssize_t size, 3085 const char *encoding, 3086 const char *errors) 3087{ 3088 PyObject *buffer = NULL, *unicode; 3089 Py_buffer info; 3090 char lower[11]; /* Enough for any encoding shortcut */ 3091 3092 /* Shortcuts for common default encodings */ 3093 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3094 if ((strcmp(lower, "utf-8") == 0) || 3095 (strcmp(lower, "utf8") == 0)) 3096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3097 else if ((strcmp(lower, "latin-1") == 0) || 3098 (strcmp(lower, "latin1") == 0) || 3099 (strcmp(lower, "iso-8859-1") == 0)) 3100 return PyUnicode_DecodeLatin1(s, size, errors); 3101#ifdef HAVE_MBCS 3102 else if (strcmp(lower, "mbcs") == 0) 3103 return PyUnicode_DecodeMBCS(s, size, errors); 3104#endif 3105 else if (strcmp(lower, "ascii") == 0) 3106 return PyUnicode_DecodeASCII(s, size, errors); 3107 else if (strcmp(lower, "utf-16") == 0) 3108 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3109 else if (strcmp(lower, "utf-32") == 0) 3110 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3111 } 3112 3113 /* Decode via the codec registry */ 3114 buffer = NULL; 3115 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3116 goto onError; 3117 buffer = PyMemoryView_FromBuffer(&info); 3118 if (buffer == NULL) 3119 goto onError; 3120 unicode = PyCodec_Decode(buffer, encoding, errors); 3121 if (unicode == NULL) 3122 goto onError; 3123 if (!PyUnicode_Check(unicode)) { 3124 PyErr_Format(PyExc_TypeError, 3125 "decoder did not return a str object (type=%.400s)", 3126 Py_TYPE(unicode)->tp_name); 3127 Py_DECREF(unicode); 3128 goto onError; 3129 } 3130 Py_DECREF(buffer); 3131 return unicode_result(unicode); 3132 3133 onError: 3134 Py_XDECREF(buffer); 3135 return NULL; 3136} 3137 3138PyObject * 3139PyUnicode_AsDecodedObject(PyObject *unicode, 3140 const char *encoding, 3141 const char *errors) 3142{ 3143 PyObject *v; 3144 3145 if (!PyUnicode_Check(unicode)) { 3146 PyErr_BadArgument(); 3147 goto onError; 3148 } 3149 3150 if (encoding == NULL) 3151 encoding = PyUnicode_GetDefaultEncoding(); 3152 3153 /* Decode via the codec registry */ 3154 v = PyCodec_Decode(unicode, encoding, errors); 3155 if (v == NULL) 3156 goto onError; 3157 return unicode_result(v); 3158 3159 onError: 3160 return NULL; 3161} 3162 3163PyObject * 3164PyUnicode_AsDecodedUnicode(PyObject *unicode, 3165 const char *encoding, 3166 const char *errors) 3167{ 3168 PyObject *v; 3169 3170 if (!PyUnicode_Check(unicode)) { 3171 PyErr_BadArgument(); 3172 goto onError; 3173 } 3174 3175 if (encoding == NULL) 3176 encoding = PyUnicode_GetDefaultEncoding(); 3177 3178 /* Decode via the codec registry */ 3179 v = PyCodec_Decode(unicode, encoding, errors); 3180 if (v == NULL) 3181 goto onError; 3182 if (!PyUnicode_Check(v)) { 3183 PyErr_Format(PyExc_TypeError, 3184 "decoder did not return a str object (type=%.400s)", 3185 Py_TYPE(v)->tp_name); 3186 Py_DECREF(v); 3187 goto onError; 3188 } 3189 return unicode_result(v); 3190 3191 onError: 3192 return NULL; 3193} 3194 3195PyObject * 3196PyUnicode_Encode(const Py_UNICODE *s, 3197 Py_ssize_t size, 3198 const char *encoding, 3199 const char *errors) 3200{ 3201 PyObject *v, *unicode; 3202 3203 unicode = PyUnicode_FromUnicode(s, size); 3204 if (unicode == NULL) 3205 return NULL; 3206 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3207 Py_DECREF(unicode); 3208 return v; 3209} 3210 3211PyObject * 3212PyUnicode_AsEncodedObject(PyObject *unicode, 3213 const char *encoding, 3214 const char *errors) 3215{ 3216 PyObject *v; 3217 3218 if (!PyUnicode_Check(unicode)) { 3219 PyErr_BadArgument(); 3220 goto onError; 3221 } 3222 3223 if (encoding == NULL) 3224 encoding = PyUnicode_GetDefaultEncoding(); 3225 3226 /* Encode via the codec registry */ 3227 v = PyCodec_Encode(unicode, encoding, errors); 3228 if (v == NULL) 3229 goto onError; 3230 return v; 3231 3232 onError: 3233 return NULL; 3234} 3235 3236static size_t 3237wcstombs_errorpos(const wchar_t *wstr) 3238{ 3239 size_t len; 3240#if SIZEOF_WCHAR_T == 2 3241 wchar_t buf[3]; 3242#else 3243 wchar_t buf[2]; 3244#endif 3245 char outbuf[MB_LEN_MAX]; 3246 const wchar_t *start, *previous; 3247 3248#if SIZEOF_WCHAR_T == 2 3249 buf[2] = 0; 3250#else 3251 buf[1] = 0; 3252#endif 3253 start = wstr; 3254 while (*wstr != L'\0') 3255 { 3256 previous = wstr; 3257#if SIZEOF_WCHAR_T == 2 3258 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3259 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3260 { 3261 buf[0] = wstr[0]; 3262 buf[1] = wstr[1]; 3263 wstr += 2; 3264 } 3265 else { 3266 buf[0] = *wstr; 3267 buf[1] = 0; 3268 wstr++; 3269 } 3270#else 3271 buf[0] = *wstr; 3272 wstr++; 3273#endif 3274 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3275 if (len == (size_t)-1) 3276 return previous - start; 3277 } 3278 3279 /* failed to find the unencodable character */ 3280 return 0; 3281} 3282 3283static int 3284locale_error_handler(const char *errors, int *surrogateescape) 3285{ 3286 if (errors == NULL) { 3287 *surrogateescape = 0; 3288 return 0; 3289 } 3290 3291 if (strcmp(errors, "strict") == 0) { 3292 *surrogateescape = 0; 3293 return 0; 3294 } 3295 if (strcmp(errors, "surrogateescape") == 0) { 3296 *surrogateescape = 1; 3297 return 0; 3298 } 3299 PyErr_Format(PyExc_ValueError, 3300 "only 'strict' and 'surrogateescape' error handlers " 3301 "are supported, not '%s'", 3302 errors); 3303 return -1; 3304} 3305 3306PyObject * 3307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3308{ 3309 Py_ssize_t wlen, wlen2; 3310 wchar_t *wstr; 3311 PyObject *bytes = NULL; 3312 char *errmsg; 3313 PyObject *reason; 3314 PyObject *exc; 3315 size_t error_pos; 3316 int surrogateescape; 3317 3318 if (locale_error_handler(errors, &surrogateescape) < 0) 3319 return NULL; 3320 3321 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3322 if (wstr == NULL) 3323 return NULL; 3324 3325 wlen2 = wcslen(wstr); 3326 if (wlen2 != wlen) { 3327 PyMem_Free(wstr); 3328 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3329 return NULL; 3330 } 3331 3332 if (surrogateescape) { 3333 /* locale encoding with surrogateescape */ 3334 char *str; 3335 3336 str = _Py_wchar2char(wstr, &error_pos); 3337 if (str == NULL) { 3338 if (error_pos == (size_t)-1) { 3339 PyErr_NoMemory(); 3340 PyMem_Free(wstr); 3341 return NULL; 3342 } 3343 else { 3344 goto encode_error; 3345 } 3346 } 3347 PyMem_Free(wstr); 3348 3349 bytes = PyBytes_FromString(str); 3350 PyMem_Free(str); 3351 } 3352 else { 3353 size_t len, len2; 3354 3355 len = wcstombs(NULL, wstr, 0); 3356 if (len == (size_t)-1) { 3357 error_pos = (size_t)-1; 3358 goto encode_error; 3359 } 3360 3361 bytes = PyBytes_FromStringAndSize(NULL, len); 3362 if (bytes == NULL) { 3363 PyMem_Free(wstr); 3364 return NULL; 3365 } 3366 3367 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3368 if (len2 == (size_t)-1 || len2 > len) { 3369 error_pos = (size_t)-1; 3370 goto encode_error; 3371 } 3372 PyMem_Free(wstr); 3373 } 3374 return bytes; 3375 3376encode_error: 3377 errmsg = strerror(errno); 3378 assert(errmsg != NULL); 3379 3380 if (error_pos == (size_t)-1) 3381 error_pos = wcstombs_errorpos(wstr); 3382 3383 PyMem_Free(wstr); 3384 Py_XDECREF(bytes); 3385 3386 if (errmsg != NULL) { 3387 size_t errlen; 3388 wstr = _Py_char2wchar(errmsg, &errlen); 3389 if (wstr != NULL) { 3390 reason = PyUnicode_FromWideChar(wstr, errlen); 3391 PyMem_Free(wstr); 3392 } else 3393 errmsg = NULL; 3394 } 3395 if (errmsg == NULL) 3396 reason = PyUnicode_FromString( 3397 "wcstombs() encountered an unencodable " 3398 "wide character"); 3399 if (reason == NULL) 3400 return NULL; 3401 3402 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3403 "locale", unicode, 3404 (Py_ssize_t)error_pos, 3405 (Py_ssize_t)(error_pos+1), 3406 reason); 3407 Py_DECREF(reason); 3408 if (exc != NULL) { 3409 PyCodec_StrictErrors(exc); 3410 Py_XDECREF(exc); 3411 } 3412 return NULL; 3413} 3414 3415PyObject * 3416PyUnicode_EncodeFSDefault(PyObject *unicode) 3417{ 3418#ifdef HAVE_MBCS 3419 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3420#elif defined(__APPLE__) 3421 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3422#else 3423 PyInterpreterState *interp = PyThreadState_GET()->interp; 3424 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3425 cannot use it to encode and decode filenames before it is loaded. Load 3426 the Python codec requires to encode at least its own filename. Use the C 3427 version of the locale codec until the codec registry is initialized and 3428 the Python codec is loaded. 3429 3430 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3431 cannot only rely on it: check also interp->fscodec_initialized for 3432 subinterpreters. */ 3433 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3434 return PyUnicode_AsEncodedString(unicode, 3435 Py_FileSystemDefaultEncoding, 3436 "surrogateescape"); 3437 } 3438 else { 3439 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3440 } 3441#endif 3442} 3443 3444PyObject * 3445PyUnicode_AsEncodedString(PyObject *unicode, 3446 const char *encoding, 3447 const char *errors) 3448{ 3449 PyObject *v; 3450 char lower[11]; /* Enough for any encoding shortcut */ 3451 3452 if (!PyUnicode_Check(unicode)) { 3453 PyErr_BadArgument(); 3454 return NULL; 3455 } 3456 3457 /* Shortcuts for common default encodings */ 3458 if (normalize_encoding(encoding, lower, sizeof(lower))) { 3459 if ((strcmp(lower, "utf-8") == 0) || 3460 (strcmp(lower, "utf8") == 0)) 3461 { 3462 if (errors == NULL || strcmp(errors, "strict") == 0) 3463 return _PyUnicode_AsUTF8String(unicode, NULL); 3464 else 3465 return _PyUnicode_AsUTF8String(unicode, errors); 3466 } 3467 else if ((strcmp(lower, "latin-1") == 0) || 3468 (strcmp(lower, "latin1") == 0) || 3469 (strcmp(lower, "iso-8859-1") == 0)) 3470 return _PyUnicode_AsLatin1String(unicode, errors); 3471#ifdef HAVE_MBCS 3472 else if (strcmp(lower, "mbcs") == 0) 3473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3474#endif 3475 else if (strcmp(lower, "ascii") == 0) 3476 return _PyUnicode_AsASCIIString(unicode, errors); 3477 } 3478 3479 /* Encode via the codec registry */ 3480 v = PyCodec_Encode(unicode, encoding, errors); 3481 if (v == NULL) 3482 return NULL; 3483 3484 /* The normal path */ 3485 if (PyBytes_Check(v)) 3486 return v; 3487 3488 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3489 if (PyByteArray_Check(v)) { 3490 int error; 3491 PyObject *b; 3492 3493 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3494 "encoder %s returned bytearray instead of bytes", 3495 encoding); 3496 if (error) { 3497 Py_DECREF(v); 3498 return NULL; 3499 } 3500 3501 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3502 Py_DECREF(v); 3503 return b; 3504 } 3505 3506 PyErr_Format(PyExc_TypeError, 3507 "encoder did not return a bytes object (type=%.400s)", 3508 Py_TYPE(v)->tp_name); 3509 Py_DECREF(v); 3510 return NULL; 3511} 3512 3513PyObject * 3514PyUnicode_AsEncodedUnicode(PyObject *unicode, 3515 const char *encoding, 3516 const char *errors) 3517{ 3518 PyObject *v; 3519 3520 if (!PyUnicode_Check(unicode)) { 3521 PyErr_BadArgument(); 3522 goto onError; 3523 } 3524 3525 if (encoding == NULL) 3526 encoding = PyUnicode_GetDefaultEncoding(); 3527 3528 /* Encode via the codec registry */ 3529 v = PyCodec_Encode(unicode, encoding, errors); 3530 if (v == NULL) 3531 goto onError; 3532 if (!PyUnicode_Check(v)) { 3533 PyErr_Format(PyExc_TypeError, 3534 "encoder did not return an str object (type=%.400s)", 3535 Py_TYPE(v)->tp_name); 3536 Py_DECREF(v); 3537 goto onError; 3538 } 3539 return v; 3540 3541 onError: 3542 return NULL; 3543} 3544 3545static size_t 3546mbstowcs_errorpos(const char *str, size_t len) 3547{ 3548#ifdef HAVE_MBRTOWC 3549 const char *start = str; 3550 mbstate_t mbs; 3551 size_t converted; 3552 wchar_t ch; 3553 3554 memset(&mbs, 0, sizeof mbs); 3555 while (len) 3556 { 3557 converted = mbrtowc(&ch, (char*)str, len, &mbs); 3558 if (converted == 0) 3559 /* Reached end of string */ 3560 break; 3561 if (converted == (size_t)-1 || converted == (size_t)-2) { 3562 /* Conversion error or incomplete character */ 3563 return str - start; 3564 } 3565 else { 3566 str += converted; 3567 len -= converted; 3568 } 3569 } 3570 /* failed to find the undecodable byte sequence */ 3571 return 0; 3572#endif 3573 return 0; 3574} 3575 3576PyObject* 3577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3578 const char *errors) 3579{ 3580 wchar_t smallbuf[256]; 3581 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3582 wchar_t *wstr; 3583 size_t wlen, wlen2; 3584 PyObject *unicode; 3585 int surrogateescape; 3586 size_t error_pos; 3587 char *errmsg; 3588 PyObject *reason, *exc; 3589 3590 if (locale_error_handler(errors, &surrogateescape) < 0) 3591 return NULL; 3592 3593 if (str[len] != '\0' || len != strlen(str)) { 3594 PyErr_SetString(PyExc_TypeError, "embedded null character"); 3595 return NULL; 3596 } 3597 3598 if (surrogateescape) 3599 { 3600 wstr = _Py_char2wchar(str, &wlen); 3601 if (wstr == NULL) { 3602 if (wlen == (size_t)-1) 3603 PyErr_NoMemory(); 3604 else 3605 PyErr_SetFromErrno(PyExc_OSError); 3606 return NULL; 3607 } 3608 3609 unicode = PyUnicode_FromWideChar(wstr, wlen); 3610 PyMem_Free(wstr); 3611 } 3612 else { 3613#ifndef HAVE_BROKEN_MBSTOWCS 3614 wlen = mbstowcs(NULL, str, 0); 3615#else 3616 wlen = len; 3617#endif 3618 if (wlen == (size_t)-1) 3619 goto decode_error; 3620 if (wlen+1 <= smallbuf_len) { 3621 wstr = smallbuf; 3622 } 3623 else { 3624 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) 3625 return PyErr_NoMemory(); 3626 3627 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t)); 3628 if (!wstr) 3629 return PyErr_NoMemory(); 3630 } 3631 3632 /* This shouldn't fail now */ 3633 wlen2 = mbstowcs(wstr, str, wlen+1); 3634 if (wlen2 == (size_t)-1) { 3635 if (wstr != smallbuf) 3636 PyMem_Free(wstr); 3637 goto decode_error; 3638 } 3639#ifdef HAVE_BROKEN_MBSTOWCS 3640 assert(wlen2 == wlen); 3641#endif 3642 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3643 if (wstr != smallbuf) 3644 PyMem_Free(wstr); 3645 } 3646 return unicode; 3647 3648decode_error: 3649 errmsg = strerror(errno); 3650 assert(errmsg != NULL); 3651 3652 error_pos = mbstowcs_errorpos(str, len); 3653 if (errmsg != NULL) { 3654 size_t errlen; 3655 wstr = _Py_char2wchar(errmsg, &errlen); 3656 if (wstr != NULL) { 3657 reason = PyUnicode_FromWideChar(wstr, errlen); 3658 PyMem_Free(wstr); 3659 } else 3660 errmsg = NULL; 3661 } 3662 if (errmsg == NULL) 3663 reason = PyUnicode_FromString( 3664 "mbstowcs() encountered an invalid multibyte sequence"); 3665 if (reason == NULL) 3666 return NULL; 3667 3668 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3669 "locale", str, len, 3670 (Py_ssize_t)error_pos, 3671 (Py_ssize_t)(error_pos+1), 3672 reason); 3673 Py_DECREF(reason); 3674 if (exc != NULL) { 3675 PyCodec_StrictErrors(exc); 3676 Py_XDECREF(exc); 3677 } 3678 return NULL; 3679} 3680 3681PyObject* 3682PyUnicode_DecodeLocale(const char *str, const char *errors) 3683{ 3684 Py_ssize_t size = (Py_ssize_t)strlen(str); 3685 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3686} 3687 3688 3689PyObject* 3690PyUnicode_DecodeFSDefault(const char *s) { 3691 Py_ssize_t size = (Py_ssize_t)strlen(s); 3692 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3693} 3694 3695PyObject* 3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3697{ 3698#ifdef HAVE_MBCS 3699 return PyUnicode_DecodeMBCS(s, size, NULL); 3700#elif defined(__APPLE__) 3701 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3702#else 3703 PyInterpreterState *interp = PyThreadState_GET()->interp; 3704 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3705 cannot use it to encode and decode filenames before it is loaded. Load 3706 the Python codec requires to encode at least its own filename. Use the C 3707 version of the locale codec until the codec registry is initialized and 3708 the Python codec is loaded. 3709 3710 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3711 cannot only rely on it: check also interp->fscodec_initialized for 3712 subinterpreters. */ 3713 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3714 return PyUnicode_Decode(s, size, 3715 Py_FileSystemDefaultEncoding, 3716 "surrogateescape"); 3717 } 3718 else { 3719 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3720 } 3721#endif 3722} 3723 3724 3725int 3726_PyUnicode_HasNULChars(PyObject* s) 3727{ 3728 static PyObject *nul = NULL; 3729 3730 if (nul == NULL) 3731 nul = PyUnicode_FromStringAndSize("\0", 1); 3732 if (nul == NULL) 3733 return -1; 3734 return PyUnicode_Contains(s, nul); 3735} 3736 3737 3738int 3739PyUnicode_FSConverter(PyObject* arg, void* addr) 3740{ 3741 PyObject *output = NULL; 3742 Py_ssize_t size; 3743 void *data; 3744 if (arg == NULL) { 3745 Py_DECREF(*(PyObject**)addr); 3746 return 1; 3747 } 3748 if (PyBytes_Check(arg)) { 3749 output = arg; 3750 Py_INCREF(output); 3751 } 3752 else { 3753 arg = PyUnicode_FromObject(arg); 3754 if (!arg) 3755 return 0; 3756 output = PyUnicode_EncodeFSDefault(arg); 3757 Py_DECREF(arg); 3758 if (!output) 3759 return 0; 3760 if (!PyBytes_Check(output)) { 3761 Py_DECREF(output); 3762 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3763 return 0; 3764 } 3765 } 3766 size = PyBytes_GET_SIZE(output); 3767 data = PyBytes_AS_STRING(output); 3768 if (size != strlen(data)) { 3769 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3770 Py_DECREF(output); 3771 return 0; 3772 } 3773 *(PyObject**)addr = output; 3774 return Py_CLEANUP_SUPPORTED; 3775} 3776 3777 3778int 3779PyUnicode_FSDecoder(PyObject* arg, void* addr) 3780{ 3781 PyObject *output = NULL; 3782 if (arg == NULL) { 3783 Py_DECREF(*(PyObject**)addr); 3784 return 1; 3785 } 3786 if (PyUnicode_Check(arg)) { 3787 if (PyUnicode_READY(arg) == -1) 3788 return 0; 3789 output = arg; 3790 Py_INCREF(output); 3791 } 3792 else { 3793 arg = PyBytes_FromObject(arg); 3794 if (!arg) 3795 return 0; 3796 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3797 PyBytes_GET_SIZE(arg)); 3798 Py_DECREF(arg); 3799 if (!output) 3800 return 0; 3801 if (!PyUnicode_Check(output)) { 3802 Py_DECREF(output); 3803 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3804 return 0; 3805 } 3806 } 3807 if (PyUnicode_READY(output) == -1) { 3808 Py_DECREF(output); 3809 return 0; 3810 } 3811 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3812 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3813 PyErr_SetString(PyExc_TypeError, "embedded NUL character"); 3814 Py_DECREF(output); 3815 return 0; 3816 } 3817 *(PyObject**)addr = output; 3818 return Py_CLEANUP_SUPPORTED; 3819} 3820 3821 3822char* 3823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3824{ 3825 PyObject *bytes; 3826 3827 if (!PyUnicode_Check(unicode)) { 3828 PyErr_BadArgument(); 3829 return NULL; 3830 } 3831 if (PyUnicode_READY(unicode) == -1) 3832 return NULL; 3833 3834 if (PyUnicode_UTF8(unicode) == NULL) { 3835 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3836 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3837 if (bytes == NULL) 3838 return NULL; 3839 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3840 if (_PyUnicode_UTF8(unicode) == NULL) { 3841 Py_DECREF(bytes); 3842 return NULL; 3843 } 3844 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3845 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3846 PyBytes_AS_STRING(bytes), 3847 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3848 Py_DECREF(bytes); 3849 } 3850 3851 if (psize) 3852 *psize = PyUnicode_UTF8_LENGTH(unicode); 3853 return PyUnicode_UTF8(unicode); 3854} 3855 3856char* 3857PyUnicode_AsUTF8(PyObject *unicode) 3858{ 3859 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3860} 3861 3862Py_UNICODE * 3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3864{ 3865 const unsigned char *one_byte; 3866#if SIZEOF_WCHAR_T == 4 3867 const Py_UCS2 *two_bytes; 3868#else 3869 const Py_UCS4 *four_bytes; 3870 const Py_UCS4 *ucs4_end; 3871 Py_ssize_t num_surrogates; 3872#endif 3873 wchar_t *w; 3874 wchar_t *wchar_end; 3875 3876 if (!PyUnicode_Check(unicode)) { 3877 PyErr_BadArgument(); 3878 return NULL; 3879 } 3880 if (_PyUnicode_WSTR(unicode) == NULL) { 3881 /* Non-ASCII compact unicode object */ 3882 assert(_PyUnicode_KIND(unicode) != 0); 3883 assert(PyUnicode_IS_READY(unicode)); 3884 3885 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3886#if SIZEOF_WCHAR_T == 2 3887 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3888 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3889 num_surrogates = 0; 3890 3891 for (; four_bytes < ucs4_end; ++four_bytes) { 3892 if (*four_bytes > 0xFFFF) 3893 ++num_surrogates; 3894 } 3895 3896 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3897 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3898 if (!_PyUnicode_WSTR(unicode)) { 3899 PyErr_NoMemory(); 3900 return NULL; 3901 } 3902 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3903 3904 w = _PyUnicode_WSTR(unicode); 3905 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3906 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3907 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3908 if (*four_bytes > 0xFFFF) { 3909 assert(*four_bytes <= MAX_UNICODE); 3910 /* encode surrogate pair in this case */ 3911 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3912 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3913 } 3914 else 3915 *w = *four_bytes; 3916 3917 if (w > wchar_end) { 3918 assert(0 && "Miscalculated string end"); 3919 } 3920 } 3921 *w = 0; 3922#else 3923 /* sizeof(wchar_t) == 4 */ 3924 Py_FatalError("Impossible unicode object state, wstr and str " 3925 "should share memory already."); 3926 return NULL; 3927#endif 3928 } 3929 else { 3930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3931 (_PyUnicode_LENGTH(unicode) + 1)); 3932 if (!_PyUnicode_WSTR(unicode)) { 3933 PyErr_NoMemory(); 3934 return NULL; 3935 } 3936 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3937 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3938 w = _PyUnicode_WSTR(unicode); 3939 wchar_end = w + _PyUnicode_LENGTH(unicode); 3940 3941 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3942 one_byte = PyUnicode_1BYTE_DATA(unicode); 3943 for (; w < wchar_end; ++one_byte, ++w) 3944 *w = *one_byte; 3945 /* null-terminate the wstr */ 3946 *w = 0; 3947 } 3948 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3949#if SIZEOF_WCHAR_T == 4 3950 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3951 for (; w < wchar_end; ++two_bytes, ++w) 3952 *w = *two_bytes; 3953 /* null-terminate the wstr */ 3954 *w = 0; 3955#else 3956 /* sizeof(wchar_t) == 2 */ 3957 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3958 _PyUnicode_WSTR(unicode) = NULL; 3959 Py_FatalError("Impossible unicode object state, wstr " 3960 "and str should share memory already."); 3961 return NULL; 3962#endif 3963 } 3964 else { 3965 assert(0 && "This should never happen."); 3966 } 3967 } 3968 } 3969 if (size != NULL) 3970 *size = PyUnicode_WSTR_LENGTH(unicode); 3971 return _PyUnicode_WSTR(unicode); 3972} 3973 3974Py_UNICODE * 3975PyUnicode_AsUnicode(PyObject *unicode) 3976{ 3977 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3978} 3979 3980 3981Py_ssize_t 3982PyUnicode_GetSize(PyObject *unicode) 3983{ 3984 if (!PyUnicode_Check(unicode)) { 3985 PyErr_BadArgument(); 3986 goto onError; 3987 } 3988 return PyUnicode_GET_SIZE(unicode); 3989 3990 onError: 3991 return -1; 3992} 3993 3994Py_ssize_t 3995PyUnicode_GetLength(PyObject *unicode) 3996{ 3997 if (!PyUnicode_Check(unicode)) { 3998 PyErr_BadArgument(); 3999 return -1; 4000 } 4001 if (PyUnicode_READY(unicode) == -1) 4002 return -1; 4003 return PyUnicode_GET_LENGTH(unicode); 4004} 4005 4006Py_UCS4 4007PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 4008{ 4009 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 4010 PyErr_BadArgument(); 4011 return (Py_UCS4)-1; 4012 } 4013 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4014 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4015 return (Py_UCS4)-1; 4016 } 4017 return PyUnicode_READ_CHAR(unicode, index); 4018} 4019 4020int 4021PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 4022{ 4023 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 4024 PyErr_BadArgument(); 4025 return -1; 4026 } 4027 assert(PyUnicode_IS_READY(unicode)); 4028 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 4029 PyErr_SetString(PyExc_IndexError, "string index out of range"); 4030 return -1; 4031 } 4032 if (unicode_check_modifiable(unicode)) 4033 return -1; 4034 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 4035 PyErr_SetString(PyExc_ValueError, "character out of range"); 4036 return -1; 4037 } 4038 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 4039 index, ch); 4040 return 0; 4041} 4042 4043const char * 4044PyUnicode_GetDefaultEncoding(void) 4045{ 4046 return "utf-8"; 4047} 4048 4049/* create or adjust a UnicodeDecodeError */ 4050static void 4051make_decode_exception(PyObject **exceptionObject, 4052 const char *encoding, 4053 const char *input, Py_ssize_t length, 4054 Py_ssize_t startpos, Py_ssize_t endpos, 4055 const char *reason) 4056{ 4057 if (*exceptionObject == NULL) { 4058 *exceptionObject = PyUnicodeDecodeError_Create( 4059 encoding, input, length, startpos, endpos, reason); 4060 } 4061 else { 4062 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 4063 goto onError; 4064 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 4065 goto onError; 4066 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 4067 goto onError; 4068 } 4069 return; 4070 4071onError: 4072 Py_DECREF(*exceptionObject); 4073 *exceptionObject = NULL; 4074} 4075 4076/* error handling callback helper: 4077 build arguments, call the callback and check the arguments, 4078 if no exception occurred, copy the replacement to the output 4079 and adjust various state variables. 4080 return 0 on success, -1 on error 4081*/ 4082 4083static int 4084unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, 4085 const char *encoding, const char *reason, 4086 const char **input, const char **inend, Py_ssize_t *startinpos, 4087 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4088 PyObject **output, Py_ssize_t *outpos) 4089{ 4090 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4091 4092 PyObject *restuple = NULL; 4093 PyObject *repunicode = NULL; 4094 Py_ssize_t outsize; 4095 Py_ssize_t insize; 4096 Py_ssize_t requiredsize; 4097 Py_ssize_t newpos; 4098 PyObject *inputobj = NULL; 4099 int res = -1; 4100 4101 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) 4102 outsize = PyUnicode_GET_LENGTH(*output); 4103 else 4104 outsize = _PyUnicode_WSTR_LENGTH(*output); 4105 4106 if (*errorHandler == NULL) { 4107 *errorHandler = PyCodec_LookupError(errors); 4108 if (*errorHandler == NULL) 4109 goto onError; 4110 } 4111 4112 make_decode_exception(exceptionObject, 4113 encoding, 4114 *input, *inend - *input, 4115 *startinpos, *endinpos, 4116 reason); 4117 if (*exceptionObject == NULL) 4118 goto onError; 4119 4120 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4121 if (restuple == NULL) 4122 goto onError; 4123 if (!PyTuple_Check(restuple)) { 4124 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4125 goto onError; 4126 } 4127 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4128 goto onError; 4129 if (PyUnicode_READY(repunicode) == -1) 4130 goto onError; 4131 4132 /* Copy back the bytes variables, which might have been modified by the 4133 callback */ 4134 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4135 if (!inputobj) 4136 goto onError; 4137 if (!PyBytes_Check(inputobj)) { 4138 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4139 } 4140 *input = PyBytes_AS_STRING(inputobj); 4141 insize = PyBytes_GET_SIZE(inputobj); 4142 *inend = *input + insize; 4143 /* we can DECREF safely, as the exception has another reference, 4144 so the object won't go away. */ 4145 Py_DECREF(inputobj); 4146 4147 if (newpos<0) 4148 newpos = insize+newpos; 4149 if (newpos<0 || newpos>insize) { 4150 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4151 goto onError; 4152 } 4153 4154 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) { 4155 /* need more space? (at least enough for what we 4156 have+the replacement+the rest of the string (starting 4157 at the new input position), so we won't have to check space 4158 when there are no errors in the rest of the string) */ 4159 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode); 4160 requiredsize = *outpos + replen + insize-newpos; 4161 if (requiredsize > outsize) { 4162 if (requiredsize<2*outsize) 4163 requiredsize = 2*outsize; 4164 if (unicode_resize(output, requiredsize) < 0) 4165 goto onError; 4166 } 4167 if (unicode_widen(output, *outpos, 4168 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0) 4169 goto onError; 4170 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen); 4171 *outpos += replen; 4172 } 4173 else { 4174 wchar_t *repwstr; 4175 Py_ssize_t repwlen; 4176 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4177 if (repwstr == NULL) 4178 goto onError; 4179 /* need more space? (at least enough for what we 4180 have+the replacement+the rest of the string (starting 4181 at the new input position), so we won't have to check space 4182 when there are no errors in the rest of the string) */ 4183 requiredsize = *outpos + repwlen + insize-newpos; 4184 if (requiredsize > outsize) { 4185 if (requiredsize < 2*outsize) 4186 requiredsize = 2*outsize; 4187 if (unicode_resize(output, requiredsize) < 0) 4188 goto onError; 4189 } 4190 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4191 *outpos += repwlen; 4192 } 4193 *endinpos = newpos; 4194 *inptr = *input + newpos; 4195 4196 /* we made it! */ 4197 res = 0; 4198 4199 onError: 4200 Py_XDECREF(restuple); 4201 return res; 4202} 4203 4204/* --- UTF-7 Codec -------------------------------------------------------- */ 4205 4206/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4207 4208/* Three simple macros defining base-64. */ 4209 4210/* Is c a base-64 character? */ 4211 4212#define IS_BASE64(c) \ 4213 (((c) >= 'A' && (c) <= 'Z') || \ 4214 ((c) >= 'a' && (c) <= 'z') || \ 4215 ((c) >= '0' && (c) <= '9') || \ 4216 (c) == '+' || (c) == '/') 4217 4218/* given that c is a base-64 character, what is its base-64 value? */ 4219 4220#define FROM_BASE64(c) \ 4221 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4222 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4223 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4224 (c) == '+' ? 62 : 63) 4225 4226/* What is the base-64 character of the bottom 6 bits of n? */ 4227 4228#define TO_BASE64(n) \ 4229 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4230 4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4232 * decoded as itself. We are permissive on decoding; the only ASCII 4233 * byte not decoding to itself is the + which begins a base64 4234 * string. */ 4235 4236#define DECODE_DIRECT(c) \ 4237 ((c) <= 127 && (c) != '+') 4238 4239/* The UTF-7 encoder treats ASCII characters differently according to 4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4241 * the above). See RFC2152. This array identifies these different 4242 * sets: 4243 * 0 : "Set D" 4244 * alphanumeric and '(),-./:? 4245 * 1 : "Set O" 4246 * !"#$%&*;<=>@[]^_`{|} 4247 * 2 : "whitespace" 4248 * ht nl cr sp 4249 * 3 : special (must be base64 encoded) 4250 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4251 */ 4252 4253static 4254char utf7_category[128] = { 4255/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4259/* sp ! " # $ % & ' ( ) * + , - . / */ 4260 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4261/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4263/* @ A B C D E F G H I J K L M N O */ 4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4265/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4267/* ` a b c d e f g h i j k l m n o */ 4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4269/* p q r s t u v w x y z { | } ~ del */ 4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4271}; 4272 4273/* ENCODE_DIRECT: this character should be encoded as itself. The 4274 * answer depends on whether we are encoding set O as itself, and also 4275 * on whether we are encoding whitespace as itself. RFC2152 makes it 4276 * clear that the answers to these questions vary between 4277 * applications, so this code needs to be flexible. */ 4278 4279#define ENCODE_DIRECT(c, directO, directWS) \ 4280 ((c) < 128 && (c) > 0 && \ 4281 ((utf7_category[(c)] == 0) || \ 4282 (directWS && (utf7_category[(c)] == 2)) || \ 4283 (directO && (utf7_category[(c)] == 1)))) 4284 4285PyObject * 4286PyUnicode_DecodeUTF7(const char *s, 4287 Py_ssize_t size, 4288 const char *errors) 4289{ 4290 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4291} 4292 4293/* The decoder. The only state we preserve is our read position, 4294 * i.e. how many characters we have consumed. So if we end in the 4295 * middle of a shift sequence we have to back off the read position 4296 * and the output to the beginning of the sequence, otherwise we lose 4297 * all the shift state (seen bits, number of bits seen, high 4298 * surrogate). */ 4299 4300PyObject * 4301PyUnicode_DecodeUTF7Stateful(const char *s, 4302 Py_ssize_t size, 4303 const char *errors, 4304 Py_ssize_t *consumed) 4305{ 4306 const char *starts = s; 4307 Py_ssize_t startinpos; 4308 Py_ssize_t endinpos; 4309 Py_ssize_t outpos; 4310 const char *e; 4311 PyObject *unicode; 4312 const char *errmsg = ""; 4313 int inShift = 0; 4314 Py_ssize_t shiftOutStart; 4315 unsigned int base64bits = 0; 4316 unsigned long base64buffer = 0; 4317 Py_UCS4 surrogate = 0; 4318 PyObject *errorHandler = NULL; 4319 PyObject *exc = NULL; 4320 4321 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4322 unicode = PyUnicode_New(size, 127); 4323 if (!unicode) 4324 return NULL; 4325 if (size == 0) { 4326 if (consumed) 4327 *consumed = 0; 4328 return unicode; 4329 } 4330 4331 shiftOutStart = outpos = 0; 4332 e = s + size; 4333 4334 while (s < e) { 4335 Py_UCS4 ch; 4336 restart: 4337 ch = (unsigned char) *s; 4338 4339 if (inShift) { /* in a base-64 section */ 4340 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4341 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4342 base64bits += 6; 4343 s++; 4344 if (base64bits >= 16) { 4345 /* we have enough bits for a UTF-16 value */ 4346 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4347 base64bits -= 16; 4348 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4349 if (surrogate) { 4350 /* expecting a second surrogate */ 4351 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4352 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4353 if (unicode_putchar(&unicode, &outpos, ch2) < 0) 4354 goto onError; 4355 surrogate = 0; 4356 continue; 4357 } 4358 else { 4359 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4360 goto onError; 4361 surrogate = 0; 4362 } 4363 } 4364 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4365 /* first surrogate */ 4366 surrogate = outCh; 4367 } 4368 else { 4369 if (unicode_putchar(&unicode, &outpos, outCh) < 0) 4370 goto onError; 4371 } 4372 } 4373 } 4374 else { /* now leaving a base-64 section */ 4375 inShift = 0; 4376 s++; 4377 if (surrogate) { 4378 if (unicode_putchar(&unicode, &outpos, surrogate) < 0) 4379 goto onError; 4380 surrogate = 0; 4381 } 4382 if (base64bits > 0) { /* left-over bits */ 4383 if (base64bits >= 6) { 4384 /* We've seen at least one base-64 character */ 4385 errmsg = "partial character in shift sequence"; 4386 goto utf7Error; 4387 } 4388 else { 4389 /* Some bits remain; they should be zero */ 4390 if (base64buffer != 0) { 4391 errmsg = "non-zero padding bits in shift sequence"; 4392 goto utf7Error; 4393 } 4394 } 4395 } 4396 if (ch != '-') { 4397 /* '-' is absorbed; other terminating 4398 characters are preserved */ 4399 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4400 goto onError; 4401 } 4402 } 4403 } 4404 else if ( ch == '+' ) { 4405 startinpos = s-starts; 4406 s++; /* consume '+' */ 4407 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4408 s++; 4409 if (unicode_putchar(&unicode, &outpos, '+') < 0) 4410 goto onError; 4411 } 4412 else { /* begin base64-encoded section */ 4413 inShift = 1; 4414 shiftOutStart = outpos; 4415 base64bits = 0; 4416 } 4417 } 4418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4419 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4420 goto onError; 4421 s++; 4422 } 4423 else { 4424 startinpos = s-starts; 4425 s++; 4426 errmsg = "unexpected special character"; 4427 goto utf7Error; 4428 } 4429 continue; 4430utf7Error: 4431 endinpos = s-starts; 4432 if (unicode_decode_call_errorhandler( 4433 errors, &errorHandler, 4434 "utf7", errmsg, 4435 &starts, &e, &startinpos, &endinpos, &exc, &s, 4436 &unicode, &outpos)) 4437 goto onError; 4438 } 4439 4440 /* end of string */ 4441 4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4443 /* if we're in an inconsistent state, that's an error */ 4444 if (surrogate || 4445 (base64bits >= 6) || 4446 (base64bits > 0 && base64buffer != 0)) { 4447 endinpos = size; 4448 if (unicode_decode_call_errorhandler( 4449 errors, &errorHandler, 4450 "utf7", "unterminated shift sequence", 4451 &starts, &e, &startinpos, &endinpos, &exc, &s, 4452 &unicode, &outpos)) 4453 goto onError; 4454 if (s < e) 4455 goto restart; 4456 } 4457 } 4458 4459 /* return state */ 4460 if (consumed) { 4461 if (inShift) { 4462 outpos = shiftOutStart; /* back off output */ 4463 *consumed = startinpos; 4464 } 4465 else { 4466 *consumed = s-starts; 4467 } 4468 } 4469 4470 if (unicode_resize(&unicode, outpos) < 0) 4471 goto onError; 4472 4473 Py_XDECREF(errorHandler); 4474 Py_XDECREF(exc); 4475 return unicode_result(unicode); 4476 4477 onError: 4478 Py_XDECREF(errorHandler); 4479 Py_XDECREF(exc); 4480 Py_DECREF(unicode); 4481 return NULL; 4482} 4483 4484 4485PyObject * 4486_PyUnicode_EncodeUTF7(PyObject *str, 4487 int base64SetO, 4488 int base64WhiteSpace, 4489 const char *errors) 4490{ 4491 int kind; 4492 void *data; 4493 Py_ssize_t len; 4494 PyObject *v; 4495 Py_ssize_t allocated; 4496 int inShift = 0; 4497 Py_ssize_t i; 4498 unsigned int base64bits = 0; 4499 unsigned long base64buffer = 0; 4500 char * out; 4501 char * start; 4502 4503 if (PyUnicode_READY(str) == -1) 4504 return NULL; 4505 kind = PyUnicode_KIND(str); 4506 data = PyUnicode_DATA(str); 4507 len = PyUnicode_GET_LENGTH(str); 4508 4509 if (len == 0) 4510 return PyBytes_FromStringAndSize(NULL, 0); 4511 4512 /* It might be possible to tighten this worst case */ 4513 allocated = 8 * len; 4514 if (allocated / 8 != len) 4515 return PyErr_NoMemory(); 4516 4517 v = PyBytes_FromStringAndSize(NULL, allocated); 4518 if (v == NULL) 4519 return NULL; 4520 4521 start = out = PyBytes_AS_STRING(v); 4522 for (i = 0; i < len; ++i) { 4523 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4524 4525 if (inShift) { 4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4527 /* shifting out */ 4528 if (base64bits) { /* output remaining bits */ 4529 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4530 base64buffer = 0; 4531 base64bits = 0; 4532 } 4533 inShift = 0; 4534 /* Characters not in the BASE64 set implicitly unshift the sequence 4535 so no '-' is required, except if the character is itself a '-' */ 4536 if (IS_BASE64(ch) || ch == '-') { 4537 *out++ = '-'; 4538 } 4539 *out++ = (char) ch; 4540 } 4541 else { 4542 goto encode_char; 4543 } 4544 } 4545 else { /* not in a shift sequence */ 4546 if (ch == '+') { 4547 *out++ = '+'; 4548 *out++ = '-'; 4549 } 4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4551 *out++ = (char) ch; 4552 } 4553 else { 4554 *out++ = '+'; 4555 inShift = 1; 4556 goto encode_char; 4557 } 4558 } 4559 continue; 4560encode_char: 4561 if (ch >= 0x10000) { 4562 assert(ch <= MAX_UNICODE); 4563 4564 /* code first surrogate */ 4565 base64bits += 16; 4566 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); 4567 while (base64bits >= 6) { 4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4569 base64bits -= 6; 4570 } 4571 /* prepare second surrogate */ 4572 ch = Py_UNICODE_LOW_SURROGATE(ch); 4573 } 4574 base64bits += 16; 4575 base64buffer = (base64buffer << 16) | ch; 4576 while (base64bits >= 6) { 4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4578 base64bits -= 6; 4579 } 4580 } 4581 if (base64bits) 4582 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4583 if (inShift) 4584 *out++ = '-'; 4585 if (_PyBytes_Resize(&v, out - start) < 0) 4586 return NULL; 4587 return v; 4588} 4589PyObject * 4590PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4591 Py_ssize_t size, 4592 int base64SetO, 4593 int base64WhiteSpace, 4594 const char *errors) 4595{ 4596 PyObject *result; 4597 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4598 if (tmp == NULL) 4599 return NULL; 4600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4601 base64WhiteSpace, errors); 4602 Py_DECREF(tmp); 4603 return result; 4604} 4605 4606#undef IS_BASE64 4607#undef FROM_BASE64 4608#undef TO_BASE64 4609#undef DECODE_DIRECT 4610#undef ENCODE_DIRECT 4611 4612/* --- UTF-8 Codec -------------------------------------------------------- */ 4613 4614PyObject * 4615PyUnicode_DecodeUTF8(const char *s, 4616 Py_ssize_t size, 4617 const char *errors) 4618{ 4619 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4620} 4621 4622#include "stringlib/asciilib.h" 4623#include "stringlib/codecs.h" 4624#include "stringlib/undef.h" 4625 4626#include "stringlib/ucs1lib.h" 4627#include "stringlib/codecs.h" 4628#include "stringlib/undef.h" 4629 4630#include "stringlib/ucs2lib.h" 4631#include "stringlib/codecs.h" 4632#include "stringlib/undef.h" 4633 4634#include "stringlib/ucs4lib.h" 4635#include "stringlib/codecs.h" 4636#include "stringlib/undef.h" 4637 4638/* Mask to quickly check whether a C 'long' contains a 4639 non-ASCII, UTF8-encoded char. */ 4640#if (SIZEOF_LONG == 8) 4641# define ASCII_CHAR_MASK 0x8080808080808080UL 4642#elif (SIZEOF_LONG == 4) 4643# define ASCII_CHAR_MASK 0x80808080UL 4644#else 4645# error C 'long' size should be either 4 or 8! 4646#endif 4647 4648static Py_ssize_t 4649ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4650{ 4651 const char *p = start; 4652 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4653 4654#if SIZEOF_LONG <= SIZEOF_VOID_P 4655 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4656 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4657 /* Fast path, see in STRINGLIB(utf8_decode) for 4658 an explanation. */ 4659 /* Help register allocation */ 4660 register const char *_p = p; 4661 register Py_UCS1 * q = dest; 4662 while (_p < aligned_end) { 4663 unsigned long value = *(const unsigned long *) _p; 4664 if (value & ASCII_CHAR_MASK) 4665 break; 4666 *((unsigned long *)q) = value; 4667 _p += SIZEOF_LONG; 4668 q += SIZEOF_LONG; 4669 } 4670 p = _p; 4671 while (p < end) { 4672 if ((unsigned char)*p & 0x80) 4673 break; 4674 *q++ = *p++; 4675 } 4676 return p - start; 4677 } 4678#endif 4679 while (p < end) { 4680 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4681 for an explanation. */ 4682 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4683 /* Help register allocation */ 4684 register const char *_p = p; 4685 while (_p < aligned_end) { 4686 unsigned long value = *(unsigned long *) _p; 4687 if (value & ASCII_CHAR_MASK) 4688 break; 4689 _p += SIZEOF_LONG; 4690 } 4691 p = _p; 4692 if (_p == end) 4693 break; 4694 } 4695 if ((unsigned char)*p & 0x80) 4696 break; 4697 ++p; 4698 } 4699 memcpy(dest, start, p - start); 4700 return p - start; 4701} 4702 4703PyObject * 4704PyUnicode_DecodeUTF8Stateful(const char *s, 4705 Py_ssize_t size, 4706 const char *errors, 4707 Py_ssize_t *consumed) 4708{ 4709 PyObject *unicode; 4710 const char *starts = s; 4711 const char *end = s + size; 4712 Py_ssize_t outpos; 4713 4714 Py_ssize_t startinpos; 4715 Py_ssize_t endinpos; 4716 const char *errmsg = ""; 4717 PyObject *errorHandler = NULL; 4718 PyObject *exc = NULL; 4719 4720 if (size == 0) { 4721 if (consumed) 4722 *consumed = 0; 4723 Py_INCREF(unicode_empty); 4724 return unicode_empty; 4725 } 4726 4727 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4728 if (size == 1 && (unsigned char)s[0] < 128) { 4729 if (consumed) 4730 *consumed = 1; 4731 return get_latin1_char((unsigned char)s[0]); 4732 } 4733 4734 unicode = PyUnicode_New(size, 127); 4735 if (!unicode) 4736 return NULL; 4737 4738 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode)); 4739 s += outpos; 4740 while (s < end) { 4741 Py_UCS4 ch; 4742 int kind = PyUnicode_KIND(unicode); 4743 if (kind == PyUnicode_1BYTE_KIND) { 4744 if (PyUnicode_IS_ASCII(unicode)) 4745 ch = asciilib_utf8_decode(&s, end, 4746 PyUnicode_1BYTE_DATA(unicode), &outpos); 4747 else 4748 ch = ucs1lib_utf8_decode(&s, end, 4749 PyUnicode_1BYTE_DATA(unicode), &outpos); 4750 } else if (kind == PyUnicode_2BYTE_KIND) { 4751 ch = ucs2lib_utf8_decode(&s, end, 4752 PyUnicode_2BYTE_DATA(unicode), &outpos); 4753 } else { 4754 assert(kind == PyUnicode_4BYTE_KIND); 4755 ch = ucs4lib_utf8_decode(&s, end, 4756 PyUnicode_4BYTE_DATA(unicode), &outpos); 4757 } 4758 4759 switch (ch) { 4760 case 0: 4761 if (s == end || consumed) 4762 goto End; 4763 errmsg = "unexpected end of data"; 4764 startinpos = s - starts; 4765 endinpos = startinpos + 1; 4766 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4767 endinpos++; 4768 break; 4769 case 1: 4770 errmsg = "invalid start byte"; 4771 startinpos = s - starts; 4772 endinpos = startinpos + 1; 4773 break; 4774 case 2: 4775 errmsg = "invalid continuation byte"; 4776 startinpos = s - starts; 4777 endinpos = startinpos + 1; 4778 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) 4779 endinpos++; 4780 break; 4781 default: 4782 if (unicode_putchar(&unicode, &outpos, ch) < 0) 4783 goto onError; 4784 continue; 4785 } 4786 4787 if (unicode_decode_call_errorhandler( 4788 errors, &errorHandler, 4789 "utf-8", errmsg, 4790 &starts, &end, &startinpos, &endinpos, &exc, &s, 4791 &unicode, &outpos)) 4792 goto onError; 4793 } 4794 4795End: 4796 if (unicode_resize(&unicode, outpos) < 0) 4797 goto onError; 4798 4799 if (consumed) 4800 *consumed = s - starts; 4801 4802 Py_XDECREF(errorHandler); 4803 Py_XDECREF(exc); 4804 assert(_PyUnicode_CheckConsistency(unicode, 1)); 4805 return unicode; 4806 4807onError: 4808 Py_XDECREF(errorHandler); 4809 Py_XDECREF(exc); 4810 Py_XDECREF(unicode); 4811 return NULL; 4812} 4813 4814#ifdef __APPLE__ 4815 4816/* Simplified UTF-8 decoder using surrogateescape error handler, 4817 used to decode the command line arguments on Mac OS X. */ 4818 4819wchar_t* 4820_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4821{ 4822 const char *e; 4823 wchar_t *unicode; 4824 Py_ssize_t outpos; 4825 4826 /* Note: size will always be longer than the resulting Unicode 4827 character count */ 4828 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { 4829 PyErr_NoMemory(); 4830 return NULL; 4831 } 4832 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); 4833 if (!unicode) 4834 return NULL; 4835 4836 /* Unpack UTF-8 encoded data */ 4837 e = s + size; 4838 outpos = 0; 4839 while (s < e) { 4840 Py_UCS4 ch; 4841#if SIZEOF_WCHAR_T == 4 4842 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4843#else 4844 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4845#endif 4846 if (ch > 0xFF) { 4847#if SIZEOF_WCHAR_T == 4 4848 assert(0); 4849#else 4850 assert(Py_UNICODE_IS_SURROGATE(ch)); 4851 /* compute and append the two surrogates: */ 4852 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4853 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4854#endif 4855 } 4856 else { 4857 if (!ch && s == e) 4858 break; 4859 /* surrogateescape */ 4860 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4861 } 4862 } 4863 unicode[outpos] = L'\0'; 4864 return unicode; 4865} 4866 4867#endif /* __APPLE__ */ 4868 4869/* Primary internal function which creates utf8 encoded bytes objects. 4870 4871 Allocation strategy: if the string is short, convert into a stack buffer 4872 and allocate exactly as much space needed at the end. Else allocate the 4873 maximum possible needed (4 result bytes per Unicode character), and return 4874 the excess memory at the end. 4875*/ 4876PyObject * 4877_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4878{ 4879 enum PyUnicode_Kind kind; 4880 void *data; 4881 Py_ssize_t size; 4882 4883 if (!PyUnicode_Check(unicode)) { 4884 PyErr_BadArgument(); 4885 return NULL; 4886 } 4887 4888 if (PyUnicode_READY(unicode) == -1) 4889 return NULL; 4890 4891 if (PyUnicode_UTF8(unicode)) 4892 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4893 PyUnicode_UTF8_LENGTH(unicode)); 4894 4895 kind = PyUnicode_KIND(unicode); 4896 data = PyUnicode_DATA(unicode); 4897 size = PyUnicode_GET_LENGTH(unicode); 4898 4899 switch (kind) { 4900 default: 4901 assert(0); 4902 case PyUnicode_1BYTE_KIND: 4903 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4904 assert(!PyUnicode_IS_ASCII(unicode)); 4905 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4906 case PyUnicode_2BYTE_KIND: 4907 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4908 case PyUnicode_4BYTE_KIND: 4909 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4910 } 4911} 4912 4913PyObject * 4914PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4915 Py_ssize_t size, 4916 const char *errors) 4917{ 4918 PyObject *v, *unicode; 4919 4920 unicode = PyUnicode_FromUnicode(s, size); 4921 if (unicode == NULL) 4922 return NULL; 4923 v = _PyUnicode_AsUTF8String(unicode, errors); 4924 Py_DECREF(unicode); 4925 return v; 4926} 4927 4928PyObject * 4929PyUnicode_AsUTF8String(PyObject *unicode) 4930{ 4931 return _PyUnicode_AsUTF8String(unicode, NULL); 4932} 4933 4934/* --- UTF-32 Codec ------------------------------------------------------- */ 4935 4936PyObject * 4937PyUnicode_DecodeUTF32(const char *s, 4938 Py_ssize_t size, 4939 const char *errors, 4940 int *byteorder) 4941{ 4942 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4943} 4944 4945PyObject * 4946PyUnicode_DecodeUTF32Stateful(const char *s, 4947 Py_ssize_t size, 4948 const char *errors, 4949 int *byteorder, 4950 Py_ssize_t *consumed) 4951{ 4952 const char *starts = s; 4953 Py_ssize_t startinpos; 4954 Py_ssize_t endinpos; 4955 Py_ssize_t outpos; 4956 PyObject *unicode; 4957 const unsigned char *q, *e; 4958 int bo = 0; /* assume native ordering by default */ 4959 const char *errmsg = ""; 4960 /* Offsets from q for retrieving bytes in the right order. */ 4961#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4962 int iorder[] = {0, 1, 2, 3}; 4963#else 4964 int iorder[] = {3, 2, 1, 0}; 4965#endif 4966 PyObject *errorHandler = NULL; 4967 PyObject *exc = NULL; 4968 4969 q = (unsigned char *)s; 4970 e = q + size; 4971 4972 if (byteorder) 4973 bo = *byteorder; 4974 4975 /* Check for BOM marks (U+FEFF) in the input and adjust current 4976 byte order setting accordingly. In native mode, the leading BOM 4977 mark is skipped, in all other modes, it is copied to the output 4978 stream as-is (giving a ZWNBSP character). */ 4979 if (bo == 0) { 4980 if (size >= 4) { 4981 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 4982 (q[iorder[1]] << 8) | q[iorder[0]]; 4983#ifdef BYTEORDER_IS_LITTLE_ENDIAN 4984 if (bom == 0x0000FEFF) { 4985 q += 4; 4986 bo = -1; 4987 } 4988 else if (bom == 0xFFFE0000) { 4989 q += 4; 4990 bo = 1; 4991 } 4992#else 4993 if (bom == 0x0000FEFF) { 4994 q += 4; 4995 bo = 1; 4996 } 4997 else if (bom == 0xFFFE0000) { 4998 q += 4; 4999 bo = -1; 5000 } 5001#endif 5002 } 5003 } 5004 5005 if (bo == -1) { 5006 /* force LE */ 5007 iorder[0] = 0; 5008 iorder[1] = 1; 5009 iorder[2] = 2; 5010 iorder[3] = 3; 5011 } 5012 else if (bo == 1) { 5013 /* force BE */ 5014 iorder[0] = 3; 5015 iorder[1] = 2; 5016 iorder[2] = 1; 5017 iorder[3] = 0; 5018 } 5019 5020 /* This might be one to much, because of a BOM */ 5021 unicode = PyUnicode_New((size+3)/4, 127); 5022 if (!unicode) 5023 return NULL; 5024 if (size == 0) 5025 return unicode; 5026 outpos = 0; 5027 5028 while (q < e) { 5029 Py_UCS4 ch; 5030 /* remaining bytes at the end? (size should be divisible by 4) */ 5031 if (e-q<4) { 5032 if (consumed) 5033 break; 5034 errmsg = "truncated data"; 5035 startinpos = ((const char *)q)-starts; 5036 endinpos = ((const char *)e)-starts; 5037 goto utf32Error; 5038 /* The remaining input chars are ignored if the callback 5039 chooses to skip the input */ 5040 } 5041 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | 5042 (q[iorder[1]] << 8) | q[iorder[0]]; 5043 5044 if (ch >= 0x110000) 5045 { 5046 errmsg = "codepoint not in range(0x110000)"; 5047 startinpos = ((const char *)q)-starts; 5048 endinpos = startinpos+4; 5049 goto utf32Error; 5050 } 5051 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5052 goto onError; 5053 q += 4; 5054 continue; 5055 utf32Error: 5056 if (unicode_decode_call_errorhandler( 5057 errors, &errorHandler, 5058 "utf32", errmsg, 5059 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5060 &unicode, &outpos)) 5061 goto onError; 5062 } 5063 5064 if (byteorder) 5065 *byteorder = bo; 5066 5067 if (consumed) 5068 *consumed = (const char *)q-starts; 5069 5070 /* Adjust length */ 5071 if (unicode_resize(&unicode, outpos) < 0) 5072 goto onError; 5073 5074 Py_XDECREF(errorHandler); 5075 Py_XDECREF(exc); 5076 return unicode_result(unicode); 5077 5078 onError: 5079 Py_DECREF(unicode); 5080 Py_XDECREF(errorHandler); 5081 Py_XDECREF(exc); 5082 return NULL; 5083} 5084 5085PyObject * 5086_PyUnicode_EncodeUTF32(PyObject *str, 5087 const char *errors, 5088 int byteorder) 5089{ 5090 int kind; 5091 void *data; 5092 Py_ssize_t len; 5093 PyObject *v; 5094 unsigned char *p; 5095 Py_ssize_t nsize, bytesize, i; 5096 /* Offsets from p for storing byte pairs in the right order. */ 5097#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5098 int iorder[] = {0, 1, 2, 3}; 5099#else 5100 int iorder[] = {3, 2, 1, 0}; 5101#endif 5102 5103#define STORECHAR(CH) \ 5104 do { \ 5105 p[iorder[3]] = ((CH) >> 24) & 0xff; \ 5106 p[iorder[2]] = ((CH) >> 16) & 0xff; \ 5107 p[iorder[1]] = ((CH) >> 8) & 0xff; \ 5108 p[iorder[0]] = (CH) & 0xff; \ 5109 p += 4; \ 5110 } while(0) 5111 5112 if (!PyUnicode_Check(str)) { 5113 PyErr_BadArgument(); 5114 return NULL; 5115 } 5116 if (PyUnicode_READY(str) == -1) 5117 return NULL; 5118 kind = PyUnicode_KIND(str); 5119 data = PyUnicode_DATA(str); 5120 len = PyUnicode_GET_LENGTH(str); 5121 5122 nsize = len + (byteorder == 0); 5123 bytesize = nsize * 4; 5124 if (bytesize / 4 != nsize) 5125 return PyErr_NoMemory(); 5126 v = PyBytes_FromStringAndSize(NULL, bytesize); 5127 if (v == NULL) 5128 return NULL; 5129 5130 p = (unsigned char *)PyBytes_AS_STRING(v); 5131 if (byteorder == 0) 5132 STORECHAR(0xFEFF); 5133 if (len == 0) 5134 goto done; 5135 5136 if (byteorder == -1) { 5137 /* force LE */ 5138 iorder[0] = 0; 5139 iorder[1] = 1; 5140 iorder[2] = 2; 5141 iorder[3] = 3; 5142 } 5143 else if (byteorder == 1) { 5144 /* force BE */ 5145 iorder[0] = 3; 5146 iorder[1] = 2; 5147 iorder[2] = 1; 5148 iorder[3] = 0; 5149 } 5150 5151 for (i = 0; i < len; i++) 5152 STORECHAR(PyUnicode_READ(kind, data, i)); 5153 5154 done: 5155 return v; 5156#undef STORECHAR 5157} 5158 5159PyObject * 5160PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5161 Py_ssize_t size, 5162 const char *errors, 5163 int byteorder) 5164{ 5165 PyObject *result; 5166 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5167 if (tmp == NULL) 5168 return NULL; 5169 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5170 Py_DECREF(tmp); 5171 return result; 5172} 5173 5174PyObject * 5175PyUnicode_AsUTF32String(PyObject *unicode) 5176{ 5177 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5178} 5179 5180/* --- UTF-16 Codec ------------------------------------------------------- */ 5181 5182PyObject * 5183PyUnicode_DecodeUTF16(const char *s, 5184 Py_ssize_t size, 5185 const char *errors, 5186 int *byteorder) 5187{ 5188 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5189} 5190 5191PyObject * 5192PyUnicode_DecodeUTF16Stateful(const char *s, 5193 Py_ssize_t size, 5194 const char *errors, 5195 int *byteorder, 5196 Py_ssize_t *consumed) 5197{ 5198 const char *starts = s; 5199 Py_ssize_t startinpos; 5200 Py_ssize_t endinpos; 5201 Py_ssize_t outpos; 5202 PyObject *unicode; 5203 const unsigned char *q, *e; 5204 int bo = 0; /* assume native ordering by default */ 5205 int native_ordering; 5206 const char *errmsg = ""; 5207 PyObject *errorHandler = NULL; 5208 PyObject *exc = NULL; 5209 5210 q = (unsigned char *)s; 5211 e = q + size; 5212 5213 if (byteorder) 5214 bo = *byteorder; 5215 5216 /* Check for BOM marks (U+FEFF) in the input and adjust current 5217 byte order setting accordingly. In native mode, the leading BOM 5218 mark is skipped, in all other modes, it is copied to the output 5219 stream as-is (giving a ZWNBSP character). */ 5220 if (bo == 0 && size >= 2) { 5221 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5222 if (bom == 0xFEFF) { 5223 q += 2; 5224 bo = -1; 5225 } 5226 else if (bom == 0xFFFE) { 5227 q += 2; 5228 bo = 1; 5229 } 5230 if (byteorder) 5231 *byteorder = bo; 5232 } 5233 5234 if (q == e) { 5235 if (consumed) 5236 *consumed = size; 5237 Py_INCREF(unicode_empty); 5238 return unicode_empty; 5239 } 5240 5241#ifdef BYTEORDER_IS_LITTLE_ENDIAN 5242 native_ordering = bo <= 0; 5243#else 5244 native_ordering = bo >= 0; 5245#endif 5246 5247 /* Note: size will always be longer than the resulting Unicode 5248 character count */ 5249 unicode = PyUnicode_New((e - q + 1) / 2, 127); 5250 if (!unicode) 5251 return NULL; 5252 5253 outpos = 0; 5254 while (1) { 5255 Py_UCS4 ch = 0; 5256 if (e - q >= 2) { 5257 int kind = PyUnicode_KIND(unicode); 5258 if (kind == PyUnicode_1BYTE_KIND) { 5259 if (PyUnicode_IS_ASCII(unicode)) 5260 ch = asciilib_utf16_decode(&q, e, 5261 PyUnicode_1BYTE_DATA(unicode), &outpos, 5262 native_ordering); 5263 else 5264 ch = ucs1lib_utf16_decode(&q, e, 5265 PyUnicode_1BYTE_DATA(unicode), &outpos, 5266 native_ordering); 5267 } else if (kind == PyUnicode_2BYTE_KIND) { 5268 ch = ucs2lib_utf16_decode(&q, e, 5269 PyUnicode_2BYTE_DATA(unicode), &outpos, 5270 native_ordering); 5271 } else { 5272 assert(kind == PyUnicode_4BYTE_KIND); 5273 ch = ucs4lib_utf16_decode(&q, e, 5274 PyUnicode_4BYTE_DATA(unicode), &outpos, 5275 native_ordering); 5276 } 5277 } 5278 5279 switch (ch) 5280 { 5281 case 0: 5282 /* remaining byte at the end? (size should be even) */ 5283 if (q == e || consumed) 5284 goto End; 5285 errmsg = "truncated data"; 5286 startinpos = ((const char *)q) - starts; 5287 endinpos = ((const char *)e) - starts; 5288 break; 5289 /* The remaining input chars are ignored if the callback 5290 chooses to skip the input */ 5291 case 1: 5292 errmsg = "unexpected end of data"; 5293 startinpos = ((const char *)q) - 2 - starts; 5294 endinpos = ((const char *)e) - starts; 5295 break; 5296 case 2: 5297 errmsg = "illegal encoding"; 5298 startinpos = ((const char *)q) - 2 - starts; 5299 endinpos = startinpos + 2; 5300 break; 5301 case 3: 5302 errmsg = "illegal UTF-16 surrogate"; 5303 startinpos = ((const char *)q) - 4 - starts; 5304 endinpos = startinpos + 2; 5305 break; 5306 default: 5307 if (unicode_putchar(&unicode, &outpos, ch) < 0) 5308 goto onError; 5309 continue; 5310 } 5311 5312 if (unicode_decode_call_errorhandler( 5313 errors, 5314 &errorHandler, 5315 "utf16", errmsg, 5316 &starts, 5317 (const char **)&e, 5318 &startinpos, 5319 &endinpos, 5320 &exc, 5321 (const char **)&q, 5322 &unicode, 5323 &outpos)) 5324 goto onError; 5325 } 5326 5327End: 5328 if (consumed) 5329 *consumed = (const char *)q-starts; 5330 5331 /* Adjust length */ 5332 if (unicode_resize(&unicode, outpos) < 0) 5333 goto onError; 5334 5335 Py_XDECREF(errorHandler); 5336 Py_XDECREF(exc); 5337 return unicode_result(unicode); 5338 5339 onError: 5340 Py_DECREF(unicode); 5341 Py_XDECREF(errorHandler); 5342 Py_XDECREF(exc); 5343 return NULL; 5344} 5345 5346PyObject * 5347_PyUnicode_EncodeUTF16(PyObject *str, 5348 const char *errors, 5349 int byteorder) 5350{ 5351 enum PyUnicode_Kind kind; 5352 const void *data; 5353 Py_ssize_t len; 5354 PyObject *v; 5355 unsigned short *out; 5356 Py_ssize_t bytesize; 5357 Py_ssize_t pairs; 5358#ifdef WORDS_BIGENDIAN 5359 int native_ordering = byteorder >= 0; 5360#else 5361 int native_ordering = byteorder <= 0; 5362#endif 5363 5364 if (!PyUnicode_Check(str)) { 5365 PyErr_BadArgument(); 5366 return NULL; 5367 } 5368 if (PyUnicode_READY(str) == -1) 5369 return NULL; 5370 kind = PyUnicode_KIND(str); 5371 data = PyUnicode_DATA(str); 5372 len = PyUnicode_GET_LENGTH(str); 5373 5374 pairs = 0; 5375 if (kind == PyUnicode_4BYTE_KIND) { 5376 const Py_UCS4 *in = (const Py_UCS4 *)data; 5377 const Py_UCS4 *end = in + len; 5378 while (in < end) 5379 if (*in++ >= 0x10000) 5380 pairs++; 5381 } 5382 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5383 return PyErr_NoMemory(); 5384 bytesize = (len + pairs + (byteorder == 0)) * 2; 5385 v = PyBytes_FromStringAndSize(NULL, bytesize); 5386 if (v == NULL) 5387 return NULL; 5388 5389 /* output buffer is 2-bytes aligned */ 5390 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5391 out = (unsigned short *)PyBytes_AS_STRING(v); 5392 if (byteorder == 0) 5393 *out++ = 0xFEFF; 5394 if (len == 0) 5395 goto done; 5396 5397 switch (kind) { 5398 case PyUnicode_1BYTE_KIND: { 5399 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering); 5400 break; 5401 } 5402 case PyUnicode_2BYTE_KIND: { 5403 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering); 5404 break; 5405 } 5406 case PyUnicode_4BYTE_KIND: { 5407 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering); 5408 break; 5409 } 5410 default: 5411 assert(0); 5412 } 5413 5414 done: 5415 return v; 5416} 5417 5418PyObject * 5419PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5420 Py_ssize_t size, 5421 const char *errors, 5422 int byteorder) 5423{ 5424 PyObject *result; 5425 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5426 if (tmp == NULL) 5427 return NULL; 5428 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5429 Py_DECREF(tmp); 5430 return result; 5431} 5432 5433PyObject * 5434PyUnicode_AsUTF16String(PyObject *unicode) 5435{ 5436 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5437} 5438 5439/* --- Unicode Escape Codec ----------------------------------------------- */ 5440 5441/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5442 if all the escapes in the string make it still a valid ASCII string. 5443 Returns -1 if any escapes were found which cause the string to 5444 pop out of ASCII range. Otherwise returns the length of the 5445 required buffer to hold the string. 5446 */ 5447static Py_ssize_t 5448length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5449{ 5450 const unsigned char *p = (const unsigned char *)s; 5451 const unsigned char *end = p + size; 5452 Py_ssize_t length = 0; 5453 5454 if (size < 0) 5455 return -1; 5456 5457 for (; p < end; ++p) { 5458 if (*p > 127) { 5459 /* Non-ASCII */ 5460 return -1; 5461 } 5462 else if (*p != '\\') { 5463 /* Normal character */ 5464 ++length; 5465 } 5466 else { 5467 /* Backslash-escape, check next char */ 5468 ++p; 5469 /* Escape sequence reaches till end of string or 5470 non-ASCII follow-up. */ 5471 if (p >= end || *p > 127) 5472 return -1; 5473 switch (*p) { 5474 case '\n': 5475 /* backslash + \n result in zero characters */ 5476 break; 5477 case '\\': case '\'': case '\"': 5478 case 'b': case 'f': case 't': 5479 case 'n': case 'r': case 'v': case 'a': 5480 ++length; 5481 break; 5482 case '0': case '1': case '2': case '3': 5483 case '4': case '5': case '6': case '7': 5484 case 'x': case 'u': case 'U': case 'N': 5485 /* these do not guarantee ASCII characters */ 5486 return -1; 5487 default: 5488 /* count the backslash + the other character */ 5489 length += 2; 5490 } 5491 } 5492 } 5493 return length; 5494} 5495 5496static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5497 5498PyObject * 5499PyUnicode_DecodeUnicodeEscape(const char *s, 5500 Py_ssize_t size, 5501 const char *errors) 5502{ 5503 const char *starts = s; 5504 Py_ssize_t startinpos; 5505 Py_ssize_t endinpos; 5506 int j; 5507 PyObject *v; 5508 const char *end; 5509 char* message; 5510 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5511 PyObject *errorHandler = NULL; 5512 PyObject *exc = NULL; 5513 Py_ssize_t len; 5514 Py_ssize_t i; 5515 5516 len = length_of_escaped_ascii_string(s, size); 5517 5518 /* After length_of_escaped_ascii_string() there are two alternatives, 5519 either the string is pure ASCII with named escapes like \n, etc. 5520 and we determined it's exact size (common case) 5521 or it contains \x, \u, ... escape sequences. then we create a 5522 legacy wchar string and resize it at the end of this function. */ 5523 if (len >= 0) { 5524 v = PyUnicode_New(len, 127); 5525 if (!v) 5526 goto onError; 5527 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); 5528 } 5529 else { 5530 /* Escaped strings will always be longer than the resulting 5531 Unicode string, so we start with size here and then reduce the 5532 length after conversion to the true value. 5533 (but if the error callback returns a long replacement string 5534 we'll have to allocate more space) */ 5535 v = PyUnicode_New(size, 127); 5536 if (!v) 5537 goto onError; 5538 len = size; 5539 } 5540 5541 if (size == 0) 5542 return v; 5543 i = 0; 5544 end = s + size; 5545 5546 while (s < end) { 5547 unsigned char c; 5548 Py_UCS4 x; 5549 int digits; 5550 5551 /* The only case in which i == ascii_length is a backslash 5552 followed by a newline. */ 5553 assert(i <= len); 5554 5555 /* Non-escape characters are interpreted as Unicode ordinals */ 5556 if (*s != '\\') { 5557 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0) 5558 goto onError; 5559 continue; 5560 } 5561 5562 startinpos = s-starts; 5563 /* \ - Escapes */ 5564 s++; 5565 c = *s++; 5566 if (s > end) 5567 c = '\0'; /* Invalid after \ */ 5568 5569 /* The only case in which i == ascii_length is a backslash 5570 followed by a newline. */ 5571 assert(i < len || (i == len && c == '\n')); 5572 5573 switch (c) { 5574 5575 /* \x escapes */ 5576#define WRITECHAR(ch) \ 5577 do { \ 5578 if (unicode_putchar(&v, &i, ch) < 0) \ 5579 goto onError; \ 5580 }while(0) 5581 5582 case '\n': break; 5583 case '\\': WRITECHAR('\\'); break; 5584 case '\'': WRITECHAR('\''); break; 5585 case '\"': WRITECHAR('\"'); break; 5586 case 'b': WRITECHAR('\b'); break; 5587 /* FF */ 5588 case 'f': WRITECHAR('\014'); break; 5589 case 't': WRITECHAR('\t'); break; 5590 case 'n': WRITECHAR('\n'); break; 5591 case 'r': WRITECHAR('\r'); break; 5592 /* VT */ 5593 case 'v': WRITECHAR('\013'); break; 5594 /* BEL, not classic C */ 5595 case 'a': WRITECHAR('\007'); break; 5596 5597 /* \OOO (octal) escapes */ 5598 case '0': case '1': case '2': case '3': 5599 case '4': case '5': case '6': case '7': 5600 x = s[-1] - '0'; 5601 if (s < end && '0' <= *s && *s <= '7') { 5602 x = (x<<3) + *s++ - '0'; 5603 if (s < end && '0' <= *s && *s <= '7') 5604 x = (x<<3) + *s++ - '0'; 5605 } 5606 WRITECHAR(x); 5607 break; 5608 5609 /* hex escapes */ 5610 /* \xXX */ 5611 case 'x': 5612 digits = 2; 5613 message = "truncated \\xXX escape"; 5614 goto hexescape; 5615 5616 /* \uXXXX */ 5617 case 'u': 5618 digits = 4; 5619 message = "truncated \\uXXXX escape"; 5620 goto hexescape; 5621 5622 /* \UXXXXXXXX */ 5623 case 'U': 5624 digits = 8; 5625 message = "truncated \\UXXXXXXXX escape"; 5626 hexescape: 5627 chr = 0; 5628 if (s+digits>end) { 5629 endinpos = size; 5630 if (unicode_decode_call_errorhandler( 5631 errors, &errorHandler, 5632 "unicodeescape", "end of string in escape sequence", 5633 &starts, &end, &startinpos, &endinpos, &exc, &s, 5634 &v, &i)) 5635 goto onError; 5636 goto nextByte; 5637 } 5638 for (j = 0; j < digits; ++j) { 5639 c = (unsigned char) s[j]; 5640 if (!Py_ISXDIGIT(c)) { 5641 endinpos = (s+j+1)-starts; 5642 if (unicode_decode_call_errorhandler( 5643 errors, &errorHandler, 5644 "unicodeescape", message, 5645 &starts, &end, &startinpos, &endinpos, &exc, &s, 5646 &v, &i)) 5647 goto onError; 5648 len = PyUnicode_GET_LENGTH(v); 5649 goto nextByte; 5650 } 5651 chr = (chr<<4) & ~0xF; 5652 if (c >= '0' && c <= '9') 5653 chr += c - '0'; 5654 else if (c >= 'a' && c <= 'f') 5655 chr += 10 + c - 'a'; 5656 else 5657 chr += 10 + c - 'A'; 5658 } 5659 s += j; 5660 if (chr == 0xffffffff && PyErr_Occurred()) 5661 /* _decoding_error will have already written into the 5662 target buffer. */ 5663 break; 5664 store: 5665 /* when we get here, chr is a 32-bit unicode character */ 5666 if (chr <= MAX_UNICODE) { 5667 WRITECHAR(chr); 5668 } else { 5669 endinpos = s-starts; 5670 if (unicode_decode_call_errorhandler( 5671 errors, &errorHandler, 5672 "unicodeescape", "illegal Unicode character", 5673 &starts, &end, &startinpos, &endinpos, &exc, &s, 5674 &v, &i)) 5675 goto onError; 5676 } 5677 break; 5678 5679 /* \N{name} */ 5680 case 'N': 5681 message = "malformed \\N character escape"; 5682 if (ucnhash_CAPI == NULL) { 5683 /* load the unicode data module */ 5684 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5685 PyUnicodeData_CAPSULE_NAME, 1); 5686 if (ucnhash_CAPI == NULL) 5687 goto ucnhashError; 5688 } 5689 if (*s == '{') { 5690 const char *start = s+1; 5691 /* look for the closing brace */ 5692 while (*s != '}' && s < end) 5693 s++; 5694 if (s > start && s < end && *s == '}') { 5695 /* found a name. look it up in the unicode database */ 5696 message = "unknown Unicode character name"; 5697 s++; 5698 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5699 &chr, 0)) 5700 goto store; 5701 } 5702 } 5703 endinpos = s-starts; 5704 if (unicode_decode_call_errorhandler( 5705 errors, &errorHandler, 5706 "unicodeescape", message, 5707 &starts, &end, &startinpos, &endinpos, &exc, &s, 5708 &v, &i)) 5709 goto onError; 5710 break; 5711 5712 default: 5713 if (s > end) { 5714 message = "\\ at end of string"; 5715 s--; 5716 endinpos = s-starts; 5717 if (unicode_decode_call_errorhandler( 5718 errors, &errorHandler, 5719 "unicodeescape", message, 5720 &starts, &end, &startinpos, &endinpos, &exc, &s, 5721 &v, &i)) 5722 goto onError; 5723 } 5724 else { 5725 WRITECHAR('\\'); 5726 WRITECHAR(s[-1]); 5727 } 5728 break; 5729 } 5730 nextByte: 5731 ; 5732 } 5733#undef WRITECHAR 5734 5735 if (unicode_resize(&v, i) < 0) 5736 goto onError; 5737 Py_XDECREF(errorHandler); 5738 Py_XDECREF(exc); 5739 return unicode_result(v); 5740 5741 ucnhashError: 5742 PyErr_SetString( 5743 PyExc_UnicodeError, 5744 "\\N escapes not supported (can't load unicodedata module)" 5745 ); 5746 Py_XDECREF(v); 5747 Py_XDECREF(errorHandler); 5748 Py_XDECREF(exc); 5749 return NULL; 5750 5751 onError: 5752 Py_XDECREF(v); 5753 Py_XDECREF(errorHandler); 5754 Py_XDECREF(exc); 5755 return NULL; 5756} 5757 5758/* Return a Unicode-Escape string version of the Unicode object. 5759 5760 If quotes is true, the string is enclosed in u"" or u'' quotes as 5761 appropriate. 5762 5763*/ 5764 5765PyObject * 5766PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5767{ 5768 Py_ssize_t i, len; 5769 PyObject *repr; 5770 char *p; 5771 int kind; 5772 void *data; 5773 Py_ssize_t expandsize = 0; 5774 5775 /* Initial allocation is based on the longest-possible unichr 5776 escape. 5777 5778 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source 5779 unichr, so in this case it's the longest unichr escape. In 5780 narrow (UTF-16) builds this is five chars per source unichr 5781 since there are two unichrs in the surrogate pair, so in narrow 5782 (UTF-16) builds it's not the longest unichr escape. 5783 5784 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, 5785 so in the narrow (UTF-16) build case it's the longest unichr 5786 escape. 5787 */ 5788 5789 if (!PyUnicode_Check(unicode)) { 5790 PyErr_BadArgument(); 5791 return NULL; 5792 } 5793 if (PyUnicode_READY(unicode) == -1) 5794 return NULL; 5795 len = PyUnicode_GET_LENGTH(unicode); 5796 kind = PyUnicode_KIND(unicode); 5797 data = PyUnicode_DATA(unicode); 5798 switch (kind) { 5799 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5800 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5801 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5802 } 5803 5804 if (len == 0) 5805 return PyBytes_FromStringAndSize(NULL, 0); 5806 5807 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5808 return PyErr_NoMemory(); 5809 5810 repr = PyBytes_FromStringAndSize(NULL, 5811 2 5812 + expandsize*len 5813 + 1); 5814 if (repr == NULL) 5815 return NULL; 5816 5817 p = PyBytes_AS_STRING(repr); 5818 5819 for (i = 0; i < len; i++) { 5820 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5821 5822 /* Escape backslashes */ 5823 if (ch == '\\') { 5824 *p++ = '\\'; 5825 *p++ = (char) ch; 5826 continue; 5827 } 5828 5829 /* Map 21-bit characters to '\U00xxxxxx' */ 5830 else if (ch >= 0x10000) { 5831 assert(ch <= MAX_UNICODE); 5832 *p++ = '\\'; 5833 *p++ = 'U'; 5834 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5835 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5836 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5837 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5838 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5839 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5840 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5841 *p++ = Py_hexdigits[ch & 0x0000000F]; 5842 continue; 5843 } 5844 5845 /* Map 16-bit characters to '\uxxxx' */ 5846 if (ch >= 256) { 5847 *p++ = '\\'; 5848 *p++ = 'u'; 5849 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5850 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5851 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5852 *p++ = Py_hexdigits[ch & 0x000F]; 5853 } 5854 5855 /* Map special whitespace to '\t', \n', '\r' */ 5856 else if (ch == '\t') { 5857 *p++ = '\\'; 5858 *p++ = 't'; 5859 } 5860 else if (ch == '\n') { 5861 *p++ = '\\'; 5862 *p++ = 'n'; 5863 } 5864 else if (ch == '\r') { 5865 *p++ = '\\'; 5866 *p++ = 'r'; 5867 } 5868 5869 /* Map non-printable US ASCII to '\xhh' */ 5870 else if (ch < ' ' || ch >= 0x7F) { 5871 *p++ = '\\'; 5872 *p++ = 'x'; 5873 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5874 *p++ = Py_hexdigits[ch & 0x000F]; 5875 } 5876 5877 /* Copy everything else as-is */ 5878 else 5879 *p++ = (char) ch; 5880 } 5881 5882 assert(p - PyBytes_AS_STRING(repr) > 0); 5883 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5884 return NULL; 5885 return repr; 5886} 5887 5888PyObject * 5889PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5890 Py_ssize_t size) 5891{ 5892 PyObject *result; 5893 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5894 if (tmp == NULL) 5895 return NULL; 5896 result = PyUnicode_AsUnicodeEscapeString(tmp); 5897 Py_DECREF(tmp); 5898 return result; 5899} 5900 5901/* --- Raw Unicode Escape Codec ------------------------------------------- */ 5902 5903PyObject * 5904PyUnicode_DecodeRawUnicodeEscape(const char *s, 5905 Py_ssize_t size, 5906 const char *errors) 5907{ 5908 const char *starts = s; 5909 Py_ssize_t startinpos; 5910 Py_ssize_t endinpos; 5911 Py_ssize_t outpos; 5912 PyObject *v; 5913 const char *end; 5914 const char *bs; 5915 PyObject *errorHandler = NULL; 5916 PyObject *exc = NULL; 5917 5918 /* Escaped strings will always be longer than the resulting 5919 Unicode string, so we start with size here and then reduce the 5920 length after conversion to the true value. (But decoding error 5921 handler might have to resize the string) */ 5922 v = PyUnicode_New(size, 127); 5923 if (v == NULL) 5924 goto onError; 5925 if (size == 0) 5926 return v; 5927 outpos = 0; 5928 end = s + size; 5929 while (s < end) { 5930 unsigned char c; 5931 Py_UCS4 x; 5932 int i; 5933 int count; 5934 5935 /* Non-escape characters are interpreted as Unicode ordinals */ 5936 if (*s != '\\') { 5937 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5938 goto onError; 5939 continue; 5940 } 5941 startinpos = s-starts; 5942 5943 /* \u-escapes are only interpreted iff the number of leading 5944 backslashes if odd */ 5945 bs = s; 5946 for (;s < end;) { 5947 if (*s != '\\') 5948 break; 5949 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0) 5950 goto onError; 5951 } 5952 if (((s - bs) & 1) == 0 || 5953 s >= end || 5954 (*s != 'u' && *s != 'U')) { 5955 continue; 5956 } 5957 outpos--; 5958 count = *s=='u' ? 4 : 8; 5959 s++; 5960 5961 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 5962 for (x = 0, i = 0; i < count; ++i, ++s) { 5963 c = (unsigned char)*s; 5964 if (!Py_ISXDIGIT(c)) { 5965 endinpos = s-starts; 5966 if (unicode_decode_call_errorhandler( 5967 errors, &errorHandler, 5968 "rawunicodeescape", "truncated \\uXXXX", 5969 &starts, &end, &startinpos, &endinpos, &exc, &s, 5970 &v, &outpos)) 5971 goto onError; 5972 goto nextByte; 5973 } 5974 x = (x<<4) & ~0xF; 5975 if (c >= '0' && c <= '9') 5976 x += c - '0'; 5977 else if (c >= 'a' && c <= 'f') 5978 x += 10 + c - 'a'; 5979 else 5980 x += 10 + c - 'A'; 5981 } 5982 if (x <= MAX_UNICODE) { 5983 if (unicode_putchar(&v, &outpos, x) < 0) 5984 goto onError; 5985 } else { 5986 endinpos = s-starts; 5987 if (unicode_decode_call_errorhandler( 5988 errors, &errorHandler, 5989 "rawunicodeescape", "\\Uxxxxxxxx out of range", 5990 &starts, &end, &startinpos, &endinpos, &exc, &s, 5991 &v, &outpos)) 5992 goto onError; 5993 } 5994 nextByte: 5995 ; 5996 } 5997 if (unicode_resize(&v, outpos) < 0) 5998 goto onError; 5999 Py_XDECREF(errorHandler); 6000 Py_XDECREF(exc); 6001 return unicode_result(v); 6002 6003 onError: 6004 Py_XDECREF(v); 6005 Py_XDECREF(errorHandler); 6006 Py_XDECREF(exc); 6007 return NULL; 6008} 6009 6010 6011PyObject * 6012PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6013{ 6014 PyObject *repr; 6015 char *p; 6016 char *q; 6017 Py_ssize_t expandsize, pos; 6018 int kind; 6019 void *data; 6020 Py_ssize_t len; 6021 6022 if (!PyUnicode_Check(unicode)) { 6023 PyErr_BadArgument(); 6024 return NULL; 6025 } 6026 if (PyUnicode_READY(unicode) == -1) 6027 return NULL; 6028 kind = PyUnicode_KIND(unicode); 6029 data = PyUnicode_DATA(unicode); 6030 len = PyUnicode_GET_LENGTH(unicode); 6031 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6032 bytes, and 1 byte characters 4. */ 6033 expandsize = kind * 2 + 2; 6034 6035 if (len > PY_SSIZE_T_MAX / expandsize) 6036 return PyErr_NoMemory(); 6037 6038 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6039 if (repr == NULL) 6040 return NULL; 6041 if (len == 0) 6042 return repr; 6043 6044 p = q = PyBytes_AS_STRING(repr); 6045 for (pos = 0; pos < len; pos++) { 6046 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6047 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6048 if (ch >= 0x10000) { 6049 assert(ch <= MAX_UNICODE); 6050 *p++ = '\\'; 6051 *p++ = 'U'; 6052 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6053 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6054 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6055 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6056 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6057 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6058 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6059 *p++ = Py_hexdigits[ch & 15]; 6060 } 6061 /* Map 16-bit characters to '\uxxxx' */ 6062 else if (ch >= 256) { 6063 *p++ = '\\'; 6064 *p++ = 'u'; 6065 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6066 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6067 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6068 *p++ = Py_hexdigits[ch & 15]; 6069 } 6070 /* Copy everything else as-is */ 6071 else 6072 *p++ = (char) ch; 6073 } 6074 6075 assert(p > q); 6076 if (_PyBytes_Resize(&repr, p - q) < 0) 6077 return NULL; 6078 return repr; 6079} 6080 6081PyObject * 6082PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6083 Py_ssize_t size) 6084{ 6085 PyObject *result; 6086 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6087 if (tmp == NULL) 6088 return NULL; 6089 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6090 Py_DECREF(tmp); 6091 return result; 6092} 6093 6094/* --- Unicode Internal Codec ------------------------------------------- */ 6095 6096PyObject * 6097_PyUnicode_DecodeUnicodeInternal(const char *s, 6098 Py_ssize_t size, 6099 const char *errors) 6100{ 6101 const char *starts = s; 6102 Py_ssize_t startinpos; 6103 Py_ssize_t endinpos; 6104 Py_ssize_t outpos; 6105 PyObject *v; 6106 const char *end; 6107 const char *reason; 6108 PyObject *errorHandler = NULL; 6109 PyObject *exc = NULL; 6110 6111 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6112 "unicode_internal codec has been deprecated", 6113 1)) 6114 return NULL; 6115 6116 /* XXX overflow detection missing */ 6117 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); 6118 if (v == NULL) 6119 goto onError; 6120 if (PyUnicode_GET_LENGTH(v) == 0) 6121 return v; 6122 outpos = 0; 6123 end = s + size; 6124 6125 while (s < end) { 6126 Py_UNICODE uch; 6127 Py_UCS4 ch; 6128 /* We copy the raw representation one byte at a time because the 6129 pointer may be unaligned (see test_codeccallbacks). */ 6130 ((char *) &uch)[0] = s[0]; 6131 ((char *) &uch)[1] = s[1]; 6132#ifdef Py_UNICODE_WIDE 6133 ((char *) &uch)[2] = s[2]; 6134 ((char *) &uch)[3] = s[3]; 6135#endif 6136 ch = uch; 6137 6138 /* We have to sanity check the raw data, otherwise doom looms for 6139 some malformed UCS-4 data. */ 6140 if ( 6141#ifdef Py_UNICODE_WIDE 6142 ch > 0x10ffff || 6143#endif 6144 end-s < Py_UNICODE_SIZE 6145 ) 6146 { 6147 startinpos = s - starts; 6148 if (end-s < Py_UNICODE_SIZE) { 6149 endinpos = end-starts; 6150 reason = "truncated input"; 6151 } 6152 else { 6153 endinpos = s - starts + Py_UNICODE_SIZE; 6154 reason = "illegal code point (> 0x10FFFF)"; 6155 } 6156 if (unicode_decode_call_errorhandler( 6157 errors, &errorHandler, 6158 "unicode_internal", reason, 6159 &starts, &end, &startinpos, &endinpos, &exc, &s, 6160 &v, &outpos)) 6161 goto onError; 6162 continue; 6163 } 6164 6165 s += Py_UNICODE_SIZE; 6166#ifndef Py_UNICODE_WIDE 6167 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end) 6168 { 6169 Py_UNICODE uch2; 6170 ((char *) &uch2)[0] = s[0]; 6171 ((char *) &uch2)[1] = s[1]; 6172 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6173 { 6174 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6175 s += Py_UNICODE_SIZE; 6176 } 6177 } 6178#endif 6179 6180 if (unicode_putchar(&v, &outpos, ch) < 0) 6181 goto onError; 6182 } 6183 6184 if (unicode_resize(&v, outpos) < 0) 6185 goto onError; 6186 Py_XDECREF(errorHandler); 6187 Py_XDECREF(exc); 6188 return unicode_result(v); 6189 6190 onError: 6191 Py_XDECREF(v); 6192 Py_XDECREF(errorHandler); 6193 Py_XDECREF(exc); 6194 return NULL; 6195} 6196 6197/* --- Latin-1 Codec ------------------------------------------------------ */ 6198 6199PyObject * 6200PyUnicode_DecodeLatin1(const char *s, 6201 Py_ssize_t size, 6202 const char *errors) 6203{ 6204 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6205 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6206} 6207 6208/* create or adjust a UnicodeEncodeError */ 6209static void 6210make_encode_exception(PyObject **exceptionObject, 6211 const char *encoding, 6212 PyObject *unicode, 6213 Py_ssize_t startpos, Py_ssize_t endpos, 6214 const char *reason) 6215{ 6216 if (*exceptionObject == NULL) { 6217 *exceptionObject = PyObject_CallFunction( 6218 PyExc_UnicodeEncodeError, "sOnns", 6219 encoding, unicode, startpos, endpos, reason); 6220 } 6221 else { 6222 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6223 goto onError; 6224 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6225 goto onError; 6226 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6227 goto onError; 6228 return; 6229 onError: 6230 Py_DECREF(*exceptionObject); 6231 *exceptionObject = NULL; 6232 } 6233} 6234 6235/* raises a UnicodeEncodeError */ 6236static void 6237raise_encode_exception(PyObject **exceptionObject, 6238 const char *encoding, 6239 PyObject *unicode, 6240 Py_ssize_t startpos, Py_ssize_t endpos, 6241 const char *reason) 6242{ 6243 make_encode_exception(exceptionObject, 6244 encoding, unicode, startpos, endpos, reason); 6245 if (*exceptionObject != NULL) 6246 PyCodec_StrictErrors(*exceptionObject); 6247} 6248 6249/* error handling callback helper: 6250 build arguments, call the callback and check the arguments, 6251 put the result into newpos and return the replacement string, which 6252 has to be freed by the caller */ 6253static PyObject * 6254unicode_encode_call_errorhandler(const char *errors, 6255 PyObject **errorHandler, 6256 const char *encoding, const char *reason, 6257 PyObject *unicode, PyObject **exceptionObject, 6258 Py_ssize_t startpos, Py_ssize_t endpos, 6259 Py_ssize_t *newpos) 6260{ 6261 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6262 Py_ssize_t len; 6263 PyObject *restuple; 6264 PyObject *resunicode; 6265 6266 if (*errorHandler == NULL) { 6267 *errorHandler = PyCodec_LookupError(errors); 6268 if (*errorHandler == NULL) 6269 return NULL; 6270 } 6271 6272 if (PyUnicode_READY(unicode) == -1) 6273 return NULL; 6274 len = PyUnicode_GET_LENGTH(unicode); 6275 6276 make_encode_exception(exceptionObject, 6277 encoding, unicode, startpos, endpos, reason); 6278 if (*exceptionObject == NULL) 6279 return NULL; 6280 6281 restuple = PyObject_CallFunctionObjArgs( 6282 *errorHandler, *exceptionObject, NULL); 6283 if (restuple == NULL) 6284 return NULL; 6285 if (!PyTuple_Check(restuple)) { 6286 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6287 Py_DECREF(restuple); 6288 return NULL; 6289 } 6290 if (!PyArg_ParseTuple(restuple, argparse, 6291 &resunicode, newpos)) { 6292 Py_DECREF(restuple); 6293 return NULL; 6294 } 6295 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6296 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6297 Py_DECREF(restuple); 6298 return NULL; 6299 } 6300 if (*newpos<0) 6301 *newpos = len + *newpos; 6302 if (*newpos<0 || *newpos>len) { 6303 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6304 Py_DECREF(restuple); 6305 return NULL; 6306 } 6307 Py_INCREF(resunicode); 6308 Py_DECREF(restuple); 6309 return resunicode; 6310} 6311 6312static PyObject * 6313unicode_encode_ucs1(PyObject *unicode, 6314 const char *errors, 6315 unsigned int limit) 6316{ 6317 /* input state */ 6318 Py_ssize_t pos=0, size; 6319 int kind; 6320 void *data; 6321 /* output object */ 6322 PyObject *res; 6323 /* pointer into the output */ 6324 char *str; 6325 /* current output position */ 6326 Py_ssize_t ressize; 6327 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6328 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6329 PyObject *errorHandler = NULL; 6330 PyObject *exc = NULL; 6331 /* the following variable is used for caching string comparisons 6332 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ 6333 int known_errorHandler = -1; 6334 6335 if (PyUnicode_READY(unicode) == -1) 6336 return NULL; 6337 size = PyUnicode_GET_LENGTH(unicode); 6338 kind = PyUnicode_KIND(unicode); 6339 data = PyUnicode_DATA(unicode); 6340 /* allocate enough for a simple encoding without 6341 replacements, if we need more, we'll resize */ 6342 if (size == 0) 6343 return PyBytes_FromStringAndSize(NULL, 0); 6344 res = PyBytes_FromStringAndSize(NULL, size); 6345 if (res == NULL) 6346 return NULL; 6347 str = PyBytes_AS_STRING(res); 6348 ressize = size; 6349 6350 while (pos < size) { 6351 Py_UCS4 c = PyUnicode_READ(kind, data, pos); 6352 6353 /* can we encode this? */ 6354 if (c<limit) { 6355 /* no overflow check, because we know that the space is enough */ 6356 *str++ = (char)c; 6357 ++pos; 6358 } 6359 else { 6360 Py_ssize_t requiredsize; 6361 PyObject *repunicode; 6362 Py_ssize_t repsize, newpos, respos, i; 6363 /* startpos for collecting unencodable chars */ 6364 Py_ssize_t collstart = pos; 6365 Py_ssize_t collend = pos; 6366 /* find all unecodable characters */ 6367 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit)) 6368 ++collend; 6369 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6370 if (known_errorHandler==-1) { 6371 if ((errors==NULL) || (!strcmp(errors, "strict"))) 6372 known_errorHandler = 1; 6373 else if (!strcmp(errors, "replace")) 6374 known_errorHandler = 2; 6375 else if (!strcmp(errors, "ignore")) 6376 known_errorHandler = 3; 6377 else if (!strcmp(errors, "xmlcharrefreplace")) 6378 known_errorHandler = 4; 6379 else 6380 known_errorHandler = 0; 6381 } 6382 switch (known_errorHandler) { 6383 case 1: /* strict */ 6384 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6385 goto onError; 6386 case 2: /* replace */ 6387 while (collstart++<collend) 6388 *str++ = '?'; /* fall through */ 6389 case 3: /* ignore */ 6390 pos = collend; 6391 break; 6392 case 4: /* xmlcharrefreplace */ 6393 respos = str - PyBytes_AS_STRING(res); 6394 /* determine replacement size */ 6395 for (i = collstart, repsize = 0; i < collend; ++i) { 6396 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 6397 if (ch < 10) 6398 repsize += 2+1+1; 6399 else if (ch < 100) 6400 repsize += 2+2+1; 6401 else if (ch < 1000) 6402 repsize += 2+3+1; 6403 else if (ch < 10000) 6404 repsize += 2+4+1; 6405 else if (ch < 100000) 6406 repsize += 2+5+1; 6407 else if (ch < 1000000) 6408 repsize += 2+6+1; 6409 else { 6410 assert(ch <= MAX_UNICODE); 6411 repsize += 2+7+1; 6412 } 6413 } 6414 requiredsize = respos+repsize+(size-collend); 6415 if (requiredsize > ressize) { 6416 if (requiredsize<2*ressize) 6417 requiredsize = 2*ressize; 6418 if (_PyBytes_Resize(&res, requiredsize)) 6419 goto onError; 6420 str = PyBytes_AS_STRING(res) + respos; 6421 ressize = requiredsize; 6422 } 6423 /* generate replacement */ 6424 for (i = collstart; i < collend; ++i) { 6425 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6426 } 6427 pos = collend; 6428 break; 6429 default: 6430 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, 6431 encoding, reason, unicode, &exc, 6432 collstart, collend, &newpos); 6433 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6434 PyUnicode_READY(repunicode) == -1)) 6435 goto onError; 6436 if (PyBytes_Check(repunicode)) { 6437 /* Directly copy bytes result to output. */ 6438 repsize = PyBytes_Size(repunicode); 6439 if (repsize > 1) { 6440 /* Make room for all additional bytes. */ 6441 respos = str - PyBytes_AS_STRING(res); 6442 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6443 Py_DECREF(repunicode); 6444 goto onError; 6445 } 6446 str = PyBytes_AS_STRING(res) + respos; 6447 ressize += repsize-1; 6448 } 6449 memcpy(str, PyBytes_AsString(repunicode), repsize); 6450 str += repsize; 6451 pos = newpos; 6452 Py_DECREF(repunicode); 6453 break; 6454 } 6455 /* need more space? (at least enough for what we 6456 have+the replacement+the rest of the string, so 6457 we won't have to check space for encodable characters) */ 6458 respos = str - PyBytes_AS_STRING(res); 6459 repsize = PyUnicode_GET_LENGTH(repunicode); 6460 requiredsize = respos+repsize+(size-collend); 6461 if (requiredsize > ressize) { 6462 if (requiredsize<2*ressize) 6463 requiredsize = 2*ressize; 6464 if (_PyBytes_Resize(&res, requiredsize)) { 6465 Py_DECREF(repunicode); 6466 goto onError; 6467 } 6468 str = PyBytes_AS_STRING(res) + respos; 6469 ressize = requiredsize; 6470 } 6471 /* check if there is anything unencodable in the replacement 6472 and copy it to the output */ 6473 for (i = 0; repsize-->0; ++i, ++str) { 6474 c = PyUnicode_READ_CHAR(repunicode, i); 6475 if (c >= limit) { 6476 raise_encode_exception(&exc, encoding, unicode, 6477 pos, pos+1, reason); 6478 Py_DECREF(repunicode); 6479 goto onError; 6480 } 6481 *str = (char)c; 6482 } 6483 pos = newpos; 6484 Py_DECREF(repunicode); 6485 } 6486 } 6487 } 6488 /* Resize if we allocated to much */ 6489 size = str - PyBytes_AS_STRING(res); 6490 if (size < ressize) { /* If this falls res will be NULL */ 6491 assert(size >= 0); 6492 if (_PyBytes_Resize(&res, size) < 0) 6493 goto onError; 6494 } 6495 6496 Py_XDECREF(errorHandler); 6497 Py_XDECREF(exc); 6498 return res; 6499 6500 onError: 6501 Py_XDECREF(res); 6502 Py_XDECREF(errorHandler); 6503 Py_XDECREF(exc); 6504 return NULL; 6505} 6506 6507/* Deprecated */ 6508PyObject * 6509PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6510 Py_ssize_t size, 6511 const char *errors) 6512{ 6513 PyObject *result; 6514 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6515 if (unicode == NULL) 6516 return NULL; 6517 result = unicode_encode_ucs1(unicode, errors, 256); 6518 Py_DECREF(unicode); 6519 return result; 6520} 6521 6522PyObject * 6523_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6524{ 6525 if (!PyUnicode_Check(unicode)) { 6526 PyErr_BadArgument(); 6527 return NULL; 6528 } 6529 if (PyUnicode_READY(unicode) == -1) 6530 return NULL; 6531 /* Fast path: if it is a one-byte string, construct 6532 bytes object directly. */ 6533 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6534 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6535 PyUnicode_GET_LENGTH(unicode)); 6536 /* Non-Latin-1 characters present. Defer to above function to 6537 raise the exception. */ 6538 return unicode_encode_ucs1(unicode, errors, 256); 6539} 6540 6541PyObject* 6542PyUnicode_AsLatin1String(PyObject *unicode) 6543{ 6544 return _PyUnicode_AsLatin1String(unicode, NULL); 6545} 6546 6547/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6548 6549PyObject * 6550PyUnicode_DecodeASCII(const char *s, 6551 Py_ssize_t size, 6552 const char *errors) 6553{ 6554 const char *starts = s; 6555 PyObject *unicode; 6556 int kind; 6557 void *data; 6558 Py_ssize_t startinpos; 6559 Py_ssize_t endinpos; 6560 Py_ssize_t outpos; 6561 const char *e; 6562 PyObject *errorHandler = NULL; 6563 PyObject *exc = NULL; 6564 6565 if (size == 0) { 6566 Py_INCREF(unicode_empty); 6567 return unicode_empty; 6568 } 6569 6570 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6571 if (size == 1 && (unsigned char)s[0] < 128) 6572 return get_latin1_char((unsigned char)s[0]); 6573 6574 unicode = PyUnicode_New(size, 127); 6575 if (unicode == NULL) 6576 goto onError; 6577 6578 e = s + size; 6579 data = PyUnicode_1BYTE_DATA(unicode); 6580 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6581 if (outpos == size) 6582 return unicode; 6583 6584 s += outpos; 6585 kind = PyUnicode_1BYTE_KIND; 6586 while (s < e) { 6587 register unsigned char c = (unsigned char)*s; 6588 if (c < 128) { 6589 PyUnicode_WRITE(kind, data, outpos++, c); 6590 ++s; 6591 } 6592 else { 6593 startinpos = s-starts; 6594 endinpos = startinpos + 1; 6595 if (unicode_decode_call_errorhandler( 6596 errors, &errorHandler, 6597 "ascii", "ordinal not in range(128)", 6598 &starts, &e, &startinpos, &endinpos, &exc, &s, 6599 &unicode, &outpos)) 6600 goto onError; 6601 kind = PyUnicode_KIND(unicode); 6602 data = PyUnicode_DATA(unicode); 6603 } 6604 } 6605 if (unicode_resize(&unicode, outpos) < 0) 6606 goto onError; 6607 Py_XDECREF(errorHandler); 6608 Py_XDECREF(exc); 6609 assert(_PyUnicode_CheckConsistency(unicode, 1)); 6610 return unicode; 6611 6612 onError: 6613 Py_XDECREF(unicode); 6614 Py_XDECREF(errorHandler); 6615 Py_XDECREF(exc); 6616 return NULL; 6617} 6618 6619/* Deprecated */ 6620PyObject * 6621PyUnicode_EncodeASCII(const Py_UNICODE *p, 6622 Py_ssize_t size, 6623 const char *errors) 6624{ 6625 PyObject *result; 6626 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6627 if (unicode == NULL) 6628 return NULL; 6629 result = unicode_encode_ucs1(unicode, errors, 128); 6630 Py_DECREF(unicode); 6631 return result; 6632} 6633 6634PyObject * 6635_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6636{ 6637 if (!PyUnicode_Check(unicode)) { 6638 PyErr_BadArgument(); 6639 return NULL; 6640 } 6641 if (PyUnicode_READY(unicode) == -1) 6642 return NULL; 6643 /* Fast path: if it is an ASCII-only string, construct bytes object 6644 directly. Else defer to above function to raise the exception. */ 6645 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 6646 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6647 PyUnicode_GET_LENGTH(unicode)); 6648 return unicode_encode_ucs1(unicode, errors, 128); 6649} 6650 6651PyObject * 6652PyUnicode_AsASCIIString(PyObject *unicode) 6653{ 6654 return _PyUnicode_AsASCIIString(unicode, NULL); 6655} 6656 6657#ifdef HAVE_MBCS 6658 6659/* --- MBCS codecs for Windows -------------------------------------------- */ 6660 6661#if SIZEOF_INT < SIZEOF_SIZE_T 6662#define NEED_RETRY 6663#endif 6664 6665#ifndef WC_ERR_INVALID_CHARS 6666# define WC_ERR_INVALID_CHARS 0x0080 6667#endif 6668 6669static char* 6670code_page_name(UINT code_page, PyObject **obj) 6671{ 6672 *obj = NULL; 6673 if (code_page == CP_ACP) 6674 return "mbcs"; 6675 if (code_page == CP_UTF7) 6676 return "CP_UTF7"; 6677 if (code_page == CP_UTF8) 6678 return "CP_UTF8"; 6679 6680 *obj = PyBytes_FromFormat("cp%u", code_page); 6681 if (*obj == NULL) 6682 return NULL; 6683 return PyBytes_AS_STRING(*obj); 6684} 6685 6686static int 6687is_dbcs_lead_byte(UINT code_page, const char *s, int offset) 6688{ 6689 const char *curr = s + offset; 6690 const char *prev; 6691 6692 if (!IsDBCSLeadByteEx(code_page, *curr)) 6693 return 0; 6694 6695 prev = CharPrevExA(code_page, s, curr, 0); 6696 if (prev == curr) 6697 return 1; 6698 /* FIXME: This code is limited to "true" double-byte encodings, 6699 as it assumes an incomplete character consists of a single 6700 byte. */ 6701 if (curr - prev == 2) 6702 return 1; 6703 if (!IsDBCSLeadByteEx(code_page, *prev)) 6704 return 1; 6705 return 0; 6706} 6707 6708static DWORD 6709decode_code_page_flags(UINT code_page) 6710{ 6711 if (code_page == CP_UTF7) { 6712 /* The CP_UTF7 decoder only supports flags=0 */ 6713 return 0; 6714 } 6715 else 6716 return MB_ERR_INVALID_CHARS; 6717} 6718 6719/* 6720 * Decode a byte string from a Windows code page into unicode object in strict 6721 * mode. 6722 * 6723 * Returns consumed size if succeed, returns -2 on decode error, or raise a 6724 * WindowsError and returns -1 on other error. 6725 */ 6726static int 6727decode_code_page_strict(UINT code_page, 6728 PyObject **v, 6729 const char *in, 6730 int insize) 6731{ 6732 const DWORD flags = decode_code_page_flags(code_page); 6733 wchar_t *out; 6734 DWORD outsize; 6735 6736 /* First get the size of the result */ 6737 assert(insize > 0); 6738 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6739 if (outsize <= 0) 6740 goto error; 6741 6742 if (*v == NULL) { 6743 /* Create unicode object */ 6744 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6745 *v = (PyObject*)_PyUnicode_New(outsize); 6746 if (*v == NULL) 6747 return -1; 6748 out = PyUnicode_AS_UNICODE(*v); 6749 } 6750 else { 6751 /* Extend unicode object */ 6752 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6753 if (unicode_resize(v, n + outsize) < 0) 6754 return -1; 6755 out = PyUnicode_AS_UNICODE(*v) + n; 6756 } 6757 6758 /* Do the conversion */ 6759 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6760 if (outsize <= 0) 6761 goto error; 6762 return insize; 6763 6764error: 6765 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6766 return -2; 6767 PyErr_SetFromWindowsErr(0); 6768 return -1; 6769} 6770 6771/* 6772 * Decode a byte string from a code page into unicode object with an error 6773 * handler. 6774 * 6775 * Returns consumed size if succeed, or raise a WindowsError or 6776 * UnicodeDecodeError exception and returns -1 on error. 6777 */ 6778static int 6779decode_code_page_errors(UINT code_page, 6780 PyObject **v, 6781 const char *in, const int size, 6782 const char *errors) 6783{ 6784 const char *startin = in; 6785 const char *endin = in + size; 6786 const DWORD flags = decode_code_page_flags(code_page); 6787 /* Ideally, we should get reason from FormatMessage. This is the Windows 6788 2000 English version of the message. */ 6789 const char *reason = "No mapping for the Unicode character exists " 6790 "in the target code page."; 6791 /* each step cannot decode more than 1 character, but a character can be 6792 represented as a surrogate pair */ 6793 wchar_t buffer[2], *startout, *out; 6794 int insize, outsize; 6795 PyObject *errorHandler = NULL; 6796 PyObject *exc = NULL; 6797 PyObject *encoding_obj = NULL; 6798 char *encoding; 6799 DWORD err; 6800 int ret = -1; 6801 6802 assert(size > 0); 6803 6804 encoding = code_page_name(code_page, &encoding_obj); 6805 if (encoding == NULL) 6806 return -1; 6807 6808 if (errors == NULL || strcmp(errors, "strict") == 0) { 6809 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6810 UnicodeDecodeError. */ 6811 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6812 if (exc != NULL) { 6813 PyCodec_StrictErrors(exc); 6814 Py_CLEAR(exc); 6815 } 6816 goto error; 6817 } 6818 6819 if (*v == NULL) { 6820 /* Create unicode object */ 6821 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6822 PyErr_NoMemory(); 6823 goto error; 6824 } 6825 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6826 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6827 if (*v == NULL) 6828 goto error; 6829 startout = PyUnicode_AS_UNICODE(*v); 6830 } 6831 else { 6832 /* Extend unicode object */ 6833 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6834 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6835 PyErr_NoMemory(); 6836 goto error; 6837 } 6838 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6839 goto error; 6840 startout = PyUnicode_AS_UNICODE(*v) + n; 6841 } 6842 6843 /* Decode the byte string character per character */ 6844 out = startout; 6845 while (in < endin) 6846 { 6847 /* Decode a character */ 6848 insize = 1; 6849 do 6850 { 6851 outsize = MultiByteToWideChar(code_page, flags, 6852 in, insize, 6853 buffer, Py_ARRAY_LENGTH(buffer)); 6854 if (outsize > 0) 6855 break; 6856 err = GetLastError(); 6857 if (err != ERROR_NO_UNICODE_TRANSLATION 6858 && err != ERROR_INSUFFICIENT_BUFFER) 6859 { 6860 PyErr_SetFromWindowsErr(0); 6861 goto error; 6862 } 6863 insize++; 6864 } 6865 /* 4=maximum length of a UTF-8 sequence */ 6866 while (insize <= 4 && (in + insize) <= endin); 6867 6868 if (outsize <= 0) { 6869 Py_ssize_t startinpos, endinpos, outpos; 6870 6871 startinpos = in - startin; 6872 endinpos = startinpos + 1; 6873 outpos = out - PyUnicode_AS_UNICODE(*v); 6874 if (unicode_decode_call_errorhandler( 6875 errors, &errorHandler, 6876 encoding, reason, 6877 &startin, &endin, &startinpos, &endinpos, &exc, &in, 6878 v, &outpos)) 6879 { 6880 goto error; 6881 } 6882 out = PyUnicode_AS_UNICODE(*v) + outpos; 6883 } 6884 else { 6885 in += insize; 6886 memcpy(out, buffer, outsize * sizeof(wchar_t)); 6887 out += outsize; 6888 } 6889 } 6890 6891 /* write a NUL character at the end */ 6892 *out = 0; 6893 6894 /* Extend unicode object */ 6895 outsize = out - startout; 6896 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 6897 if (unicode_resize(v, outsize) < 0) 6898 goto error; 6899 ret = size; 6900 6901error: 6902 Py_XDECREF(encoding_obj); 6903 Py_XDECREF(errorHandler); 6904 Py_XDECREF(exc); 6905 return ret; 6906} 6907 6908static PyObject * 6909decode_code_page_stateful(int code_page, 6910 const char *s, Py_ssize_t size, 6911 const char *errors, Py_ssize_t *consumed) 6912{ 6913 PyObject *v = NULL; 6914 int chunk_size, final, converted, done; 6915 6916 if (code_page < 0) { 6917 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 6918 return NULL; 6919 } 6920 6921 if (consumed) 6922 *consumed = 0; 6923 6924 do 6925 { 6926#ifdef NEED_RETRY 6927 if (size > INT_MAX) { 6928 chunk_size = INT_MAX; 6929 final = 0; 6930 done = 0; 6931 } 6932 else 6933#endif 6934 { 6935 chunk_size = (int)size; 6936 final = (consumed == NULL); 6937 done = 1; 6938 } 6939 6940 /* Skip trailing lead-byte unless 'final' is set */ 6941 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) 6942 --chunk_size; 6943 6944 if (chunk_size == 0 && done) { 6945 if (v != NULL) 6946 break; 6947 Py_INCREF(unicode_empty); 6948 return unicode_empty; 6949 } 6950 6951 6952 converted = decode_code_page_strict(code_page, &v, 6953 s, chunk_size); 6954 if (converted == -2) 6955 converted = decode_code_page_errors(code_page, &v, 6956 s, chunk_size, 6957 errors); 6958 assert(converted != 0); 6959 6960 if (converted < 0) { 6961 Py_XDECREF(v); 6962 return NULL; 6963 } 6964 6965 if (consumed) 6966 *consumed += converted; 6967 6968 s += converted; 6969 size -= converted; 6970 } while (!done); 6971 6972 return unicode_result(v); 6973} 6974 6975PyObject * 6976PyUnicode_DecodeCodePageStateful(int code_page, 6977 const char *s, 6978 Py_ssize_t size, 6979 const char *errors, 6980 Py_ssize_t *consumed) 6981{ 6982 return decode_code_page_stateful(code_page, s, size, errors, consumed); 6983} 6984 6985PyObject * 6986PyUnicode_DecodeMBCSStateful(const char *s, 6987 Py_ssize_t size, 6988 const char *errors, 6989 Py_ssize_t *consumed) 6990{ 6991 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 6992} 6993 6994PyObject * 6995PyUnicode_DecodeMBCS(const char *s, 6996 Py_ssize_t size, 6997 const char *errors) 6998{ 6999 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7000} 7001 7002static DWORD 7003encode_code_page_flags(UINT code_page, const char *errors) 7004{ 7005 if (code_page == CP_UTF8) { 7006 if (winver.dwMajorVersion >= 6) 7007 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista 7008 and later */ 7009 return WC_ERR_INVALID_CHARS; 7010 else 7011 /* CP_UTF8 only supports flags=0 on Windows older than Vista */ 7012 return 0; 7013 } 7014 else if (code_page == CP_UTF7) { 7015 /* CP_UTF7 only supports flags=0 */ 7016 return 0; 7017 } 7018 else { 7019 if (errors != NULL && strcmp(errors, "replace") == 0) 7020 return 0; 7021 else 7022 return WC_NO_BEST_FIT_CHARS; 7023 } 7024} 7025 7026/* 7027 * Encode a Unicode string to a Windows code page into a byte string in strict 7028 * mode. 7029 * 7030 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7031 * a WindowsError and returns -1 on other error. 7032 */ 7033static int 7034encode_code_page_strict(UINT code_page, PyObject **outbytes, 7035 PyObject *unicode, Py_ssize_t offset, int len, 7036 const char* errors) 7037{ 7038 BOOL usedDefaultChar = FALSE; 7039 BOOL *pusedDefaultChar = &usedDefaultChar; 7040 int outsize; 7041 PyObject *exc = NULL; 7042 wchar_t *p; 7043 Py_ssize_t size; 7044 const DWORD flags = encode_code_page_flags(code_page, NULL); 7045 char *out; 7046 /* Create a substring so that we can get the UTF-16 representation 7047 of just the slice under consideration. */ 7048 PyObject *substring; 7049 7050 assert(len > 0); 7051 7052 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7053 pusedDefaultChar = &usedDefaultChar; 7054 else 7055 pusedDefaultChar = NULL; 7056 7057 substring = PyUnicode_Substring(unicode, offset, offset+len); 7058 if (substring == NULL) 7059 return -1; 7060 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7061 if (p == NULL) { 7062 Py_DECREF(substring); 7063 return -1; 7064 } 7065 7066 /* First get the size of the result */ 7067 outsize = WideCharToMultiByte(code_page, flags, 7068 p, size, 7069 NULL, 0, 7070 NULL, pusedDefaultChar); 7071 if (outsize <= 0) 7072 goto error; 7073 /* If we used a default char, then we failed! */ 7074 if (pusedDefaultChar && *pusedDefaultChar) { 7075 Py_DECREF(substring); 7076 return -2; 7077 } 7078 7079 if (*outbytes == NULL) { 7080 /* Create string object */ 7081 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7082 if (*outbytes == NULL) { 7083 Py_DECREF(substring); 7084 return -1; 7085 } 7086 out = PyBytes_AS_STRING(*outbytes); 7087 } 7088 else { 7089 /* Extend string object */ 7090 const Py_ssize_t n = PyBytes_Size(*outbytes); 7091 if (outsize > PY_SSIZE_T_MAX - n) { 7092 PyErr_NoMemory(); 7093 Py_DECREF(substring); 7094 return -1; 7095 } 7096 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7097 Py_DECREF(substring); 7098 return -1; 7099 } 7100 out = PyBytes_AS_STRING(*outbytes) + n; 7101 } 7102 7103 /* Do the conversion */ 7104 outsize = WideCharToMultiByte(code_page, flags, 7105 p, size, 7106 out, outsize, 7107 NULL, pusedDefaultChar); 7108 Py_CLEAR(substring); 7109 if (outsize <= 0) 7110 goto error; 7111 if (pusedDefaultChar && *pusedDefaultChar) 7112 return -2; 7113 return 0; 7114 7115error: 7116 Py_XDECREF(substring); 7117 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7118 return -2; 7119 PyErr_SetFromWindowsErr(0); 7120 return -1; 7121} 7122 7123/* 7124 * Encode a Unicode string to a Windows code page into a byte string using a 7125 * error handler. 7126 * 7127 * Returns consumed characters if succeed, or raise a WindowsError and returns 7128 * -1 on other error. 7129 */ 7130static int 7131encode_code_page_errors(UINT code_page, PyObject **outbytes, 7132 PyObject *unicode, Py_ssize_t unicode_offset, 7133 Py_ssize_t insize, const char* errors) 7134{ 7135 const DWORD flags = encode_code_page_flags(code_page, errors); 7136 Py_ssize_t pos = unicode_offset; 7137 Py_ssize_t endin = unicode_offset + insize; 7138 /* Ideally, we should get reason from FormatMessage. This is the Windows 7139 2000 English version of the message. */ 7140 const char *reason = "invalid character"; 7141 /* 4=maximum length of a UTF-8 sequence */ 7142 char buffer[4]; 7143 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7144 Py_ssize_t outsize; 7145 char *out; 7146 PyObject *errorHandler = NULL; 7147 PyObject *exc = NULL; 7148 PyObject *encoding_obj = NULL; 7149 char *encoding; 7150 Py_ssize_t newpos, newoutsize; 7151 PyObject *rep; 7152 int ret = -1; 7153 7154 assert(insize > 0); 7155 7156 encoding = code_page_name(code_page, &encoding_obj); 7157 if (encoding == NULL) 7158 return -1; 7159 7160 if (errors == NULL || strcmp(errors, "strict") == 0) { 7161 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7162 then we raise a UnicodeEncodeError. */ 7163 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7164 if (exc != NULL) { 7165 PyCodec_StrictErrors(exc); 7166 Py_DECREF(exc); 7167 } 7168 Py_XDECREF(encoding_obj); 7169 return -1; 7170 } 7171 7172 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7173 pusedDefaultChar = &usedDefaultChar; 7174 else 7175 pusedDefaultChar = NULL; 7176 7177 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7178 PyErr_NoMemory(); 7179 goto error; 7180 } 7181 outsize = insize * Py_ARRAY_LENGTH(buffer); 7182 7183 if (*outbytes == NULL) { 7184 /* Create string object */ 7185 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7186 if (*outbytes == NULL) 7187 goto error; 7188 out = PyBytes_AS_STRING(*outbytes); 7189 } 7190 else { 7191 /* Extend string object */ 7192 Py_ssize_t n = PyBytes_Size(*outbytes); 7193 if (n > PY_SSIZE_T_MAX - outsize) { 7194 PyErr_NoMemory(); 7195 goto error; 7196 } 7197 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7198 goto error; 7199 out = PyBytes_AS_STRING(*outbytes) + n; 7200 } 7201 7202 /* Encode the string character per character */ 7203 while (pos < endin) 7204 { 7205 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7206 wchar_t chars[2]; 7207 int charsize; 7208 if (ch < 0x10000) { 7209 chars[0] = (wchar_t)ch; 7210 charsize = 1; 7211 } 7212 else { 7213 ch -= 0x10000; 7214 chars[0] = 0xd800 + (ch >> 10); 7215 chars[1] = 0xdc00 + (ch & 0x3ff); 7216 charsize = 2; 7217 } 7218 7219 outsize = WideCharToMultiByte(code_page, flags, 7220 chars, charsize, 7221 buffer, Py_ARRAY_LENGTH(buffer), 7222 NULL, pusedDefaultChar); 7223 if (outsize > 0) { 7224 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7225 { 7226 pos++; 7227 memcpy(out, buffer, outsize); 7228 out += outsize; 7229 continue; 7230 } 7231 } 7232 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7233 PyErr_SetFromWindowsErr(0); 7234 goto error; 7235 } 7236 7237 rep = unicode_encode_call_errorhandler( 7238 errors, &errorHandler, encoding, reason, 7239 unicode, &exc, 7240 pos, pos + 1, &newpos); 7241 if (rep == NULL) 7242 goto error; 7243 pos = newpos; 7244 7245 if (PyBytes_Check(rep)) { 7246 outsize = PyBytes_GET_SIZE(rep); 7247 if (outsize != 1) { 7248 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7249 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7250 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7251 Py_DECREF(rep); 7252 goto error; 7253 } 7254 out = PyBytes_AS_STRING(*outbytes) + offset; 7255 } 7256 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7257 out += outsize; 7258 } 7259 else { 7260 Py_ssize_t i; 7261 enum PyUnicode_Kind kind; 7262 void *data; 7263 7264 if (PyUnicode_READY(rep) == -1) { 7265 Py_DECREF(rep); 7266 goto error; 7267 } 7268 7269 outsize = PyUnicode_GET_LENGTH(rep); 7270 if (outsize != 1) { 7271 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7272 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7273 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7274 Py_DECREF(rep); 7275 goto error; 7276 } 7277 out = PyBytes_AS_STRING(*outbytes) + offset; 7278 } 7279 kind = PyUnicode_KIND(rep); 7280 data = PyUnicode_DATA(rep); 7281 for (i=0; i < outsize; i++) { 7282 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7283 if (ch > 127) { 7284 raise_encode_exception(&exc, 7285 encoding, unicode, 7286 pos, pos + 1, 7287 "unable to encode error handler result to ASCII"); 7288 Py_DECREF(rep); 7289 goto error; 7290 } 7291 *out = (unsigned char)ch; 7292 out++; 7293 } 7294 } 7295 Py_DECREF(rep); 7296 } 7297 /* write a NUL byte */ 7298 *out = 0; 7299 outsize = out - PyBytes_AS_STRING(*outbytes); 7300 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7301 if (_PyBytes_Resize(outbytes, outsize) < 0) 7302 goto error; 7303 ret = 0; 7304 7305error: 7306 Py_XDECREF(encoding_obj); 7307 Py_XDECREF(errorHandler); 7308 Py_XDECREF(exc); 7309 return ret; 7310} 7311 7312static PyObject * 7313encode_code_page(int code_page, 7314 PyObject *unicode, 7315 const char *errors) 7316{ 7317 Py_ssize_t len; 7318 PyObject *outbytes = NULL; 7319 Py_ssize_t offset; 7320 int chunk_len, ret, done; 7321 7322 if (PyUnicode_READY(unicode) == -1) 7323 return NULL; 7324 len = PyUnicode_GET_LENGTH(unicode); 7325 7326 if (code_page < 0) { 7327 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7328 return NULL; 7329 } 7330 7331 if (len == 0) 7332 return PyBytes_FromStringAndSize(NULL, 0); 7333 7334 offset = 0; 7335 do 7336 { 7337#ifdef NEED_RETRY 7338 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7339 chunks. */ 7340 if (len > INT_MAX/2) { 7341 chunk_len = INT_MAX/2; 7342 done = 0; 7343 } 7344 else 7345#endif 7346 { 7347 chunk_len = (int)len; 7348 done = 1; 7349 } 7350 7351 ret = encode_code_page_strict(code_page, &outbytes, 7352 unicode, offset, chunk_len, 7353 errors); 7354 if (ret == -2) 7355 ret = encode_code_page_errors(code_page, &outbytes, 7356 unicode, offset, 7357 chunk_len, errors); 7358 if (ret < 0) { 7359 Py_XDECREF(outbytes); 7360 return NULL; 7361 } 7362 7363 offset += chunk_len; 7364 len -= chunk_len; 7365 } while (!done); 7366 7367 return outbytes; 7368} 7369 7370PyObject * 7371PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7372 Py_ssize_t size, 7373 const char *errors) 7374{ 7375 PyObject *unicode, *res; 7376 unicode = PyUnicode_FromUnicode(p, size); 7377 if (unicode == NULL) 7378 return NULL; 7379 res = encode_code_page(CP_ACP, unicode, errors); 7380 Py_DECREF(unicode); 7381 return res; 7382} 7383 7384PyObject * 7385PyUnicode_EncodeCodePage(int code_page, 7386 PyObject *unicode, 7387 const char *errors) 7388{ 7389 return encode_code_page(code_page, unicode, errors); 7390} 7391 7392PyObject * 7393PyUnicode_AsMBCSString(PyObject *unicode) 7394{ 7395 if (!PyUnicode_Check(unicode)) { 7396 PyErr_BadArgument(); 7397 return NULL; 7398 } 7399 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7400} 7401 7402#undef NEED_RETRY 7403 7404#endif /* HAVE_MBCS */ 7405 7406/* --- Character Mapping Codec -------------------------------------------- */ 7407 7408PyObject * 7409PyUnicode_DecodeCharmap(const char *s, 7410 Py_ssize_t size, 7411 PyObject *mapping, 7412 const char *errors) 7413{ 7414 const char *starts = s; 7415 Py_ssize_t startinpos; 7416 Py_ssize_t endinpos; 7417 Py_ssize_t outpos; 7418 const char *e; 7419 PyObject *v; 7420 Py_ssize_t extrachars = 0; 7421 PyObject *errorHandler = NULL; 7422 PyObject *exc = NULL; 7423 7424 /* Default to Latin-1 */ 7425 if (mapping == NULL) 7426 return PyUnicode_DecodeLatin1(s, size, errors); 7427 7428 v = PyUnicode_New(size, 127); 7429 if (v == NULL) 7430 goto onError; 7431 if (size == 0) 7432 return v; 7433 outpos = 0; 7434 e = s + size; 7435 if (PyUnicode_CheckExact(mapping)) { 7436 Py_ssize_t maplen; 7437 enum PyUnicode_Kind mapkind; 7438 void *mapdata; 7439 Py_UCS4 x; 7440 7441 if (PyUnicode_READY(mapping) == -1) 7442 return NULL; 7443 7444 maplen = PyUnicode_GET_LENGTH(mapping); 7445 mapdata = PyUnicode_DATA(mapping); 7446 mapkind = PyUnicode_KIND(mapping); 7447 while (s < e) { 7448 unsigned char ch; 7449 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7450 enum PyUnicode_Kind outkind = PyUnicode_KIND(v); 7451 if (outkind == PyUnicode_1BYTE_KIND) { 7452 void *outdata = PyUnicode_DATA(v); 7453 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v); 7454 while (s < e) { 7455 unsigned char ch = *s; 7456 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7457 if (x > maxchar) 7458 goto Error; 7459 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x); 7460 ++s; 7461 } 7462 break; 7463 } 7464 else if (outkind == PyUnicode_2BYTE_KIND) { 7465 void *outdata = PyUnicode_DATA(v); 7466 while (s < e) { 7467 unsigned char ch = *s; 7468 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch); 7469 if (x == 0xFFFE) 7470 goto Error; 7471 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x); 7472 ++s; 7473 } 7474 break; 7475 } 7476 } 7477 ch = *s; 7478 7479 if (ch < maplen) 7480 x = PyUnicode_READ(mapkind, mapdata, ch); 7481 else 7482 x = 0xfffe; /* invalid value */ 7483Error: 7484 if (x == 0xfffe) 7485 { 7486 /* undefined mapping */ 7487 startinpos = s-starts; 7488 endinpos = startinpos+1; 7489 if (unicode_decode_call_errorhandler( 7490 errors, &errorHandler, 7491 "charmap", "character maps to <undefined>", 7492 &starts, &e, &startinpos, &endinpos, &exc, &s, 7493 &v, &outpos)) { 7494 goto onError; 7495 } 7496 continue; 7497 } 7498 7499 if (unicode_putchar(&v, &outpos, x) < 0) 7500 goto onError; 7501 ++s; 7502 } 7503 } 7504 else { 7505 while (s < e) { 7506 unsigned char ch = *s; 7507 PyObject *w, *x; 7508 7509 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7510 w = PyLong_FromLong((long)ch); 7511 if (w == NULL) 7512 goto onError; 7513 x = PyObject_GetItem(mapping, w); 7514 Py_DECREF(w); 7515 if (x == NULL) { 7516 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7517 /* No mapping found means: mapping is undefined. */ 7518 PyErr_Clear(); 7519 x = Py_None; 7520 Py_INCREF(x); 7521 } else 7522 goto onError; 7523 } 7524 7525 /* Apply mapping */ 7526 if (PyLong_Check(x)) { 7527 long value = PyLong_AS_LONG(x); 7528 if (value < 0 || value > MAX_UNICODE) { 7529 PyErr_Format(PyExc_TypeError, 7530 "character mapping must be in range(0x%lx)", 7531 (unsigned long)MAX_UNICODE + 1); 7532 Py_DECREF(x); 7533 goto onError; 7534 } 7535 if (unicode_putchar(&v, &outpos, value) < 0) 7536 goto onError; 7537 } 7538 else if (x == Py_None) { 7539 /* undefined mapping */ 7540 startinpos = s-starts; 7541 endinpos = startinpos+1; 7542 if (unicode_decode_call_errorhandler( 7543 errors, &errorHandler, 7544 "charmap", "character maps to <undefined>", 7545 &starts, &e, &startinpos, &endinpos, &exc, &s, 7546 &v, &outpos)) { 7547 Py_DECREF(x); 7548 goto onError; 7549 } 7550 Py_DECREF(x); 7551 continue; 7552 } 7553 else if (PyUnicode_Check(x)) { 7554 Py_ssize_t targetsize; 7555 7556 if (PyUnicode_READY(x) == -1) 7557 goto onError; 7558 targetsize = PyUnicode_GET_LENGTH(x); 7559 7560 if (targetsize == 1) { 7561 /* 1-1 mapping */ 7562 if (unicode_putchar(&v, &outpos, 7563 PyUnicode_READ_CHAR(x, 0)) < 0) 7564 goto onError; 7565 } 7566 else if (targetsize > 1) { 7567 /* 1-n mapping */ 7568 if (targetsize > extrachars) { 7569 /* resize first */ 7570 Py_ssize_t needed = (targetsize - extrachars) + \ 7571 (targetsize << 2); 7572 extrachars += needed; 7573 /* XXX overflow detection missing */ 7574 if (unicode_resize(&v, 7575 PyUnicode_GET_LENGTH(v) + needed) < 0) 7576 { 7577 Py_DECREF(x); 7578 goto onError; 7579 } 7580 } 7581 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0) 7582 goto onError; 7583 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize); 7584 outpos += targetsize; 7585 extrachars -= targetsize; 7586 } 7587 /* 1-0 mapping: skip the character */ 7588 } 7589 else { 7590 /* wrong return value */ 7591 PyErr_SetString(PyExc_TypeError, 7592 "character mapping must return integer, None or str"); 7593 Py_DECREF(x); 7594 goto onError; 7595 } 7596 Py_DECREF(x); 7597 ++s; 7598 } 7599 } 7600 if (unicode_resize(&v, outpos) < 0) 7601 goto onError; 7602 Py_XDECREF(errorHandler); 7603 Py_XDECREF(exc); 7604 return unicode_result(v); 7605 7606 onError: 7607 Py_XDECREF(errorHandler); 7608 Py_XDECREF(exc); 7609 Py_XDECREF(v); 7610 return NULL; 7611} 7612 7613/* Charmap encoding: the lookup table */ 7614 7615struct encoding_map { 7616 PyObject_HEAD 7617 unsigned char level1[32]; 7618 int count2, count3; 7619 unsigned char level23[1]; 7620}; 7621 7622static PyObject* 7623encoding_map_size(PyObject *obj, PyObject* args) 7624{ 7625 struct encoding_map *map = (struct encoding_map*)obj; 7626 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7627 128*map->count3); 7628} 7629 7630static PyMethodDef encoding_map_methods[] = { 7631 {"size", encoding_map_size, METH_NOARGS, 7632 PyDoc_STR("Return the size (in bytes) of this object") }, 7633 { 0 } 7634}; 7635 7636static void 7637encoding_map_dealloc(PyObject* o) 7638{ 7639 PyObject_FREE(o); 7640} 7641 7642static PyTypeObject EncodingMapType = { 7643 PyVarObject_HEAD_INIT(NULL, 0) 7644 "EncodingMap", /*tp_name*/ 7645 sizeof(struct encoding_map), /*tp_basicsize*/ 7646 0, /*tp_itemsize*/ 7647 /* methods */ 7648 encoding_map_dealloc, /*tp_dealloc*/ 7649 0, /*tp_print*/ 7650 0, /*tp_getattr*/ 7651 0, /*tp_setattr*/ 7652 0, /*tp_reserved*/ 7653 0, /*tp_repr*/ 7654 0, /*tp_as_number*/ 7655 0, /*tp_as_sequence*/ 7656 0, /*tp_as_mapping*/ 7657 0, /*tp_hash*/ 7658 0, /*tp_call*/ 7659 0, /*tp_str*/ 7660 0, /*tp_getattro*/ 7661 0, /*tp_setattro*/ 7662 0, /*tp_as_buffer*/ 7663 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7664 0, /*tp_doc*/ 7665 0, /*tp_traverse*/ 7666 0, /*tp_clear*/ 7667 0, /*tp_richcompare*/ 7668 0, /*tp_weaklistoffset*/ 7669 0, /*tp_iter*/ 7670 0, /*tp_iternext*/ 7671 encoding_map_methods, /*tp_methods*/ 7672 0, /*tp_members*/ 7673 0, /*tp_getset*/ 7674 0, /*tp_base*/ 7675 0, /*tp_dict*/ 7676 0, /*tp_descr_get*/ 7677 0, /*tp_descr_set*/ 7678 0, /*tp_dictoffset*/ 7679 0, /*tp_init*/ 7680 0, /*tp_alloc*/ 7681 0, /*tp_new*/ 7682 0, /*tp_free*/ 7683 0, /*tp_is_gc*/ 7684}; 7685 7686PyObject* 7687PyUnicode_BuildEncodingMap(PyObject* string) 7688{ 7689 PyObject *result; 7690 struct encoding_map *mresult; 7691 int i; 7692 int need_dict = 0; 7693 unsigned char level1[32]; 7694 unsigned char level2[512]; 7695 unsigned char *mlevel1, *mlevel2, *mlevel3; 7696 int count2 = 0, count3 = 0; 7697 int kind; 7698 void *data; 7699 Py_ssize_t length; 7700 Py_UCS4 ch; 7701 7702 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7703 PyErr_BadArgument(); 7704 return NULL; 7705 } 7706 kind = PyUnicode_KIND(string); 7707 data = PyUnicode_DATA(string); 7708 length = PyUnicode_GET_LENGTH(string); 7709 length = Py_MIN(length, 256); 7710 memset(level1, 0xFF, sizeof level1); 7711 memset(level2, 0xFF, sizeof level2); 7712 7713 /* If there isn't a one-to-one mapping of NULL to \0, 7714 or if there are non-BMP characters, we need to use 7715 a mapping dictionary. */ 7716 if (PyUnicode_READ(kind, data, 0) != 0) 7717 need_dict = 1; 7718 for (i = 1; i < length; i++) { 7719 int l1, l2; 7720 ch = PyUnicode_READ(kind, data, i); 7721 if (ch == 0 || ch > 0xFFFF) { 7722 need_dict = 1; 7723 break; 7724 } 7725 if (ch == 0xFFFE) 7726 /* unmapped character */ 7727 continue; 7728 l1 = ch >> 11; 7729 l2 = ch >> 7; 7730 if (level1[l1] == 0xFF) 7731 level1[l1] = count2++; 7732 if (level2[l2] == 0xFF) 7733 level2[l2] = count3++; 7734 } 7735 7736 if (count2 >= 0xFF || count3 >= 0xFF) 7737 need_dict = 1; 7738 7739 if (need_dict) { 7740 PyObject *result = PyDict_New(); 7741 PyObject *key, *value; 7742 if (!result) 7743 return NULL; 7744 for (i = 0; i < length; i++) { 7745 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7746 value = PyLong_FromLong(i); 7747 if (!key || !value) 7748 goto failed1; 7749 if (PyDict_SetItem(result, key, value) == -1) 7750 goto failed1; 7751 Py_DECREF(key); 7752 Py_DECREF(value); 7753 } 7754 return result; 7755 failed1: 7756 Py_XDECREF(key); 7757 Py_XDECREF(value); 7758 Py_DECREF(result); 7759 return NULL; 7760 } 7761 7762 /* Create a three-level trie */ 7763 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7764 16*count2 + 128*count3 - 1); 7765 if (!result) 7766 return PyErr_NoMemory(); 7767 PyObject_Init(result, &EncodingMapType); 7768 mresult = (struct encoding_map*)result; 7769 mresult->count2 = count2; 7770 mresult->count3 = count3; 7771 mlevel1 = mresult->level1; 7772 mlevel2 = mresult->level23; 7773 mlevel3 = mresult->level23 + 16*count2; 7774 memcpy(mlevel1, level1, 32); 7775 memset(mlevel2, 0xFF, 16*count2); 7776 memset(mlevel3, 0, 128*count3); 7777 count3 = 0; 7778 for (i = 1; i < length; i++) { 7779 int o1, o2, o3, i2, i3; 7780 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7781 if (ch == 0xFFFE) 7782 /* unmapped character */ 7783 continue; 7784 o1 = ch>>11; 7785 o2 = (ch>>7) & 0xF; 7786 i2 = 16*mlevel1[o1] + o2; 7787 if (mlevel2[i2] == 0xFF) 7788 mlevel2[i2] = count3++; 7789 o3 = ch & 0x7F; 7790 i3 = 128*mlevel2[i2] + o3; 7791 mlevel3[i3] = i; 7792 } 7793 return result; 7794} 7795 7796static int 7797encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7798{ 7799 struct encoding_map *map = (struct encoding_map*)mapping; 7800 int l1 = c>>11; 7801 int l2 = (c>>7) & 0xF; 7802 int l3 = c & 0x7F; 7803 int i; 7804 7805 if (c > 0xFFFF) 7806 return -1; 7807 if (c == 0) 7808 return 0; 7809 /* level 1*/ 7810 i = map->level1[l1]; 7811 if (i == 0xFF) { 7812 return -1; 7813 } 7814 /* level 2*/ 7815 i = map->level23[16*i+l2]; 7816 if (i == 0xFF) { 7817 return -1; 7818 } 7819 /* level 3 */ 7820 i = map->level23[16*map->count2 + 128*i + l3]; 7821 if (i == 0) { 7822 return -1; 7823 } 7824 return i; 7825} 7826 7827/* Lookup the character ch in the mapping. If the character 7828 can't be found, Py_None is returned (or NULL, if another 7829 error occurred). */ 7830static PyObject * 7831charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 7832{ 7833 PyObject *w = PyLong_FromLong((long)c); 7834 PyObject *x; 7835 7836 if (w == NULL) 7837 return NULL; 7838 x = PyObject_GetItem(mapping, w); 7839 Py_DECREF(w); 7840 if (x == NULL) { 7841 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7842 /* No mapping found means: mapping is undefined. */ 7843 PyErr_Clear(); 7844 x = Py_None; 7845 Py_INCREF(x); 7846 return x; 7847 } else 7848 return NULL; 7849 } 7850 else if (x == Py_None) 7851 return x; 7852 else if (PyLong_Check(x)) { 7853 long value = PyLong_AS_LONG(x); 7854 if (value < 0 || value > 255) { 7855 PyErr_SetString(PyExc_TypeError, 7856 "character mapping must be in range(256)"); 7857 Py_DECREF(x); 7858 return NULL; 7859 } 7860 return x; 7861 } 7862 else if (PyBytes_Check(x)) 7863 return x; 7864 else { 7865 /* wrong return value */ 7866 PyErr_Format(PyExc_TypeError, 7867 "character mapping must return integer, bytes or None, not %.400s", 7868 x->ob_type->tp_name); 7869 Py_DECREF(x); 7870 return NULL; 7871 } 7872} 7873 7874static int 7875charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 7876{ 7877 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7878 /* exponentially overallocate to minimize reallocations */ 7879 if (requiredsize < 2*outsize) 7880 requiredsize = 2*outsize; 7881 if (_PyBytes_Resize(outobj, requiredsize)) 7882 return -1; 7883 return 0; 7884} 7885 7886typedef enum charmapencode_result { 7887 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 7888} charmapencode_result; 7889/* lookup the character, put the result in the output string and adjust 7890 various state variables. Resize the output bytes object if not enough 7891 space is available. Return a new reference to the object that 7892 was put in the output buffer, or Py_None, if the mapping was undefined 7893 (in which case no character was written) or NULL, if a 7894 reallocation error occurred. The caller must decref the result */ 7895static charmapencode_result 7896charmapencode_output(Py_UCS4 c, PyObject *mapping, 7897 PyObject **outobj, Py_ssize_t *outpos) 7898{ 7899 PyObject *rep; 7900 char *outstart; 7901 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 7902 7903 if (Py_TYPE(mapping) == &EncodingMapType) { 7904 int res = encoding_map_lookup(c, mapping); 7905 Py_ssize_t requiredsize = *outpos+1; 7906 if (res == -1) 7907 return enc_FAILED; 7908 if (outsize<requiredsize) 7909 if (charmapencode_resize(outobj, outpos, requiredsize)) 7910 return enc_EXCEPTION; 7911 outstart = PyBytes_AS_STRING(*outobj); 7912 outstart[(*outpos)++] = (char)res; 7913 return enc_SUCCESS; 7914 } 7915 7916 rep = charmapencode_lookup(c, mapping); 7917 if (rep==NULL) 7918 return enc_EXCEPTION; 7919 else if (rep==Py_None) { 7920 Py_DECREF(rep); 7921 return enc_FAILED; 7922 } else { 7923 if (PyLong_Check(rep)) { 7924 Py_ssize_t requiredsize = *outpos+1; 7925 if (outsize<requiredsize) 7926 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7927 Py_DECREF(rep); 7928 return enc_EXCEPTION; 7929 } 7930 outstart = PyBytes_AS_STRING(*outobj); 7931 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 7932 } 7933 else { 7934 const char *repchars = PyBytes_AS_STRING(rep); 7935 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 7936 Py_ssize_t requiredsize = *outpos+repsize; 7937 if (outsize<requiredsize) 7938 if (charmapencode_resize(outobj, outpos, requiredsize)) { 7939 Py_DECREF(rep); 7940 return enc_EXCEPTION; 7941 } 7942 outstart = PyBytes_AS_STRING(*outobj); 7943 memcpy(outstart + *outpos, repchars, repsize); 7944 *outpos += repsize; 7945 } 7946 } 7947 Py_DECREF(rep); 7948 return enc_SUCCESS; 7949} 7950 7951/* handle an error in PyUnicode_EncodeCharmap 7952 Return 0 on success, -1 on error */ 7953static int 7954charmap_encoding_error( 7955 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 7956 PyObject **exceptionObject, 7957 int *known_errorHandler, PyObject **errorHandler, const char *errors, 7958 PyObject **res, Py_ssize_t *respos) 7959{ 7960 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 7961 Py_ssize_t size, repsize; 7962 Py_ssize_t newpos; 7963 enum PyUnicode_Kind kind; 7964 void *data; 7965 Py_ssize_t index; 7966 /* startpos for collecting unencodable chars */ 7967 Py_ssize_t collstartpos = *inpos; 7968 Py_ssize_t collendpos = *inpos+1; 7969 Py_ssize_t collpos; 7970 char *encoding = "charmap"; 7971 char *reason = "character maps to <undefined>"; 7972 charmapencode_result x; 7973 Py_UCS4 ch; 7974 int val; 7975 7976 if (PyUnicode_READY(unicode) == -1) 7977 return -1; 7978 size = PyUnicode_GET_LENGTH(unicode); 7979 /* find all unencodable characters */ 7980 while (collendpos < size) { 7981 PyObject *rep; 7982 if (Py_TYPE(mapping) == &EncodingMapType) { 7983 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7984 val = encoding_map_lookup(ch, mapping); 7985 if (val != -1) 7986 break; 7987 ++collendpos; 7988 continue; 7989 } 7990 7991 ch = PyUnicode_READ_CHAR(unicode, collendpos); 7992 rep = charmapencode_lookup(ch, mapping); 7993 if (rep==NULL) 7994 return -1; 7995 else if (rep!=Py_None) { 7996 Py_DECREF(rep); 7997 break; 7998 } 7999 Py_DECREF(rep); 8000 ++collendpos; 8001 } 8002 /* cache callback name lookup 8003 * (if not done yet, i.e. it's the first error) */ 8004 if (*known_errorHandler==-1) { 8005 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8006 *known_errorHandler = 1; 8007 else if (!strcmp(errors, "replace")) 8008 *known_errorHandler = 2; 8009 else if (!strcmp(errors, "ignore")) 8010 *known_errorHandler = 3; 8011 else if (!strcmp(errors, "xmlcharrefreplace")) 8012 *known_errorHandler = 4; 8013 else 8014 *known_errorHandler = 0; 8015 } 8016 switch (*known_errorHandler) { 8017 case 1: /* strict */ 8018 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8019 return -1; 8020 case 2: /* replace */ 8021 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8022 x = charmapencode_output('?', mapping, res, respos); 8023 if (x==enc_EXCEPTION) { 8024 return -1; 8025 } 8026 else if (x==enc_FAILED) { 8027 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8028 return -1; 8029 } 8030 } 8031 /* fall through */ 8032 case 3: /* ignore */ 8033 *inpos = collendpos; 8034 break; 8035 case 4: /* xmlcharrefreplace */ 8036 /* generate replacement (temporarily (mis)uses p) */ 8037 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8038 char buffer[2+29+1+1]; 8039 char *cp; 8040 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8041 for (cp = buffer; *cp; ++cp) { 8042 x = charmapencode_output(*cp, mapping, res, respos); 8043 if (x==enc_EXCEPTION) 8044 return -1; 8045 else if (x==enc_FAILED) { 8046 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8047 return -1; 8048 } 8049 } 8050 } 8051 *inpos = collendpos; 8052 break; 8053 default: 8054 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, 8055 encoding, reason, unicode, exceptionObject, 8056 collstartpos, collendpos, &newpos); 8057 if (repunicode == NULL) 8058 return -1; 8059 if (PyBytes_Check(repunicode)) { 8060 /* Directly copy bytes result to output. */ 8061 Py_ssize_t outsize = PyBytes_Size(*res); 8062 Py_ssize_t requiredsize; 8063 repsize = PyBytes_Size(repunicode); 8064 requiredsize = *respos + repsize; 8065 if (requiredsize > outsize) 8066 /* Make room for all additional bytes. */ 8067 if (charmapencode_resize(res, respos, requiredsize)) { 8068 Py_DECREF(repunicode); 8069 return -1; 8070 } 8071 memcpy(PyBytes_AsString(*res) + *respos, 8072 PyBytes_AsString(repunicode), repsize); 8073 *respos += repsize; 8074 *inpos = newpos; 8075 Py_DECREF(repunicode); 8076 break; 8077 } 8078 /* generate replacement */ 8079 if (PyUnicode_READY(repunicode) == -1) { 8080 Py_DECREF(repunicode); 8081 return -1; 8082 } 8083 repsize = PyUnicode_GET_LENGTH(repunicode); 8084 data = PyUnicode_DATA(repunicode); 8085 kind = PyUnicode_KIND(repunicode); 8086 for (index = 0; index < repsize; index++) { 8087 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8088 x = charmapencode_output(repch, mapping, res, respos); 8089 if (x==enc_EXCEPTION) { 8090 Py_DECREF(repunicode); 8091 return -1; 8092 } 8093 else if (x==enc_FAILED) { 8094 Py_DECREF(repunicode); 8095 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8096 return -1; 8097 } 8098 } 8099 *inpos = newpos; 8100 Py_DECREF(repunicode); 8101 } 8102 return 0; 8103} 8104 8105PyObject * 8106_PyUnicode_EncodeCharmap(PyObject *unicode, 8107 PyObject *mapping, 8108 const char *errors) 8109{ 8110 /* output object */ 8111 PyObject *res = NULL; 8112 /* current input position */ 8113 Py_ssize_t inpos = 0; 8114 Py_ssize_t size; 8115 /* current output position */ 8116 Py_ssize_t respos = 0; 8117 PyObject *errorHandler = NULL; 8118 PyObject *exc = NULL; 8119 /* the following variable is used for caching string comparisons 8120 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8121 * 3=ignore, 4=xmlcharrefreplace */ 8122 int known_errorHandler = -1; 8123 8124 if (PyUnicode_READY(unicode) == -1) 8125 return NULL; 8126 size = PyUnicode_GET_LENGTH(unicode); 8127 8128 /* Default to Latin-1 */ 8129 if (mapping == NULL) 8130 return unicode_encode_ucs1(unicode, errors, 256); 8131 8132 /* allocate enough for a simple encoding without 8133 replacements, if we need more, we'll resize */ 8134 res = PyBytes_FromStringAndSize(NULL, size); 8135 if (res == NULL) 8136 goto onError; 8137 if (size == 0) 8138 return res; 8139 8140 while (inpos<size) { 8141 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos); 8142 /* try to encode it */ 8143 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8144 if (x==enc_EXCEPTION) /* error */ 8145 goto onError; 8146 if (x==enc_FAILED) { /* unencodable character */ 8147 if (charmap_encoding_error(unicode, &inpos, mapping, 8148 &exc, 8149 &known_errorHandler, &errorHandler, errors, 8150 &res, &respos)) { 8151 goto onError; 8152 } 8153 } 8154 else 8155 /* done with this character => adjust input position */ 8156 ++inpos; 8157 } 8158 8159 /* Resize if we allocated to much */ 8160 if (respos<PyBytes_GET_SIZE(res)) 8161 if (_PyBytes_Resize(&res, respos) < 0) 8162 goto onError; 8163 8164 Py_XDECREF(exc); 8165 Py_XDECREF(errorHandler); 8166 return res; 8167 8168 onError: 8169 Py_XDECREF(res); 8170 Py_XDECREF(exc); 8171 Py_XDECREF(errorHandler); 8172 return NULL; 8173} 8174 8175/* Deprecated */ 8176PyObject * 8177PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8178 Py_ssize_t size, 8179 PyObject *mapping, 8180 const char *errors) 8181{ 8182 PyObject *result; 8183 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8184 if (unicode == NULL) 8185 return NULL; 8186 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8187 Py_DECREF(unicode); 8188 return result; 8189} 8190 8191PyObject * 8192PyUnicode_AsCharmapString(PyObject *unicode, 8193 PyObject *mapping) 8194{ 8195 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8196 PyErr_BadArgument(); 8197 return NULL; 8198 } 8199 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8200} 8201 8202/* create or adjust a UnicodeTranslateError */ 8203static void 8204make_translate_exception(PyObject **exceptionObject, 8205 PyObject *unicode, 8206 Py_ssize_t startpos, Py_ssize_t endpos, 8207 const char *reason) 8208{ 8209 if (*exceptionObject == NULL) { 8210 *exceptionObject = _PyUnicodeTranslateError_Create( 8211 unicode, startpos, endpos, reason); 8212 } 8213 else { 8214 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8215 goto onError; 8216 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8217 goto onError; 8218 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8219 goto onError; 8220 return; 8221 onError: 8222 Py_DECREF(*exceptionObject); 8223 *exceptionObject = NULL; 8224 } 8225} 8226 8227/* raises a UnicodeTranslateError */ 8228static void 8229raise_translate_exception(PyObject **exceptionObject, 8230 PyObject *unicode, 8231 Py_ssize_t startpos, Py_ssize_t endpos, 8232 const char *reason) 8233{ 8234 make_translate_exception(exceptionObject, 8235 unicode, startpos, endpos, reason); 8236 if (*exceptionObject != NULL) 8237 PyCodec_StrictErrors(*exceptionObject); 8238} 8239 8240/* error handling callback helper: 8241 build arguments, call the callback and check the arguments, 8242 put the result into newpos and return the replacement string, which 8243 has to be freed by the caller */ 8244static PyObject * 8245unicode_translate_call_errorhandler(const char *errors, 8246 PyObject **errorHandler, 8247 const char *reason, 8248 PyObject *unicode, PyObject **exceptionObject, 8249 Py_ssize_t startpos, Py_ssize_t endpos, 8250 Py_ssize_t *newpos) 8251{ 8252 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8253 8254 Py_ssize_t i_newpos; 8255 PyObject *restuple; 8256 PyObject *resunicode; 8257 8258 if (*errorHandler == NULL) { 8259 *errorHandler = PyCodec_LookupError(errors); 8260 if (*errorHandler == NULL) 8261 return NULL; 8262 } 8263 8264 make_translate_exception(exceptionObject, 8265 unicode, startpos, endpos, reason); 8266 if (*exceptionObject == NULL) 8267 return NULL; 8268 8269 restuple = PyObject_CallFunctionObjArgs( 8270 *errorHandler, *exceptionObject, NULL); 8271 if (restuple == NULL) 8272 return NULL; 8273 if (!PyTuple_Check(restuple)) { 8274 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8275 Py_DECREF(restuple); 8276 return NULL; 8277 } 8278 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8279 &resunicode, &i_newpos)) { 8280 Py_DECREF(restuple); 8281 return NULL; 8282 } 8283 if (i_newpos<0) 8284 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8285 else 8286 *newpos = i_newpos; 8287 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8288 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8289 Py_DECREF(restuple); 8290 return NULL; 8291 } 8292 Py_INCREF(resunicode); 8293 Py_DECREF(restuple); 8294 return resunicode; 8295} 8296 8297/* Lookup the character ch in the mapping and put the result in result, 8298 which must be decrefed by the caller. 8299 Return 0 on success, -1 on error */ 8300static int 8301charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8302{ 8303 PyObject *w = PyLong_FromLong((long)c); 8304 PyObject *x; 8305 8306 if (w == NULL) 8307 return -1; 8308 x = PyObject_GetItem(mapping, w); 8309 Py_DECREF(w); 8310 if (x == NULL) { 8311 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8312 /* No mapping found means: use 1:1 mapping. */ 8313 PyErr_Clear(); 8314 *result = NULL; 8315 return 0; 8316 } else 8317 return -1; 8318 } 8319 else if (x == Py_None) { 8320 *result = x; 8321 return 0; 8322 } 8323 else if (PyLong_Check(x)) { 8324 long value = PyLong_AS_LONG(x); 8325 long max = PyUnicode_GetMax(); 8326 if (value < 0 || value > max) { 8327 PyErr_Format(PyExc_TypeError, 8328 "character mapping must be in range(0x%x)", max+1); 8329 Py_DECREF(x); 8330 return -1; 8331 } 8332 *result = x; 8333 return 0; 8334 } 8335 else if (PyUnicode_Check(x)) { 8336 *result = x; 8337 return 0; 8338 } 8339 else { 8340 /* wrong return value */ 8341 PyErr_SetString(PyExc_TypeError, 8342 "character mapping must return integer, None or str"); 8343 Py_DECREF(x); 8344 return -1; 8345 } 8346} 8347/* ensure that *outobj is at least requiredsize characters long, 8348 if not reallocate and adjust various state variables. 8349 Return 0 on success, -1 on error */ 8350static int 8351charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, 8352 Py_ssize_t requiredsize) 8353{ 8354 Py_ssize_t oldsize = *psize; 8355 Py_UCS4 *new_outobj; 8356 if (requiredsize > oldsize) { 8357 /* exponentially overallocate to minimize reallocations */ 8358 if (requiredsize < 2 * oldsize) 8359 requiredsize = 2 * oldsize; 8360 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); 8361 if (new_outobj == 0) 8362 return -1; 8363 *outobj = new_outobj; 8364 *psize = requiredsize; 8365 } 8366 return 0; 8367} 8368/* lookup the character, put the result in the output string and adjust 8369 various state variables. Return a new reference to the object that 8370 was put in the output buffer in *result, or Py_None, if the mapping was 8371 undefined (in which case no character was written). 8372 The called must decref result. 8373 Return 0 on success, -1 on error. */ 8374static int 8375charmaptranslate_output(PyObject *input, Py_ssize_t ipos, 8376 PyObject *mapping, Py_UCS4 **output, 8377 Py_ssize_t *osize, Py_ssize_t *opos, 8378 PyObject **res) 8379{ 8380 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); 8381 if (charmaptranslate_lookup(curinp, mapping, res)) 8382 return -1; 8383 if (*res==NULL) { 8384 /* not found => default to 1:1 mapping */ 8385 (*output)[(*opos)++] = curinp; 8386 } 8387 else if (*res==Py_None) 8388 ; 8389 else if (PyLong_Check(*res)) { 8390 /* no overflow check, because we know that the space is enough */ 8391 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); 8392 } 8393 else if (PyUnicode_Check(*res)) { 8394 Py_ssize_t repsize; 8395 if (PyUnicode_READY(*res) == -1) 8396 return -1; 8397 repsize = PyUnicode_GET_LENGTH(*res); 8398 if (repsize==1) { 8399 /* no overflow check, because we know that the space is enough */ 8400 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0); 8401 } 8402 else if (repsize!=0) { 8403 /* more than one character */ 8404 Py_ssize_t requiredsize = *opos + 8405 (PyUnicode_GET_LENGTH(input) - ipos) + 8406 repsize - 1; 8407 Py_ssize_t i; 8408 if (charmaptranslate_makespace(output, osize, requiredsize)) 8409 return -1; 8410 for(i = 0; i < repsize; i++) 8411 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); 8412 } 8413 } 8414 else 8415 return -1; 8416 return 0; 8417} 8418 8419PyObject * 8420_PyUnicode_TranslateCharmap(PyObject *input, 8421 PyObject *mapping, 8422 const char *errors) 8423{ 8424 /* input object */ 8425 char *idata; 8426 Py_ssize_t size, i; 8427 int kind; 8428 /* output buffer */ 8429 Py_UCS4 *output = NULL; 8430 Py_ssize_t osize; 8431 PyObject *res; 8432 /* current output position */ 8433 Py_ssize_t opos; 8434 char *reason = "character maps to <undefined>"; 8435 PyObject *errorHandler = NULL; 8436 PyObject *exc = NULL; 8437 /* the following variable is used for caching string comparisons 8438 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 8439 * 3=ignore, 4=xmlcharrefreplace */ 8440 int known_errorHandler = -1; 8441 8442 if (mapping == NULL) { 8443 PyErr_BadArgument(); 8444 return NULL; 8445 } 8446 8447 if (PyUnicode_READY(input) == -1) 8448 return NULL; 8449 idata = (char*)PyUnicode_DATA(input); 8450 kind = PyUnicode_KIND(input); 8451 size = PyUnicode_GET_LENGTH(input); 8452 i = 0; 8453 8454 if (size == 0) { 8455 Py_INCREF(input); 8456 return input; 8457 } 8458 8459 /* allocate enough for a simple 1:1 translation without 8460 replacements, if we need more, we'll resize */ 8461 osize = size; 8462 output = PyMem_Malloc(osize * sizeof(Py_UCS4)); 8463 opos = 0; 8464 if (output == NULL) { 8465 PyErr_NoMemory(); 8466 goto onError; 8467 } 8468 8469 while (i<size) { 8470 /* try to encode it */ 8471 PyObject *x = NULL; 8472 if (charmaptranslate_output(input, i, mapping, 8473 &output, &osize, &opos, &x)) { 8474 Py_XDECREF(x); 8475 goto onError; 8476 } 8477 Py_XDECREF(x); 8478 if (x!=Py_None) /* it worked => adjust input pointer */ 8479 ++i; 8480 else { /* untranslatable character */ 8481 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8482 Py_ssize_t repsize; 8483 Py_ssize_t newpos; 8484 Py_ssize_t uni2; 8485 /* startpos for collecting untranslatable chars */ 8486 Py_ssize_t collstart = i; 8487 Py_ssize_t collend = i+1; 8488 Py_ssize_t coll; 8489 8490 /* find all untranslatable characters */ 8491 while (collend < size) { 8492 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) 8493 goto onError; 8494 Py_XDECREF(x); 8495 if (x!=Py_None) 8496 break; 8497 ++collend; 8498 } 8499 /* cache callback name lookup 8500 * (if not done yet, i.e. it's the first error) */ 8501 if (known_errorHandler==-1) { 8502 if ((errors==NULL) || (!strcmp(errors, "strict"))) 8503 known_errorHandler = 1; 8504 else if (!strcmp(errors, "replace")) 8505 known_errorHandler = 2; 8506 else if (!strcmp(errors, "ignore")) 8507 known_errorHandler = 3; 8508 else if (!strcmp(errors, "xmlcharrefreplace")) 8509 known_errorHandler = 4; 8510 else 8511 known_errorHandler = 0; 8512 } 8513 switch (known_errorHandler) { 8514 case 1: /* strict */ 8515 raise_translate_exception(&exc, input, collstart, 8516 collend, reason); 8517 goto onError; 8518 case 2: /* replace */ 8519 /* No need to check for space, this is a 1:1 replacement */ 8520 for (coll = collstart; coll<collend; coll++) 8521 output[opos++] = '?'; 8522 /* fall through */ 8523 case 3: /* ignore */ 8524 i = collend; 8525 break; 8526 case 4: /* xmlcharrefreplace */ 8527 /* generate replacement (temporarily (mis)uses i) */ 8528 for (i = collstart; i < collend; ++i) { 8529 char buffer[2+29+1+1]; 8530 char *cp; 8531 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); 8532 if (charmaptranslate_makespace(&output, &osize, 8533 opos+strlen(buffer)+(size-collend))) 8534 goto onError; 8535 for (cp = buffer; *cp; ++cp) 8536 output[opos++] = *cp; 8537 } 8538 i = collend; 8539 break; 8540 default: 8541 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8542 reason, input, &exc, 8543 collstart, collend, &newpos); 8544 if (repunicode == NULL) 8545 goto onError; 8546 if (PyUnicode_READY(repunicode) == -1) { 8547 Py_DECREF(repunicode); 8548 goto onError; 8549 } 8550 /* generate replacement */ 8551 repsize = PyUnicode_GET_LENGTH(repunicode); 8552 if (charmaptranslate_makespace(&output, &osize, 8553 opos+repsize+(size-collend))) { 8554 Py_DECREF(repunicode); 8555 goto onError; 8556 } 8557 for (uni2 = 0; repsize-->0; ++uni2) 8558 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); 8559 i = newpos; 8560 Py_DECREF(repunicode); 8561 } 8562 } 8563 } 8564 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); 8565 if (!res) 8566 goto onError; 8567 PyMem_Free(output); 8568 Py_XDECREF(exc); 8569 Py_XDECREF(errorHandler); 8570 return res; 8571 8572 onError: 8573 PyMem_Free(output); 8574 Py_XDECREF(exc); 8575 Py_XDECREF(errorHandler); 8576 return NULL; 8577} 8578 8579/* Deprecated. Use PyUnicode_Translate instead. */ 8580PyObject * 8581PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8582 Py_ssize_t size, 8583 PyObject *mapping, 8584 const char *errors) 8585{ 8586 PyObject *result; 8587 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8588 if (!unicode) 8589 return NULL; 8590 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8591 Py_DECREF(unicode); 8592 return result; 8593} 8594 8595PyObject * 8596PyUnicode_Translate(PyObject *str, 8597 PyObject *mapping, 8598 const char *errors) 8599{ 8600 PyObject *result; 8601 8602 str = PyUnicode_FromObject(str); 8603 if (str == NULL) 8604 return NULL; 8605 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8606 Py_DECREF(str); 8607 return result; 8608} 8609 8610static Py_UCS4 8611fix_decimal_and_space_to_ascii(PyObject *self) 8612{ 8613 /* No need to call PyUnicode_READY(self) because this function is only 8614 called as a callback from fixup() which does it already. */ 8615 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8616 const int kind = PyUnicode_KIND(self); 8617 void *data = PyUnicode_DATA(self); 8618 Py_UCS4 maxchar = 127, ch, fixed; 8619 int modified = 0; 8620 Py_ssize_t i; 8621 8622 for (i = 0; i < len; ++i) { 8623 ch = PyUnicode_READ(kind, data, i); 8624 fixed = 0; 8625 if (ch > 127) { 8626 if (Py_UNICODE_ISSPACE(ch)) 8627 fixed = ' '; 8628 else { 8629 const int decimal = Py_UNICODE_TODECIMAL(ch); 8630 if (decimal >= 0) 8631 fixed = '0' + decimal; 8632 } 8633 if (fixed != 0) { 8634 modified = 1; 8635 maxchar = MAX_MAXCHAR(maxchar, fixed); 8636 PyUnicode_WRITE(kind, data, i, fixed); 8637 } 8638 else 8639 maxchar = MAX_MAXCHAR(maxchar, ch); 8640 } 8641 } 8642 8643 return (modified) ? maxchar : 0; 8644} 8645 8646PyObject * 8647_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8648{ 8649 if (!PyUnicode_Check(unicode)) { 8650 PyErr_BadInternalCall(); 8651 return NULL; 8652 } 8653 if (PyUnicode_READY(unicode) == -1) 8654 return NULL; 8655 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8656 /* If the string is already ASCII, just return the same string */ 8657 Py_INCREF(unicode); 8658 return unicode; 8659 } 8660 return fixup(unicode, fix_decimal_and_space_to_ascii); 8661} 8662 8663PyObject * 8664PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8665 Py_ssize_t length) 8666{ 8667 PyObject *decimal; 8668 Py_ssize_t i; 8669 Py_UCS4 maxchar; 8670 enum PyUnicode_Kind kind; 8671 void *data; 8672 8673 maxchar = 127; 8674 for (i = 0; i < length; i++) { 8675 Py_UNICODE ch = s[i]; 8676 if (ch > 127) { 8677 int decimal = Py_UNICODE_TODECIMAL(ch); 8678 if (decimal >= 0) 8679 ch = '0' + decimal; 8680 maxchar = MAX_MAXCHAR(maxchar, ch); 8681 } 8682 } 8683 8684 /* Copy to a new string */ 8685 decimal = PyUnicode_New(length, maxchar); 8686 if (decimal == NULL) 8687 return decimal; 8688 kind = PyUnicode_KIND(decimal); 8689 data = PyUnicode_DATA(decimal); 8690 /* Iterate over code points */ 8691 for (i = 0; i < length; i++) { 8692 Py_UNICODE ch = s[i]; 8693 if (ch > 127) { 8694 int decimal = Py_UNICODE_TODECIMAL(ch); 8695 if (decimal >= 0) 8696 ch = '0' + decimal; 8697 } 8698 PyUnicode_WRITE(kind, data, i, ch); 8699 } 8700 return unicode_result(decimal); 8701} 8702/* --- Decimal Encoder ---------------------------------------------------- */ 8703 8704int 8705PyUnicode_EncodeDecimal(Py_UNICODE *s, 8706 Py_ssize_t length, 8707 char *output, 8708 const char *errors) 8709{ 8710 PyObject *unicode; 8711 Py_ssize_t i; 8712 enum PyUnicode_Kind kind; 8713 void *data; 8714 8715 if (output == NULL) { 8716 PyErr_BadArgument(); 8717 return -1; 8718 } 8719 8720 unicode = PyUnicode_FromUnicode(s, length); 8721 if (unicode == NULL) 8722 return -1; 8723 8724 if (PyUnicode_READY(unicode) == -1) { 8725 Py_DECREF(unicode); 8726 return -1; 8727 } 8728 kind = PyUnicode_KIND(unicode); 8729 data = PyUnicode_DATA(unicode); 8730 8731 for (i=0; i < length; ) { 8732 PyObject *exc; 8733 Py_UCS4 ch; 8734 int decimal; 8735 Py_ssize_t startpos; 8736 8737 ch = PyUnicode_READ(kind, data, i); 8738 8739 if (Py_UNICODE_ISSPACE(ch)) { 8740 *output++ = ' '; 8741 i++; 8742 continue; 8743 } 8744 decimal = Py_UNICODE_TODECIMAL(ch); 8745 if (decimal >= 0) { 8746 *output++ = '0' + decimal; 8747 i++; 8748 continue; 8749 } 8750 if (0 < ch && ch < 256) { 8751 *output++ = (char)ch; 8752 i++; 8753 continue; 8754 } 8755 8756 startpos = i; 8757 exc = NULL; 8758 raise_encode_exception(&exc, "decimal", unicode, 8759 startpos, startpos+1, 8760 "invalid decimal Unicode string"); 8761 Py_XDECREF(exc); 8762 Py_DECREF(unicode); 8763 return -1; 8764 } 8765 /* 0-terminate the output string */ 8766 *output++ = '\0'; 8767 Py_DECREF(unicode); 8768 return 0; 8769} 8770 8771/* --- Helpers ------------------------------------------------------------ */ 8772 8773static Py_ssize_t 8774any_find_slice(int direction, PyObject* s1, PyObject* s2, 8775 Py_ssize_t start, 8776 Py_ssize_t end) 8777{ 8778 int kind1, kind2, kind; 8779 void *buf1, *buf2; 8780 Py_ssize_t len1, len2, result; 8781 8782 kind1 = PyUnicode_KIND(s1); 8783 kind2 = PyUnicode_KIND(s2); 8784 kind = kind1 > kind2 ? kind1 : kind2; 8785 buf1 = PyUnicode_DATA(s1); 8786 buf2 = PyUnicode_DATA(s2); 8787 if (kind1 != kind) 8788 buf1 = _PyUnicode_AsKind(s1, kind); 8789 if (!buf1) 8790 return -2; 8791 if (kind2 != kind) 8792 buf2 = _PyUnicode_AsKind(s2, kind); 8793 if (!buf2) { 8794 if (kind1 != kind) PyMem_Free(buf1); 8795 return -2; 8796 } 8797 len1 = PyUnicode_GET_LENGTH(s1); 8798 len2 = PyUnicode_GET_LENGTH(s2); 8799 8800 if (direction > 0) { 8801 switch (kind) { 8802 case PyUnicode_1BYTE_KIND: 8803 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8804 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 8805 else 8806 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 8807 break; 8808 case PyUnicode_2BYTE_KIND: 8809 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 8810 break; 8811 case PyUnicode_4BYTE_KIND: 8812 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 8813 break; 8814 default: 8815 assert(0); result = -2; 8816 } 8817 } 8818 else { 8819 switch (kind) { 8820 case PyUnicode_1BYTE_KIND: 8821 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 8822 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 8823 else 8824 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8825 break; 8826 case PyUnicode_2BYTE_KIND: 8827 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8828 break; 8829 case PyUnicode_4BYTE_KIND: 8830 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 8831 break; 8832 default: 8833 assert(0); result = -2; 8834 } 8835 } 8836 8837 if (kind1 != kind) 8838 PyMem_Free(buf1); 8839 if (kind2 != kind) 8840 PyMem_Free(buf2); 8841 8842 return result; 8843} 8844 8845Py_ssize_t 8846_PyUnicode_InsertThousandsGrouping( 8847 PyObject *unicode, Py_ssize_t index, 8848 Py_ssize_t n_buffer, 8849 void *digits, Py_ssize_t n_digits, 8850 Py_ssize_t min_width, 8851 const char *grouping, PyObject *thousands_sep, 8852 Py_UCS4 *maxchar) 8853{ 8854 unsigned int kind, thousands_sep_kind; 8855 char *data, *thousands_sep_data; 8856 Py_ssize_t thousands_sep_len; 8857 Py_ssize_t len; 8858 8859 if (unicode != NULL) { 8860 kind = PyUnicode_KIND(unicode); 8861 data = (char *) PyUnicode_DATA(unicode) + index * kind; 8862 } 8863 else { 8864 kind = PyUnicode_1BYTE_KIND; 8865 data = NULL; 8866 } 8867 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 8868 thousands_sep_data = PyUnicode_DATA(thousands_sep); 8869 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 8870 if (unicode != NULL && thousands_sep_kind != kind) { 8871 if (thousands_sep_kind < kind) { 8872 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 8873 if (!thousands_sep_data) 8874 return -1; 8875 } 8876 else { 8877 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 8878 if (!data) 8879 return -1; 8880 } 8881 } 8882 8883 switch (kind) { 8884 case PyUnicode_1BYTE_KIND: 8885 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 8886 len = asciilib_InsertThousandsGrouping( 8887 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 8888 min_width, grouping, 8889 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8890 else 8891 len = ucs1lib_InsertThousandsGrouping( 8892 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 8893 min_width, grouping, 8894 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 8895 break; 8896 case PyUnicode_2BYTE_KIND: 8897 len = ucs2lib_InsertThousandsGrouping( 8898 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 8899 min_width, grouping, 8900 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 8901 break; 8902 case PyUnicode_4BYTE_KIND: 8903 len = ucs4lib_InsertThousandsGrouping( 8904 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 8905 min_width, grouping, 8906 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 8907 break; 8908 default: 8909 assert(0); 8910 return -1; 8911 } 8912 if (unicode != NULL && thousands_sep_kind != kind) { 8913 if (thousands_sep_kind < kind) 8914 PyMem_Free(thousands_sep_data); 8915 else 8916 PyMem_Free(data); 8917 } 8918 if (unicode == NULL) { 8919 *maxchar = 127; 8920 if (len != n_digits) { 8921 *maxchar = MAX_MAXCHAR(*maxchar, 8922 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 8923 } 8924 } 8925 return len; 8926} 8927 8928 8929/* helper macro to fixup start/end slice values */ 8930#define ADJUST_INDICES(start, end, len) \ 8931 if (end > len) \ 8932 end = len; \ 8933 else if (end < 0) { \ 8934 end += len; \ 8935 if (end < 0) \ 8936 end = 0; \ 8937 } \ 8938 if (start < 0) { \ 8939 start += len; \ 8940 if (start < 0) \ 8941 start = 0; \ 8942 } 8943 8944Py_ssize_t 8945PyUnicode_Count(PyObject *str, 8946 PyObject *substr, 8947 Py_ssize_t start, 8948 Py_ssize_t end) 8949{ 8950 Py_ssize_t result; 8951 PyObject* str_obj; 8952 PyObject* sub_obj; 8953 int kind1, kind2, kind; 8954 void *buf1 = NULL, *buf2 = NULL; 8955 Py_ssize_t len1, len2; 8956 8957 str_obj = PyUnicode_FromObject(str); 8958 if (!str_obj) 8959 return -1; 8960 sub_obj = PyUnicode_FromObject(substr); 8961 if (!sub_obj) { 8962 Py_DECREF(str_obj); 8963 return -1; 8964 } 8965 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 8966 Py_DECREF(sub_obj); 8967 Py_DECREF(str_obj); 8968 return -1; 8969 } 8970 8971 kind1 = PyUnicode_KIND(str_obj); 8972 kind2 = PyUnicode_KIND(sub_obj); 8973 kind = kind1; 8974 buf1 = PyUnicode_DATA(str_obj); 8975 buf2 = PyUnicode_DATA(sub_obj); 8976 if (kind2 != kind) { 8977 if (kind2 > kind) { 8978 Py_DECREF(sub_obj); 8979 Py_DECREF(str_obj); 8980 return 0; 8981 } 8982 buf2 = _PyUnicode_AsKind(sub_obj, kind); 8983 } 8984 if (!buf2) 8985 goto onError; 8986 len1 = PyUnicode_GET_LENGTH(str_obj); 8987 len2 = PyUnicode_GET_LENGTH(sub_obj); 8988 8989 ADJUST_INDICES(start, end, len1); 8990 switch (kind) { 8991 case PyUnicode_1BYTE_KIND: 8992 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 8993 result = asciilib_count( 8994 ((Py_UCS1*)buf1) + start, end - start, 8995 buf2, len2, PY_SSIZE_T_MAX 8996 ); 8997 else 8998 result = ucs1lib_count( 8999 ((Py_UCS1*)buf1) + start, end - start, 9000 buf2, len2, PY_SSIZE_T_MAX 9001 ); 9002 break; 9003 case PyUnicode_2BYTE_KIND: 9004 result = ucs2lib_count( 9005 ((Py_UCS2*)buf1) + start, end - start, 9006 buf2, len2, PY_SSIZE_T_MAX 9007 ); 9008 break; 9009 case PyUnicode_4BYTE_KIND: 9010 result = ucs4lib_count( 9011 ((Py_UCS4*)buf1) + start, end - start, 9012 buf2, len2, PY_SSIZE_T_MAX 9013 ); 9014 break; 9015 default: 9016 assert(0); result = 0; 9017 } 9018 9019 Py_DECREF(sub_obj); 9020 Py_DECREF(str_obj); 9021 9022 if (kind2 != kind) 9023 PyMem_Free(buf2); 9024 9025 return result; 9026 onError: 9027 Py_DECREF(sub_obj); 9028 Py_DECREF(str_obj); 9029 if (kind2 != kind && buf2) 9030 PyMem_Free(buf2); 9031 return -1; 9032} 9033 9034Py_ssize_t 9035PyUnicode_Find(PyObject *str, 9036 PyObject *sub, 9037 Py_ssize_t start, 9038 Py_ssize_t end, 9039 int direction) 9040{ 9041 Py_ssize_t result; 9042 9043 str = PyUnicode_FromObject(str); 9044 if (!str) 9045 return -2; 9046 sub = PyUnicode_FromObject(sub); 9047 if (!sub) { 9048 Py_DECREF(str); 9049 return -2; 9050 } 9051 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9052 Py_DECREF(sub); 9053 Py_DECREF(str); 9054 return -2; 9055 } 9056 9057 result = any_find_slice(direction, 9058 str, sub, start, end 9059 ); 9060 9061 Py_DECREF(str); 9062 Py_DECREF(sub); 9063 9064 return result; 9065} 9066 9067Py_ssize_t 9068PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9069 Py_ssize_t start, Py_ssize_t end, 9070 int direction) 9071{ 9072 int kind; 9073 Py_ssize_t result; 9074 if (PyUnicode_READY(str) == -1) 9075 return -2; 9076 if (start < 0 || end < 0) { 9077 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9078 return -2; 9079 } 9080 if (end > PyUnicode_GET_LENGTH(str)) 9081 end = PyUnicode_GET_LENGTH(str); 9082 kind = PyUnicode_KIND(str); 9083 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9084 kind, end-start, ch, direction); 9085 if (result == -1) 9086 return -1; 9087 else 9088 return start + result; 9089} 9090 9091static int 9092tailmatch(PyObject *self, 9093 PyObject *substring, 9094 Py_ssize_t start, 9095 Py_ssize_t end, 9096 int direction) 9097{ 9098 int kind_self; 9099 int kind_sub; 9100 void *data_self; 9101 void *data_sub; 9102 Py_ssize_t offset; 9103 Py_ssize_t i; 9104 Py_ssize_t end_sub; 9105 9106 if (PyUnicode_READY(self) == -1 || 9107 PyUnicode_READY(substring) == -1) 9108 return 0; 9109 9110 if (PyUnicode_GET_LENGTH(substring) == 0) 9111 return 1; 9112 9113 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9114 end -= PyUnicode_GET_LENGTH(substring); 9115 if (end < start) 9116 return 0; 9117 9118 kind_self = PyUnicode_KIND(self); 9119 data_self = PyUnicode_DATA(self); 9120 kind_sub = PyUnicode_KIND(substring); 9121 data_sub = PyUnicode_DATA(substring); 9122 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9123 9124 if (direction > 0) 9125 offset = end; 9126 else 9127 offset = start; 9128 9129 if (PyUnicode_READ(kind_self, data_self, offset) == 9130 PyUnicode_READ(kind_sub, data_sub, 0) && 9131 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9132 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9133 /* If both are of the same kind, memcmp is sufficient */ 9134 if (kind_self == kind_sub) { 9135 return ! memcmp((char *)data_self + 9136 (offset * PyUnicode_KIND(substring)), 9137 data_sub, 9138 PyUnicode_GET_LENGTH(substring) * 9139 PyUnicode_KIND(substring)); 9140 } 9141 /* otherwise we have to compare each character by first accesing it */ 9142 else { 9143 /* We do not need to compare 0 and len(substring)-1 because 9144 the if statement above ensured already that they are equal 9145 when we end up here. */ 9146 /* TODO: honor direction and do a forward or backwards search */ 9147 for (i = 1; i < end_sub; ++i) { 9148 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9149 PyUnicode_READ(kind_sub, data_sub, i)) 9150 return 0; 9151 } 9152 return 1; 9153 } 9154 } 9155 9156 return 0; 9157} 9158 9159Py_ssize_t 9160PyUnicode_Tailmatch(PyObject *str, 9161 PyObject *substr, 9162 Py_ssize_t start, 9163 Py_ssize_t end, 9164 int direction) 9165{ 9166 Py_ssize_t result; 9167 9168 str = PyUnicode_FromObject(str); 9169 if (str == NULL) 9170 return -1; 9171 substr = PyUnicode_FromObject(substr); 9172 if (substr == NULL) { 9173 Py_DECREF(str); 9174 return -1; 9175 } 9176 9177 result = tailmatch(str, substr, 9178 start, end, direction); 9179 Py_DECREF(str); 9180 Py_DECREF(substr); 9181 return result; 9182} 9183 9184/* Apply fixfct filter to the Unicode object self and return a 9185 reference to the modified object */ 9186 9187static PyObject * 9188fixup(PyObject *self, 9189 Py_UCS4 (*fixfct)(PyObject *s)) 9190{ 9191 PyObject *u; 9192 Py_UCS4 maxchar_old, maxchar_new = 0; 9193 PyObject *v; 9194 9195 u = _PyUnicode_Copy(self); 9196 if (u == NULL) 9197 return NULL; 9198 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9199 9200 /* fix functions return the new maximum character in a string, 9201 if the kind of the resulting unicode object does not change, 9202 everything is fine. Otherwise we need to change the string kind 9203 and re-run the fix function. */ 9204 maxchar_new = fixfct(u); 9205 9206 if (maxchar_new == 0) { 9207 /* no changes */; 9208 if (PyUnicode_CheckExact(self)) { 9209 Py_DECREF(u); 9210 Py_INCREF(self); 9211 return self; 9212 } 9213 else 9214 return u; 9215 } 9216 9217 maxchar_new = align_maxchar(maxchar_new); 9218 9219 if (maxchar_new == maxchar_old) 9220 return u; 9221 9222 /* In case the maximum character changed, we need to 9223 convert the string to the new category. */ 9224 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9225 if (v == NULL) { 9226 Py_DECREF(u); 9227 return NULL; 9228 } 9229 if (maxchar_new > maxchar_old) { 9230 /* If the maxchar increased so that the kind changed, not all 9231 characters are representable anymore and we need to fix the 9232 string again. This only happens in very few cases. */ 9233 _PyUnicode_FastCopyCharacters(v, 0, 9234 self, 0, PyUnicode_GET_LENGTH(self)); 9235 maxchar_old = fixfct(v); 9236 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9237 } 9238 else { 9239 _PyUnicode_FastCopyCharacters(v, 0, 9240 u, 0, PyUnicode_GET_LENGTH(self)); 9241 } 9242 Py_DECREF(u); 9243 assert(_PyUnicode_CheckConsistency(v, 1)); 9244 return v; 9245} 9246 9247static PyObject * 9248ascii_upper_or_lower(PyObject *self, int lower) 9249{ 9250 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9251 char *resdata, *data = PyUnicode_DATA(self); 9252 PyObject *res; 9253 9254 res = PyUnicode_New(len, 127); 9255 if (res == NULL) 9256 return NULL; 9257 resdata = PyUnicode_DATA(res); 9258 if (lower) 9259 _Py_bytes_lower(resdata, data, len); 9260 else 9261 _Py_bytes_upper(resdata, data, len); 9262 return res; 9263} 9264 9265static Py_UCS4 9266handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9267{ 9268 Py_ssize_t j; 9269 int final_sigma; 9270 Py_UCS4 c; 9271 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9272 9273 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9274 9275 where ! is a negation and \p{xxx} is a character with property xxx. 9276 */ 9277 for (j = i - 1; j >= 0; j--) { 9278 c = PyUnicode_READ(kind, data, j); 9279 if (!_PyUnicode_IsCaseIgnorable(c)) 9280 break; 9281 } 9282 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9283 if (final_sigma) { 9284 for (j = i + 1; j < length; j++) { 9285 c = PyUnicode_READ(kind, data, j); 9286 if (!_PyUnicode_IsCaseIgnorable(c)) 9287 break; 9288 } 9289 final_sigma = j == length || !_PyUnicode_IsCased(c); 9290 } 9291 return (final_sigma) ? 0x3C2 : 0x3C3; 9292} 9293 9294static int 9295lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9296 Py_UCS4 c, Py_UCS4 *mapped) 9297{ 9298 /* Obscure special case. */ 9299 if (c == 0x3A3) { 9300 mapped[0] = handle_capital_sigma(kind, data, length, i); 9301 return 1; 9302 } 9303 return _PyUnicode_ToLowerFull(c, mapped); 9304} 9305 9306static Py_ssize_t 9307do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9308{ 9309 Py_ssize_t i, k = 0; 9310 int n_res, j; 9311 Py_UCS4 c, mapped[3]; 9312 9313 c = PyUnicode_READ(kind, data, 0); 9314 n_res = _PyUnicode_ToUpperFull(c, mapped); 9315 for (j = 0; j < n_res; j++) { 9316 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9317 res[k++] = mapped[j]; 9318 } 9319 for (i = 1; i < length; i++) { 9320 c = PyUnicode_READ(kind, data, i); 9321 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9322 for (j = 0; j < n_res; j++) { 9323 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9324 res[k++] = mapped[j]; 9325 } 9326 } 9327 return k; 9328} 9329 9330static Py_ssize_t 9331do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9332 Py_ssize_t i, k = 0; 9333 9334 for (i = 0; i < length; i++) { 9335 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9336 int n_res, j; 9337 if (Py_UNICODE_ISUPPER(c)) { 9338 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9339 } 9340 else if (Py_UNICODE_ISLOWER(c)) { 9341 n_res = _PyUnicode_ToUpperFull(c, mapped); 9342 } 9343 else { 9344 n_res = 1; 9345 mapped[0] = c; 9346 } 9347 for (j = 0; j < n_res; j++) { 9348 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9349 res[k++] = mapped[j]; 9350 } 9351 } 9352 return k; 9353} 9354 9355static Py_ssize_t 9356do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9357 Py_UCS4 *maxchar, int lower) 9358{ 9359 Py_ssize_t i, k = 0; 9360 9361 for (i = 0; i < length; i++) { 9362 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9363 int n_res, j; 9364 if (lower) 9365 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9366 else 9367 n_res = _PyUnicode_ToUpperFull(c, mapped); 9368 for (j = 0; j < n_res; j++) { 9369 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9370 res[k++] = mapped[j]; 9371 } 9372 } 9373 return k; 9374} 9375 9376static Py_ssize_t 9377do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9378{ 9379 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9380} 9381 9382static Py_ssize_t 9383do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9384{ 9385 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9386} 9387 9388static Py_ssize_t 9389do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9390{ 9391 Py_ssize_t i, k = 0; 9392 9393 for (i = 0; i < length; i++) { 9394 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9395 Py_UCS4 mapped[3]; 9396 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9397 for (j = 0; j < n_res; j++) { 9398 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9399 res[k++] = mapped[j]; 9400 } 9401 } 9402 return k; 9403} 9404 9405static Py_ssize_t 9406do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9407{ 9408 Py_ssize_t i, k = 0; 9409 int previous_is_cased; 9410 9411 previous_is_cased = 0; 9412 for (i = 0; i < length; i++) { 9413 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9414 Py_UCS4 mapped[3]; 9415 int n_res, j; 9416 9417 if (previous_is_cased) 9418 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9419 else 9420 n_res = _PyUnicode_ToTitleFull(c, mapped); 9421 9422 for (j = 0; j < n_res; j++) { 9423 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); 9424 res[k++] = mapped[j]; 9425 } 9426 9427 previous_is_cased = _PyUnicode_IsCased(c); 9428 } 9429 return k; 9430} 9431 9432static PyObject * 9433case_operation(PyObject *self, 9434 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9435{ 9436 PyObject *res = NULL; 9437 Py_ssize_t length, newlength = 0; 9438 int kind, outkind; 9439 void *data, *outdata; 9440 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9441 9442 assert(PyUnicode_IS_READY(self)); 9443 9444 kind = PyUnicode_KIND(self); 9445 data = PyUnicode_DATA(self); 9446 length = PyUnicode_GET_LENGTH(self); 9447 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9448 if (tmp == NULL) 9449 return PyErr_NoMemory(); 9450 newlength = perform(kind, data, length, tmp, &maxchar); 9451 res = PyUnicode_New(newlength, maxchar); 9452 if (res == NULL) 9453 goto leave; 9454 tmpend = tmp + newlength; 9455 outdata = PyUnicode_DATA(res); 9456 outkind = PyUnicode_KIND(res); 9457 switch (outkind) { 9458 case PyUnicode_1BYTE_KIND: 9459 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9460 break; 9461 case PyUnicode_2BYTE_KIND: 9462 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9463 break; 9464 case PyUnicode_4BYTE_KIND: 9465 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9466 break; 9467 default: 9468 assert(0); 9469 break; 9470 } 9471 leave: 9472 PyMem_FREE(tmp); 9473 return res; 9474} 9475 9476PyObject * 9477PyUnicode_Join(PyObject *separator, PyObject *seq) 9478{ 9479 PyObject *sep = NULL; 9480 Py_ssize_t seplen; 9481 PyObject *res = NULL; /* the result */ 9482 PyObject *fseq; /* PySequence_Fast(seq) */ 9483 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9484 PyObject **items; 9485 PyObject *item; 9486 Py_ssize_t sz, i, res_offset; 9487 Py_UCS4 maxchar; 9488 Py_UCS4 item_maxchar; 9489 int use_memcpy; 9490 unsigned char *res_data = NULL, *sep_data = NULL; 9491 PyObject *last_obj; 9492 unsigned int kind = 0; 9493 9494 fseq = PySequence_Fast(seq, ""); 9495 if (fseq == NULL) { 9496 return NULL; 9497 } 9498 9499 /* NOTE: the following code can't call back into Python code, 9500 * so we are sure that fseq won't be mutated. 9501 */ 9502 9503 seqlen = PySequence_Fast_GET_SIZE(fseq); 9504 /* If empty sequence, return u"". */ 9505 if (seqlen == 0) { 9506 Py_DECREF(fseq); 9507 Py_INCREF(unicode_empty); 9508 res = unicode_empty; 9509 return res; 9510 } 9511 9512 /* If singleton sequence with an exact Unicode, return that. */ 9513 last_obj = NULL; 9514 items = PySequence_Fast_ITEMS(fseq); 9515 if (seqlen == 1) { 9516 if (PyUnicode_CheckExact(items[0])) { 9517 res = items[0]; 9518 Py_INCREF(res); 9519 Py_DECREF(fseq); 9520 return res; 9521 } 9522 seplen = 0; 9523 maxchar = 0; 9524 } 9525 else { 9526 /* Set up sep and seplen */ 9527 if (separator == NULL) { 9528 /* fall back to a blank space separator */ 9529 sep = PyUnicode_FromOrdinal(' '); 9530 if (!sep) 9531 goto onError; 9532 seplen = 1; 9533 maxchar = 32; 9534 } 9535 else { 9536 if (!PyUnicode_Check(separator)) { 9537 PyErr_Format(PyExc_TypeError, 9538 "separator: expected str instance," 9539 " %.80s found", 9540 Py_TYPE(separator)->tp_name); 9541 goto onError; 9542 } 9543 if (PyUnicode_READY(separator)) 9544 goto onError; 9545 sep = separator; 9546 seplen = PyUnicode_GET_LENGTH(separator); 9547 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9548 /* inc refcount to keep this code path symmetric with the 9549 above case of a blank separator */ 9550 Py_INCREF(sep); 9551 } 9552 last_obj = sep; 9553 } 9554 9555 /* There are at least two things to join, or else we have a subclass 9556 * of str in the sequence. 9557 * Do a pre-pass to figure out the total amount of space we'll 9558 * need (sz), and see whether all argument are strings. 9559 */ 9560 sz = 0; 9561#ifdef Py_DEBUG 9562 use_memcpy = 0; 9563#else 9564 use_memcpy = 1; 9565#endif 9566 for (i = 0; i < seqlen; i++) { 9567 const Py_ssize_t old_sz = sz; 9568 item = items[i]; 9569 if (!PyUnicode_Check(item)) { 9570 PyErr_Format(PyExc_TypeError, 9571 "sequence item %zd: expected str instance," 9572 " %.80s found", 9573 i, Py_TYPE(item)->tp_name); 9574 goto onError; 9575 } 9576 if (PyUnicode_READY(item) == -1) 9577 goto onError; 9578 sz += PyUnicode_GET_LENGTH(item); 9579 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9580 maxchar = MAX_MAXCHAR(maxchar, item_maxchar); 9581 if (i != 0) 9582 sz += seplen; 9583 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9584 PyErr_SetString(PyExc_OverflowError, 9585 "join() result is too long for a Python string"); 9586 goto onError; 9587 } 9588 if (use_memcpy && last_obj != NULL) { 9589 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9590 use_memcpy = 0; 9591 } 9592 last_obj = item; 9593 } 9594 9595 res = PyUnicode_New(sz, maxchar); 9596 if (res == NULL) 9597 goto onError; 9598 9599 /* Catenate everything. */ 9600#ifdef Py_DEBUG 9601 use_memcpy = 0; 9602#else 9603 if (use_memcpy) { 9604 res_data = PyUnicode_1BYTE_DATA(res); 9605 kind = PyUnicode_KIND(res); 9606 if (seplen != 0) 9607 sep_data = PyUnicode_1BYTE_DATA(sep); 9608 } 9609#endif 9610 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9611 Py_ssize_t itemlen; 9612 item = items[i]; 9613 /* Copy item, and maybe the separator. */ 9614 if (i && seplen != 0) { 9615 if (use_memcpy) { 9616 Py_MEMCPY(res_data, 9617 sep_data, 9618 kind * seplen); 9619 res_data += kind * seplen; 9620 } 9621 else { 9622 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9623 res_offset += seplen; 9624 } 9625 } 9626 itemlen = PyUnicode_GET_LENGTH(item); 9627 if (itemlen != 0) { 9628 if (use_memcpy) { 9629 Py_MEMCPY(res_data, 9630 PyUnicode_DATA(item), 9631 kind * itemlen); 9632 res_data += kind * itemlen; 9633 } 9634 else { 9635 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9636 res_offset += itemlen; 9637 } 9638 } 9639 } 9640 if (use_memcpy) 9641 assert(res_data == PyUnicode_1BYTE_DATA(res) 9642 + kind * PyUnicode_GET_LENGTH(res)); 9643 else 9644 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9645 9646 Py_DECREF(fseq); 9647 Py_XDECREF(sep); 9648 assert(_PyUnicode_CheckConsistency(res, 1)); 9649 return res; 9650 9651 onError: 9652 Py_DECREF(fseq); 9653 Py_XDECREF(sep); 9654 Py_XDECREF(res); 9655 return NULL; 9656} 9657 9658#define FILL(kind, data, value, start, length) \ 9659 do { \ 9660 Py_ssize_t i_ = 0; \ 9661 assert(kind != PyUnicode_WCHAR_KIND); \ 9662 switch ((kind)) { \ 9663 case PyUnicode_1BYTE_KIND: { \ 9664 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9665 memset(to_, (unsigned char)value, (length)); \ 9666 break; \ 9667 } \ 9668 case PyUnicode_2BYTE_KIND: { \ 9669 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9670 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9671 break; \ 9672 } \ 9673 case PyUnicode_4BYTE_KIND: { \ 9674 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9675 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9676 break; \ 9677 default: assert(0); \ 9678 } \ 9679 } \ 9680 } while (0) 9681 9682void 9683_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9684 Py_UCS4 fill_char) 9685{ 9686 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9687 const void *data = PyUnicode_DATA(unicode); 9688 assert(PyUnicode_IS_READY(unicode)); 9689 assert(unicode_modifiable(unicode)); 9690 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9691 assert(start >= 0); 9692 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9693 FILL(kind, data, fill_char, start, length); 9694} 9695 9696Py_ssize_t 9697PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9698 Py_UCS4 fill_char) 9699{ 9700 Py_ssize_t maxlen; 9701 9702 if (!PyUnicode_Check(unicode)) { 9703 PyErr_BadInternalCall(); 9704 return -1; 9705 } 9706 if (PyUnicode_READY(unicode) == -1) 9707 return -1; 9708 if (unicode_check_modifiable(unicode)) 9709 return -1; 9710 9711 if (start < 0) { 9712 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9713 return -1; 9714 } 9715 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9716 PyErr_SetString(PyExc_ValueError, 9717 "fill character is bigger than " 9718 "the string maximum character"); 9719 return -1; 9720 } 9721 9722 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9723 length = Py_MIN(maxlen, length); 9724 if (length <= 0) 9725 return 0; 9726 9727 _PyUnicode_FastFill(unicode, start, length, fill_char); 9728 return length; 9729} 9730 9731static PyObject * 9732pad(PyObject *self, 9733 Py_ssize_t left, 9734 Py_ssize_t right, 9735 Py_UCS4 fill) 9736{ 9737 PyObject *u; 9738 Py_UCS4 maxchar; 9739 int kind; 9740 void *data; 9741 9742 if (left < 0) 9743 left = 0; 9744 if (right < 0) 9745 right = 0; 9746 9747 if (left == 0 && right == 0) 9748 return unicode_result_unchanged(self); 9749 9750 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9751 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9752 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9753 return NULL; 9754 } 9755 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9756 maxchar = MAX_MAXCHAR(maxchar, fill); 9757 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9758 if (!u) 9759 return NULL; 9760 9761 kind = PyUnicode_KIND(u); 9762 data = PyUnicode_DATA(u); 9763 if (left) 9764 FILL(kind, data, fill, 0, left); 9765 if (right) 9766 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9767 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9768 assert(_PyUnicode_CheckConsistency(u, 1)); 9769 return u; 9770} 9771 9772PyObject * 9773PyUnicode_Splitlines(PyObject *string, int keepends) 9774{ 9775 PyObject *list; 9776 9777 string = PyUnicode_FromObject(string); 9778 if (string == NULL) 9779 return NULL; 9780 if (PyUnicode_READY(string) == -1) { 9781 Py_DECREF(string); 9782 return NULL; 9783 } 9784 9785 switch (PyUnicode_KIND(string)) { 9786 case PyUnicode_1BYTE_KIND: 9787 if (PyUnicode_IS_ASCII(string)) 9788 list = asciilib_splitlines( 9789 string, PyUnicode_1BYTE_DATA(string), 9790 PyUnicode_GET_LENGTH(string), keepends); 9791 else 9792 list = ucs1lib_splitlines( 9793 string, PyUnicode_1BYTE_DATA(string), 9794 PyUnicode_GET_LENGTH(string), keepends); 9795 break; 9796 case PyUnicode_2BYTE_KIND: 9797 list = ucs2lib_splitlines( 9798 string, PyUnicode_2BYTE_DATA(string), 9799 PyUnicode_GET_LENGTH(string), keepends); 9800 break; 9801 case PyUnicode_4BYTE_KIND: 9802 list = ucs4lib_splitlines( 9803 string, PyUnicode_4BYTE_DATA(string), 9804 PyUnicode_GET_LENGTH(string), keepends); 9805 break; 9806 default: 9807 assert(0); 9808 list = 0; 9809 } 9810 Py_DECREF(string); 9811 return list; 9812} 9813 9814static PyObject * 9815split(PyObject *self, 9816 PyObject *substring, 9817 Py_ssize_t maxcount) 9818{ 9819 int kind1, kind2, kind; 9820 void *buf1, *buf2; 9821 Py_ssize_t len1, len2; 9822 PyObject* out; 9823 9824 if (maxcount < 0) 9825 maxcount = PY_SSIZE_T_MAX; 9826 9827 if (PyUnicode_READY(self) == -1) 9828 return NULL; 9829 9830 if (substring == NULL) 9831 switch (PyUnicode_KIND(self)) { 9832 case PyUnicode_1BYTE_KIND: 9833 if (PyUnicode_IS_ASCII(self)) 9834 return asciilib_split_whitespace( 9835 self, PyUnicode_1BYTE_DATA(self), 9836 PyUnicode_GET_LENGTH(self), maxcount 9837 ); 9838 else 9839 return ucs1lib_split_whitespace( 9840 self, PyUnicode_1BYTE_DATA(self), 9841 PyUnicode_GET_LENGTH(self), maxcount 9842 ); 9843 case PyUnicode_2BYTE_KIND: 9844 return ucs2lib_split_whitespace( 9845 self, PyUnicode_2BYTE_DATA(self), 9846 PyUnicode_GET_LENGTH(self), maxcount 9847 ); 9848 case PyUnicode_4BYTE_KIND: 9849 return ucs4lib_split_whitespace( 9850 self, PyUnicode_4BYTE_DATA(self), 9851 PyUnicode_GET_LENGTH(self), maxcount 9852 ); 9853 default: 9854 assert(0); 9855 return NULL; 9856 } 9857 9858 if (PyUnicode_READY(substring) == -1) 9859 return NULL; 9860 9861 kind1 = PyUnicode_KIND(self); 9862 kind2 = PyUnicode_KIND(substring); 9863 kind = kind1 > kind2 ? kind1 : kind2; 9864 buf1 = PyUnicode_DATA(self); 9865 buf2 = PyUnicode_DATA(substring); 9866 if (kind1 != kind) 9867 buf1 = _PyUnicode_AsKind(self, kind); 9868 if (!buf1) 9869 return NULL; 9870 if (kind2 != kind) 9871 buf2 = _PyUnicode_AsKind(substring, kind); 9872 if (!buf2) { 9873 if (kind1 != kind) PyMem_Free(buf1); 9874 return NULL; 9875 } 9876 len1 = PyUnicode_GET_LENGTH(self); 9877 len2 = PyUnicode_GET_LENGTH(substring); 9878 9879 switch (kind) { 9880 case PyUnicode_1BYTE_KIND: 9881 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9882 out = asciilib_split( 9883 self, buf1, len1, buf2, len2, maxcount); 9884 else 9885 out = ucs1lib_split( 9886 self, buf1, len1, buf2, len2, maxcount); 9887 break; 9888 case PyUnicode_2BYTE_KIND: 9889 out = ucs2lib_split( 9890 self, buf1, len1, buf2, len2, maxcount); 9891 break; 9892 case PyUnicode_4BYTE_KIND: 9893 out = ucs4lib_split( 9894 self, buf1, len1, buf2, len2, maxcount); 9895 break; 9896 default: 9897 out = NULL; 9898 } 9899 if (kind1 != kind) 9900 PyMem_Free(buf1); 9901 if (kind2 != kind) 9902 PyMem_Free(buf2); 9903 return out; 9904} 9905 9906static PyObject * 9907rsplit(PyObject *self, 9908 PyObject *substring, 9909 Py_ssize_t maxcount) 9910{ 9911 int kind1, kind2, kind; 9912 void *buf1, *buf2; 9913 Py_ssize_t len1, len2; 9914 PyObject* out; 9915 9916 if (maxcount < 0) 9917 maxcount = PY_SSIZE_T_MAX; 9918 9919 if (PyUnicode_READY(self) == -1) 9920 return NULL; 9921 9922 if (substring == NULL) 9923 switch (PyUnicode_KIND(self)) { 9924 case PyUnicode_1BYTE_KIND: 9925 if (PyUnicode_IS_ASCII(self)) 9926 return asciilib_rsplit_whitespace( 9927 self, PyUnicode_1BYTE_DATA(self), 9928 PyUnicode_GET_LENGTH(self), maxcount 9929 ); 9930 else 9931 return ucs1lib_rsplit_whitespace( 9932 self, PyUnicode_1BYTE_DATA(self), 9933 PyUnicode_GET_LENGTH(self), maxcount 9934 ); 9935 case PyUnicode_2BYTE_KIND: 9936 return ucs2lib_rsplit_whitespace( 9937 self, PyUnicode_2BYTE_DATA(self), 9938 PyUnicode_GET_LENGTH(self), maxcount 9939 ); 9940 case PyUnicode_4BYTE_KIND: 9941 return ucs4lib_rsplit_whitespace( 9942 self, PyUnicode_4BYTE_DATA(self), 9943 PyUnicode_GET_LENGTH(self), maxcount 9944 ); 9945 default: 9946 assert(0); 9947 return NULL; 9948 } 9949 9950 if (PyUnicode_READY(substring) == -1) 9951 return NULL; 9952 9953 kind1 = PyUnicode_KIND(self); 9954 kind2 = PyUnicode_KIND(substring); 9955 kind = kind1 > kind2 ? kind1 : kind2; 9956 buf1 = PyUnicode_DATA(self); 9957 buf2 = PyUnicode_DATA(substring); 9958 if (kind1 != kind) 9959 buf1 = _PyUnicode_AsKind(self, kind); 9960 if (!buf1) 9961 return NULL; 9962 if (kind2 != kind) 9963 buf2 = _PyUnicode_AsKind(substring, kind); 9964 if (!buf2) { 9965 if (kind1 != kind) PyMem_Free(buf1); 9966 return NULL; 9967 } 9968 len1 = PyUnicode_GET_LENGTH(self); 9969 len2 = PyUnicode_GET_LENGTH(substring); 9970 9971 switch (kind) { 9972 case PyUnicode_1BYTE_KIND: 9973 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 9974 out = asciilib_rsplit( 9975 self, buf1, len1, buf2, len2, maxcount); 9976 else 9977 out = ucs1lib_rsplit( 9978 self, buf1, len1, buf2, len2, maxcount); 9979 break; 9980 case PyUnicode_2BYTE_KIND: 9981 out = ucs2lib_rsplit( 9982 self, buf1, len1, buf2, len2, maxcount); 9983 break; 9984 case PyUnicode_4BYTE_KIND: 9985 out = ucs4lib_rsplit( 9986 self, buf1, len1, buf2, len2, maxcount); 9987 break; 9988 default: 9989 out = NULL; 9990 } 9991 if (kind1 != kind) 9992 PyMem_Free(buf1); 9993 if (kind2 != kind) 9994 PyMem_Free(buf2); 9995 return out; 9996} 9997 9998static Py_ssize_t 9999anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10000 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10001{ 10002 switch (kind) { 10003 case PyUnicode_1BYTE_KIND: 10004 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10005 return asciilib_find(buf1, len1, buf2, len2, offset); 10006 else 10007 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10008 case PyUnicode_2BYTE_KIND: 10009 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10010 case PyUnicode_4BYTE_KIND: 10011 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10012 } 10013 assert(0); 10014 return -1; 10015} 10016 10017static Py_ssize_t 10018anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10019 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10020{ 10021 switch (kind) { 10022 case PyUnicode_1BYTE_KIND: 10023 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10024 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10025 else 10026 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10027 case PyUnicode_2BYTE_KIND: 10028 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10029 case PyUnicode_4BYTE_KIND: 10030 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10031 } 10032 assert(0); 10033 return 0; 10034} 10035 10036static PyObject * 10037replace(PyObject *self, PyObject *str1, 10038 PyObject *str2, Py_ssize_t maxcount) 10039{ 10040 PyObject *u; 10041 char *sbuf = PyUnicode_DATA(self); 10042 char *buf1 = PyUnicode_DATA(str1); 10043 char *buf2 = PyUnicode_DATA(str2); 10044 int srelease = 0, release1 = 0, release2 = 0; 10045 int skind = PyUnicode_KIND(self); 10046 int kind1 = PyUnicode_KIND(str1); 10047 int kind2 = PyUnicode_KIND(str2); 10048 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10049 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10050 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10051 int mayshrink; 10052 Py_UCS4 maxchar, maxchar_str2; 10053 10054 if (maxcount < 0) 10055 maxcount = PY_SSIZE_T_MAX; 10056 else if (maxcount == 0 || slen == 0) 10057 goto nothing; 10058 10059 if (str1 == str2) 10060 goto nothing; 10061 if (skind < kind1) 10062 /* substring too wide to be present */ 10063 goto nothing; 10064 10065 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10066 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10067 /* Replacing str1 with str2 may cause a maxchar reduction in the 10068 result string. */ 10069 mayshrink = (maxchar_str2 < maxchar); 10070 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); 10071 10072 if (len1 == len2) { 10073 /* same length */ 10074 if (len1 == 0) 10075 goto nothing; 10076 if (len1 == 1) { 10077 /* replace characters */ 10078 Py_UCS4 u1, u2; 10079 int rkind; 10080 Py_ssize_t index, pos; 10081 char *src; 10082 10083 u1 = PyUnicode_READ_CHAR(str1, 0); 10084 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1); 10085 if (pos < 0) 10086 goto nothing; 10087 u2 = PyUnicode_READ_CHAR(str2, 0); 10088 u = PyUnicode_New(slen, maxchar); 10089 if (!u) 10090 goto error; 10091 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10092 rkind = PyUnicode_KIND(u); 10093 10094 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2); 10095 index = 0; 10096 src = sbuf; 10097 while (--maxcount) 10098 { 10099 pos++; 10100 src += pos * PyUnicode_KIND(self); 10101 slen -= pos; 10102 index += pos; 10103 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1); 10104 if (pos < 0) 10105 break; 10106 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2); 10107 } 10108 } 10109 else { 10110 int rkind = skind; 10111 char *res; 10112 Py_ssize_t i; 10113 10114 if (kind1 < rkind) { 10115 /* widen substring */ 10116 buf1 = _PyUnicode_AsKind(str1, rkind); 10117 if (!buf1) goto error; 10118 release1 = 1; 10119 } 10120 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10121 if (i < 0) 10122 goto nothing; 10123 if (rkind > kind2) { 10124 /* widen replacement */ 10125 buf2 = _PyUnicode_AsKind(str2, rkind); 10126 if (!buf2) goto error; 10127 release2 = 1; 10128 } 10129 else if (rkind < kind2) { 10130 /* widen self and buf1 */ 10131 rkind = kind2; 10132 if (release1) PyMem_Free(buf1); 10133 sbuf = _PyUnicode_AsKind(self, rkind); 10134 if (!sbuf) goto error; 10135 srelease = 1; 10136 buf1 = _PyUnicode_AsKind(str1, rkind); 10137 if (!buf1) goto error; 10138 release1 = 1; 10139 } 10140 u = PyUnicode_New(slen, maxchar); 10141 if (!u) 10142 goto error; 10143 assert(PyUnicode_KIND(u) == rkind); 10144 res = PyUnicode_DATA(u); 10145 10146 memcpy(res, sbuf, rkind * slen); 10147 /* change everything in-place, starting with this one */ 10148 memcpy(res + rkind * i, 10149 buf2, 10150 rkind * len2); 10151 i += len1; 10152 10153 while ( --maxcount > 0) { 10154 i = anylib_find(rkind, self, 10155 sbuf+rkind*i, slen-i, 10156 str1, buf1, len1, i); 10157 if (i == -1) 10158 break; 10159 memcpy(res + rkind * i, 10160 buf2, 10161 rkind * len2); 10162 i += len1; 10163 } 10164 } 10165 } 10166 else { 10167 Py_ssize_t n, i, j, ires; 10168 Py_ssize_t product, new_size; 10169 int rkind = skind; 10170 char *res; 10171 10172 if (kind1 < rkind) { 10173 /* widen substring */ 10174 buf1 = _PyUnicode_AsKind(str1, rkind); 10175 if (!buf1) goto error; 10176 release1 = 1; 10177 } 10178 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10179 if (n == 0) 10180 goto nothing; 10181 if (kind2 < rkind) { 10182 /* widen replacement */ 10183 buf2 = _PyUnicode_AsKind(str2, rkind); 10184 if (!buf2) goto error; 10185 release2 = 1; 10186 } 10187 else if (kind2 > rkind) { 10188 /* widen self and buf1 */ 10189 rkind = kind2; 10190 sbuf = _PyUnicode_AsKind(self, rkind); 10191 if (!sbuf) goto error; 10192 srelease = 1; 10193 if (release1) PyMem_Free(buf1); 10194 buf1 = _PyUnicode_AsKind(str1, rkind); 10195 if (!buf1) goto error; 10196 release1 = 1; 10197 } 10198 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10199 PyUnicode_GET_LENGTH(str1))); */ 10200 product = n * (len2-len1); 10201 if ((product / (len2-len1)) != n) { 10202 PyErr_SetString(PyExc_OverflowError, 10203 "replace string is too long"); 10204 goto error; 10205 } 10206 new_size = slen + product; 10207 if (new_size == 0) { 10208 Py_INCREF(unicode_empty); 10209 u = unicode_empty; 10210 goto done; 10211 } 10212 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10213 PyErr_SetString(PyExc_OverflowError, 10214 "replace string is too long"); 10215 goto error; 10216 } 10217 u = PyUnicode_New(new_size, maxchar); 10218 if (!u) 10219 goto error; 10220 assert(PyUnicode_KIND(u) == rkind); 10221 res = PyUnicode_DATA(u); 10222 ires = i = 0; 10223 if (len1 > 0) { 10224 while (n-- > 0) { 10225 /* look for next match */ 10226 j = anylib_find(rkind, self, 10227 sbuf + rkind * i, slen-i, 10228 str1, buf1, len1, i); 10229 if (j == -1) 10230 break; 10231 else if (j > i) { 10232 /* copy unchanged part [i:j] */ 10233 memcpy(res + rkind * ires, 10234 sbuf + rkind * i, 10235 rkind * (j-i)); 10236 ires += j - i; 10237 } 10238 /* copy substitution string */ 10239 if (len2 > 0) { 10240 memcpy(res + rkind * ires, 10241 buf2, 10242 rkind * len2); 10243 ires += len2; 10244 } 10245 i = j + len1; 10246 } 10247 if (i < slen) 10248 /* copy tail [i:] */ 10249 memcpy(res + rkind * ires, 10250 sbuf + rkind * i, 10251 rkind * (slen-i)); 10252 } 10253 else { 10254 /* interleave */ 10255 while (n > 0) { 10256 memcpy(res + rkind * ires, 10257 buf2, 10258 rkind * len2); 10259 ires += len2; 10260 if (--n <= 0) 10261 break; 10262 memcpy(res + rkind * ires, 10263 sbuf + rkind * i, 10264 rkind); 10265 ires++; 10266 i++; 10267 } 10268 memcpy(res + rkind * ires, 10269 sbuf + rkind * i, 10270 rkind * (slen-i)); 10271 } 10272 } 10273 10274 if (mayshrink) { 10275 unicode_adjust_maxchar(&u); 10276 if (u == NULL) 10277 goto error; 10278 } 10279 10280 done: 10281 if (srelease) 10282 PyMem_FREE(sbuf); 10283 if (release1) 10284 PyMem_FREE(buf1); 10285 if (release2) 10286 PyMem_FREE(buf2); 10287 assert(_PyUnicode_CheckConsistency(u, 1)); 10288 return u; 10289 10290 nothing: 10291 /* nothing to replace; return original string (when possible) */ 10292 if (srelease) 10293 PyMem_FREE(sbuf); 10294 if (release1) 10295 PyMem_FREE(buf1); 10296 if (release2) 10297 PyMem_FREE(buf2); 10298 return unicode_result_unchanged(self); 10299 10300 error: 10301 if (srelease && sbuf) 10302 PyMem_FREE(sbuf); 10303 if (release1 && buf1) 10304 PyMem_FREE(buf1); 10305 if (release2 && buf2) 10306 PyMem_FREE(buf2); 10307 return NULL; 10308} 10309 10310/* --- Unicode Object Methods --------------------------------------------- */ 10311 10312PyDoc_STRVAR(title__doc__, 10313 "S.title() -> str\n\ 10314\n\ 10315Return a titlecased version of S, i.e. words start with title case\n\ 10316characters, all remaining cased characters have lower case."); 10317 10318static PyObject* 10319unicode_title(PyObject *self) 10320{ 10321 if (PyUnicode_READY(self) == -1) 10322 return NULL; 10323 return case_operation(self, do_title); 10324} 10325 10326PyDoc_STRVAR(capitalize__doc__, 10327 "S.capitalize() -> str\n\ 10328\n\ 10329Return a capitalized version of S, i.e. make the first character\n\ 10330have upper case and the rest lower case."); 10331 10332static PyObject* 10333unicode_capitalize(PyObject *self) 10334{ 10335 if (PyUnicode_READY(self) == -1) 10336 return NULL; 10337 if (PyUnicode_GET_LENGTH(self) == 0) 10338 return unicode_result_unchanged(self); 10339 return case_operation(self, do_capitalize); 10340} 10341 10342PyDoc_STRVAR(casefold__doc__, 10343 "S.casefold() -> str\n\ 10344\n\ 10345Return a version of S suitable for caseless comparisons."); 10346 10347static PyObject * 10348unicode_casefold(PyObject *self) 10349{ 10350 if (PyUnicode_READY(self) == -1) 10351 return NULL; 10352 if (PyUnicode_IS_ASCII(self)) 10353 return ascii_upper_or_lower(self, 1); 10354 return case_operation(self, do_casefold); 10355} 10356 10357 10358/* Argument converter. Coerces to a single unicode character */ 10359 10360static int 10361convert_uc(PyObject *obj, void *addr) 10362{ 10363 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10364 PyObject *uniobj; 10365 10366 uniobj = PyUnicode_FromObject(obj); 10367 if (uniobj == NULL) { 10368 PyErr_SetString(PyExc_TypeError, 10369 "The fill character cannot be converted to Unicode"); 10370 return 0; 10371 } 10372 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10373 PyErr_SetString(PyExc_TypeError, 10374 "The fill character must be exactly one character long"); 10375 Py_DECREF(uniobj); 10376 return 0; 10377 } 10378 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10379 Py_DECREF(uniobj); 10380 return 1; 10381} 10382 10383PyDoc_STRVAR(center__doc__, 10384 "S.center(width[, fillchar]) -> str\n\ 10385\n\ 10386Return S centered in a string of length width. Padding is\n\ 10387done using the specified fill character (default is a space)"); 10388 10389static PyObject * 10390unicode_center(PyObject *self, PyObject *args) 10391{ 10392 Py_ssize_t marg, left; 10393 Py_ssize_t width; 10394 Py_UCS4 fillchar = ' '; 10395 10396 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10397 return NULL; 10398 10399 if (PyUnicode_READY(self) == -1) 10400 return NULL; 10401 10402 if (PyUnicode_GET_LENGTH(self) >= width) 10403 return unicode_result_unchanged(self); 10404 10405 marg = width - PyUnicode_GET_LENGTH(self); 10406 left = marg / 2 + (marg & width & 1); 10407 10408 return pad(self, left, marg - left, fillchar); 10409} 10410 10411/* This function assumes that str1 and str2 are readied by the caller. */ 10412 10413static int 10414unicode_compare(PyObject *str1, PyObject *str2) 10415{ 10416 int kind1, kind2; 10417 void *data1, *data2; 10418 Py_ssize_t len1, len2, i; 10419 10420 kind1 = PyUnicode_KIND(str1); 10421 kind2 = PyUnicode_KIND(str2); 10422 data1 = PyUnicode_DATA(str1); 10423 data2 = PyUnicode_DATA(str2); 10424 len1 = PyUnicode_GET_LENGTH(str1); 10425 len2 = PyUnicode_GET_LENGTH(str2); 10426 10427 for (i = 0; i < len1 && i < len2; ++i) { 10428 Py_UCS4 c1, c2; 10429 c1 = PyUnicode_READ(kind1, data1, i); 10430 c2 = PyUnicode_READ(kind2, data2, i); 10431 10432 if (c1 != c2) 10433 return (c1 < c2) ? -1 : 1; 10434 } 10435 10436 return (len1 < len2) ? -1 : (len1 != len2); 10437} 10438 10439int 10440PyUnicode_Compare(PyObject *left, PyObject *right) 10441{ 10442 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10443 if (PyUnicode_READY(left) == -1 || 10444 PyUnicode_READY(right) == -1) 10445 return -1; 10446 return unicode_compare(left, right); 10447 } 10448 PyErr_Format(PyExc_TypeError, 10449 "Can't compare %.100s and %.100s", 10450 left->ob_type->tp_name, 10451 right->ob_type->tp_name); 10452 return -1; 10453} 10454 10455int 10456PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10457{ 10458 Py_ssize_t i; 10459 int kind; 10460 void *data; 10461 Py_UCS4 chr; 10462 10463 assert(_PyUnicode_CHECK(uni)); 10464 if (PyUnicode_READY(uni) == -1) 10465 return -1; 10466 kind = PyUnicode_KIND(uni); 10467 data = PyUnicode_DATA(uni); 10468 /* Compare Unicode string and source character set string */ 10469 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10470 if (chr != str[i]) 10471 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10472 /* This check keeps Python strings that end in '\0' from comparing equal 10473 to C strings identical up to that point. */ 10474 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10475 return 1; /* uni is longer */ 10476 if (str[i]) 10477 return -1; /* str is longer */ 10478 return 0; 10479} 10480 10481 10482#define TEST_COND(cond) \ 10483 ((cond) ? Py_True : Py_False) 10484 10485PyObject * 10486PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10487{ 10488 int result; 10489 10490 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10491 PyObject *v; 10492 if (PyUnicode_READY(left) == -1 || 10493 PyUnicode_READY(right) == -1) 10494 return NULL; 10495 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) || 10496 PyUnicode_KIND(left) != PyUnicode_KIND(right)) { 10497 if (op == Py_EQ) { 10498 Py_INCREF(Py_False); 10499 return Py_False; 10500 } 10501 if (op == Py_NE) { 10502 Py_INCREF(Py_True); 10503 return Py_True; 10504 } 10505 } 10506 if (left == right) 10507 result = 0; 10508 else 10509 result = unicode_compare(left, right); 10510 10511 /* Convert the return value to a Boolean */ 10512 switch (op) { 10513 case Py_EQ: 10514 v = TEST_COND(result == 0); 10515 break; 10516 case Py_NE: 10517 v = TEST_COND(result != 0); 10518 break; 10519 case Py_LE: 10520 v = TEST_COND(result <= 0); 10521 break; 10522 case Py_GE: 10523 v = TEST_COND(result >= 0); 10524 break; 10525 case Py_LT: 10526 v = TEST_COND(result == -1); 10527 break; 10528 case Py_GT: 10529 v = TEST_COND(result == 1); 10530 break; 10531 default: 10532 PyErr_BadArgument(); 10533 return NULL; 10534 } 10535 Py_INCREF(v); 10536 return v; 10537 } 10538 10539 Py_RETURN_NOTIMPLEMENTED; 10540} 10541 10542int 10543PyUnicode_Contains(PyObject *container, PyObject *element) 10544{ 10545 PyObject *str, *sub; 10546 int kind1, kind2, kind; 10547 void *buf1, *buf2; 10548 Py_ssize_t len1, len2; 10549 int result; 10550 10551 /* Coerce the two arguments */ 10552 sub = PyUnicode_FromObject(element); 10553 if (!sub) { 10554 PyErr_Format(PyExc_TypeError, 10555 "'in <string>' requires string as left operand, not %s", 10556 element->ob_type->tp_name); 10557 return -1; 10558 } 10559 10560 str = PyUnicode_FromObject(container); 10561 if (!str) { 10562 Py_DECREF(sub); 10563 return -1; 10564 } 10565 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 10566 Py_DECREF(sub); 10567 Py_DECREF(str); 10568 } 10569 10570 kind1 = PyUnicode_KIND(str); 10571 kind2 = PyUnicode_KIND(sub); 10572 kind = kind1; 10573 buf1 = PyUnicode_DATA(str); 10574 buf2 = PyUnicode_DATA(sub); 10575 if (kind2 != kind) { 10576 if (kind2 > kind) { 10577 Py_DECREF(sub); 10578 Py_DECREF(str); 10579 return 0; 10580 } 10581 buf2 = _PyUnicode_AsKind(sub, kind); 10582 } 10583 if (!buf2) { 10584 Py_DECREF(sub); 10585 Py_DECREF(str); 10586 return -1; 10587 } 10588 len1 = PyUnicode_GET_LENGTH(str); 10589 len2 = PyUnicode_GET_LENGTH(sub); 10590 10591 switch (kind) { 10592 case PyUnicode_1BYTE_KIND: 10593 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 10594 break; 10595 case PyUnicode_2BYTE_KIND: 10596 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 10597 break; 10598 case PyUnicode_4BYTE_KIND: 10599 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 10600 break; 10601 default: 10602 result = -1; 10603 assert(0); 10604 } 10605 10606 Py_DECREF(str); 10607 Py_DECREF(sub); 10608 10609 if (kind2 != kind) 10610 PyMem_Free(buf2); 10611 10612 return result; 10613} 10614 10615/* Concat to string or Unicode object giving a new Unicode object. */ 10616 10617PyObject * 10618PyUnicode_Concat(PyObject *left, PyObject *right) 10619{ 10620 PyObject *u = NULL, *v = NULL, *w; 10621 Py_UCS4 maxchar, maxchar2; 10622 Py_ssize_t u_len, v_len, new_len; 10623 10624 /* Coerce the two arguments */ 10625 u = PyUnicode_FromObject(left); 10626 if (u == NULL) 10627 goto onError; 10628 v = PyUnicode_FromObject(right); 10629 if (v == NULL) 10630 goto onError; 10631 10632 /* Shortcuts */ 10633 if (v == unicode_empty) { 10634 Py_DECREF(v); 10635 return u; 10636 } 10637 if (u == unicode_empty) { 10638 Py_DECREF(u); 10639 return v; 10640 } 10641 10642 u_len = PyUnicode_GET_LENGTH(u); 10643 v_len = PyUnicode_GET_LENGTH(v); 10644 if (u_len > PY_SSIZE_T_MAX - v_len) { 10645 PyErr_SetString(PyExc_OverflowError, 10646 "strings are too large to concat"); 10647 goto onError; 10648 } 10649 new_len = u_len + v_len; 10650 10651 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 10652 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 10653 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10654 10655 /* Concat the two Unicode strings */ 10656 w = PyUnicode_New(new_len, maxchar); 10657 if (w == NULL) 10658 goto onError; 10659 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 10660 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 10661 Py_DECREF(u); 10662 Py_DECREF(v); 10663 assert(_PyUnicode_CheckConsistency(w, 1)); 10664 return w; 10665 10666 onError: 10667 Py_XDECREF(u); 10668 Py_XDECREF(v); 10669 return NULL; 10670} 10671 10672void 10673PyUnicode_Append(PyObject **p_left, PyObject *right) 10674{ 10675 PyObject *left, *res; 10676 Py_UCS4 maxchar, maxchar2; 10677 Py_ssize_t left_len, right_len, new_len; 10678 10679 if (p_left == NULL) { 10680 if (!PyErr_Occurred()) 10681 PyErr_BadInternalCall(); 10682 return; 10683 } 10684 left = *p_left; 10685 if (right == NULL || !PyUnicode_Check(left)) { 10686 if (!PyErr_Occurred()) 10687 PyErr_BadInternalCall(); 10688 goto error; 10689 } 10690 10691 if (PyUnicode_READY(left) == -1) 10692 goto error; 10693 if (PyUnicode_READY(right) == -1) 10694 goto error; 10695 10696 /* Shortcuts */ 10697 if (left == unicode_empty) { 10698 Py_DECREF(left); 10699 Py_INCREF(right); 10700 *p_left = right; 10701 return; 10702 } 10703 if (right == unicode_empty) 10704 return; 10705 10706 left_len = PyUnicode_GET_LENGTH(left); 10707 right_len = PyUnicode_GET_LENGTH(right); 10708 if (left_len > PY_SSIZE_T_MAX - right_len) { 10709 PyErr_SetString(PyExc_OverflowError, 10710 "strings are too large to concat"); 10711 goto error; 10712 } 10713 new_len = left_len + right_len; 10714 10715 if (unicode_modifiable(left) 10716 && PyUnicode_CheckExact(right) 10717 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 10718 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 10719 to change the structure size, but characters are stored just after 10720 the structure, and so it requires to move all characters which is 10721 not so different than duplicating the string. */ 10722 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 10723 { 10724 /* append inplace */ 10725 if (unicode_resize(p_left, new_len) != 0) { 10726 /* XXX if _PyUnicode_Resize() fails, 'left' has been 10727 * deallocated so it cannot be put back into 10728 * 'variable'. The MemoryError is raised when there 10729 * is no value in 'variable', which might (very 10730 * remotely) be a cause of incompatibilities. 10731 */ 10732 goto error; 10733 } 10734 /* copy 'right' into the newly allocated area of 'left' */ 10735 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 10736 } 10737 else { 10738 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 10739 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 10740 maxchar = MAX_MAXCHAR(maxchar, maxchar2); 10741 10742 /* Concat the two Unicode strings */ 10743 res = PyUnicode_New(new_len, maxchar); 10744 if (res == NULL) 10745 goto error; 10746 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 10747 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 10748 Py_DECREF(left); 10749 *p_left = res; 10750 } 10751 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 10752 return; 10753 10754error: 10755 Py_CLEAR(*p_left); 10756} 10757 10758void 10759PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 10760{ 10761 PyUnicode_Append(pleft, right); 10762 Py_XDECREF(right); 10763} 10764 10765PyDoc_STRVAR(count__doc__, 10766 "S.count(sub[, start[, end]]) -> int\n\ 10767\n\ 10768Return the number of non-overlapping occurrences of substring sub in\n\ 10769string S[start:end]. Optional arguments start and end are\n\ 10770interpreted as in slice notation."); 10771 10772static PyObject * 10773unicode_count(PyObject *self, PyObject *args) 10774{ 10775 PyObject *substring; 10776 Py_ssize_t start = 0; 10777 Py_ssize_t end = PY_SSIZE_T_MAX; 10778 PyObject *result; 10779 int kind1, kind2, kind; 10780 void *buf1, *buf2; 10781 Py_ssize_t len1, len2, iresult; 10782 10783 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 10784 &start, &end)) 10785 return NULL; 10786 10787 kind1 = PyUnicode_KIND(self); 10788 kind2 = PyUnicode_KIND(substring); 10789 if (kind2 > kind1) 10790 return PyLong_FromLong(0); 10791 kind = kind1; 10792 buf1 = PyUnicode_DATA(self); 10793 buf2 = PyUnicode_DATA(substring); 10794 if (kind2 != kind) 10795 buf2 = _PyUnicode_AsKind(substring, kind); 10796 if (!buf2) { 10797 Py_DECREF(substring); 10798 return NULL; 10799 } 10800 len1 = PyUnicode_GET_LENGTH(self); 10801 len2 = PyUnicode_GET_LENGTH(substring); 10802 10803 ADJUST_INDICES(start, end, len1); 10804 switch (kind) { 10805 case PyUnicode_1BYTE_KIND: 10806 iresult = ucs1lib_count( 10807 ((Py_UCS1*)buf1) + start, end - start, 10808 buf2, len2, PY_SSIZE_T_MAX 10809 ); 10810 break; 10811 case PyUnicode_2BYTE_KIND: 10812 iresult = ucs2lib_count( 10813 ((Py_UCS2*)buf1) + start, end - start, 10814 buf2, len2, PY_SSIZE_T_MAX 10815 ); 10816 break; 10817 case PyUnicode_4BYTE_KIND: 10818 iresult = ucs4lib_count( 10819 ((Py_UCS4*)buf1) + start, end - start, 10820 buf2, len2, PY_SSIZE_T_MAX 10821 ); 10822 break; 10823 default: 10824 assert(0); iresult = 0; 10825 } 10826 10827 result = PyLong_FromSsize_t(iresult); 10828 10829 if (kind2 != kind) 10830 PyMem_Free(buf2); 10831 10832 Py_DECREF(substring); 10833 10834 return result; 10835} 10836 10837PyDoc_STRVAR(encode__doc__, 10838 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 10839\n\ 10840Encode S using the codec registered for encoding. Default encoding\n\ 10841is 'utf-8'. errors may be given to set a different error\n\ 10842handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 10843a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 10844'xmlcharrefreplace' as well as any other name registered with\n\ 10845codecs.register_error that can handle UnicodeEncodeErrors."); 10846 10847static PyObject * 10848unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 10849{ 10850 static char *kwlist[] = {"encoding", "errors", 0}; 10851 char *encoding = NULL; 10852 char *errors = NULL; 10853 10854 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 10855 kwlist, &encoding, &errors)) 10856 return NULL; 10857 return PyUnicode_AsEncodedString(self, encoding, errors); 10858} 10859 10860PyDoc_STRVAR(expandtabs__doc__, 10861 "S.expandtabs([tabsize]) -> str\n\ 10862\n\ 10863Return a copy of S where all tab characters are expanded using spaces.\n\ 10864If tabsize is not given, a tab size of 8 characters is assumed."); 10865 10866static PyObject* 10867unicode_expandtabs(PyObject *self, PyObject *args) 10868{ 10869 Py_ssize_t i, j, line_pos, src_len, incr; 10870 Py_UCS4 ch; 10871 PyObject *u; 10872 void *src_data, *dest_data; 10873 int tabsize = 8; 10874 int kind; 10875 int found; 10876 10877 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) 10878 return NULL; 10879 10880 if (PyUnicode_READY(self) == -1) 10881 return NULL; 10882 10883 /* First pass: determine size of output string */ 10884 src_len = PyUnicode_GET_LENGTH(self); 10885 i = j = line_pos = 0; 10886 kind = PyUnicode_KIND(self); 10887 src_data = PyUnicode_DATA(self); 10888 found = 0; 10889 for (; i < src_len; i++) { 10890 ch = PyUnicode_READ(kind, src_data, i); 10891 if (ch == '\t') { 10892 found = 1; 10893 if (tabsize > 0) { 10894 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 10895 if (j > PY_SSIZE_T_MAX - incr) 10896 goto overflow; 10897 line_pos += incr; 10898 j += incr; 10899 } 10900 } 10901 else { 10902 if (j > PY_SSIZE_T_MAX - 1) 10903 goto overflow; 10904 line_pos++; 10905 j++; 10906 if (ch == '\n' || ch == '\r') 10907 line_pos = 0; 10908 } 10909 } 10910 if (!found) 10911 return unicode_result_unchanged(self); 10912 10913 /* Second pass: create output string and fill it */ 10914 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 10915 if (!u) 10916 return NULL; 10917 dest_data = PyUnicode_DATA(u); 10918 10919 i = j = line_pos = 0; 10920 10921 for (; i < src_len; i++) { 10922 ch = PyUnicode_READ(kind, src_data, i); 10923 if (ch == '\t') { 10924 if (tabsize > 0) { 10925 incr = tabsize - (line_pos % tabsize); 10926 line_pos += incr; 10927 FILL(kind, dest_data, ' ', j, incr); 10928 j += incr; 10929 } 10930 } 10931 else { 10932 line_pos++; 10933 PyUnicode_WRITE(kind, dest_data, j, ch); 10934 j++; 10935 if (ch == '\n' || ch == '\r') 10936 line_pos = 0; 10937 } 10938 } 10939 assert (j == PyUnicode_GET_LENGTH(u)); 10940 return unicode_result(u); 10941 10942 overflow: 10943 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 10944 return NULL; 10945} 10946 10947PyDoc_STRVAR(find__doc__, 10948 "S.find(sub[, start[, end]]) -> int\n\ 10949\n\ 10950Return the lowest index in S where substring sub is found,\n\ 10951such that sub is contained within S[start:end]. Optional\n\ 10952arguments start and end are interpreted as in slice notation.\n\ 10953\n\ 10954Return -1 on failure."); 10955 10956static PyObject * 10957unicode_find(PyObject *self, PyObject *args) 10958{ 10959 PyObject *substring; 10960 Py_ssize_t start; 10961 Py_ssize_t end; 10962 Py_ssize_t result; 10963 10964 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 10965 &start, &end)) 10966 return NULL; 10967 10968 if (PyUnicode_READY(self) == -1) 10969 return NULL; 10970 if (PyUnicode_READY(substring) == -1) 10971 return NULL; 10972 10973 result = any_find_slice(1, self, substring, start, end); 10974 10975 Py_DECREF(substring); 10976 10977 if (result == -2) 10978 return NULL; 10979 10980 return PyLong_FromSsize_t(result); 10981} 10982 10983static PyObject * 10984unicode_getitem(PyObject *self, Py_ssize_t index) 10985{ 10986 void *data; 10987 enum PyUnicode_Kind kind; 10988 Py_UCS4 ch; 10989 PyObject *res; 10990 10991 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 10992 PyErr_BadArgument(); 10993 return NULL; 10994 } 10995 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 10996 PyErr_SetString(PyExc_IndexError, "string index out of range"); 10997 return NULL; 10998 } 10999 kind = PyUnicode_KIND(self); 11000 data = PyUnicode_DATA(self); 11001 ch = PyUnicode_READ(kind, data, index); 11002 if (ch < 256) 11003 return get_latin1_char(ch); 11004 11005 res = PyUnicode_New(1, ch); 11006 if (res == NULL) 11007 return NULL; 11008 kind = PyUnicode_KIND(res); 11009 data = PyUnicode_DATA(res); 11010 PyUnicode_WRITE(kind, data, 0, ch); 11011 assert(_PyUnicode_CheckConsistency(res, 1)); 11012 return res; 11013} 11014 11015/* Believe it or not, this produces the same value for ASCII strings 11016 as bytes_hash(). */ 11017static Py_hash_t 11018unicode_hash(PyObject *self) 11019{ 11020 Py_ssize_t len; 11021 Py_uhash_t x; 11022 11023#ifdef Py_DEBUG 11024 assert(_Py_HashSecret_Initialized); 11025#endif 11026 if (_PyUnicode_HASH(self) != -1) 11027 return _PyUnicode_HASH(self); 11028 if (PyUnicode_READY(self) == -1) 11029 return -1; 11030 len = PyUnicode_GET_LENGTH(self); 11031 /* 11032 We make the hash of the empty string be 0, rather than using 11033 (prefix ^ suffix), since this slightly obfuscates the hash secret 11034 */ 11035 if (len == 0) { 11036 _PyUnicode_HASH(self) = 0; 11037 return 0; 11038 } 11039 11040 /* The hash function as a macro, gets expanded three times below. */ 11041#define HASH(P) \ 11042 x ^= (Py_uhash_t) *P << 7; \ 11043 while (--len >= 0) \ 11044 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \ 11045 11046 x = (Py_uhash_t) _Py_HashSecret.prefix; 11047 switch (PyUnicode_KIND(self)) { 11048 case PyUnicode_1BYTE_KIND: { 11049 const unsigned char *c = PyUnicode_1BYTE_DATA(self); 11050 HASH(c); 11051 break; 11052 } 11053 case PyUnicode_2BYTE_KIND: { 11054 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self); 11055 HASH(s); 11056 break; 11057 } 11058 default: { 11059 Py_UCS4 *l; 11060 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND && 11061 "Impossible switch case in unicode_hash"); 11062 l = PyUnicode_4BYTE_DATA(self); 11063 HASH(l); 11064 break; 11065 } 11066 } 11067 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self); 11068 x ^= (Py_uhash_t) _Py_HashSecret.suffix; 11069 11070 if (x == -1) 11071 x = -2; 11072 _PyUnicode_HASH(self) = x; 11073 return x; 11074} 11075#undef HASH 11076 11077PyDoc_STRVAR(index__doc__, 11078 "S.index(sub[, start[, end]]) -> int\n\ 11079\n\ 11080Like S.find() but raise ValueError when the substring is not found."); 11081 11082static PyObject * 11083unicode_index(PyObject *self, PyObject *args) 11084{ 11085 Py_ssize_t result; 11086 PyObject *substring; 11087 Py_ssize_t start; 11088 Py_ssize_t end; 11089 11090 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11091 &start, &end)) 11092 return NULL; 11093 11094 if (PyUnicode_READY(self) == -1) 11095 return NULL; 11096 if (PyUnicode_READY(substring) == -1) 11097 return NULL; 11098 11099 result = any_find_slice(1, self, substring, start, end); 11100 11101 Py_DECREF(substring); 11102 11103 if (result == -2) 11104 return NULL; 11105 11106 if (result < 0) { 11107 PyErr_SetString(PyExc_ValueError, "substring not found"); 11108 return NULL; 11109 } 11110 11111 return PyLong_FromSsize_t(result); 11112} 11113 11114PyDoc_STRVAR(islower__doc__, 11115 "S.islower() -> bool\n\ 11116\n\ 11117Return True if all cased characters in S are lowercase and there is\n\ 11118at least one cased character in S, False otherwise."); 11119 11120static PyObject* 11121unicode_islower(PyObject *self) 11122{ 11123 Py_ssize_t i, length; 11124 int kind; 11125 void *data; 11126 int cased; 11127 11128 if (PyUnicode_READY(self) == -1) 11129 return NULL; 11130 length = PyUnicode_GET_LENGTH(self); 11131 kind = PyUnicode_KIND(self); 11132 data = PyUnicode_DATA(self); 11133 11134 /* Shortcut for single character strings */ 11135 if (length == 1) 11136 return PyBool_FromLong( 11137 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11138 11139 /* Special case for empty strings */ 11140 if (length == 0) 11141 return PyBool_FromLong(0); 11142 11143 cased = 0; 11144 for (i = 0; i < length; i++) { 11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11146 11147 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11148 return PyBool_FromLong(0); 11149 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11150 cased = 1; 11151 } 11152 return PyBool_FromLong(cased); 11153} 11154 11155PyDoc_STRVAR(isupper__doc__, 11156 "S.isupper() -> bool\n\ 11157\n\ 11158Return True if all cased characters in S are uppercase and there is\n\ 11159at least one cased character in S, False otherwise."); 11160 11161static PyObject* 11162unicode_isupper(PyObject *self) 11163{ 11164 Py_ssize_t i, length; 11165 int kind; 11166 void *data; 11167 int cased; 11168 11169 if (PyUnicode_READY(self) == -1) 11170 return NULL; 11171 length = PyUnicode_GET_LENGTH(self); 11172 kind = PyUnicode_KIND(self); 11173 data = PyUnicode_DATA(self); 11174 11175 /* Shortcut for single character strings */ 11176 if (length == 1) 11177 return PyBool_FromLong( 11178 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11179 11180 /* Special case for empty strings */ 11181 if (length == 0) 11182 return PyBool_FromLong(0); 11183 11184 cased = 0; 11185 for (i = 0; i < length; i++) { 11186 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11187 11188 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11189 return PyBool_FromLong(0); 11190 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11191 cased = 1; 11192 } 11193 return PyBool_FromLong(cased); 11194} 11195 11196PyDoc_STRVAR(istitle__doc__, 11197 "S.istitle() -> bool\n\ 11198\n\ 11199Return True if S is a titlecased string and there is at least one\n\ 11200character in S, i.e. upper- and titlecase characters may only\n\ 11201follow uncased characters and lowercase characters only cased ones.\n\ 11202Return False otherwise."); 11203 11204static PyObject* 11205unicode_istitle(PyObject *self) 11206{ 11207 Py_ssize_t i, length; 11208 int kind; 11209 void *data; 11210 int cased, previous_is_cased; 11211 11212 if (PyUnicode_READY(self) == -1) 11213 return NULL; 11214 length = PyUnicode_GET_LENGTH(self); 11215 kind = PyUnicode_KIND(self); 11216 data = PyUnicode_DATA(self); 11217 11218 /* Shortcut for single character strings */ 11219 if (length == 1) { 11220 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11221 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11222 (Py_UNICODE_ISUPPER(ch) != 0)); 11223 } 11224 11225 /* Special case for empty strings */ 11226 if (length == 0) 11227 return PyBool_FromLong(0); 11228 11229 cased = 0; 11230 previous_is_cased = 0; 11231 for (i = 0; i < length; i++) { 11232 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11233 11234 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11235 if (previous_is_cased) 11236 return PyBool_FromLong(0); 11237 previous_is_cased = 1; 11238 cased = 1; 11239 } 11240 else if (Py_UNICODE_ISLOWER(ch)) { 11241 if (!previous_is_cased) 11242 return PyBool_FromLong(0); 11243 previous_is_cased = 1; 11244 cased = 1; 11245 } 11246 else 11247 previous_is_cased = 0; 11248 } 11249 return PyBool_FromLong(cased); 11250} 11251 11252PyDoc_STRVAR(isspace__doc__, 11253 "S.isspace() -> bool\n\ 11254\n\ 11255Return True if all characters in S are whitespace\n\ 11256and there is at least one character in S, False otherwise."); 11257 11258static PyObject* 11259unicode_isspace(PyObject *self) 11260{ 11261 Py_ssize_t i, length; 11262 int kind; 11263 void *data; 11264 11265 if (PyUnicode_READY(self) == -1) 11266 return NULL; 11267 length = PyUnicode_GET_LENGTH(self); 11268 kind = PyUnicode_KIND(self); 11269 data = PyUnicode_DATA(self); 11270 11271 /* Shortcut for single character strings */ 11272 if (length == 1) 11273 return PyBool_FromLong( 11274 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11275 11276 /* Special case for empty strings */ 11277 if (length == 0) 11278 return PyBool_FromLong(0); 11279 11280 for (i = 0; i < length; i++) { 11281 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11282 if (!Py_UNICODE_ISSPACE(ch)) 11283 return PyBool_FromLong(0); 11284 } 11285 return PyBool_FromLong(1); 11286} 11287 11288PyDoc_STRVAR(isalpha__doc__, 11289 "S.isalpha() -> bool\n\ 11290\n\ 11291Return True if all characters in S are alphabetic\n\ 11292and there is at least one character in S, False otherwise."); 11293 11294static PyObject* 11295unicode_isalpha(PyObject *self) 11296{ 11297 Py_ssize_t i, length; 11298 int kind; 11299 void *data; 11300 11301 if (PyUnicode_READY(self) == -1) 11302 return NULL; 11303 length = PyUnicode_GET_LENGTH(self); 11304 kind = PyUnicode_KIND(self); 11305 data = PyUnicode_DATA(self); 11306 11307 /* Shortcut for single character strings */ 11308 if (length == 1) 11309 return PyBool_FromLong( 11310 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11311 11312 /* Special case for empty strings */ 11313 if (length == 0) 11314 return PyBool_FromLong(0); 11315 11316 for (i = 0; i < length; i++) { 11317 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11318 return PyBool_FromLong(0); 11319 } 11320 return PyBool_FromLong(1); 11321} 11322 11323PyDoc_STRVAR(isalnum__doc__, 11324 "S.isalnum() -> bool\n\ 11325\n\ 11326Return True if all characters in S are alphanumeric\n\ 11327and there is at least one character in S, False otherwise."); 11328 11329static PyObject* 11330unicode_isalnum(PyObject *self) 11331{ 11332 int kind; 11333 void *data; 11334 Py_ssize_t len, i; 11335 11336 if (PyUnicode_READY(self) == -1) 11337 return NULL; 11338 11339 kind = PyUnicode_KIND(self); 11340 data = PyUnicode_DATA(self); 11341 len = PyUnicode_GET_LENGTH(self); 11342 11343 /* Shortcut for single character strings */ 11344 if (len == 1) { 11345 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11346 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11347 } 11348 11349 /* Special case for empty strings */ 11350 if (len == 0) 11351 return PyBool_FromLong(0); 11352 11353 for (i = 0; i < len; i++) { 11354 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11355 if (!Py_UNICODE_ISALNUM(ch)) 11356 return PyBool_FromLong(0); 11357 } 11358 return PyBool_FromLong(1); 11359} 11360 11361PyDoc_STRVAR(isdecimal__doc__, 11362 "S.isdecimal() -> bool\n\ 11363\n\ 11364Return True if there are only decimal characters in S,\n\ 11365False otherwise."); 11366 11367static PyObject* 11368unicode_isdecimal(PyObject *self) 11369{ 11370 Py_ssize_t i, length; 11371 int kind; 11372 void *data; 11373 11374 if (PyUnicode_READY(self) == -1) 11375 return NULL; 11376 length = PyUnicode_GET_LENGTH(self); 11377 kind = PyUnicode_KIND(self); 11378 data = PyUnicode_DATA(self); 11379 11380 /* Shortcut for single character strings */ 11381 if (length == 1) 11382 return PyBool_FromLong( 11383 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11384 11385 /* Special case for empty strings */ 11386 if (length == 0) 11387 return PyBool_FromLong(0); 11388 11389 for (i = 0; i < length; i++) { 11390 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11391 return PyBool_FromLong(0); 11392 } 11393 return PyBool_FromLong(1); 11394} 11395 11396PyDoc_STRVAR(isdigit__doc__, 11397 "S.isdigit() -> bool\n\ 11398\n\ 11399Return True if all characters in S are digits\n\ 11400and there is at least one character in S, False otherwise."); 11401 11402static PyObject* 11403unicode_isdigit(PyObject *self) 11404{ 11405 Py_ssize_t i, length; 11406 int kind; 11407 void *data; 11408 11409 if (PyUnicode_READY(self) == -1) 11410 return NULL; 11411 length = PyUnicode_GET_LENGTH(self); 11412 kind = PyUnicode_KIND(self); 11413 data = PyUnicode_DATA(self); 11414 11415 /* Shortcut for single character strings */ 11416 if (length == 1) { 11417 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11418 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11419 } 11420 11421 /* Special case for empty strings */ 11422 if (length == 0) 11423 return PyBool_FromLong(0); 11424 11425 for (i = 0; i < length; i++) { 11426 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11427 return PyBool_FromLong(0); 11428 } 11429 return PyBool_FromLong(1); 11430} 11431 11432PyDoc_STRVAR(isnumeric__doc__, 11433 "S.isnumeric() -> bool\n\ 11434\n\ 11435Return True if there are only numeric characters in S,\n\ 11436False otherwise."); 11437 11438static PyObject* 11439unicode_isnumeric(PyObject *self) 11440{ 11441 Py_ssize_t i, length; 11442 int kind; 11443 void *data; 11444 11445 if (PyUnicode_READY(self) == -1) 11446 return NULL; 11447 length = PyUnicode_GET_LENGTH(self); 11448 kind = PyUnicode_KIND(self); 11449 data = PyUnicode_DATA(self); 11450 11451 /* Shortcut for single character strings */ 11452 if (length == 1) 11453 return PyBool_FromLong( 11454 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11455 11456 /* Special case for empty strings */ 11457 if (length == 0) 11458 return PyBool_FromLong(0); 11459 11460 for (i = 0; i < length; i++) { 11461 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11462 return PyBool_FromLong(0); 11463 } 11464 return PyBool_FromLong(1); 11465} 11466 11467int 11468PyUnicode_IsIdentifier(PyObject *self) 11469{ 11470 int kind; 11471 void *data; 11472 Py_ssize_t i; 11473 Py_UCS4 first; 11474 11475 if (PyUnicode_READY(self) == -1) { 11476 Py_FatalError("identifier not ready"); 11477 return 0; 11478 } 11479 11480 /* Special case for empty strings */ 11481 if (PyUnicode_GET_LENGTH(self) == 0) 11482 return 0; 11483 kind = PyUnicode_KIND(self); 11484 data = PyUnicode_DATA(self); 11485 11486 /* PEP 3131 says that the first character must be in 11487 XID_Start and subsequent characters in XID_Continue, 11488 and for the ASCII range, the 2.x rules apply (i.e 11489 start with letters and underscore, continue with 11490 letters, digits, underscore). However, given the current 11491 definition of XID_Start and XID_Continue, it is sufficient 11492 to check just for these, except that _ must be allowed 11493 as starting an identifier. */ 11494 first = PyUnicode_READ(kind, data, 0); 11495 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11496 return 0; 11497 11498 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11499 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11500 return 0; 11501 return 1; 11502} 11503 11504PyDoc_STRVAR(isidentifier__doc__, 11505 "S.isidentifier() -> bool\n\ 11506\n\ 11507Return True if S is a valid identifier according\n\ 11508to the language definition."); 11509 11510static PyObject* 11511unicode_isidentifier(PyObject *self) 11512{ 11513 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11514} 11515 11516PyDoc_STRVAR(isprintable__doc__, 11517 "S.isprintable() -> bool\n\ 11518\n\ 11519Return True if all characters in S are considered\n\ 11520printable in repr() or S is empty, False otherwise."); 11521 11522static PyObject* 11523unicode_isprintable(PyObject *self) 11524{ 11525 Py_ssize_t i, length; 11526 int kind; 11527 void *data; 11528 11529 if (PyUnicode_READY(self) == -1) 11530 return NULL; 11531 length = PyUnicode_GET_LENGTH(self); 11532 kind = PyUnicode_KIND(self); 11533 data = PyUnicode_DATA(self); 11534 11535 /* Shortcut for single character strings */ 11536 if (length == 1) 11537 return PyBool_FromLong( 11538 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11539 11540 for (i = 0; i < length; i++) { 11541 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11542 Py_RETURN_FALSE; 11543 } 11544 } 11545 Py_RETURN_TRUE; 11546} 11547 11548PyDoc_STRVAR(join__doc__, 11549 "S.join(iterable) -> str\n\ 11550\n\ 11551Return a string which is the concatenation of the strings in the\n\ 11552iterable. The separator between elements is S."); 11553 11554static PyObject* 11555unicode_join(PyObject *self, PyObject *data) 11556{ 11557 return PyUnicode_Join(self, data); 11558} 11559 11560static Py_ssize_t 11561unicode_length(PyObject *self) 11562{ 11563 if (PyUnicode_READY(self) == -1) 11564 return -1; 11565 return PyUnicode_GET_LENGTH(self); 11566} 11567 11568PyDoc_STRVAR(ljust__doc__, 11569 "S.ljust(width[, fillchar]) -> str\n\ 11570\n\ 11571Return S left-justified in a Unicode string of length width. Padding is\n\ 11572done using the specified fill character (default is a space)."); 11573 11574static PyObject * 11575unicode_ljust(PyObject *self, PyObject *args) 11576{ 11577 Py_ssize_t width; 11578 Py_UCS4 fillchar = ' '; 11579 11580 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11581 return NULL; 11582 11583 if (PyUnicode_READY(self) == -1) 11584 return NULL; 11585 11586 if (PyUnicode_GET_LENGTH(self) >= width) 11587 return unicode_result_unchanged(self); 11588 11589 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11590} 11591 11592PyDoc_STRVAR(lower__doc__, 11593 "S.lower() -> str\n\ 11594\n\ 11595Return a copy of the string S converted to lowercase."); 11596 11597static PyObject* 11598unicode_lower(PyObject *self) 11599{ 11600 if (PyUnicode_READY(self) == -1) 11601 return NULL; 11602 if (PyUnicode_IS_ASCII(self)) 11603 return ascii_upper_or_lower(self, 1); 11604 return case_operation(self, do_lower); 11605} 11606 11607#define LEFTSTRIP 0 11608#define RIGHTSTRIP 1 11609#define BOTHSTRIP 2 11610 11611/* Arrays indexed by above */ 11612static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11613 11614#define STRIPNAME(i) (stripformat[i]+3) 11615 11616/* externally visible for str.strip(unicode) */ 11617PyObject * 11618_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11619{ 11620 void *data; 11621 int kind; 11622 Py_ssize_t i, j, len; 11623 BLOOM_MASK sepmask; 11624 11625 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 11626 return NULL; 11627 11628 kind = PyUnicode_KIND(self); 11629 data = PyUnicode_DATA(self); 11630 len = PyUnicode_GET_LENGTH(self); 11631 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 11632 PyUnicode_DATA(sepobj), 11633 PyUnicode_GET_LENGTH(sepobj)); 11634 11635 i = 0; 11636 if (striptype != RIGHTSTRIP) { 11637 while (i < len && 11638 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { 11639 i++; 11640 } 11641 } 11642 11643 j = len; 11644 if (striptype != LEFTSTRIP) { 11645 do { 11646 j--; 11647 } while (j >= i && 11648 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); 11649 j++; 11650 } 11651 11652 return PyUnicode_Substring(self, i, j); 11653} 11654 11655PyObject* 11656PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 11657{ 11658 unsigned char *data; 11659 int kind; 11660 Py_ssize_t length; 11661 11662 if (PyUnicode_READY(self) == -1) 11663 return NULL; 11664 11665 length = PyUnicode_GET_LENGTH(self); 11666 end = Py_MIN(end, length); 11667 11668 if (start == 0 && end == length) 11669 return unicode_result_unchanged(self); 11670 11671 if (start < 0 || end < 0) { 11672 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11673 return NULL; 11674 } 11675 if (start >= length || end < start) { 11676 Py_INCREF(unicode_empty); 11677 return unicode_empty; 11678 } 11679 11680 length = end - start; 11681 if (PyUnicode_IS_ASCII(self)) { 11682 data = PyUnicode_1BYTE_DATA(self); 11683 return _PyUnicode_FromASCII((char*)(data + start), length); 11684 } 11685 else { 11686 kind = PyUnicode_KIND(self); 11687 data = PyUnicode_1BYTE_DATA(self); 11688 return PyUnicode_FromKindAndData(kind, 11689 data + kind * start, 11690 length); 11691 } 11692} 11693 11694static PyObject * 11695do_strip(PyObject *self, int striptype) 11696{ 11697 int kind; 11698 void *data; 11699 Py_ssize_t len, i, j; 11700 11701 if (PyUnicode_READY(self) == -1) 11702 return NULL; 11703 11704 kind = PyUnicode_KIND(self); 11705 data = PyUnicode_DATA(self); 11706 len = PyUnicode_GET_LENGTH(self); 11707 11708 i = 0; 11709 if (striptype != RIGHTSTRIP) { 11710 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { 11711 i++; 11712 } 11713 } 11714 11715 j = len; 11716 if (striptype != LEFTSTRIP) { 11717 do { 11718 j--; 11719 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); 11720 j++; 11721 } 11722 11723 return PyUnicode_Substring(self, i, j); 11724} 11725 11726 11727static PyObject * 11728do_argstrip(PyObject *self, int striptype, PyObject *args) 11729{ 11730 PyObject *sep = NULL; 11731 11732 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) 11733 return NULL; 11734 11735 if (sep != NULL && sep != Py_None) { 11736 if (PyUnicode_Check(sep)) 11737 return _PyUnicode_XStrip(self, striptype, sep); 11738 else { 11739 PyErr_Format(PyExc_TypeError, 11740 "%s arg must be None or str", 11741 STRIPNAME(striptype)); 11742 return NULL; 11743 } 11744 } 11745 11746 return do_strip(self, striptype); 11747} 11748 11749 11750PyDoc_STRVAR(strip__doc__, 11751 "S.strip([chars]) -> str\n\ 11752\n\ 11753Return a copy of the string S with leading and trailing\n\ 11754whitespace removed.\n\ 11755If chars is given and not None, remove characters in chars instead."); 11756 11757static PyObject * 11758unicode_strip(PyObject *self, PyObject *args) 11759{ 11760 if (PyTuple_GET_SIZE(args) == 0) 11761 return do_strip(self, BOTHSTRIP); /* Common case */ 11762 else 11763 return do_argstrip(self, BOTHSTRIP, args); 11764} 11765 11766 11767PyDoc_STRVAR(lstrip__doc__, 11768 "S.lstrip([chars]) -> str\n\ 11769\n\ 11770Return a copy of the string S with leading whitespace removed.\n\ 11771If chars is given and not None, remove characters in chars instead."); 11772 11773static PyObject * 11774unicode_lstrip(PyObject *self, PyObject *args) 11775{ 11776 if (PyTuple_GET_SIZE(args) == 0) 11777 return do_strip(self, LEFTSTRIP); /* Common case */ 11778 else 11779 return do_argstrip(self, LEFTSTRIP, args); 11780} 11781 11782 11783PyDoc_STRVAR(rstrip__doc__, 11784 "S.rstrip([chars]) -> str\n\ 11785\n\ 11786Return a copy of the string S with trailing whitespace removed.\n\ 11787If chars is given and not None, remove characters in chars instead."); 11788 11789static PyObject * 11790unicode_rstrip(PyObject *self, PyObject *args) 11791{ 11792 if (PyTuple_GET_SIZE(args) == 0) 11793 return do_strip(self, RIGHTSTRIP); /* Common case */ 11794 else 11795 return do_argstrip(self, RIGHTSTRIP, args); 11796} 11797 11798 11799static PyObject* 11800unicode_repeat(PyObject *str, Py_ssize_t len) 11801{ 11802 PyObject *u; 11803 Py_ssize_t nchars, n; 11804 11805 if (len < 1) { 11806 Py_INCREF(unicode_empty); 11807 return unicode_empty; 11808 } 11809 11810 /* no repeat, return original string */ 11811 if (len == 1) 11812 return unicode_result_unchanged(str); 11813 11814 if (PyUnicode_READY(str) == -1) 11815 return NULL; 11816 11817 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 11818 PyErr_SetString(PyExc_OverflowError, 11819 "repeated string is too long"); 11820 return NULL; 11821 } 11822 nchars = len * PyUnicode_GET_LENGTH(str); 11823 11824 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 11825 if (!u) 11826 return NULL; 11827 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 11828 11829 if (PyUnicode_GET_LENGTH(str) == 1) { 11830 const int kind = PyUnicode_KIND(str); 11831 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 11832 if (kind == PyUnicode_1BYTE_KIND) { 11833 void *to = PyUnicode_DATA(u); 11834 memset(to, (unsigned char)fill_char, len); 11835 } 11836 else if (kind == PyUnicode_2BYTE_KIND) { 11837 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 11838 for (n = 0; n < len; ++n) 11839 ucs2[n] = fill_char; 11840 } else { 11841 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 11842 assert(kind == PyUnicode_4BYTE_KIND); 11843 for (n = 0; n < len; ++n) 11844 ucs4[n] = fill_char; 11845 } 11846 } 11847 else { 11848 /* number of characters copied this far */ 11849 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 11850 const Py_ssize_t char_size = PyUnicode_KIND(str); 11851 char *to = (char *) PyUnicode_DATA(u); 11852 Py_MEMCPY(to, PyUnicode_DATA(str), 11853 PyUnicode_GET_LENGTH(str) * char_size); 11854 while (done < nchars) { 11855 n = (done <= nchars-done) ? done : nchars-done; 11856 Py_MEMCPY(to + (done * char_size), to, n * char_size); 11857 done += n; 11858 } 11859 } 11860 11861 assert(_PyUnicode_CheckConsistency(u, 1)); 11862 return u; 11863} 11864 11865PyObject * 11866PyUnicode_Replace(PyObject *obj, 11867 PyObject *subobj, 11868 PyObject *replobj, 11869 Py_ssize_t maxcount) 11870{ 11871 PyObject *self; 11872 PyObject *str1; 11873 PyObject *str2; 11874 PyObject *result; 11875 11876 self = PyUnicode_FromObject(obj); 11877 if (self == NULL) 11878 return NULL; 11879 str1 = PyUnicode_FromObject(subobj); 11880 if (str1 == NULL) { 11881 Py_DECREF(self); 11882 return NULL; 11883 } 11884 str2 = PyUnicode_FromObject(replobj); 11885 if (str2 == NULL) { 11886 Py_DECREF(self); 11887 Py_DECREF(str1); 11888 return NULL; 11889 } 11890 if (PyUnicode_READY(self) == -1 || 11891 PyUnicode_READY(str1) == -1 || 11892 PyUnicode_READY(str2) == -1) 11893 result = NULL; 11894 else 11895 result = replace(self, str1, str2, maxcount); 11896 Py_DECREF(self); 11897 Py_DECREF(str1); 11898 Py_DECREF(str2); 11899 return result; 11900} 11901 11902PyDoc_STRVAR(replace__doc__, 11903 "S.replace(old, new[, count]) -> str\n\ 11904\n\ 11905Return a copy of S with all occurrences of substring\n\ 11906old replaced by new. If the optional argument count is\n\ 11907given, only the first count occurrences are replaced."); 11908 11909static PyObject* 11910unicode_replace(PyObject *self, PyObject *args) 11911{ 11912 PyObject *str1; 11913 PyObject *str2; 11914 Py_ssize_t maxcount = -1; 11915 PyObject *result; 11916 11917 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 11918 return NULL; 11919 if (PyUnicode_READY(self) == -1) 11920 return NULL; 11921 str1 = PyUnicode_FromObject(str1); 11922 if (str1 == NULL) 11923 return NULL; 11924 str2 = PyUnicode_FromObject(str2); 11925 if (str2 == NULL) { 11926 Py_DECREF(str1); 11927 return NULL; 11928 } 11929 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 11930 result = NULL; 11931 else 11932 result = replace(self, str1, str2, maxcount); 11933 11934 Py_DECREF(str1); 11935 Py_DECREF(str2); 11936 return result; 11937} 11938 11939static PyObject * 11940unicode_repr(PyObject *unicode) 11941{ 11942 PyObject *repr; 11943 Py_ssize_t isize; 11944 Py_ssize_t osize, squote, dquote, i, o; 11945 Py_UCS4 max, quote; 11946 int ikind, okind; 11947 void *idata, *odata; 11948 11949 if (PyUnicode_READY(unicode) == -1) 11950 return NULL; 11951 11952 isize = PyUnicode_GET_LENGTH(unicode); 11953 idata = PyUnicode_DATA(unicode); 11954 11955 /* Compute length of output, quote characters, and 11956 maximum character */ 11957 osize = 2; /* quotes */ 11958 max = 127; 11959 squote = dquote = 0; 11960 ikind = PyUnicode_KIND(unicode); 11961 for (i = 0; i < isize; i++) { 11962 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 11963 switch (ch) { 11964 case '\'': squote++; osize++; break; 11965 case '"': dquote++; osize++; break; 11966 case '\\': case '\t': case '\r': case '\n': 11967 osize += 2; break; 11968 default: 11969 /* Fast-path ASCII */ 11970 if (ch < ' ' || ch == 0x7f) 11971 osize += 4; /* \xHH */ 11972 else if (ch < 0x7f) 11973 osize++; 11974 else if (Py_UNICODE_ISPRINTABLE(ch)) { 11975 osize++; 11976 max = ch > max ? ch : max; 11977 } 11978 else if (ch < 0x100) 11979 osize += 4; /* \xHH */ 11980 else if (ch < 0x10000) 11981 osize += 6; /* \uHHHH */ 11982 else 11983 osize += 10; /* \uHHHHHHHH */ 11984 } 11985 } 11986 11987 quote = '\''; 11988 if (squote) { 11989 if (dquote) 11990 /* Both squote and dquote present. Use squote, 11991 and escape them */ 11992 osize += squote; 11993 else 11994 quote = '"'; 11995 } 11996 11997 repr = PyUnicode_New(osize, max); 11998 if (repr == NULL) 11999 return NULL; 12000 okind = PyUnicode_KIND(repr); 12001 odata = PyUnicode_DATA(repr); 12002 12003 PyUnicode_WRITE(okind, odata, 0, quote); 12004 PyUnicode_WRITE(okind, odata, osize-1, quote); 12005 12006 for (i = 0, o = 1; i < isize; i++) { 12007 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12008 12009 /* Escape quotes and backslashes */ 12010 if ((ch == quote) || (ch == '\\')) { 12011 PyUnicode_WRITE(okind, odata, o++, '\\'); 12012 PyUnicode_WRITE(okind, odata, o++, ch); 12013 continue; 12014 } 12015 12016 /* Map special whitespace to '\t', \n', '\r' */ 12017 if (ch == '\t') { 12018 PyUnicode_WRITE(okind, odata, o++, '\\'); 12019 PyUnicode_WRITE(okind, odata, o++, 't'); 12020 } 12021 else if (ch == '\n') { 12022 PyUnicode_WRITE(okind, odata, o++, '\\'); 12023 PyUnicode_WRITE(okind, odata, o++, 'n'); 12024 } 12025 else if (ch == '\r') { 12026 PyUnicode_WRITE(okind, odata, o++, '\\'); 12027 PyUnicode_WRITE(okind, odata, o++, 'r'); 12028 } 12029 12030 /* Map non-printable US ASCII to '\xhh' */ 12031 else if (ch < ' ' || ch == 0x7F) { 12032 PyUnicode_WRITE(okind, odata, o++, '\\'); 12033 PyUnicode_WRITE(okind, odata, o++, 'x'); 12034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12036 } 12037 12038 /* Copy ASCII characters as-is */ 12039 else if (ch < 0x7F) { 12040 PyUnicode_WRITE(okind, odata, o++, ch); 12041 } 12042 12043 /* Non-ASCII characters */ 12044 else { 12045 /* Map Unicode whitespace and control characters 12046 (categories Z* and C* except ASCII space) 12047 */ 12048 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12049 PyUnicode_WRITE(okind, odata, o++, '\\'); 12050 /* Map 8-bit characters to '\xhh' */ 12051 if (ch <= 0xff) { 12052 PyUnicode_WRITE(okind, odata, o++, 'x'); 12053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12055 } 12056 /* Map 16-bit characters to '\uxxxx' */ 12057 else if (ch <= 0xffff) { 12058 PyUnicode_WRITE(okind, odata, o++, 'u'); 12059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12063 } 12064 /* Map 21-bit characters to '\U00xxxxxx' */ 12065 else { 12066 PyUnicode_WRITE(okind, odata, o++, 'U'); 12067 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12068 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12069 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12070 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12071 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12072 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12073 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12074 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12075 } 12076 } 12077 /* Copy characters as-is */ 12078 else { 12079 PyUnicode_WRITE(okind, odata, o++, ch); 12080 } 12081 } 12082 } 12083 /* Closing quote already added at the beginning */ 12084 assert(_PyUnicode_CheckConsistency(repr, 1)); 12085 return repr; 12086} 12087 12088PyDoc_STRVAR(rfind__doc__, 12089 "S.rfind(sub[, start[, end]]) -> int\n\ 12090\n\ 12091Return the highest index in S where substring sub is found,\n\ 12092such that sub is contained within S[start:end]. Optional\n\ 12093arguments start and end are interpreted as in slice notation.\n\ 12094\n\ 12095Return -1 on failure."); 12096 12097static PyObject * 12098unicode_rfind(PyObject *self, PyObject *args) 12099{ 12100 PyObject *substring; 12101 Py_ssize_t start; 12102 Py_ssize_t end; 12103 Py_ssize_t result; 12104 12105 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12106 &start, &end)) 12107 return NULL; 12108 12109 if (PyUnicode_READY(self) == -1) 12110 return NULL; 12111 if (PyUnicode_READY(substring) == -1) 12112 return NULL; 12113 12114 result = any_find_slice(-1, self, substring, start, end); 12115 12116 Py_DECREF(substring); 12117 12118 if (result == -2) 12119 return NULL; 12120 12121 return PyLong_FromSsize_t(result); 12122} 12123 12124PyDoc_STRVAR(rindex__doc__, 12125 "S.rindex(sub[, start[, end]]) -> int\n\ 12126\n\ 12127Like S.rfind() but raise ValueError when the substring is not found."); 12128 12129static PyObject * 12130unicode_rindex(PyObject *self, PyObject *args) 12131{ 12132 PyObject *substring; 12133 Py_ssize_t start; 12134 Py_ssize_t end; 12135 Py_ssize_t result; 12136 12137 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12138 &start, &end)) 12139 return NULL; 12140 12141 if (PyUnicode_READY(self) == -1) 12142 return NULL; 12143 if (PyUnicode_READY(substring) == -1) 12144 return NULL; 12145 12146 result = any_find_slice(-1, self, substring, start, end); 12147 12148 Py_DECREF(substring); 12149 12150 if (result == -2) 12151 return NULL; 12152 12153 if (result < 0) { 12154 PyErr_SetString(PyExc_ValueError, "substring not found"); 12155 return NULL; 12156 } 12157 12158 return PyLong_FromSsize_t(result); 12159} 12160 12161PyDoc_STRVAR(rjust__doc__, 12162 "S.rjust(width[, fillchar]) -> str\n\ 12163\n\ 12164Return S right-justified in a string of length width. Padding is\n\ 12165done using the specified fill character (default is a space)."); 12166 12167static PyObject * 12168unicode_rjust(PyObject *self, PyObject *args) 12169{ 12170 Py_ssize_t width; 12171 Py_UCS4 fillchar = ' '; 12172 12173 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12174 return NULL; 12175 12176 if (PyUnicode_READY(self) == -1) 12177 return NULL; 12178 12179 if (PyUnicode_GET_LENGTH(self) >= width) 12180 return unicode_result_unchanged(self); 12181 12182 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12183} 12184 12185PyObject * 12186PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12187{ 12188 PyObject *result; 12189 12190 s = PyUnicode_FromObject(s); 12191 if (s == NULL) 12192 return NULL; 12193 if (sep != NULL) { 12194 sep = PyUnicode_FromObject(sep); 12195 if (sep == NULL) { 12196 Py_DECREF(s); 12197 return NULL; 12198 } 12199 } 12200 12201 result = split(s, sep, maxsplit); 12202 12203 Py_DECREF(s); 12204 Py_XDECREF(sep); 12205 return result; 12206} 12207 12208PyDoc_STRVAR(split__doc__, 12209 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12210\n\ 12211Return a list of the words in S, using sep as the\n\ 12212delimiter string. If maxsplit is given, at most maxsplit\n\ 12213splits are done. If sep is not specified or is None, any\n\ 12214whitespace string is a separator and empty strings are\n\ 12215removed from the result."); 12216 12217static PyObject* 12218unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12219{ 12220 static char *kwlist[] = {"sep", "maxsplit", 0}; 12221 PyObject *substring = Py_None; 12222 Py_ssize_t maxcount = -1; 12223 12224 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12225 kwlist, &substring, &maxcount)) 12226 return NULL; 12227 12228 if (substring == Py_None) 12229 return split(self, NULL, maxcount); 12230 else if (PyUnicode_Check(substring)) 12231 return split(self, substring, maxcount); 12232 else 12233 return PyUnicode_Split(self, substring, maxcount); 12234} 12235 12236PyObject * 12237PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12238{ 12239 PyObject* str_obj; 12240 PyObject* sep_obj; 12241 PyObject* out; 12242 int kind1, kind2, kind; 12243 void *buf1 = NULL, *buf2 = NULL; 12244 Py_ssize_t len1, len2; 12245 12246 str_obj = PyUnicode_FromObject(str_in); 12247 if (!str_obj) 12248 return NULL; 12249 sep_obj = PyUnicode_FromObject(sep_in); 12250 if (!sep_obj) { 12251 Py_DECREF(str_obj); 12252 return NULL; 12253 } 12254 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12255 Py_DECREF(sep_obj); 12256 Py_DECREF(str_obj); 12257 return NULL; 12258 } 12259 12260 kind1 = PyUnicode_KIND(str_obj); 12261 kind2 = PyUnicode_KIND(sep_obj); 12262 kind = Py_MAX(kind1, kind2); 12263 buf1 = PyUnicode_DATA(str_obj); 12264 if (kind1 != kind) 12265 buf1 = _PyUnicode_AsKind(str_obj, kind); 12266 if (!buf1) 12267 goto onError; 12268 buf2 = PyUnicode_DATA(sep_obj); 12269 if (kind2 != kind) 12270 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12271 if (!buf2) 12272 goto onError; 12273 len1 = PyUnicode_GET_LENGTH(str_obj); 12274 len2 = PyUnicode_GET_LENGTH(sep_obj); 12275 12276 switch (PyUnicode_KIND(str_obj)) { 12277 case PyUnicode_1BYTE_KIND: 12278 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12279 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12280 else 12281 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12282 break; 12283 case PyUnicode_2BYTE_KIND: 12284 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12285 break; 12286 case PyUnicode_4BYTE_KIND: 12287 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12288 break; 12289 default: 12290 assert(0); 12291 out = 0; 12292 } 12293 12294 Py_DECREF(sep_obj); 12295 Py_DECREF(str_obj); 12296 if (kind1 != kind) 12297 PyMem_Free(buf1); 12298 if (kind2 != kind) 12299 PyMem_Free(buf2); 12300 12301 return out; 12302 onError: 12303 Py_DECREF(sep_obj); 12304 Py_DECREF(str_obj); 12305 if (kind1 != kind && buf1) 12306 PyMem_Free(buf1); 12307 if (kind2 != kind && buf2) 12308 PyMem_Free(buf2); 12309 return NULL; 12310} 12311 12312 12313PyObject * 12314PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12315{ 12316 PyObject* str_obj; 12317 PyObject* sep_obj; 12318 PyObject* out; 12319 int kind1, kind2, kind; 12320 void *buf1 = NULL, *buf2 = NULL; 12321 Py_ssize_t len1, len2; 12322 12323 str_obj = PyUnicode_FromObject(str_in); 12324 if (!str_obj) 12325 return NULL; 12326 sep_obj = PyUnicode_FromObject(sep_in); 12327 if (!sep_obj) { 12328 Py_DECREF(str_obj); 12329 return NULL; 12330 } 12331 12332 kind1 = PyUnicode_KIND(str_in); 12333 kind2 = PyUnicode_KIND(sep_obj); 12334 kind = Py_MAX(kind1, kind2); 12335 buf1 = PyUnicode_DATA(str_in); 12336 if (kind1 != kind) 12337 buf1 = _PyUnicode_AsKind(str_in, kind); 12338 if (!buf1) 12339 goto onError; 12340 buf2 = PyUnicode_DATA(sep_obj); 12341 if (kind2 != kind) 12342 buf2 = _PyUnicode_AsKind(sep_obj, kind); 12343 if (!buf2) 12344 goto onError; 12345 len1 = PyUnicode_GET_LENGTH(str_obj); 12346 len2 = PyUnicode_GET_LENGTH(sep_obj); 12347 12348 switch (PyUnicode_KIND(str_in)) { 12349 case PyUnicode_1BYTE_KIND: 12350 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12351 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12352 else 12353 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12354 break; 12355 case PyUnicode_2BYTE_KIND: 12356 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12357 break; 12358 case PyUnicode_4BYTE_KIND: 12359 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12360 break; 12361 default: 12362 assert(0); 12363 out = 0; 12364 } 12365 12366 Py_DECREF(sep_obj); 12367 Py_DECREF(str_obj); 12368 if (kind1 != kind) 12369 PyMem_Free(buf1); 12370 if (kind2 != kind) 12371 PyMem_Free(buf2); 12372 12373 return out; 12374 onError: 12375 Py_DECREF(sep_obj); 12376 Py_DECREF(str_obj); 12377 if (kind1 != kind && buf1) 12378 PyMem_Free(buf1); 12379 if (kind2 != kind && buf2) 12380 PyMem_Free(buf2); 12381 return NULL; 12382} 12383 12384PyDoc_STRVAR(partition__doc__, 12385 "S.partition(sep) -> (head, sep, tail)\n\ 12386\n\ 12387Search for the separator sep in S, and return the part before it,\n\ 12388the separator itself, and the part after it. If the separator is not\n\ 12389found, return S and two empty strings."); 12390 12391static PyObject* 12392unicode_partition(PyObject *self, PyObject *separator) 12393{ 12394 return PyUnicode_Partition(self, separator); 12395} 12396 12397PyDoc_STRVAR(rpartition__doc__, 12398 "S.rpartition(sep) -> (head, sep, tail)\n\ 12399\n\ 12400Search for the separator sep in S, starting at the end of S, and return\n\ 12401the part before it, the separator itself, and the part after it. If the\n\ 12402separator is not found, return two empty strings and S."); 12403 12404static PyObject* 12405unicode_rpartition(PyObject *self, PyObject *separator) 12406{ 12407 return PyUnicode_RPartition(self, separator); 12408} 12409 12410PyObject * 12411PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12412{ 12413 PyObject *result; 12414 12415 s = PyUnicode_FromObject(s); 12416 if (s == NULL) 12417 return NULL; 12418 if (sep != NULL) { 12419 sep = PyUnicode_FromObject(sep); 12420 if (sep == NULL) { 12421 Py_DECREF(s); 12422 return NULL; 12423 } 12424 } 12425 12426 result = rsplit(s, sep, maxsplit); 12427 12428 Py_DECREF(s); 12429 Py_XDECREF(sep); 12430 return result; 12431} 12432 12433PyDoc_STRVAR(rsplit__doc__, 12434 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12435\n\ 12436Return a list of the words in S, using sep as the\n\ 12437delimiter string, starting at the end of the string and\n\ 12438working to the front. If maxsplit is given, at most maxsplit\n\ 12439splits are done. If sep is not specified, any whitespace string\n\ 12440is a separator."); 12441 12442static PyObject* 12443unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12444{ 12445 static char *kwlist[] = {"sep", "maxsplit", 0}; 12446 PyObject *substring = Py_None; 12447 Py_ssize_t maxcount = -1; 12448 12449 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12450 kwlist, &substring, &maxcount)) 12451 return NULL; 12452 12453 if (substring == Py_None) 12454 return rsplit(self, NULL, maxcount); 12455 else if (PyUnicode_Check(substring)) 12456 return rsplit(self, substring, maxcount); 12457 else 12458 return PyUnicode_RSplit(self, substring, maxcount); 12459} 12460 12461PyDoc_STRVAR(splitlines__doc__, 12462 "S.splitlines([keepends]) -> list of strings\n\ 12463\n\ 12464Return a list of the lines in S, breaking at line boundaries.\n\ 12465Line breaks are not included in the resulting list unless keepends\n\ 12466is given and true."); 12467 12468static PyObject* 12469unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12470{ 12471 static char *kwlist[] = {"keepends", 0}; 12472 int keepends = 0; 12473 12474 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12475 kwlist, &keepends)) 12476 return NULL; 12477 12478 return PyUnicode_Splitlines(self, keepends); 12479} 12480 12481static 12482PyObject *unicode_str(PyObject *self) 12483{ 12484 return unicode_result_unchanged(self); 12485} 12486 12487PyDoc_STRVAR(swapcase__doc__, 12488 "S.swapcase() -> str\n\ 12489\n\ 12490Return a copy of S with uppercase characters converted to lowercase\n\ 12491and vice versa."); 12492 12493static PyObject* 12494unicode_swapcase(PyObject *self) 12495{ 12496 if (PyUnicode_READY(self) == -1) 12497 return NULL; 12498 return case_operation(self, do_swapcase); 12499} 12500 12501PyDoc_STRVAR(maketrans__doc__, 12502 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\ 12503\n\ 12504Return a translation table usable for str.translate().\n\ 12505If there is only one argument, it must be a dictionary mapping Unicode\n\ 12506ordinals (integers) or characters to Unicode ordinals, strings or None.\n\ 12507Character keys will be then converted to ordinals.\n\ 12508If there are two arguments, they must be strings of equal length, and\n\ 12509in the resulting dictionary, each character in x will be mapped to the\n\ 12510character at the same position in y. If there is a third argument, it\n\ 12511must be a string, whose characters will be mapped to None in the result."); 12512 12513static PyObject* 12514unicode_maketrans(PyObject *null, PyObject *args) 12515{ 12516 PyObject *x, *y = NULL, *z = NULL; 12517 PyObject *new = NULL, *key, *value; 12518 Py_ssize_t i = 0; 12519 int res; 12520 12521 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z)) 12522 return NULL; 12523 new = PyDict_New(); 12524 if (!new) 12525 return NULL; 12526 if (y != NULL) { 12527 int x_kind, y_kind, z_kind; 12528 void *x_data, *y_data, *z_data; 12529 12530 /* x must be a string too, of equal length */ 12531 if (!PyUnicode_Check(x)) { 12532 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12533 "be a string if there is a second argument"); 12534 goto err; 12535 } 12536 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 12537 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 12538 "arguments must have equal length"); 12539 goto err; 12540 } 12541 /* create entries for translating chars in x to those in y */ 12542 x_kind = PyUnicode_KIND(x); 12543 y_kind = PyUnicode_KIND(y); 12544 x_data = PyUnicode_DATA(x); 12545 y_data = PyUnicode_DATA(y); 12546 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 12547 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 12548 if (!key) 12549 goto err; 12550 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 12551 if (!value) { 12552 Py_DECREF(key); 12553 goto err; 12554 } 12555 res = PyDict_SetItem(new, key, value); 12556 Py_DECREF(key); 12557 Py_DECREF(value); 12558 if (res < 0) 12559 goto err; 12560 } 12561 /* create entries for deleting chars in z */ 12562 if (z != NULL) { 12563 z_kind = PyUnicode_KIND(z); 12564 z_data = PyUnicode_DATA(z); 12565 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 12566 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 12567 if (!key) 12568 goto err; 12569 res = PyDict_SetItem(new, key, Py_None); 12570 Py_DECREF(key); 12571 if (res < 0) 12572 goto err; 12573 } 12574 } 12575 } else { 12576 int kind; 12577 void *data; 12578 12579 /* x must be a dict */ 12580 if (!PyDict_CheckExact(x)) { 12581 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 12582 "to maketrans it must be a dict"); 12583 goto err; 12584 } 12585 /* copy entries into the new dict, converting string keys to int keys */ 12586 while (PyDict_Next(x, &i, &key, &value)) { 12587 if (PyUnicode_Check(key)) { 12588 /* convert string keys to integer keys */ 12589 PyObject *newkey; 12590 if (PyUnicode_GET_LENGTH(key) != 1) { 12591 PyErr_SetString(PyExc_ValueError, "string keys in translate " 12592 "table must be of length 1"); 12593 goto err; 12594 } 12595 kind = PyUnicode_KIND(key); 12596 data = PyUnicode_DATA(key); 12597 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 12598 if (!newkey) 12599 goto err; 12600 res = PyDict_SetItem(new, newkey, value); 12601 Py_DECREF(newkey); 12602 if (res < 0) 12603 goto err; 12604 } else if (PyLong_Check(key)) { 12605 /* just keep integer keys */ 12606 if (PyDict_SetItem(new, key, value) < 0) 12607 goto err; 12608 } else { 12609 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 12610 "be strings or integers"); 12611 goto err; 12612 } 12613 } 12614 } 12615 return new; 12616 err: 12617 Py_DECREF(new); 12618 return NULL; 12619} 12620 12621PyDoc_STRVAR(translate__doc__, 12622 "S.translate(table) -> str\n\ 12623\n\ 12624Return a copy of the string S, where all characters have been mapped\n\ 12625through the given translation table, which must be a mapping of\n\ 12626Unicode ordinals to Unicode ordinals, strings, or None.\n\ 12627Unmapped characters are left untouched. Characters mapped to None\n\ 12628are deleted."); 12629 12630static PyObject* 12631unicode_translate(PyObject *self, PyObject *table) 12632{ 12633 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 12634} 12635 12636PyDoc_STRVAR(upper__doc__, 12637 "S.upper() -> str\n\ 12638\n\ 12639Return a copy of S converted to uppercase."); 12640 12641static PyObject* 12642unicode_upper(PyObject *self) 12643{ 12644 if (PyUnicode_READY(self) == -1) 12645 return NULL; 12646 if (PyUnicode_IS_ASCII(self)) 12647 return ascii_upper_or_lower(self, 0); 12648 return case_operation(self, do_upper); 12649} 12650 12651PyDoc_STRVAR(zfill__doc__, 12652 "S.zfill(width) -> str\n\ 12653\n\ 12654Pad a numeric string S with zeros on the left, to fill a field\n\ 12655of the specified width. The string S is never truncated."); 12656 12657static PyObject * 12658unicode_zfill(PyObject *self, PyObject *args) 12659{ 12660 Py_ssize_t fill; 12661 PyObject *u; 12662 Py_ssize_t width; 12663 int kind; 12664 void *data; 12665 Py_UCS4 chr; 12666 12667 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 12668 return NULL; 12669 12670 if (PyUnicode_READY(self) == -1) 12671 return NULL; 12672 12673 if (PyUnicode_GET_LENGTH(self) >= width) 12674 return unicode_result_unchanged(self); 12675 12676 fill = width - PyUnicode_GET_LENGTH(self); 12677 12678 u = pad(self, fill, 0, '0'); 12679 12680 if (u == NULL) 12681 return NULL; 12682 12683 kind = PyUnicode_KIND(u); 12684 data = PyUnicode_DATA(u); 12685 chr = PyUnicode_READ(kind, data, fill); 12686 12687 if (chr == '+' || chr == '-') { 12688 /* move sign to beginning of string */ 12689 PyUnicode_WRITE(kind, data, 0, chr); 12690 PyUnicode_WRITE(kind, data, fill, '0'); 12691 } 12692 12693 assert(_PyUnicode_CheckConsistency(u, 1)); 12694 return u; 12695} 12696 12697#if 0 12698static PyObject * 12699unicode__decimal2ascii(PyObject *self) 12700{ 12701 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 12702} 12703#endif 12704 12705PyDoc_STRVAR(startswith__doc__, 12706 "S.startswith(prefix[, start[, end]]) -> bool\n\ 12707\n\ 12708Return True if S starts with the specified prefix, False otherwise.\n\ 12709With optional start, test S beginning at that position.\n\ 12710With optional end, stop comparing S at that position.\n\ 12711prefix can also be a tuple of strings to try."); 12712 12713static PyObject * 12714unicode_startswith(PyObject *self, 12715 PyObject *args) 12716{ 12717 PyObject *subobj; 12718 PyObject *substring; 12719 Py_ssize_t start = 0; 12720 Py_ssize_t end = PY_SSIZE_T_MAX; 12721 int result; 12722 12723 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 12724 return NULL; 12725 if (PyTuple_Check(subobj)) { 12726 Py_ssize_t i; 12727 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12728 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 12729 if (substring == NULL) 12730 return NULL; 12731 result = tailmatch(self, substring, start, end, -1); 12732 Py_DECREF(substring); 12733 if (result) { 12734 Py_RETURN_TRUE; 12735 } 12736 } 12737 /* nothing matched */ 12738 Py_RETURN_FALSE; 12739 } 12740 substring = PyUnicode_FromObject(subobj); 12741 if (substring == NULL) { 12742 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12743 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 12744 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12745 return NULL; 12746 } 12747 result = tailmatch(self, substring, start, end, -1); 12748 Py_DECREF(substring); 12749 return PyBool_FromLong(result); 12750} 12751 12752 12753PyDoc_STRVAR(endswith__doc__, 12754 "S.endswith(suffix[, start[, end]]) -> bool\n\ 12755\n\ 12756Return True if S ends with the specified suffix, False otherwise.\n\ 12757With optional start, test S beginning at that position.\n\ 12758With optional end, stop comparing S at that position.\n\ 12759suffix can also be a tuple of strings to try."); 12760 12761static PyObject * 12762unicode_endswith(PyObject *self, 12763 PyObject *args) 12764{ 12765 PyObject *subobj; 12766 PyObject *substring; 12767 Py_ssize_t start = 0; 12768 Py_ssize_t end = PY_SSIZE_T_MAX; 12769 int result; 12770 12771 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 12772 return NULL; 12773 if (PyTuple_Check(subobj)) { 12774 Py_ssize_t i; 12775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 12776 substring = PyUnicode_FromObject( 12777 PyTuple_GET_ITEM(subobj, i)); 12778 if (substring == NULL) 12779 return NULL; 12780 result = tailmatch(self, substring, start, end, +1); 12781 Py_DECREF(substring); 12782 if (result) { 12783 Py_RETURN_TRUE; 12784 } 12785 } 12786 Py_RETURN_FALSE; 12787 } 12788 substring = PyUnicode_FromObject(subobj); 12789 if (substring == NULL) { 12790 if (PyErr_ExceptionMatches(PyExc_TypeError)) 12791 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 12792 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 12793 return NULL; 12794 } 12795 result = tailmatch(self, substring, start, end, +1); 12796 Py_DECREF(substring); 12797 return PyBool_FromLong(result); 12798} 12799 12800Py_LOCAL_INLINE(void) 12801_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 12802{ 12803 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 12804 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 12805 writer->data = PyUnicode_DATA(writer->buffer); 12806 writer->kind = PyUnicode_KIND(writer->buffer); 12807} 12808 12809void 12810_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) 12811{ 12812 memset(writer, 0, sizeof(*writer)); 12813#ifdef Py_DEBUG 12814 writer->kind = 5; /* invalid kind */ 12815#endif 12816 writer->min_length = Py_MAX(min_length, 100); 12817 writer->overallocate = (min_length > 0); 12818} 12819 12820int 12821_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 12822 Py_ssize_t length, Py_UCS4 maxchar) 12823{ 12824 Py_ssize_t newlen; 12825 PyObject *newbuffer; 12826 12827 assert(length > 0); 12828 12829 if (length > PY_SSIZE_T_MAX - writer->pos) { 12830 PyErr_NoMemory(); 12831 return -1; 12832 } 12833 newlen = writer->pos + length; 12834 12835 if (writer->buffer == NULL) { 12836 if (writer->overallocate) { 12837 /* overallocate 25% to limit the number of resize */ 12838 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12839 newlen += newlen / 4; 12840 if (newlen < writer->min_length) 12841 newlen = writer->min_length; 12842 } 12843 writer->buffer = PyUnicode_New(newlen, maxchar); 12844 if (writer->buffer == NULL) 12845 return -1; 12846 _PyUnicodeWriter_Update(writer); 12847 return 0; 12848 } 12849 12850 if (newlen > writer->size) { 12851 if (writer->overallocate) { 12852 /* overallocate 25% to limit the number of resize */ 12853 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4)) 12854 newlen += newlen / 4; 12855 if (newlen < writer->min_length) 12856 newlen = writer->min_length; 12857 } 12858 12859 if (maxchar > writer->maxchar || writer->readonly) { 12860 /* resize + widen */ 12861 newbuffer = PyUnicode_New(newlen, maxchar); 12862 if (newbuffer == NULL) 12863 return -1; 12864 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12865 writer->buffer, 0, writer->pos); 12866 Py_DECREF(writer->buffer); 12867 writer->readonly = 0; 12868 } 12869 else { 12870 newbuffer = resize_compact(writer->buffer, newlen); 12871 if (newbuffer == NULL) 12872 return -1; 12873 } 12874 writer->buffer = newbuffer; 12875 _PyUnicodeWriter_Update(writer); 12876 } 12877 else if (maxchar > writer->maxchar) { 12878 assert(!writer->readonly); 12879 newbuffer = PyUnicode_New(writer->size, maxchar); 12880 if (newbuffer == NULL) 12881 return -1; 12882 _PyUnicode_FastCopyCharacters(newbuffer, 0, 12883 writer->buffer, 0, writer->pos); 12884 Py_DECREF(writer->buffer); 12885 writer->buffer = newbuffer; 12886 _PyUnicodeWriter_Update(writer); 12887 } 12888 return 0; 12889} 12890 12891int 12892_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 12893{ 12894 Py_UCS4 maxchar; 12895 Py_ssize_t len; 12896 12897 if (PyUnicode_READY(str) == -1) 12898 return -1; 12899 len = PyUnicode_GET_LENGTH(str); 12900 if (len == 0) 12901 return 0; 12902 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 12903 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 12904 if (writer->buffer == NULL && !writer->overallocate) { 12905 Py_INCREF(str); 12906 writer->buffer = str; 12907 _PyUnicodeWriter_Update(writer); 12908 writer->readonly = 1; 12909 writer->size = 0; 12910 writer->pos += len; 12911 return 0; 12912 } 12913 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 12914 return -1; 12915 } 12916 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 12917 str, 0, len); 12918 writer->pos += len; 12919 return 0; 12920} 12921 12922PyObject * 12923_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 12924{ 12925 if (writer->pos == 0) { 12926 Py_XDECREF(writer->buffer); 12927 Py_INCREF(unicode_empty); 12928 return unicode_empty; 12929 } 12930 if (writer->readonly) { 12931 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos); 12932 return writer->buffer; 12933 } 12934 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 12935 PyObject *newbuffer; 12936 newbuffer = resize_compact(writer->buffer, writer->pos); 12937 if (newbuffer == NULL) { 12938 Py_DECREF(writer->buffer); 12939 return NULL; 12940 } 12941 writer->buffer = newbuffer; 12942 } 12943 assert(_PyUnicode_CheckConsistency(writer->buffer, 1)); 12944 return writer->buffer; 12945} 12946 12947void 12948_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 12949{ 12950 Py_CLEAR(writer->buffer); 12951} 12952 12953#include "stringlib/unicode_format.h" 12954 12955PyDoc_STRVAR(format__doc__, 12956 "S.format(*args, **kwargs) -> str\n\ 12957\n\ 12958Return a formatted version of S, using substitutions from args and kwargs.\n\ 12959The substitutions are identified by braces ('{' and '}')."); 12960 12961PyDoc_STRVAR(format_map__doc__, 12962 "S.format_map(mapping) -> str\n\ 12963\n\ 12964Return a formatted version of S, using substitutions from mapping.\n\ 12965The substitutions are identified by braces ('{' and '}')."); 12966 12967static PyObject * 12968unicode__format__(PyObject* self, PyObject* args) 12969{ 12970 PyObject *format_spec; 12971 _PyUnicodeWriter writer; 12972 int ret; 12973 12974 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 12975 return NULL; 12976 12977 if (PyUnicode_READY(self) == -1) 12978 return NULL; 12979 _PyUnicodeWriter_Init(&writer, 0); 12980 ret = _PyUnicode_FormatAdvancedWriter(&writer, 12981 self, format_spec, 0, 12982 PyUnicode_GET_LENGTH(format_spec)); 12983 if (ret == -1) { 12984 _PyUnicodeWriter_Dealloc(&writer); 12985 return NULL; 12986 } 12987 return _PyUnicodeWriter_Finish(&writer); 12988} 12989 12990PyDoc_STRVAR(p_format__doc__, 12991 "S.__format__(format_spec) -> str\n\ 12992\n\ 12993Return a formatted version of S as described by format_spec."); 12994 12995static PyObject * 12996unicode__sizeof__(PyObject *v) 12997{ 12998 Py_ssize_t size; 12999 13000 /* If it's a compact object, account for base structure + 13001 character data. */ 13002 if (PyUnicode_IS_COMPACT_ASCII(v)) 13003 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13004 else if (PyUnicode_IS_COMPACT(v)) 13005 size = sizeof(PyCompactUnicodeObject) + 13006 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13007 else { 13008 /* If it is a two-block object, account for base object, and 13009 for character block if present. */ 13010 size = sizeof(PyUnicodeObject); 13011 if (_PyUnicode_DATA_ANY(v)) 13012 size += (PyUnicode_GET_LENGTH(v) + 1) * 13013 PyUnicode_KIND(v); 13014 } 13015 /* If the wstr pointer is present, account for it unless it is shared 13016 with the data pointer. Check if the data is not shared. */ 13017 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13018 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13019 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13020 size += PyUnicode_UTF8_LENGTH(v) + 1; 13021 13022 return PyLong_FromSsize_t(size); 13023} 13024 13025PyDoc_STRVAR(sizeof__doc__, 13026 "S.__sizeof__() -> size of S in memory, in bytes"); 13027 13028static PyObject * 13029unicode_getnewargs(PyObject *v) 13030{ 13031 PyObject *copy = _PyUnicode_Copy(v); 13032 if (!copy) 13033 return NULL; 13034 return Py_BuildValue("(N)", copy); 13035} 13036 13037static PyMethodDef unicode_methods[] = { 13038 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13039 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13040 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13041 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13042 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13043 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13044 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13045 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13046 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13047 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13048 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, 13049 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13050 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13051 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13052 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13053 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13054 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13055 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13056 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13057 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13058 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13059 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13060 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13061 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13062 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13063 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13064 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13065 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13066 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13067 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13068 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13069 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13070 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13071 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13072 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13073 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13074 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13075 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13076 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13077 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13078 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13079 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13080 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13081 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13082 {"maketrans", (PyCFunction) unicode_maketrans, 13083 METH_VARARGS | METH_STATIC, maketrans__doc__}, 13084 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13085#if 0 13086 /* These methods are just used for debugging the implementation. */ 13087 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13088#endif 13089 13090 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13091 {NULL, NULL} 13092}; 13093 13094static PyObject * 13095unicode_mod(PyObject *v, PyObject *w) 13096{ 13097 if (!PyUnicode_Check(v)) 13098 Py_RETURN_NOTIMPLEMENTED; 13099 return PyUnicode_Format(v, w); 13100} 13101 13102static PyNumberMethods unicode_as_number = { 13103 0, /*nb_add*/ 13104 0, /*nb_subtract*/ 13105 0, /*nb_multiply*/ 13106 unicode_mod, /*nb_remainder*/ 13107}; 13108 13109static PySequenceMethods unicode_as_sequence = { 13110 (lenfunc) unicode_length, /* sq_length */ 13111 PyUnicode_Concat, /* sq_concat */ 13112 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13113 (ssizeargfunc) unicode_getitem, /* sq_item */ 13114 0, /* sq_slice */ 13115 0, /* sq_ass_item */ 13116 0, /* sq_ass_slice */ 13117 PyUnicode_Contains, /* sq_contains */ 13118}; 13119 13120static PyObject* 13121unicode_subscript(PyObject* self, PyObject* item) 13122{ 13123 if (PyUnicode_READY(self) == -1) 13124 return NULL; 13125 13126 if (PyIndex_Check(item)) { 13127 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13128 if (i == -1 && PyErr_Occurred()) 13129 return NULL; 13130 if (i < 0) 13131 i += PyUnicode_GET_LENGTH(self); 13132 return unicode_getitem(self, i); 13133 } else if (PySlice_Check(item)) { 13134 Py_ssize_t start, stop, step, slicelength, cur, i; 13135 PyObject *result; 13136 void *src_data, *dest_data; 13137 int src_kind, dest_kind; 13138 Py_UCS4 ch, max_char, kind_limit; 13139 13140 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13141 &start, &stop, &step, &slicelength) < 0) { 13142 return NULL; 13143 } 13144 13145 if (slicelength <= 0) { 13146 Py_INCREF(unicode_empty); 13147 return unicode_empty; 13148 } else if (start == 0 && step == 1 && 13149 slicelength == PyUnicode_GET_LENGTH(self)) { 13150 return unicode_result_unchanged(self); 13151 } else if (step == 1) { 13152 return PyUnicode_Substring(self, 13153 start, start + slicelength); 13154 } 13155 /* General case */ 13156 src_kind = PyUnicode_KIND(self); 13157 src_data = PyUnicode_DATA(self); 13158 if (!PyUnicode_IS_ASCII(self)) { 13159 kind_limit = kind_maxchar_limit(src_kind); 13160 max_char = 0; 13161 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13162 ch = PyUnicode_READ(src_kind, src_data, cur); 13163 if (ch > max_char) { 13164 max_char = ch; 13165 if (max_char >= kind_limit) 13166 break; 13167 } 13168 } 13169 } 13170 else 13171 max_char = 127; 13172 result = PyUnicode_New(slicelength, max_char); 13173 if (result == NULL) 13174 return NULL; 13175 dest_kind = PyUnicode_KIND(result); 13176 dest_data = PyUnicode_DATA(result); 13177 13178 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13179 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13180 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13181 } 13182 assert(_PyUnicode_CheckConsistency(result, 1)); 13183 return result; 13184 } else { 13185 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13186 return NULL; 13187 } 13188} 13189 13190static PyMappingMethods unicode_as_mapping = { 13191 (lenfunc)unicode_length, /* mp_length */ 13192 (binaryfunc)unicode_subscript, /* mp_subscript */ 13193 (objobjargproc)0, /* mp_ass_subscript */ 13194}; 13195 13196 13197/* Helpers for PyUnicode_Format() */ 13198 13199static PyObject * 13200getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) 13201{ 13202 Py_ssize_t argidx = *p_argidx; 13203 if (argidx < arglen) { 13204 (*p_argidx)++; 13205 if (arglen < 0) 13206 return args; 13207 else 13208 return PyTuple_GetItem(args, argidx); 13209 } 13210 PyErr_SetString(PyExc_TypeError, 13211 "not enough arguments for format string"); 13212 return NULL; 13213} 13214 13215/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13216 13217static int 13218formatfloat(PyObject *v, int flags, int prec, int type, 13219 PyObject **p_output, _PyUnicodeWriter *writer) 13220{ 13221 char *p; 13222 double x; 13223 Py_ssize_t len; 13224 13225 x = PyFloat_AsDouble(v); 13226 if (x == -1.0 && PyErr_Occurred()) 13227 return -1; 13228 13229 if (prec < 0) 13230 prec = 6; 13231 13232 p = PyOS_double_to_string(x, type, prec, 13233 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); 13234 if (p == NULL) 13235 return -1; 13236 len = strlen(p); 13237 if (writer) { 13238 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) { 13239 PyMem_Free(p); 13240 return -1; 13241 } 13242 unicode_write_cstr(writer->buffer, writer->pos, p, len); 13243 writer->pos += len; 13244 } 13245 else 13246 *p_output = _PyUnicode_FromASCII(p, len); 13247 PyMem_Free(p); 13248 return 0; 13249} 13250 13251/* formatlong() emulates the format codes d, u, o, x and X, and 13252 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13253 * Python's regular ints. 13254 * Return value: a new PyUnicodeObject*, or NULL if error. 13255 * The output string is of the form 13256 * "-"? ("0x" | "0X")? digit+ 13257 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13258 * set in flags. The case of hex digits will be correct, 13259 * There will be at least prec digits, zero-filled on the left if 13260 * necessary to get that many. 13261 * val object to be converted 13262 * flags bitmask of format flags; only F_ALT is looked at 13263 * prec minimum number of digits; 0-fill on left if needed 13264 * type a character in [duoxX]; u acts the same as d 13265 * 13266 * CAUTION: o, x and X conversions on regular ints can never 13267 * produce a '-' sign, but can for Python's unbounded ints. 13268 */ 13269static PyObject* 13270formatlong(PyObject *val, int flags, int prec, int type) 13271{ 13272 PyObject *result = NULL; 13273 char *buf; 13274 Py_ssize_t i; 13275 int sign; /* 1 if '-', else 0 */ 13276 int len; /* number of characters */ 13277 Py_ssize_t llen; 13278 int numdigits; /* len == numnondigits + numdigits */ 13279 int numnondigits = 0; 13280 13281 /* Avoid exceeding SSIZE_T_MAX */ 13282 if (prec > INT_MAX-3) { 13283 PyErr_SetString(PyExc_OverflowError, 13284 "precision too large"); 13285 return NULL; 13286 } 13287 13288 assert(PyLong_Check(val)); 13289 13290 switch (type) { 13291 case 'd': 13292 case 'u': 13293 /* Special-case boolean: we want 0/1 */ 13294 if (PyBool_Check(val)) 13295 result = PyNumber_ToBase(val, 10); 13296 else 13297 result = Py_TYPE(val)->tp_str(val); 13298 break; 13299 case 'o': 13300 numnondigits = 2; 13301 result = PyNumber_ToBase(val, 8); 13302 break; 13303 case 'x': 13304 case 'X': 13305 numnondigits = 2; 13306 result = PyNumber_ToBase(val, 16); 13307 break; 13308 default: 13309 assert(!"'type' not in [duoxX]"); 13310 } 13311 if (!result) 13312 return NULL; 13313 13314 assert(unicode_modifiable(result)); 13315 assert(PyUnicode_IS_READY(result)); 13316 assert(PyUnicode_IS_ASCII(result)); 13317 13318 /* To modify the string in-place, there can only be one reference. */ 13319 if (Py_REFCNT(result) != 1) { 13320 PyErr_BadInternalCall(); 13321 return NULL; 13322 } 13323 buf = PyUnicode_DATA(result); 13324 llen = PyUnicode_GET_LENGTH(result); 13325 if (llen > INT_MAX) { 13326 PyErr_SetString(PyExc_ValueError, 13327 "string too large in _PyBytes_FormatLong"); 13328 return NULL; 13329 } 13330 len = (int)llen; 13331 sign = buf[0] == '-'; 13332 numnondigits += sign; 13333 numdigits = len - numnondigits; 13334 assert(numdigits > 0); 13335 13336 /* Get rid of base marker unless F_ALT */ 13337 if (((flags & F_ALT) == 0 && 13338 (type == 'o' || type == 'x' || type == 'X'))) { 13339 assert(buf[sign] == '0'); 13340 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 13341 buf[sign+1] == 'o'); 13342 numnondigits -= 2; 13343 buf += 2; 13344 len -= 2; 13345 if (sign) 13346 buf[0] = '-'; 13347 assert(len == numnondigits + numdigits); 13348 assert(numdigits > 0); 13349 } 13350 13351 /* Fill with leading zeroes to meet minimum width. */ 13352 if (prec > numdigits) { 13353 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 13354 numnondigits + prec); 13355 char *b1; 13356 if (!r1) { 13357 Py_DECREF(result); 13358 return NULL; 13359 } 13360 b1 = PyBytes_AS_STRING(r1); 13361 for (i = 0; i < numnondigits; ++i) 13362 *b1++ = *buf++; 13363 for (i = 0; i < prec - numdigits; i++) 13364 *b1++ = '0'; 13365 for (i = 0; i < numdigits; i++) 13366 *b1++ = *buf++; 13367 *b1 = '\0'; 13368 Py_DECREF(result); 13369 result = r1; 13370 buf = PyBytes_AS_STRING(result); 13371 len = numnondigits + prec; 13372 } 13373 13374 /* Fix up case for hex conversions. */ 13375 if (type == 'X') { 13376 /* Need to convert all lower case letters to upper case. 13377 and need to convert 0x to 0X (and -0x to -0X). */ 13378 for (i = 0; i < len; i++) 13379 if (buf[i] >= 'a' && buf[i] <= 'x') 13380 buf[i] -= 'a'-'A'; 13381 } 13382 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) { 13383 PyObject *unicode; 13384 unicode = _PyUnicode_FromASCII(buf, len); 13385 Py_DECREF(result); 13386 result = unicode; 13387 } 13388 return result; 13389} 13390 13391static Py_UCS4 13392formatchar(PyObject *v) 13393{ 13394 /* presume that the buffer is at least 3 characters long */ 13395 if (PyUnicode_Check(v)) { 13396 if (PyUnicode_GET_LENGTH(v) == 1) { 13397 return PyUnicode_READ_CHAR(v, 0); 13398 } 13399 goto onError; 13400 } 13401 else { 13402 /* Integer input truncated to a character */ 13403 long x; 13404 x = PyLong_AsLong(v); 13405 if (x == -1 && PyErr_Occurred()) 13406 goto onError; 13407 13408 if (x < 0 || x > MAX_UNICODE) { 13409 PyErr_SetString(PyExc_OverflowError, 13410 "%c arg not in range(0x110000)"); 13411 return (Py_UCS4) -1; 13412 } 13413 13414 return (Py_UCS4) x; 13415 } 13416 13417 onError: 13418 PyErr_SetString(PyExc_TypeError, 13419 "%c requires int or char"); 13420 return (Py_UCS4) -1; 13421} 13422 13423PyObject * 13424PyUnicode_Format(PyObject *format, PyObject *args) 13425{ 13426 Py_ssize_t fmtcnt, fmtpos, arglen, argidx; 13427 int args_owned = 0; 13428 PyObject *dict = NULL; 13429 PyObject *temp = NULL; 13430 PyObject *second = NULL; 13431 PyObject *uformat; 13432 void *fmt; 13433 enum PyUnicode_Kind kind, fmtkind; 13434 _PyUnicodeWriter writer; 13435 Py_ssize_t sublen; 13436 Py_UCS4 maxchar; 13437 13438 if (format == NULL || args == NULL) { 13439 PyErr_BadInternalCall(); 13440 return NULL; 13441 } 13442 uformat = PyUnicode_FromObject(format); 13443 if (uformat == NULL) 13444 return NULL; 13445 if (PyUnicode_READY(uformat) == -1) { 13446 Py_DECREF(uformat); 13447 return NULL; 13448 } 13449 13450 fmt = PyUnicode_DATA(uformat); 13451 fmtkind = PyUnicode_KIND(uformat); 13452 fmtcnt = PyUnicode_GET_LENGTH(uformat); 13453 fmtpos = 0; 13454 13455 _PyUnicodeWriter_Init(&writer, fmtcnt + 100); 13456 13457 if (PyTuple_Check(args)) { 13458 arglen = PyTuple_Size(args); 13459 argidx = 0; 13460 } 13461 else { 13462 arglen = -1; 13463 argidx = -2; 13464 } 13465 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 13466 dict = args; 13467 13468 while (--fmtcnt >= 0) { 13469 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13470 Py_ssize_t nonfmtpos; 13471 nonfmtpos = fmtpos++; 13472 while (fmtcnt >= 0 && 13473 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') { 13474 fmtpos++; 13475 fmtcnt--; 13476 } 13477 if (fmtcnt < 0) 13478 fmtpos--; 13479 sublen = fmtpos - nonfmtpos; 13480 maxchar = _PyUnicode_FindMaxChar(uformat, 13481 nonfmtpos, nonfmtpos + sublen); 13482 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1) 13483 goto onError; 13484 13485 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13486 uformat, nonfmtpos, sublen); 13487 writer.pos += sublen; 13488 } 13489 else { 13490 /* Got a format specifier */ 13491 int flags = 0; 13492 Py_ssize_t width = -1; 13493 int prec = -1; 13494 Py_UCS4 c = '\0'; 13495 Py_UCS4 fill; 13496 int sign; 13497 Py_UCS4 signchar; 13498 int isnumok; 13499 PyObject *v = NULL; 13500 void *pbuf = NULL; 13501 Py_ssize_t pindex, len; 13502 Py_UCS4 bufmaxchar; 13503 Py_ssize_t buflen; 13504 13505 fmtpos++; 13506 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13507 if (c == '(') { 13508 Py_ssize_t keystart; 13509 Py_ssize_t keylen; 13510 PyObject *key; 13511 int pcount = 1; 13512 13513 if (dict == NULL) { 13514 PyErr_SetString(PyExc_TypeError, 13515 "format requires a mapping"); 13516 goto onError; 13517 } 13518 ++fmtpos; 13519 --fmtcnt; 13520 keystart = fmtpos; 13521 /* Skip over balanced parentheses */ 13522 while (pcount > 0 && --fmtcnt >= 0) { 13523 c = PyUnicode_READ(fmtkind, fmt, fmtpos); 13524 if (c == ')') 13525 --pcount; 13526 else if (c == '(') 13527 ++pcount; 13528 fmtpos++; 13529 } 13530 keylen = fmtpos - keystart - 1; 13531 if (fmtcnt < 0 || pcount > 0) { 13532 PyErr_SetString(PyExc_ValueError, 13533 "incomplete format key"); 13534 goto onError; 13535 } 13536 key = PyUnicode_Substring(uformat, 13537 keystart, keystart + keylen); 13538 if (key == NULL) 13539 goto onError; 13540 if (args_owned) { 13541 Py_DECREF(args); 13542 args_owned = 0; 13543 } 13544 args = PyObject_GetItem(dict, key); 13545 Py_DECREF(key); 13546 if (args == NULL) { 13547 goto onError; 13548 } 13549 args_owned = 1; 13550 arglen = -1; 13551 argidx = -2; 13552 } 13553 while (--fmtcnt >= 0) { 13554 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13555 switch (c) { 13556 case '-': flags |= F_LJUST; continue; 13557 case '+': flags |= F_SIGN; continue; 13558 case ' ': flags |= F_BLANK; continue; 13559 case '#': flags |= F_ALT; continue; 13560 case '0': flags |= F_ZERO; continue; 13561 } 13562 break; 13563 } 13564 if (c == '*') { 13565 v = getnextarg(args, arglen, &argidx); 13566 if (v == NULL) 13567 goto onError; 13568 if (!PyLong_Check(v)) { 13569 PyErr_SetString(PyExc_TypeError, 13570 "* wants int"); 13571 goto onError; 13572 } 13573 width = PyLong_AsLong(v); 13574 if (width == -1 && PyErr_Occurred()) 13575 goto onError; 13576 if (width < 0) { 13577 flags |= F_LJUST; 13578 width = -width; 13579 } 13580 if (--fmtcnt >= 0) 13581 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13582 } 13583 else if (c >= '0' && c <= '9') { 13584 width = c - '0'; 13585 while (--fmtcnt >= 0) { 13586 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13587 if (c < '0' || c > '9') 13588 break; 13589 /* Since c is unsigned, the RHS would end up as unsigned, 13590 mixing signed and unsigned comparison. Since c is between 13591 '0' and '9', casting to int is safe. */ 13592 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) { 13593 PyErr_SetString(PyExc_ValueError, 13594 "width too big"); 13595 goto onError; 13596 } 13597 width = width*10 + (c - '0'); 13598 } 13599 } 13600 if (c == '.') { 13601 prec = 0; 13602 if (--fmtcnt >= 0) 13603 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13604 if (c == '*') { 13605 v = getnextarg(args, arglen, &argidx); 13606 if (v == NULL) 13607 goto onError; 13608 if (!PyLong_Check(v)) { 13609 PyErr_SetString(PyExc_TypeError, 13610 "* wants int"); 13611 goto onError; 13612 } 13613 prec = PyLong_AsLong(v); 13614 if (prec == -1 && PyErr_Occurred()) 13615 goto onError; 13616 if (prec < 0) 13617 prec = 0; 13618 if (--fmtcnt >= 0) 13619 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13620 } 13621 else if (c >= '0' && c <= '9') { 13622 prec = c - '0'; 13623 while (--fmtcnt >= 0) { 13624 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13625 if (c < '0' || c > '9') 13626 break; 13627 if (prec > (INT_MAX - ((int)c - '0')) / 10) { 13628 PyErr_SetString(PyExc_ValueError, 13629 "prec too big"); 13630 goto onError; 13631 } 13632 prec = prec*10 + (c - '0'); 13633 } 13634 } 13635 } /* prec */ 13636 if (fmtcnt >= 0) { 13637 if (c == 'h' || c == 'l' || c == 'L') { 13638 if (--fmtcnt >= 0) 13639 c = PyUnicode_READ(fmtkind, fmt, fmtpos++); 13640 } 13641 } 13642 if (fmtcnt < 0) { 13643 PyErr_SetString(PyExc_ValueError, 13644 "incomplete format"); 13645 goto onError; 13646 } 13647 if (fmtcnt == 0) 13648 writer.overallocate = 0; 13649 13650 if (c == '%') { 13651 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1) 13652 goto onError; 13653 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%'); 13654 writer.pos += 1; 13655 continue; 13656 } 13657 13658 v = getnextarg(args, arglen, &argidx); 13659 if (v == NULL) 13660 goto onError; 13661 13662 sign = 0; 13663 signchar = '\0'; 13664 fill = ' '; 13665 switch (c) { 13666 13667 case 's': 13668 case 'r': 13669 case 'a': 13670 if (PyLong_CheckExact(v) && width == -1 && prec == -1) { 13671 /* Fast path */ 13672 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13673 goto onError; 13674 goto nextarg; 13675 } 13676 13677 if (PyUnicode_CheckExact(v) && c == 's') { 13678 temp = v; 13679 Py_INCREF(temp); 13680 } 13681 else { 13682 if (c == 's') 13683 temp = PyObject_Str(v); 13684 else if (c == 'r') 13685 temp = PyObject_Repr(v); 13686 else 13687 temp = PyObject_ASCII(v); 13688 } 13689 break; 13690 13691 case 'i': 13692 case 'd': 13693 case 'u': 13694 case 'o': 13695 case 'x': 13696 case 'X': 13697 if (PyLong_CheckExact(v) 13698 && width == -1 && prec == -1 13699 && !(flags & (F_SIGN | F_BLANK))) 13700 { 13701 /* Fast path */ 13702 switch(c) 13703 { 13704 case 'd': 13705 case 'i': 13706 case 'u': 13707 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1) 13708 goto onError; 13709 goto nextarg; 13710 case 'x': 13711 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1) 13712 goto onError; 13713 goto nextarg; 13714 case 'o': 13715 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1) 13716 goto onError; 13717 goto nextarg; 13718 default: 13719 break; 13720 } 13721 } 13722 13723 isnumok = 0; 13724 if (PyNumber_Check(v)) { 13725 PyObject *iobj=NULL; 13726 13727 if (PyLong_Check(v)) { 13728 iobj = v; 13729 Py_INCREF(iobj); 13730 } 13731 else { 13732 iobj = PyNumber_Long(v); 13733 } 13734 if (iobj!=NULL) { 13735 if (PyLong_Check(iobj)) { 13736 isnumok = 1; 13737 sign = 1; 13738 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); 13739 Py_DECREF(iobj); 13740 } 13741 else { 13742 Py_DECREF(iobj); 13743 } 13744 } 13745 } 13746 if (!isnumok) { 13747 PyErr_Format(PyExc_TypeError, 13748 "%%%c format: a number is required, " 13749 "not %.200s", (char)c, Py_TYPE(v)->tp_name); 13750 goto onError; 13751 } 13752 if (flags & F_ZERO) 13753 fill = '0'; 13754 break; 13755 13756 case 'e': 13757 case 'E': 13758 case 'f': 13759 case 'F': 13760 case 'g': 13761 case 'G': 13762 if (width == -1 && prec == -1 13763 && !(flags & (F_SIGN | F_BLANK))) 13764 { 13765 /* Fast path */ 13766 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1) 13767 goto onError; 13768 goto nextarg; 13769 } 13770 13771 sign = 1; 13772 if (flags & F_ZERO) 13773 fill = '0'; 13774 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1) 13775 temp = NULL; 13776 break; 13777 13778 case 'c': 13779 { 13780 Py_UCS4 ch = formatchar(v); 13781 if (ch == (Py_UCS4) -1) 13782 goto onError; 13783 if (width == -1 && prec == -1) { 13784 /* Fast path */ 13785 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) 13786 goto onError; 13787 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); 13788 writer.pos += 1; 13789 goto nextarg; 13790 } 13791 temp = PyUnicode_FromOrdinal(ch); 13792 break; 13793 } 13794 13795 default: 13796 PyErr_Format(PyExc_ValueError, 13797 "unsupported format character '%c' (0x%x) " 13798 "at index %zd", 13799 (31<=c && c<=126) ? (char)c : '?', 13800 (int)c, 13801 fmtpos - 1); 13802 goto onError; 13803 } 13804 if (temp == NULL) 13805 goto onError; 13806 assert (PyUnicode_Check(temp)); 13807 13808 if (width == -1 && prec == -1 13809 && !(flags & (F_SIGN | F_BLANK))) 13810 { 13811 /* Fast path */ 13812 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1) 13813 goto onError; 13814 goto nextarg; 13815 } 13816 13817 if (PyUnicode_READY(temp) == -1) { 13818 Py_CLEAR(temp); 13819 goto onError; 13820 } 13821 kind = PyUnicode_KIND(temp); 13822 pbuf = PyUnicode_DATA(temp); 13823 len = PyUnicode_GET_LENGTH(temp); 13824 13825 if (c == 's' || c == 'r' || c == 'a') { 13826 if (prec >= 0 && len > prec) 13827 len = prec; 13828 } 13829 13830 /* pbuf is initialized here. */ 13831 pindex = 0; 13832 if (sign) { 13833 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 13834 if (ch == '-' || ch == '+') { 13835 signchar = ch; 13836 len--; 13837 pindex++; 13838 } 13839 else if (flags & F_SIGN) 13840 signchar = '+'; 13841 else if (flags & F_BLANK) 13842 signchar = ' '; 13843 else 13844 sign = 0; 13845 } 13846 if (width < len) 13847 width = len; 13848 13849 /* Compute the length and maximum character of the 13850 written characters */ 13851 bufmaxchar = 127; 13852 if (!(flags & F_LJUST)) { 13853 if (sign) { 13854 if ((width-1) > len) 13855 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13856 } 13857 else { 13858 if (width > len) 13859 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); 13860 } 13861 } 13862 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len); 13863 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); 13864 13865 buflen = width; 13866 if (sign && len == width) 13867 buflen++; 13868 13869 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1) 13870 goto onError; 13871 13872 /* Write characters */ 13873 if (sign) { 13874 if (fill != ' ') { 13875 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13876 writer.pos += 1; 13877 } 13878 if (width > len) 13879 width--; 13880 } 13881 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13882 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13883 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c); 13884 if (fill != ' ') { 13885 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13886 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13887 writer.pos += 2; 13888 pindex += 2; 13889 } 13890 width -= 2; 13891 if (width < 0) 13892 width = 0; 13893 len -= 2; 13894 } 13895 if (width > len && !(flags & F_LJUST)) { 13896 sublen = width - len; 13897 FILL(writer.kind, writer.data, fill, writer.pos, sublen); 13898 writer.pos += sublen; 13899 width = len; 13900 } 13901 if (fill == ' ') { 13902 if (sign) { 13903 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar); 13904 writer.pos += 1; 13905 } 13906 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { 13907 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 13908 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c); 13909 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0'); 13910 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c); 13911 writer.pos += 2; 13912 pindex += 2; 13913 } 13914 } 13915 13916 if (len) { 13917 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos, 13918 temp, pindex, len); 13919 writer.pos += len; 13920 } 13921 if (width > len) { 13922 sublen = width - len; 13923 FILL(writer.kind, writer.data, ' ', writer.pos, sublen); 13924 writer.pos += sublen; 13925 } 13926 13927nextarg: 13928 if (dict && (argidx < arglen) && c != '%') { 13929 PyErr_SetString(PyExc_TypeError, 13930 "not all arguments converted during string formatting"); 13931 goto onError; 13932 } 13933 Py_CLEAR(temp); 13934 } /* '%' */ 13935 } /* until end */ 13936 if (argidx < arglen && !dict) { 13937 PyErr_SetString(PyExc_TypeError, 13938 "not all arguments converted during string formatting"); 13939 goto onError; 13940 } 13941 13942 if (args_owned) { 13943 Py_DECREF(args); 13944 } 13945 Py_DECREF(uformat); 13946 Py_XDECREF(temp); 13947 Py_XDECREF(second); 13948 return _PyUnicodeWriter_Finish(&writer); 13949 13950 onError: 13951 Py_DECREF(uformat); 13952 Py_XDECREF(temp); 13953 Py_XDECREF(second); 13954 _PyUnicodeWriter_Dealloc(&writer); 13955 if (args_owned) { 13956 Py_DECREF(args); 13957 } 13958 return NULL; 13959} 13960 13961static PyObject * 13962unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 13963 13964static PyObject * 13965unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13966{ 13967 PyObject *x = NULL; 13968 static char *kwlist[] = {"object", "encoding", "errors", 0}; 13969 char *encoding = NULL; 13970 char *errors = NULL; 13971 13972 if (type != &PyUnicode_Type) 13973 return unicode_subtype_new(type, args, kwds); 13974 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 13975 kwlist, &x, &encoding, &errors)) 13976 return NULL; 13977 if (x == NULL) { 13978 Py_INCREF(unicode_empty); 13979 return unicode_empty; 13980 } 13981 if (encoding == NULL && errors == NULL) 13982 return PyObject_Str(x); 13983 else 13984 return PyUnicode_FromEncodedObject(x, encoding, errors); 13985} 13986 13987static PyObject * 13988unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 13989{ 13990 PyObject *unicode, *self; 13991 Py_ssize_t length, char_size; 13992 int share_wstr, share_utf8; 13993 unsigned int kind; 13994 void *data; 13995 13996 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 13997 13998 unicode = unicode_new(&PyUnicode_Type, args, kwds); 13999 if (unicode == NULL) 14000 return NULL; 14001 assert(_PyUnicode_CHECK(unicode)); 14002 if (PyUnicode_READY(unicode) == -1) { 14003 Py_DECREF(unicode); 14004 return NULL; 14005 } 14006 14007 self = type->tp_alloc(type, 0); 14008 if (self == NULL) { 14009 Py_DECREF(unicode); 14010 return NULL; 14011 } 14012 kind = PyUnicode_KIND(unicode); 14013 length = PyUnicode_GET_LENGTH(unicode); 14014 14015 _PyUnicode_LENGTH(self) = length; 14016#ifdef Py_DEBUG 14017 _PyUnicode_HASH(self) = -1; 14018#else 14019 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14020#endif 14021 _PyUnicode_STATE(self).interned = 0; 14022 _PyUnicode_STATE(self).kind = kind; 14023 _PyUnicode_STATE(self).compact = 0; 14024 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14025 _PyUnicode_STATE(self).ready = 1; 14026 _PyUnicode_WSTR(self) = NULL; 14027 _PyUnicode_UTF8_LENGTH(self) = 0; 14028 _PyUnicode_UTF8(self) = NULL; 14029 _PyUnicode_WSTR_LENGTH(self) = 0; 14030 _PyUnicode_DATA_ANY(self) = NULL; 14031 14032 share_utf8 = 0; 14033 share_wstr = 0; 14034 if (kind == PyUnicode_1BYTE_KIND) { 14035 char_size = 1; 14036 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14037 share_utf8 = 1; 14038 } 14039 else if (kind == PyUnicode_2BYTE_KIND) { 14040 char_size = 2; 14041 if (sizeof(wchar_t) == 2) 14042 share_wstr = 1; 14043 } 14044 else { 14045 assert(kind == PyUnicode_4BYTE_KIND); 14046 char_size = 4; 14047 if (sizeof(wchar_t) == 4) 14048 share_wstr = 1; 14049 } 14050 14051 /* Ensure we won't overflow the length. */ 14052 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14053 PyErr_NoMemory(); 14054 goto onError; 14055 } 14056 data = PyObject_MALLOC((length + 1) * char_size); 14057 if (data == NULL) { 14058 PyErr_NoMemory(); 14059 goto onError; 14060 } 14061 14062 _PyUnicode_DATA_ANY(self) = data; 14063 if (share_utf8) { 14064 _PyUnicode_UTF8_LENGTH(self) = length; 14065 _PyUnicode_UTF8(self) = data; 14066 } 14067 if (share_wstr) { 14068 _PyUnicode_WSTR_LENGTH(self) = length; 14069 _PyUnicode_WSTR(self) = (wchar_t *)data; 14070 } 14071 14072 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14073 kind * (length + 1)); 14074 assert(_PyUnicode_CheckConsistency(self, 1)); 14075#ifdef Py_DEBUG 14076 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14077#endif 14078 Py_DECREF(unicode); 14079 return self; 14080 14081onError: 14082 Py_DECREF(unicode); 14083 Py_DECREF(self); 14084 return NULL; 14085} 14086 14087PyDoc_STRVAR(unicode_doc, 14088 "str(object[, encoding[, errors]]) -> str\n\ 14089\n\ 14090Create a new string object from the given object. If encoding or\n\ 14091errors is specified, then the object must expose a data buffer\n\ 14092that will be decoded using the given encoding and error handler.\n\ 14093Otherwise, returns the result of object.__str__() (if defined)\n\ 14094or repr(object).\n\ 14095encoding defaults to sys.getdefaultencoding().\n\ 14096errors defaults to 'strict'."); 14097 14098static PyObject *unicode_iter(PyObject *seq); 14099 14100PyTypeObject PyUnicode_Type = { 14101 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14102 "str", /* tp_name */ 14103 sizeof(PyUnicodeObject), /* tp_size */ 14104 0, /* tp_itemsize */ 14105 /* Slots */ 14106 (destructor)unicode_dealloc, /* tp_dealloc */ 14107 0, /* tp_print */ 14108 0, /* tp_getattr */ 14109 0, /* tp_setattr */ 14110 0, /* tp_reserved */ 14111 unicode_repr, /* tp_repr */ 14112 &unicode_as_number, /* tp_as_number */ 14113 &unicode_as_sequence, /* tp_as_sequence */ 14114 &unicode_as_mapping, /* tp_as_mapping */ 14115 (hashfunc) unicode_hash, /* tp_hash*/ 14116 0, /* tp_call*/ 14117 (reprfunc) unicode_str, /* tp_str */ 14118 PyObject_GenericGetAttr, /* tp_getattro */ 14119 0, /* tp_setattro */ 14120 0, /* tp_as_buffer */ 14121 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14122 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14123 unicode_doc, /* tp_doc */ 14124 0, /* tp_traverse */ 14125 0, /* tp_clear */ 14126 PyUnicode_RichCompare, /* tp_richcompare */ 14127 0, /* tp_weaklistoffset */ 14128 unicode_iter, /* tp_iter */ 14129 0, /* tp_iternext */ 14130 unicode_methods, /* tp_methods */ 14131 0, /* tp_members */ 14132 0, /* tp_getset */ 14133 &PyBaseObject_Type, /* tp_base */ 14134 0, /* tp_dict */ 14135 0, /* tp_descr_get */ 14136 0, /* tp_descr_set */ 14137 0, /* tp_dictoffset */ 14138 0, /* tp_init */ 14139 0, /* tp_alloc */ 14140 unicode_new, /* tp_new */ 14141 PyObject_Del, /* tp_free */ 14142}; 14143 14144/* Initialize the Unicode implementation */ 14145 14146int _PyUnicode_Init(void) 14147{ 14148 int i; 14149 14150 /* XXX - move this array to unicodectype.c ? */ 14151 Py_UCS2 linebreak[] = { 14152 0x000A, /* LINE FEED */ 14153 0x000D, /* CARRIAGE RETURN */ 14154 0x001C, /* FILE SEPARATOR */ 14155 0x001D, /* GROUP SEPARATOR */ 14156 0x001E, /* RECORD SEPARATOR */ 14157 0x0085, /* NEXT LINE */ 14158 0x2028, /* LINE SEPARATOR */ 14159 0x2029, /* PARAGRAPH SEPARATOR */ 14160 }; 14161 14162 /* Init the implementation */ 14163 unicode_empty = PyUnicode_New(0, 0); 14164 if (!unicode_empty) 14165 Py_FatalError("Can't create empty string"); 14166 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); 14167 14168 for (i = 0; i < 256; i++) 14169 unicode_latin1[i] = NULL; 14170 if (PyType_Ready(&PyUnicode_Type) < 0) 14171 Py_FatalError("Can't initialize 'unicode'"); 14172 14173 /* initialize the linebreak bloom filter */ 14174 bloom_linebreak = make_bloom_mask( 14175 PyUnicode_2BYTE_KIND, linebreak, 14176 Py_ARRAY_LENGTH(linebreak)); 14177 14178 PyType_Ready(&EncodingMapType); 14179 14180#ifdef HAVE_MBCS 14181 winver.dwOSVersionInfoSize = sizeof(winver); 14182 if (!GetVersionEx((OSVERSIONINFO*)&winver)) { 14183 PyErr_SetFromWindowsErr(0); 14184 return -1; 14185 } 14186#endif 14187 return 0; 14188} 14189 14190/* Finalize the Unicode implementation */ 14191 14192int 14193PyUnicode_ClearFreeList(void) 14194{ 14195 return 0; 14196} 14197 14198void 14199_PyUnicode_Fini(void) 14200{ 14201 int i; 14202 14203 Py_XDECREF(unicode_empty); 14204 unicode_empty = NULL; 14205 14206 for (i = 0; i < 256; i++) { 14207 if (unicode_latin1[i]) { 14208 Py_DECREF(unicode_latin1[i]); 14209 unicode_latin1[i] = NULL; 14210 } 14211 } 14212 _PyUnicode_ClearStaticStrings(); 14213 (void)PyUnicode_ClearFreeList(); 14214} 14215 14216void 14217PyUnicode_InternInPlace(PyObject **p) 14218{ 14219 register PyObject *s = *p; 14220 PyObject *t; 14221#ifdef Py_DEBUG 14222 assert(s != NULL); 14223 assert(_PyUnicode_CHECK(s)); 14224#else 14225 if (s == NULL || !PyUnicode_Check(s)) 14226 return; 14227#endif 14228 /* If it's a subclass, we don't really know what putting 14229 it in the interned dict might do. */ 14230 if (!PyUnicode_CheckExact(s)) 14231 return; 14232 if (PyUnicode_CHECK_INTERNED(s)) 14233 return; 14234 if (interned == NULL) { 14235 interned = PyDict_New(); 14236 if (interned == NULL) { 14237 PyErr_Clear(); /* Don't leave an exception */ 14238 return; 14239 } 14240 } 14241 /* It might be that the GetItem call fails even 14242 though the key is present in the dictionary, 14243 namely when this happens during a stack overflow. */ 14244 Py_ALLOW_RECURSION 14245 t = PyDict_GetItem(interned, s); 14246 Py_END_ALLOW_RECURSION 14247 14248 if (t) { 14249 Py_INCREF(t); 14250 Py_DECREF(*p); 14251 *p = t; 14252 return; 14253 } 14254 14255 PyThreadState_GET()->recursion_critical = 1; 14256 if (PyDict_SetItem(interned, s, s) < 0) { 14257 PyErr_Clear(); 14258 PyThreadState_GET()->recursion_critical = 0; 14259 return; 14260 } 14261 PyThreadState_GET()->recursion_critical = 0; 14262 /* The two references in interned are not counted by refcnt. 14263 The deallocator will take care of this */ 14264 Py_REFCNT(s) -= 2; 14265 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 14266} 14267 14268void 14269PyUnicode_InternImmortal(PyObject **p) 14270{ 14271 PyUnicode_InternInPlace(p); 14272 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 14273 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 14274 Py_INCREF(*p); 14275 } 14276} 14277 14278PyObject * 14279PyUnicode_InternFromString(const char *cp) 14280{ 14281 PyObject *s = PyUnicode_FromString(cp); 14282 if (s == NULL) 14283 return NULL; 14284 PyUnicode_InternInPlace(&s); 14285 return s; 14286} 14287 14288void 14289_Py_ReleaseInternedUnicodeStrings(void) 14290{ 14291 PyObject *keys; 14292 PyObject *s; 14293 Py_ssize_t i, n; 14294 Py_ssize_t immortal_size = 0, mortal_size = 0; 14295 14296 if (interned == NULL || !PyDict_Check(interned)) 14297 return; 14298 keys = PyDict_Keys(interned); 14299 if (keys == NULL || !PyList_Check(keys)) { 14300 PyErr_Clear(); 14301 return; 14302 } 14303 14304 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 14305 detector, interned unicode strings are not forcibly deallocated; 14306 rather, we give them their stolen references back, and then clear 14307 and DECREF the interned dict. */ 14308 14309 n = PyList_GET_SIZE(keys); 14310 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 14311 n); 14312 for (i = 0; i < n; i++) { 14313 s = PyList_GET_ITEM(keys, i); 14314 if (PyUnicode_READY(s) == -1) { 14315 assert(0 && "could not ready string"); 14316 fprintf(stderr, "could not ready string\n"); 14317 } 14318 switch (PyUnicode_CHECK_INTERNED(s)) { 14319 case SSTATE_NOT_INTERNED: 14320 /* XXX Shouldn't happen */ 14321 break; 14322 case SSTATE_INTERNED_IMMORTAL: 14323 Py_REFCNT(s) += 1; 14324 immortal_size += PyUnicode_GET_LENGTH(s); 14325 break; 14326 case SSTATE_INTERNED_MORTAL: 14327 Py_REFCNT(s) += 2; 14328 mortal_size += PyUnicode_GET_LENGTH(s); 14329 break; 14330 default: 14331 Py_FatalError("Inconsistent interned string state."); 14332 } 14333 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 14334 } 14335 fprintf(stderr, "total size of all interned strings: " 14336 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 14337 "mortal/immortal\n", mortal_size, immortal_size); 14338 Py_DECREF(keys); 14339 PyDict_Clear(interned); 14340 Py_DECREF(interned); 14341 interned = NULL; 14342} 14343 14344 14345/********************* Unicode Iterator **************************/ 14346 14347typedef struct { 14348 PyObject_HEAD 14349 Py_ssize_t it_index; 14350 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 14351} unicodeiterobject; 14352 14353static void 14354unicodeiter_dealloc(unicodeiterobject *it) 14355{ 14356 _PyObject_GC_UNTRACK(it); 14357 Py_XDECREF(it->it_seq); 14358 PyObject_GC_Del(it); 14359} 14360 14361static int 14362unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 14363{ 14364 Py_VISIT(it->it_seq); 14365 return 0; 14366} 14367 14368static PyObject * 14369unicodeiter_next(unicodeiterobject *it) 14370{ 14371 PyObject *seq, *item; 14372 14373 assert(it != NULL); 14374 seq = it->it_seq; 14375 if (seq == NULL) 14376 return NULL; 14377 assert(_PyUnicode_CHECK(seq)); 14378 14379 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 14380 int kind = PyUnicode_KIND(seq); 14381 void *data = PyUnicode_DATA(seq); 14382 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 14383 item = PyUnicode_FromOrdinal(chr); 14384 if (item != NULL) 14385 ++it->it_index; 14386 return item; 14387 } 14388 14389 Py_DECREF(seq); 14390 it->it_seq = NULL; 14391 return NULL; 14392} 14393 14394static PyObject * 14395unicodeiter_len(unicodeiterobject *it) 14396{ 14397 Py_ssize_t len = 0; 14398 if (it->it_seq) 14399 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 14400 return PyLong_FromSsize_t(len); 14401} 14402 14403PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 14404 14405static PyObject * 14406unicodeiter_reduce(unicodeiterobject *it) 14407{ 14408 if (it->it_seq != NULL) { 14409 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 14410 it->it_seq, it->it_index); 14411 } else { 14412 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 14413 if (u == NULL) 14414 return NULL; 14415 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 14416 } 14417} 14418 14419PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 14420 14421static PyObject * 14422unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 14423{ 14424 Py_ssize_t index = PyLong_AsSsize_t(state); 14425 if (index == -1 && PyErr_Occurred()) 14426 return NULL; 14427 if (index < 0) 14428 index = 0; 14429 it->it_index = index; 14430 Py_RETURN_NONE; 14431} 14432 14433PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 14434 14435static PyMethodDef unicodeiter_methods[] = { 14436 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 14437 length_hint_doc}, 14438 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 14439 reduce_doc}, 14440 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 14441 setstate_doc}, 14442 {NULL, NULL} /* sentinel */ 14443}; 14444 14445PyTypeObject PyUnicodeIter_Type = { 14446 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14447 "str_iterator", /* tp_name */ 14448 sizeof(unicodeiterobject), /* tp_basicsize */ 14449 0, /* tp_itemsize */ 14450 /* methods */ 14451 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 14452 0, /* tp_print */ 14453 0, /* tp_getattr */ 14454 0, /* tp_setattr */ 14455 0, /* tp_reserved */ 14456 0, /* tp_repr */ 14457 0, /* tp_as_number */ 14458 0, /* tp_as_sequence */ 14459 0, /* tp_as_mapping */ 14460 0, /* tp_hash */ 14461 0, /* tp_call */ 14462 0, /* tp_str */ 14463 PyObject_GenericGetAttr, /* tp_getattro */ 14464 0, /* tp_setattro */ 14465 0, /* tp_as_buffer */ 14466 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 14467 0, /* tp_doc */ 14468 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 14469 0, /* tp_clear */ 14470 0, /* tp_richcompare */ 14471 0, /* tp_weaklistoffset */ 14472 PyObject_SelfIter, /* tp_iter */ 14473 (iternextfunc)unicodeiter_next, /* tp_iternext */ 14474 unicodeiter_methods, /* tp_methods */ 14475 0, 14476}; 14477 14478static PyObject * 14479unicode_iter(PyObject *seq) 14480{ 14481 unicodeiterobject *it; 14482 14483 if (!PyUnicode_Check(seq)) { 14484 PyErr_BadInternalCall(); 14485 return NULL; 14486 } 14487 if (PyUnicode_READY(seq) == -1) 14488 return NULL; 14489 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 14490 if (it == NULL) 14491 return NULL; 14492 it->it_index = 0; 14493 Py_INCREF(seq); 14494 it->it_seq = seq; 14495 _PyObject_GC_TRACK(it); 14496 return (PyObject *)it; 14497} 14498 14499 14500size_t 14501Py_UNICODE_strlen(const Py_UNICODE *u) 14502{ 14503 int res = 0; 14504 while(*u++) 14505 res++; 14506 return res; 14507} 14508 14509Py_UNICODE* 14510Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 14511{ 14512 Py_UNICODE *u = s1; 14513 while ((*u++ = *s2++)); 14514 return s1; 14515} 14516 14517Py_UNICODE* 14518Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14519{ 14520 Py_UNICODE *u = s1; 14521 while ((*u++ = *s2++)) 14522 if (n-- == 0) 14523 break; 14524 return s1; 14525} 14526 14527Py_UNICODE* 14528Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 14529{ 14530 Py_UNICODE *u1 = s1; 14531 u1 += Py_UNICODE_strlen(u1); 14532 Py_UNICODE_strcpy(u1, s2); 14533 return s1; 14534} 14535 14536int 14537Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 14538{ 14539 while (*s1 && *s2 && *s1 == *s2) 14540 s1++, s2++; 14541 if (*s1 && *s2) 14542 return (*s1 < *s2) ? -1 : +1; 14543 if (*s1) 14544 return 1; 14545 if (*s2) 14546 return -1; 14547 return 0; 14548} 14549 14550int 14551Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 14552{ 14553 register Py_UNICODE u1, u2; 14554 for (; n != 0; n--) { 14555 u1 = *s1; 14556 u2 = *s2; 14557 if (u1 != u2) 14558 return (u1 < u2) ? -1 : +1; 14559 if (u1 == '\0') 14560 return 0; 14561 s1++; 14562 s2++; 14563 } 14564 return 0; 14565} 14566 14567Py_UNICODE* 14568Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 14569{ 14570 const Py_UNICODE *p; 14571 for (p = s; *p; p++) 14572 if (*p == c) 14573 return (Py_UNICODE*)p; 14574 return NULL; 14575} 14576 14577Py_UNICODE* 14578Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 14579{ 14580 const Py_UNICODE *p; 14581 p = s + Py_UNICODE_strlen(s); 14582 while (p != s) { 14583 p--; 14584 if (*p == c) 14585 return (Py_UNICODE*)p; 14586 } 14587 return NULL; 14588} 14589 14590Py_UNICODE* 14591PyUnicode_AsUnicodeCopy(PyObject *unicode) 14592{ 14593 Py_UNICODE *u, *copy; 14594 Py_ssize_t len, size; 14595 14596 if (!PyUnicode_Check(unicode)) { 14597 PyErr_BadArgument(); 14598 return NULL; 14599 } 14600 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 14601 if (u == NULL) 14602 return NULL; 14603 /* Ensure we won't overflow the size. */ 14604 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { 14605 PyErr_NoMemory(); 14606 return NULL; 14607 } 14608 size = len + 1; /* copy the null character */ 14609 size *= sizeof(Py_UNICODE); 14610 copy = PyMem_Malloc(size); 14611 if (copy == NULL) { 14612 PyErr_NoMemory(); 14613 return NULL; 14614 } 14615 memcpy(copy, u, size); 14616 return copy; 14617} 14618 14619/* A _string module, to export formatter_parser and formatter_field_name_split 14620 to the string.Formatter class implemented in Python. */ 14621 14622static PyMethodDef _string_methods[] = { 14623 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 14624 METH_O, PyDoc_STR("split the argument as a field name")}, 14625 {"formatter_parser", (PyCFunction) formatter_parser, 14626 METH_O, PyDoc_STR("parse the argument as a format string")}, 14627 {NULL, NULL} 14628}; 14629 14630static struct PyModuleDef _string_module = { 14631 PyModuleDef_HEAD_INIT, 14632 "_string", 14633 PyDoc_STR("string helper module"), 14634 0, 14635 _string_methods, 14636 NULL, 14637 NULL, 14638 NULL, 14639 NULL 14640}; 14641 14642PyMODINIT_FUNC 14643PyInit__string(void) 14644{ 14645 return PyModule_Create(&_string_module); 14646} 14647 14648 14649#ifdef __cplusplus 14650} 14651#endif 14652