unicodeobject.c revision 0030cd52dacdd95d2017a0947d661feb737449af
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45#include "stringlib/eq.h" 46 47#ifdef MS_WINDOWS 48#include <windows.h> 49#endif 50 51/*[clinic input] 52class str "PyUnicodeObject *" "&PyUnicode_Type" 53[clinic start generated code]*/ 54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 55 56/* --- Globals ------------------------------------------------------------ 57 58NOTE: In the interpreter's initialization phase, some globals are currently 59 initialized dynamically as needed. In the process Unicode objects may 60 be created before the Unicode type is ready. 61 62*/ 63 64 65#ifdef __cplusplus 66extern "C" { 67#endif 68 69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 70#define MAX_UNICODE 0x10ffff 71 72#ifdef Py_DEBUG 73# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 74#else 75# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 76#endif 77 78#define _PyUnicode_UTF8(op) \ 79 (((PyCompactUnicodeObject*)(op))->utf8) 80#define PyUnicode_UTF8(op) \ 81 (assert(_PyUnicode_CHECK(op)), \ 82 assert(PyUnicode_IS_READY(op)), \ 83 PyUnicode_IS_COMPACT_ASCII(op) ? \ 84 ((char*)((PyASCIIObject*)(op) + 1)) : \ 85 _PyUnicode_UTF8(op)) 86#define _PyUnicode_UTF8_LENGTH(op) \ 87 (((PyCompactUnicodeObject*)(op))->utf8_length) 88#define PyUnicode_UTF8_LENGTH(op) \ 89 (assert(_PyUnicode_CHECK(op)), \ 90 assert(PyUnicode_IS_READY(op)), \ 91 PyUnicode_IS_COMPACT_ASCII(op) ? \ 92 ((PyASCIIObject*)(op))->length : \ 93 _PyUnicode_UTF8_LENGTH(op)) 94#define _PyUnicode_WSTR(op) \ 95 (((PyASCIIObject*)(op))->wstr) 96#define _PyUnicode_WSTR_LENGTH(op) \ 97 (((PyCompactUnicodeObject*)(op))->wstr_length) 98#define _PyUnicode_LENGTH(op) \ 99 (((PyASCIIObject *)(op))->length) 100#define _PyUnicode_STATE(op) \ 101 (((PyASCIIObject *)(op))->state) 102#define _PyUnicode_HASH(op) \ 103 (((PyASCIIObject *)(op))->hash) 104#define _PyUnicode_KIND(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 ((PyASCIIObject *)(op))->state.kind) 107#define _PyUnicode_GET_LENGTH(op) \ 108 (assert(_PyUnicode_CHECK(op)), \ 109 ((PyASCIIObject *)(op))->length) 110#define _PyUnicode_DATA_ANY(op) \ 111 (((PyUnicodeObject*)(op))->data.any) 112 113#undef PyUnicode_READY 114#define PyUnicode_READY(op) \ 115 (assert(_PyUnicode_CHECK(op)), \ 116 (PyUnicode_IS_READY(op) ? \ 117 0 : \ 118 _PyUnicode_Ready(op))) 119 120#define _PyUnicode_SHARE_UTF8(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 124#define _PyUnicode_SHARE_WSTR(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 127 128/* true if the Unicode object has an allocated UTF-8 memory block 129 (not shared with other data) */ 130#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 131 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 132 && _PyUnicode_UTF8(op) \ 133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 134 135/* true if the Unicode object has an allocated wstr memory block 136 (not shared with other data) */ 137#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 138 ((_PyUnicode_WSTR(op) && \ 139 (!PyUnicode_IS_READY(op) || \ 140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 141 142/* Generic helper macro to convert characters of different types. 143 from_type and to_type have to be valid type names, begin and end 144 are pointers to the source characters which should be of type 145 "from_type *". to is a pointer of type "to_type *" and points to the 146 buffer where the result characters are written to. */ 147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 148 do { \ 149 to_type *_to = (to_type *)(to); \ 150 const from_type *_iter = (from_type *)(begin); \ 151 const from_type *_end = (from_type *)(end); \ 152 Py_ssize_t n = (_end) - (_iter); \ 153 const from_type *_unrolled_end = \ 154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 155 while (_iter < (_unrolled_end)) { \ 156 _to[0] = (to_type) _iter[0]; \ 157 _to[1] = (to_type) _iter[1]; \ 158 _to[2] = (to_type) _iter[2]; \ 159 _to[3] = (to_type) _iter[3]; \ 160 _iter += 4; _to += 4; \ 161 } \ 162 while (_iter < (_end)) \ 163 *_to++ = (to_type) *_iter++; \ 164 } while (0) 165 166/* This dictionary holds all interned unicode strings. Note that references 167 to strings in this dictionary are *not* counted in the string's ob_refcnt. 168 When the interned string reaches a refcnt of 0 the string deallocation 169 function will delete the reference from this dictionary. 170 171 Another way to look at this is that to say that the actual reference 172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 173*/ 174static PyObject *interned = NULL; 175 176/* The empty Unicode object is shared to improve performance. */ 177static PyObject *unicode_empty = NULL; 178 179#define _Py_INCREF_UNICODE_EMPTY() \ 180 do { \ 181 if (unicode_empty != NULL) \ 182 Py_INCREF(unicode_empty); \ 183 else { \ 184 unicode_empty = PyUnicode_New(0, 0); \ 185 if (unicode_empty != NULL) { \ 186 Py_INCREF(unicode_empty); \ 187 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 188 } \ 189 } \ 190 } while (0) 191 192#define _Py_RETURN_UNICODE_EMPTY() \ 193 do { \ 194 _Py_INCREF_UNICODE_EMPTY(); \ 195 return unicode_empty; \ 196 } while (0) 197 198/* Forward declaration */ 199Py_LOCAL_INLINE(int) 200_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 201 202/* List of static strings. */ 203static _Py_Identifier *static_strings = NULL; 204 205/* Single character Unicode strings in the Latin-1 range are being 206 shared as well. */ 207static PyObject *unicode_latin1[256] = {NULL}; 208 209/* Fast detection of the most frequent whitespace characters */ 210const unsigned char _Py_ascii_whitespace[] = { 211 0, 0, 0, 0, 0, 0, 0, 0, 212/* case 0x0009: * CHARACTER TABULATION */ 213/* case 0x000A: * LINE FEED */ 214/* case 0x000B: * LINE TABULATION */ 215/* case 0x000C: * FORM FEED */ 216/* case 0x000D: * CARRIAGE RETURN */ 217 0, 1, 1, 1, 1, 1, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219/* case 0x001C: * FILE SEPARATOR */ 220/* case 0x001D: * GROUP SEPARATOR */ 221/* case 0x001E: * RECORD SEPARATOR */ 222/* case 0x001F: * UNIT SEPARATOR */ 223 0, 0, 0, 0, 1, 1, 1, 1, 224/* case 0x0020: * SPACE */ 225 1, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0, 237 0, 0, 0, 0, 0, 0, 0, 0 238}; 239 240/* forward */ 241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 242static PyObject* get_latin1_char(unsigned char ch); 243static int unicode_modifiable(PyObject *unicode); 244 245 246static PyObject * 247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 248static PyObject * 249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 250static PyObject * 251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 252 253static PyObject * 254unicode_encode_call_errorhandler(const char *errors, 255 PyObject **errorHandler,const char *encoding, const char *reason, 256 PyObject *unicode, PyObject **exceptionObject, 257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 258 259static void 260raise_encode_exception(PyObject **exceptionObject, 261 const char *encoding, 262 PyObject *unicode, 263 Py_ssize_t startpos, Py_ssize_t endpos, 264 const char *reason); 265 266/* Same for linebreaks */ 267static unsigned char ascii_linebreak[] = { 268 0, 0, 0, 0, 0, 0, 0, 0, 269/* 0x000A, * LINE FEED */ 270/* 0x000B, * LINE TABULATION */ 271/* 0x000C, * FORM FEED */ 272/* 0x000D, * CARRIAGE RETURN */ 273 0, 0, 1, 1, 1, 1, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275/* 0x001C, * FILE SEPARATOR */ 276/* 0x001D, * GROUP SEPARATOR */ 277/* 0x001E, * RECORD SEPARATOR */ 278 0, 0, 0, 0, 1, 1, 1, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0 292}; 293 294#include "clinic/unicodeobject.c.h" 295 296typedef enum { 297 _Py_ERROR_UNKNOWN=0, 298 _Py_ERROR_STRICT, 299 _Py_ERROR_SURROGATEESCAPE, 300 _Py_ERROR_REPLACE, 301 _Py_ERROR_IGNORE, 302 _Py_ERROR_XMLCHARREFREPLACE, 303 _Py_ERROR_OTHER 304} _Py_error_handler; 305 306static _Py_error_handler 307get_error_handler(const char *errors) 308{ 309 if (errors == NULL) 310 return _Py_ERROR_STRICT; 311 if (strcmp(errors, "strict") == 0) 312 return _Py_ERROR_STRICT; 313 if (strcmp(errors, "surrogateescape") == 0) 314 return _Py_ERROR_SURROGATEESCAPE; 315 if (strcmp(errors, "ignore") == 0) 316 return _Py_ERROR_IGNORE; 317 if (strcmp(errors, "replace") == 0) 318 return _Py_ERROR_REPLACE; 319 if (strcmp(errors, "xmlcharrefreplace") == 0) 320 return _Py_ERROR_XMLCHARREFREPLACE; 321 return _Py_ERROR_OTHER; 322} 323 324/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 325 This function is kept for backward compatibility with the old API. */ 326Py_UNICODE 327PyUnicode_GetMax(void) 328{ 329#ifdef Py_UNICODE_WIDE 330 return 0x10FFFF; 331#else 332 /* This is actually an illegal character, so it should 333 not be passed to unichr. */ 334 return 0xFFFF; 335#endif 336} 337 338#ifdef Py_DEBUG 339int 340_PyUnicode_CheckConsistency(PyObject *op, int check_content) 341{ 342 PyASCIIObject *ascii; 343 unsigned int kind; 344 345 assert(PyUnicode_Check(op)); 346 347 ascii = (PyASCIIObject *)op; 348 kind = ascii->state.kind; 349 350 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 351 assert(kind == PyUnicode_1BYTE_KIND); 352 assert(ascii->state.ready == 1); 353 } 354 else { 355 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 356 void *data; 357 358 if (ascii->state.compact == 1) { 359 data = compact + 1; 360 assert(kind == PyUnicode_1BYTE_KIND 361 || kind == PyUnicode_2BYTE_KIND 362 || kind == PyUnicode_4BYTE_KIND); 363 assert(ascii->state.ascii == 0); 364 assert(ascii->state.ready == 1); 365 assert (compact->utf8 != data); 366 } 367 else { 368 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 369 370 data = unicode->data.any; 371 if (kind == PyUnicode_WCHAR_KIND) { 372 assert(ascii->length == 0); 373 assert(ascii->hash == -1); 374 assert(ascii->state.compact == 0); 375 assert(ascii->state.ascii == 0); 376 assert(ascii->state.ready == 0); 377 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 378 assert(ascii->wstr != NULL); 379 assert(data == NULL); 380 assert(compact->utf8 == NULL); 381 } 382 else { 383 assert(kind == PyUnicode_1BYTE_KIND 384 || kind == PyUnicode_2BYTE_KIND 385 || kind == PyUnicode_4BYTE_KIND); 386 assert(ascii->state.compact == 0); 387 assert(ascii->state.ready == 1); 388 assert(data != NULL); 389 if (ascii->state.ascii) { 390 assert (compact->utf8 == data); 391 assert (compact->utf8_length == ascii->length); 392 } 393 else 394 assert (compact->utf8 != data); 395 } 396 } 397 if (kind != PyUnicode_WCHAR_KIND) { 398 if ( 399#if SIZEOF_WCHAR_T == 2 400 kind == PyUnicode_2BYTE_KIND 401#else 402 kind == PyUnicode_4BYTE_KIND 403#endif 404 ) 405 { 406 assert(ascii->wstr == data); 407 assert(compact->wstr_length == ascii->length); 408 } else 409 assert(ascii->wstr != data); 410 } 411 412 if (compact->utf8 == NULL) 413 assert(compact->utf8_length == 0); 414 if (ascii->wstr == NULL) 415 assert(compact->wstr_length == 0); 416 } 417 /* check that the best kind is used */ 418 if (check_content && kind != PyUnicode_WCHAR_KIND) 419 { 420 Py_ssize_t i; 421 Py_UCS4 maxchar = 0; 422 void *data; 423 Py_UCS4 ch; 424 425 data = PyUnicode_DATA(ascii); 426 for (i=0; i < ascii->length; i++) 427 { 428 ch = PyUnicode_READ(kind, data, i); 429 if (ch > maxchar) 430 maxchar = ch; 431 } 432 if (kind == PyUnicode_1BYTE_KIND) { 433 if (ascii->state.ascii == 0) { 434 assert(maxchar >= 128); 435 assert(maxchar <= 255); 436 } 437 else 438 assert(maxchar < 128); 439 } 440 else if (kind == PyUnicode_2BYTE_KIND) { 441 assert(maxchar >= 0x100); 442 assert(maxchar <= 0xFFFF); 443 } 444 else { 445 assert(maxchar >= 0x10000); 446 assert(maxchar <= MAX_UNICODE); 447 } 448 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 449 } 450 return 1; 451} 452#endif 453 454static PyObject* 455unicode_result_wchar(PyObject *unicode) 456{ 457#ifndef Py_DEBUG 458 Py_ssize_t len; 459 460 len = _PyUnicode_WSTR_LENGTH(unicode); 461 if (len == 0) { 462 Py_DECREF(unicode); 463 _Py_RETURN_UNICODE_EMPTY(); 464 } 465 466 if (len == 1) { 467 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 468 if ((Py_UCS4)ch < 256) { 469 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 470 Py_DECREF(unicode); 471 return latin1_char; 472 } 473 } 474 475 if (_PyUnicode_Ready(unicode) < 0) { 476 Py_DECREF(unicode); 477 return NULL; 478 } 479#else 480 assert(Py_REFCNT(unicode) == 1); 481 482 /* don't make the result ready in debug mode to ensure that the caller 483 makes the string ready before using it */ 484 assert(_PyUnicode_CheckConsistency(unicode, 1)); 485#endif 486 return unicode; 487} 488 489static PyObject* 490unicode_result_ready(PyObject *unicode) 491{ 492 Py_ssize_t length; 493 494 length = PyUnicode_GET_LENGTH(unicode); 495 if (length == 0) { 496 if (unicode != unicode_empty) { 497 Py_DECREF(unicode); 498 _Py_RETURN_UNICODE_EMPTY(); 499 } 500 return unicode_empty; 501 } 502 503 if (length == 1) { 504 void *data = PyUnicode_DATA(unicode); 505 int kind = PyUnicode_KIND(unicode); 506 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 507 if (ch < 256) { 508 PyObject *latin1_char = unicode_latin1[ch]; 509 if (latin1_char != NULL) { 510 if (unicode != latin1_char) { 511 Py_INCREF(latin1_char); 512 Py_DECREF(unicode); 513 } 514 return latin1_char; 515 } 516 else { 517 assert(_PyUnicode_CheckConsistency(unicode, 1)); 518 Py_INCREF(unicode); 519 unicode_latin1[ch] = unicode; 520 return unicode; 521 } 522 } 523 } 524 525 assert(_PyUnicode_CheckConsistency(unicode, 1)); 526 return unicode; 527} 528 529static PyObject* 530unicode_result(PyObject *unicode) 531{ 532 assert(_PyUnicode_CHECK(unicode)); 533 if (PyUnicode_IS_READY(unicode)) 534 return unicode_result_ready(unicode); 535 else 536 return unicode_result_wchar(unicode); 537} 538 539static PyObject* 540unicode_result_unchanged(PyObject *unicode) 541{ 542 if (PyUnicode_CheckExact(unicode)) { 543 if (PyUnicode_READY(unicode) == -1) 544 return NULL; 545 Py_INCREF(unicode); 546 return unicode; 547 } 548 else 549 /* Subtype -- return genuine unicode string with the same value. */ 550 return _PyUnicode_Copy(unicode); 551} 552 553/* --- Bloom Filters ----------------------------------------------------- */ 554 555/* stuff to implement simple "bloom filters" for Unicode characters. 556 to keep things simple, we use a single bitmask, using the least 5 557 bits from each unicode characters as the bit index. */ 558 559/* the linebreak mask is set up by Unicode_Init below */ 560 561#if LONG_BIT >= 128 562#define BLOOM_WIDTH 128 563#elif LONG_BIT >= 64 564#define BLOOM_WIDTH 64 565#elif LONG_BIT >= 32 566#define BLOOM_WIDTH 32 567#else 568#error "LONG_BIT is smaller than 32" 569#endif 570 571#define BLOOM_MASK unsigned long 572 573static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 574 575#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 576 577#define BLOOM_LINEBREAK(ch) \ 578 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 579 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 580 581Py_LOCAL_INLINE(BLOOM_MASK) 582make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 583{ 584#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 585 do { \ 586 TYPE *data = (TYPE *)PTR; \ 587 TYPE *end = data + LEN; \ 588 Py_UCS4 ch; \ 589 for (; data != end; data++) { \ 590 ch = *data; \ 591 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 592 } \ 593 break; \ 594 } while (0) 595 596 /* calculate simple bloom-style bitmask for a given unicode string */ 597 598 BLOOM_MASK mask; 599 600 mask = 0; 601 switch (kind) { 602 case PyUnicode_1BYTE_KIND: 603 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 604 break; 605 case PyUnicode_2BYTE_KIND: 606 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 607 break; 608 case PyUnicode_4BYTE_KIND: 609 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 610 break; 611 default: 612 assert(0); 613 } 614 return mask; 615 616#undef BLOOM_UPDATE 617} 618 619/* Compilation of templated routines */ 620 621#include "stringlib/asciilib.h" 622#include "stringlib/fastsearch.h" 623#include "stringlib/partition.h" 624#include "stringlib/split.h" 625#include "stringlib/count.h" 626#include "stringlib/find.h" 627#include "stringlib/find_max_char.h" 628#include "stringlib/localeutil.h" 629#include "stringlib/undef.h" 630 631#include "stringlib/ucs1lib.h" 632#include "stringlib/fastsearch.h" 633#include "stringlib/partition.h" 634#include "stringlib/split.h" 635#include "stringlib/count.h" 636#include "stringlib/find.h" 637#include "stringlib/replace.h" 638#include "stringlib/find_max_char.h" 639#include "stringlib/localeutil.h" 640#include "stringlib/undef.h" 641 642#include "stringlib/ucs2lib.h" 643#include "stringlib/fastsearch.h" 644#include "stringlib/partition.h" 645#include "stringlib/split.h" 646#include "stringlib/count.h" 647#include "stringlib/find.h" 648#include "stringlib/replace.h" 649#include "stringlib/find_max_char.h" 650#include "stringlib/localeutil.h" 651#include "stringlib/undef.h" 652 653#include "stringlib/ucs4lib.h" 654#include "stringlib/fastsearch.h" 655#include "stringlib/partition.h" 656#include "stringlib/split.h" 657#include "stringlib/count.h" 658#include "stringlib/find.h" 659#include "stringlib/replace.h" 660#include "stringlib/find_max_char.h" 661#include "stringlib/localeutil.h" 662#include "stringlib/undef.h" 663 664#include "stringlib/unicodedefs.h" 665#include "stringlib/fastsearch.h" 666#include "stringlib/count.h" 667#include "stringlib/find.h" 668#include "stringlib/undef.h" 669 670/* --- Unicode Object ----------------------------------------------------- */ 671 672static PyObject * 673fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 674 675Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, 676 Py_ssize_t size, Py_UCS4 ch, 677 int direction) 678{ 679 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 680 681 switch (kind) { 682 case PyUnicode_1BYTE_KIND: 683 { 684 Py_UCS1 ch1 = (Py_UCS1) ch; 685 if (ch1 == ch) 686 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 687 else 688 return -1; 689 } 690 case PyUnicode_2BYTE_KIND: 691 { 692 Py_UCS2 ch2 = (Py_UCS2) ch; 693 if (ch2 == ch) 694 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 695 else 696 return -1; 697 } 698 case PyUnicode_4BYTE_KIND: 699 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 700 default: 701 assert(0); 702 return -1; 703 } 704} 705 706#ifdef Py_DEBUG 707/* Fill the data of an Unicode string with invalid characters to detect bugs 708 earlier. 709 710 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 711 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 712 invalid character in Unicode 6.0. */ 713static void 714unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 715{ 716 int kind = PyUnicode_KIND(unicode); 717 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 718 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 719 if (length <= old_length) 720 return; 721 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 722} 723#endif 724 725static PyObject* 726resize_compact(PyObject *unicode, Py_ssize_t length) 727{ 728 Py_ssize_t char_size; 729 Py_ssize_t struct_size; 730 Py_ssize_t new_size; 731 int share_wstr; 732 PyObject *new_unicode; 733#ifdef Py_DEBUG 734 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 735#endif 736 737 assert(unicode_modifiable(unicode)); 738 assert(PyUnicode_IS_READY(unicode)); 739 assert(PyUnicode_IS_COMPACT(unicode)); 740 741 char_size = PyUnicode_KIND(unicode); 742 if (PyUnicode_IS_ASCII(unicode)) 743 struct_size = sizeof(PyASCIIObject); 744 else 745 struct_size = sizeof(PyCompactUnicodeObject); 746 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 747 748 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 749 PyErr_NoMemory(); 750 return NULL; 751 } 752 new_size = (struct_size + (length + 1) * char_size); 753 754 _Py_DEC_REFTOTAL; 755 _Py_ForgetReference(unicode); 756 757 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 758 if (new_unicode == NULL) { 759 _Py_NewReference(unicode); 760 PyErr_NoMemory(); 761 return NULL; 762 } 763 unicode = new_unicode; 764 _Py_NewReference(unicode); 765 766 _PyUnicode_LENGTH(unicode) = length; 767 if (share_wstr) { 768 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 769 if (!PyUnicode_IS_ASCII(unicode)) 770 _PyUnicode_WSTR_LENGTH(unicode) = length; 771 } 772 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 773 PyObject_DEL(_PyUnicode_WSTR(unicode)); 774 _PyUnicode_WSTR(unicode) = NULL; 775 } 776#ifdef Py_DEBUG 777 unicode_fill_invalid(unicode, old_length); 778#endif 779 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 780 length, 0); 781 assert(_PyUnicode_CheckConsistency(unicode, 0)); 782 return unicode; 783} 784 785static int 786resize_inplace(PyObject *unicode, Py_ssize_t length) 787{ 788 wchar_t *wstr; 789 Py_ssize_t new_size; 790 assert(!PyUnicode_IS_COMPACT(unicode)); 791 assert(Py_REFCNT(unicode) == 1); 792 793 if (PyUnicode_IS_READY(unicode)) { 794 Py_ssize_t char_size; 795 int share_wstr, share_utf8; 796 void *data; 797#ifdef Py_DEBUG 798 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 799#endif 800 801 data = _PyUnicode_DATA_ANY(unicode); 802 char_size = PyUnicode_KIND(unicode); 803 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 804 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 805 806 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 807 PyErr_NoMemory(); 808 return -1; 809 } 810 new_size = (length + 1) * char_size; 811 812 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 813 { 814 PyObject_DEL(_PyUnicode_UTF8(unicode)); 815 _PyUnicode_UTF8(unicode) = NULL; 816 _PyUnicode_UTF8_LENGTH(unicode) = 0; 817 } 818 819 data = (PyObject *)PyObject_REALLOC(data, new_size); 820 if (data == NULL) { 821 PyErr_NoMemory(); 822 return -1; 823 } 824 _PyUnicode_DATA_ANY(unicode) = data; 825 if (share_wstr) { 826 _PyUnicode_WSTR(unicode) = data; 827 _PyUnicode_WSTR_LENGTH(unicode) = length; 828 } 829 if (share_utf8) { 830 _PyUnicode_UTF8(unicode) = data; 831 _PyUnicode_UTF8_LENGTH(unicode) = length; 832 } 833 _PyUnicode_LENGTH(unicode) = length; 834 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 835#ifdef Py_DEBUG 836 unicode_fill_invalid(unicode, old_length); 837#endif 838 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 839 assert(_PyUnicode_CheckConsistency(unicode, 0)); 840 return 0; 841 } 842 } 843 assert(_PyUnicode_WSTR(unicode) != NULL); 844 845 /* check for integer overflow */ 846 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 847 PyErr_NoMemory(); 848 return -1; 849 } 850 new_size = sizeof(wchar_t) * (length + 1); 851 wstr = _PyUnicode_WSTR(unicode); 852 wstr = PyObject_REALLOC(wstr, new_size); 853 if (!wstr) { 854 PyErr_NoMemory(); 855 return -1; 856 } 857 _PyUnicode_WSTR(unicode) = wstr; 858 _PyUnicode_WSTR(unicode)[length] = 0; 859 _PyUnicode_WSTR_LENGTH(unicode) = length; 860 assert(_PyUnicode_CheckConsistency(unicode, 0)); 861 return 0; 862} 863 864static PyObject* 865resize_copy(PyObject *unicode, Py_ssize_t length) 866{ 867 Py_ssize_t copy_length; 868 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 869 PyObject *copy; 870 871 if (PyUnicode_READY(unicode) == -1) 872 return NULL; 873 874 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 875 if (copy == NULL) 876 return NULL; 877 878 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 879 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 880 return copy; 881 } 882 else { 883 PyObject *w; 884 885 w = (PyObject*)_PyUnicode_New(length); 886 if (w == NULL) 887 return NULL; 888 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 889 copy_length = Py_MIN(copy_length, length); 890 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 891 copy_length * sizeof(wchar_t)); 892 return w; 893 } 894} 895 896/* We allocate one more byte to make sure the string is 897 Ux0000 terminated; some code (e.g. new_identifier) 898 relies on that. 899 900 XXX This allocator could further be enhanced by assuring that the 901 free list never reduces its size below 1. 902 903*/ 904 905static PyUnicodeObject * 906_PyUnicode_New(Py_ssize_t length) 907{ 908 PyUnicodeObject *unicode; 909 size_t new_size; 910 911 /* Optimization for empty strings */ 912 if (length == 0 && unicode_empty != NULL) { 913 Py_INCREF(unicode_empty); 914 return (PyUnicodeObject*)unicode_empty; 915 } 916 917 /* Ensure we won't overflow the size. */ 918 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 919 return (PyUnicodeObject *)PyErr_NoMemory(); 920 } 921 if (length < 0) { 922 PyErr_SetString(PyExc_SystemError, 923 "Negative size passed to _PyUnicode_New"); 924 return NULL; 925 } 926 927 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 928 if (unicode == NULL) 929 return NULL; 930 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 931 932 _PyUnicode_WSTR_LENGTH(unicode) = length; 933 _PyUnicode_HASH(unicode) = -1; 934 _PyUnicode_STATE(unicode).interned = 0; 935 _PyUnicode_STATE(unicode).kind = 0; 936 _PyUnicode_STATE(unicode).compact = 0; 937 _PyUnicode_STATE(unicode).ready = 0; 938 _PyUnicode_STATE(unicode).ascii = 0; 939 _PyUnicode_DATA_ANY(unicode) = NULL; 940 _PyUnicode_LENGTH(unicode) = 0; 941 _PyUnicode_UTF8(unicode) = NULL; 942 _PyUnicode_UTF8_LENGTH(unicode) = 0; 943 944 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 945 if (!_PyUnicode_WSTR(unicode)) { 946 Py_DECREF(unicode); 947 PyErr_NoMemory(); 948 return NULL; 949 } 950 951 /* Initialize the first element to guard against cases where 952 * the caller fails before initializing str -- unicode_resize() 953 * reads str[0], and the Keep-Alive optimization can keep memory 954 * allocated for str alive across a call to unicode_dealloc(unicode). 955 * We don't want unicode_resize to read uninitialized memory in 956 * that case. 957 */ 958 _PyUnicode_WSTR(unicode)[0] = 0; 959 _PyUnicode_WSTR(unicode)[length] = 0; 960 961 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 962 return unicode; 963} 964 965static const char* 966unicode_kind_name(PyObject *unicode) 967{ 968 /* don't check consistency: unicode_kind_name() is called from 969 _PyUnicode_Dump() */ 970 if (!PyUnicode_IS_COMPACT(unicode)) 971 { 972 if (!PyUnicode_IS_READY(unicode)) 973 return "wstr"; 974 switch (PyUnicode_KIND(unicode)) 975 { 976 case PyUnicode_1BYTE_KIND: 977 if (PyUnicode_IS_ASCII(unicode)) 978 return "legacy ascii"; 979 else 980 return "legacy latin1"; 981 case PyUnicode_2BYTE_KIND: 982 return "legacy UCS2"; 983 case PyUnicode_4BYTE_KIND: 984 return "legacy UCS4"; 985 default: 986 return "<legacy invalid kind>"; 987 } 988 } 989 assert(PyUnicode_IS_READY(unicode)); 990 switch (PyUnicode_KIND(unicode)) { 991 case PyUnicode_1BYTE_KIND: 992 if (PyUnicode_IS_ASCII(unicode)) 993 return "ascii"; 994 else 995 return "latin1"; 996 case PyUnicode_2BYTE_KIND: 997 return "UCS2"; 998 case PyUnicode_4BYTE_KIND: 999 return "UCS4"; 1000 default: 1001 return "<invalid compact kind>"; 1002 } 1003} 1004 1005#ifdef Py_DEBUG 1006/* Functions wrapping macros for use in debugger */ 1007char *_PyUnicode_utf8(void *unicode){ 1008 return PyUnicode_UTF8(unicode); 1009} 1010 1011void *_PyUnicode_compact_data(void *unicode) { 1012 return _PyUnicode_COMPACT_DATA(unicode); 1013} 1014void *_PyUnicode_data(void *unicode){ 1015 printf("obj %p\n", unicode); 1016 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1017 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1018 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 1019 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 1020 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1021 return PyUnicode_DATA(unicode); 1022} 1023 1024void 1025_PyUnicode_Dump(PyObject *op) 1026{ 1027 PyASCIIObject *ascii = (PyASCIIObject *)op; 1028 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1029 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1030 void *data; 1031 1032 if (ascii->state.compact) 1033 { 1034 if (ascii->state.ascii) 1035 data = (ascii + 1); 1036 else 1037 data = (compact + 1); 1038 } 1039 else 1040 data = unicode->data.any; 1041 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1042 unicode_kind_name(op), ascii->length); 1043 1044 if (ascii->wstr == data) 1045 printf("shared "); 1046 printf("wstr=%p", ascii->wstr); 1047 1048 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1049 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1050 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1051 printf("shared "); 1052 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1053 compact->utf8, compact->utf8_length); 1054 } 1055 printf(", data=%p\n", data); 1056} 1057#endif 1058 1059PyObject * 1060PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1061{ 1062 PyObject *obj; 1063 PyCompactUnicodeObject *unicode; 1064 void *data; 1065 enum PyUnicode_Kind kind; 1066 int is_sharing, is_ascii; 1067 Py_ssize_t char_size; 1068 Py_ssize_t struct_size; 1069 1070 /* Optimization for empty strings */ 1071 if (size == 0 && unicode_empty != NULL) { 1072 Py_INCREF(unicode_empty); 1073 return unicode_empty; 1074 } 1075 1076 is_ascii = 0; 1077 is_sharing = 0; 1078 struct_size = sizeof(PyCompactUnicodeObject); 1079 if (maxchar < 128) { 1080 kind = PyUnicode_1BYTE_KIND; 1081 char_size = 1; 1082 is_ascii = 1; 1083 struct_size = sizeof(PyASCIIObject); 1084 } 1085 else if (maxchar < 256) { 1086 kind = PyUnicode_1BYTE_KIND; 1087 char_size = 1; 1088 } 1089 else if (maxchar < 65536) { 1090 kind = PyUnicode_2BYTE_KIND; 1091 char_size = 2; 1092 if (sizeof(wchar_t) == 2) 1093 is_sharing = 1; 1094 } 1095 else { 1096 if (maxchar > MAX_UNICODE) { 1097 PyErr_SetString(PyExc_SystemError, 1098 "invalid maximum character passed to PyUnicode_New"); 1099 return NULL; 1100 } 1101 kind = PyUnicode_4BYTE_KIND; 1102 char_size = 4; 1103 if (sizeof(wchar_t) == 4) 1104 is_sharing = 1; 1105 } 1106 1107 /* Ensure we won't overflow the size. */ 1108 if (size < 0) { 1109 PyErr_SetString(PyExc_SystemError, 1110 "Negative size passed to PyUnicode_New"); 1111 return NULL; 1112 } 1113 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1114 return PyErr_NoMemory(); 1115 1116 /* Duplicated allocation code from _PyObject_New() instead of a call to 1117 * PyObject_New() so we are able to allocate space for the object and 1118 * it's data buffer. 1119 */ 1120 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1121 if (obj == NULL) 1122 return PyErr_NoMemory(); 1123 obj = PyObject_INIT(obj, &PyUnicode_Type); 1124 if (obj == NULL) 1125 return NULL; 1126 1127 unicode = (PyCompactUnicodeObject *)obj; 1128 if (is_ascii) 1129 data = ((PyASCIIObject*)obj) + 1; 1130 else 1131 data = unicode + 1; 1132 _PyUnicode_LENGTH(unicode) = size; 1133 _PyUnicode_HASH(unicode) = -1; 1134 _PyUnicode_STATE(unicode).interned = 0; 1135 _PyUnicode_STATE(unicode).kind = kind; 1136 _PyUnicode_STATE(unicode).compact = 1; 1137 _PyUnicode_STATE(unicode).ready = 1; 1138 _PyUnicode_STATE(unicode).ascii = is_ascii; 1139 if (is_ascii) { 1140 ((char*)data)[size] = 0; 1141 _PyUnicode_WSTR(unicode) = NULL; 1142 } 1143 else if (kind == PyUnicode_1BYTE_KIND) { 1144 ((char*)data)[size] = 0; 1145 _PyUnicode_WSTR(unicode) = NULL; 1146 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1147 unicode->utf8 = NULL; 1148 unicode->utf8_length = 0; 1149 } 1150 else { 1151 unicode->utf8 = NULL; 1152 unicode->utf8_length = 0; 1153 if (kind == PyUnicode_2BYTE_KIND) 1154 ((Py_UCS2*)data)[size] = 0; 1155 else /* kind == PyUnicode_4BYTE_KIND */ 1156 ((Py_UCS4*)data)[size] = 0; 1157 if (is_sharing) { 1158 _PyUnicode_WSTR_LENGTH(unicode) = size; 1159 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1160 } 1161 else { 1162 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1163 _PyUnicode_WSTR(unicode) = NULL; 1164 } 1165 } 1166#ifdef Py_DEBUG 1167 unicode_fill_invalid((PyObject*)unicode, 0); 1168#endif 1169 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1170 return obj; 1171} 1172 1173#if SIZEOF_WCHAR_T == 2 1174/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1175 will decode surrogate pairs, the other conversions are implemented as macros 1176 for efficiency. 1177 1178 This function assumes that unicode can hold one more code point than wstr 1179 characters for a terminating null character. */ 1180static void 1181unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1182 PyObject *unicode) 1183{ 1184 const wchar_t *iter; 1185 Py_UCS4 *ucs4_out; 1186 1187 assert(unicode != NULL); 1188 assert(_PyUnicode_CHECK(unicode)); 1189 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1190 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1191 1192 for (iter = begin; iter < end; ) { 1193 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1194 _PyUnicode_GET_LENGTH(unicode))); 1195 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1196 && (iter+1) < end 1197 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1198 { 1199 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1200 iter += 2; 1201 } 1202 else { 1203 *ucs4_out++ = *iter; 1204 iter++; 1205 } 1206 } 1207 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1208 _PyUnicode_GET_LENGTH(unicode))); 1209 1210} 1211#endif 1212 1213static int 1214unicode_check_modifiable(PyObject *unicode) 1215{ 1216 if (!unicode_modifiable(unicode)) { 1217 PyErr_SetString(PyExc_SystemError, 1218 "Cannot modify a string currently used"); 1219 return -1; 1220 } 1221 return 0; 1222} 1223 1224static int 1225_copy_characters(PyObject *to, Py_ssize_t to_start, 1226 PyObject *from, Py_ssize_t from_start, 1227 Py_ssize_t how_many, int check_maxchar) 1228{ 1229 unsigned int from_kind, to_kind; 1230 void *from_data, *to_data; 1231 1232 assert(0 <= how_many); 1233 assert(0 <= from_start); 1234 assert(0 <= to_start); 1235 assert(PyUnicode_Check(from)); 1236 assert(PyUnicode_IS_READY(from)); 1237 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1238 1239 assert(PyUnicode_Check(to)); 1240 assert(PyUnicode_IS_READY(to)); 1241 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1242 1243 if (how_many == 0) 1244 return 0; 1245 1246 from_kind = PyUnicode_KIND(from); 1247 from_data = PyUnicode_DATA(from); 1248 to_kind = PyUnicode_KIND(to); 1249 to_data = PyUnicode_DATA(to); 1250 1251#ifdef Py_DEBUG 1252 if (!check_maxchar 1253 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1254 { 1255 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1256 Py_UCS4 ch; 1257 Py_ssize_t i; 1258 for (i=0; i < how_many; i++) { 1259 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1260 assert(ch <= to_maxchar); 1261 } 1262 } 1263#endif 1264 1265 if (from_kind == to_kind) { 1266 if (check_maxchar 1267 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1268 { 1269 /* Writing Latin-1 characters into an ASCII string requires to 1270 check that all written characters are pure ASCII */ 1271 Py_UCS4 max_char; 1272 max_char = ucs1lib_find_max_char(from_data, 1273 (Py_UCS1*)from_data + how_many); 1274 if (max_char >= 128) 1275 return -1; 1276 } 1277 Py_MEMCPY((char*)to_data + to_kind * to_start, 1278 (char*)from_data + from_kind * from_start, 1279 to_kind * how_many); 1280 } 1281 else if (from_kind == PyUnicode_1BYTE_KIND 1282 && to_kind == PyUnicode_2BYTE_KIND) 1283 { 1284 _PyUnicode_CONVERT_BYTES( 1285 Py_UCS1, Py_UCS2, 1286 PyUnicode_1BYTE_DATA(from) + from_start, 1287 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1288 PyUnicode_2BYTE_DATA(to) + to_start 1289 ); 1290 } 1291 else if (from_kind == PyUnicode_1BYTE_KIND 1292 && to_kind == PyUnicode_4BYTE_KIND) 1293 { 1294 _PyUnicode_CONVERT_BYTES( 1295 Py_UCS1, Py_UCS4, 1296 PyUnicode_1BYTE_DATA(from) + from_start, 1297 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1298 PyUnicode_4BYTE_DATA(to) + to_start 1299 ); 1300 } 1301 else if (from_kind == PyUnicode_2BYTE_KIND 1302 && to_kind == PyUnicode_4BYTE_KIND) 1303 { 1304 _PyUnicode_CONVERT_BYTES( 1305 Py_UCS2, Py_UCS4, 1306 PyUnicode_2BYTE_DATA(from) + from_start, 1307 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1308 PyUnicode_4BYTE_DATA(to) + to_start 1309 ); 1310 } 1311 else { 1312 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1313 1314 if (!check_maxchar) { 1315 if (from_kind == PyUnicode_2BYTE_KIND 1316 && to_kind == PyUnicode_1BYTE_KIND) 1317 { 1318 _PyUnicode_CONVERT_BYTES( 1319 Py_UCS2, Py_UCS1, 1320 PyUnicode_2BYTE_DATA(from) + from_start, 1321 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1322 PyUnicode_1BYTE_DATA(to) + to_start 1323 ); 1324 } 1325 else if (from_kind == PyUnicode_4BYTE_KIND 1326 && to_kind == PyUnicode_1BYTE_KIND) 1327 { 1328 _PyUnicode_CONVERT_BYTES( 1329 Py_UCS4, Py_UCS1, 1330 PyUnicode_4BYTE_DATA(from) + from_start, 1331 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1332 PyUnicode_1BYTE_DATA(to) + to_start 1333 ); 1334 } 1335 else if (from_kind == PyUnicode_4BYTE_KIND 1336 && to_kind == PyUnicode_2BYTE_KIND) 1337 { 1338 _PyUnicode_CONVERT_BYTES( 1339 Py_UCS4, Py_UCS2, 1340 PyUnicode_4BYTE_DATA(from) + from_start, 1341 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1342 PyUnicode_2BYTE_DATA(to) + to_start 1343 ); 1344 } 1345 else { 1346 assert(0); 1347 return -1; 1348 } 1349 } 1350 else { 1351 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1352 Py_UCS4 ch; 1353 Py_ssize_t i; 1354 1355 for (i=0; i < how_many; i++) { 1356 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1357 if (ch > to_maxchar) 1358 return -1; 1359 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1360 } 1361 } 1362 } 1363 return 0; 1364} 1365 1366void 1367_PyUnicode_FastCopyCharacters( 1368 PyObject *to, Py_ssize_t to_start, 1369 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1370{ 1371 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1372} 1373 1374Py_ssize_t 1375PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1376 PyObject *from, Py_ssize_t from_start, 1377 Py_ssize_t how_many) 1378{ 1379 int err; 1380 1381 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1382 PyErr_BadInternalCall(); 1383 return -1; 1384 } 1385 1386 if (PyUnicode_READY(from) == -1) 1387 return -1; 1388 if (PyUnicode_READY(to) == -1) 1389 return -1; 1390 1391 if (from_start < 0) { 1392 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1393 return -1; 1394 } 1395 if (to_start < 0) { 1396 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1397 return -1; 1398 } 1399 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1400 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1401 PyErr_Format(PyExc_SystemError, 1402 "Cannot write %zi characters at %zi " 1403 "in a string of %zi characters", 1404 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1405 return -1; 1406 } 1407 1408 if (how_many == 0) 1409 return 0; 1410 1411 if (unicode_check_modifiable(to)) 1412 return -1; 1413 1414 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1415 if (err) { 1416 PyErr_Format(PyExc_SystemError, 1417 "Cannot copy %s characters " 1418 "into a string of %s characters", 1419 unicode_kind_name(from), 1420 unicode_kind_name(to)); 1421 return -1; 1422 } 1423 return how_many; 1424} 1425 1426/* Find the maximum code point and count the number of surrogate pairs so a 1427 correct string length can be computed before converting a string to UCS4. 1428 This function counts single surrogates as a character and not as a pair. 1429 1430 Return 0 on success, or -1 on error. */ 1431static int 1432find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1433 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1434{ 1435 const wchar_t *iter; 1436 Py_UCS4 ch; 1437 1438 assert(num_surrogates != NULL && maxchar != NULL); 1439 *num_surrogates = 0; 1440 *maxchar = 0; 1441 1442 for (iter = begin; iter < end; ) { 1443#if SIZEOF_WCHAR_T == 2 1444 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1445 && (iter+1) < end 1446 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1447 { 1448 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1449 ++(*num_surrogates); 1450 iter += 2; 1451 } 1452 else 1453#endif 1454 { 1455 ch = *iter; 1456 iter++; 1457 } 1458 if (ch > *maxchar) { 1459 *maxchar = ch; 1460 if (*maxchar > MAX_UNICODE) { 1461 PyErr_Format(PyExc_ValueError, 1462 "character U+%x is not in range [U+0000; U+10ffff]", 1463 ch); 1464 return -1; 1465 } 1466 } 1467 } 1468 return 0; 1469} 1470 1471int 1472_PyUnicode_Ready(PyObject *unicode) 1473{ 1474 wchar_t *end; 1475 Py_UCS4 maxchar = 0; 1476 Py_ssize_t num_surrogates; 1477#if SIZEOF_WCHAR_T == 2 1478 Py_ssize_t length_wo_surrogates; 1479#endif 1480 1481 /* _PyUnicode_Ready() is only intended for old-style API usage where 1482 strings were created using _PyObject_New() and where no canonical 1483 representation (the str field) has been set yet aka strings 1484 which are not yet ready. */ 1485 assert(_PyUnicode_CHECK(unicode)); 1486 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1487 assert(_PyUnicode_WSTR(unicode) != NULL); 1488 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1489 assert(_PyUnicode_UTF8(unicode) == NULL); 1490 /* Actually, it should neither be interned nor be anything else: */ 1491 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1492 1493 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1494 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1495 &maxchar, &num_surrogates) == -1) 1496 return -1; 1497 1498 if (maxchar < 256) { 1499 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1500 if (!_PyUnicode_DATA_ANY(unicode)) { 1501 PyErr_NoMemory(); 1502 return -1; 1503 } 1504 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1505 _PyUnicode_WSTR(unicode), end, 1506 PyUnicode_1BYTE_DATA(unicode)); 1507 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1508 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1509 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1510 if (maxchar < 128) { 1511 _PyUnicode_STATE(unicode).ascii = 1; 1512 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1513 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1514 } 1515 else { 1516 _PyUnicode_STATE(unicode).ascii = 0; 1517 _PyUnicode_UTF8(unicode) = NULL; 1518 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1519 } 1520 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1521 _PyUnicode_WSTR(unicode) = NULL; 1522 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1523 } 1524 /* In this case we might have to convert down from 4-byte native 1525 wchar_t to 2-byte unicode. */ 1526 else if (maxchar < 65536) { 1527 assert(num_surrogates == 0 && 1528 "FindMaxCharAndNumSurrogatePairs() messed up"); 1529 1530#if SIZEOF_WCHAR_T == 2 1531 /* We can share representations and are done. */ 1532 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1533 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1534 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1535 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1536 _PyUnicode_UTF8(unicode) = NULL; 1537 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1538#else 1539 /* sizeof(wchar_t) == 4 */ 1540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1541 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1542 if (!_PyUnicode_DATA_ANY(unicode)) { 1543 PyErr_NoMemory(); 1544 return -1; 1545 } 1546 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1547 _PyUnicode_WSTR(unicode), end, 1548 PyUnicode_2BYTE_DATA(unicode)); 1549 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1551 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1552 _PyUnicode_UTF8(unicode) = NULL; 1553 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1554 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1555 _PyUnicode_WSTR(unicode) = NULL; 1556 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1557#endif 1558 } 1559 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1560 else { 1561#if SIZEOF_WCHAR_T == 2 1562 /* in case the native representation is 2-bytes, we need to allocate a 1563 new normalized 4-byte version. */ 1564 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1565 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1566 PyErr_NoMemory(); 1567 return -1; 1568 } 1569 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1570 if (!_PyUnicode_DATA_ANY(unicode)) { 1571 PyErr_NoMemory(); 1572 return -1; 1573 } 1574 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1575 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1576 _PyUnicode_UTF8(unicode) = NULL; 1577 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1578 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1579 _PyUnicode_STATE(unicode).ready = 1; 1580 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1581 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1582 _PyUnicode_WSTR(unicode) = NULL; 1583 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1584#else 1585 assert(num_surrogates == 0); 1586 1587 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1588 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1589 _PyUnicode_UTF8(unicode) = NULL; 1590 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1591 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1592#endif 1593 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1594 } 1595 _PyUnicode_STATE(unicode).ready = 1; 1596 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1597 return 0; 1598} 1599 1600static void 1601unicode_dealloc(PyObject *unicode) 1602{ 1603 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1604 case SSTATE_NOT_INTERNED: 1605 break; 1606 1607 case SSTATE_INTERNED_MORTAL: 1608 /* revive dead object temporarily for DelItem */ 1609 Py_REFCNT(unicode) = 3; 1610 if (PyDict_DelItem(interned, unicode) != 0) 1611 Py_FatalError( 1612 "deletion of interned string failed"); 1613 break; 1614 1615 case SSTATE_INTERNED_IMMORTAL: 1616 Py_FatalError("Immortal interned string died."); 1617 1618 default: 1619 Py_FatalError("Inconsistent interned string state."); 1620 } 1621 1622 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1623 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1624 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1625 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1626 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1627 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1628 1629 Py_TYPE(unicode)->tp_free(unicode); 1630} 1631 1632#ifdef Py_DEBUG 1633static int 1634unicode_is_singleton(PyObject *unicode) 1635{ 1636 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1637 if (unicode == unicode_empty) 1638 return 1; 1639 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1640 { 1641 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1642 if (ch < 256 && unicode_latin1[ch] == unicode) 1643 return 1; 1644 } 1645 return 0; 1646} 1647#endif 1648 1649static int 1650unicode_modifiable(PyObject *unicode) 1651{ 1652 assert(_PyUnicode_CHECK(unicode)); 1653 if (Py_REFCNT(unicode) != 1) 1654 return 0; 1655 if (_PyUnicode_HASH(unicode) != -1) 1656 return 0; 1657 if (PyUnicode_CHECK_INTERNED(unicode)) 1658 return 0; 1659 if (!PyUnicode_CheckExact(unicode)) 1660 return 0; 1661#ifdef Py_DEBUG 1662 /* singleton refcount is greater than 1 */ 1663 assert(!unicode_is_singleton(unicode)); 1664#endif 1665 return 1; 1666} 1667 1668static int 1669unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1670{ 1671 PyObject *unicode; 1672 Py_ssize_t old_length; 1673 1674 assert(p_unicode != NULL); 1675 unicode = *p_unicode; 1676 1677 assert(unicode != NULL); 1678 assert(PyUnicode_Check(unicode)); 1679 assert(0 <= length); 1680 1681 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1682 old_length = PyUnicode_WSTR_LENGTH(unicode); 1683 else 1684 old_length = PyUnicode_GET_LENGTH(unicode); 1685 if (old_length == length) 1686 return 0; 1687 1688 if (length == 0) { 1689 _Py_INCREF_UNICODE_EMPTY(); 1690 if (!unicode_empty) 1691 return -1; 1692 Py_DECREF(*p_unicode); 1693 *p_unicode = unicode_empty; 1694 return 0; 1695 } 1696 1697 if (!unicode_modifiable(unicode)) { 1698 PyObject *copy = resize_copy(unicode, length); 1699 if (copy == NULL) 1700 return -1; 1701 Py_DECREF(*p_unicode); 1702 *p_unicode = copy; 1703 return 0; 1704 } 1705 1706 if (PyUnicode_IS_COMPACT(unicode)) { 1707 PyObject *new_unicode = resize_compact(unicode, length); 1708 if (new_unicode == NULL) 1709 return -1; 1710 *p_unicode = new_unicode; 1711 return 0; 1712 } 1713 return resize_inplace(unicode, length); 1714} 1715 1716int 1717PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1718{ 1719 PyObject *unicode; 1720 if (p_unicode == NULL) { 1721 PyErr_BadInternalCall(); 1722 return -1; 1723 } 1724 unicode = *p_unicode; 1725 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1726 { 1727 PyErr_BadInternalCall(); 1728 return -1; 1729 } 1730 return unicode_resize(p_unicode, length); 1731} 1732 1733/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1734 1735 WARNING: The function doesn't copy the terminating null character and 1736 doesn't check the maximum character (may write a latin1 character in an 1737 ASCII string). */ 1738static void 1739unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1740 const char *str, Py_ssize_t len) 1741{ 1742 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1743 void *data = PyUnicode_DATA(unicode); 1744 const char *end = str + len; 1745 1746 switch (kind) { 1747 case PyUnicode_1BYTE_KIND: { 1748 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1749#ifdef Py_DEBUG 1750 if (PyUnicode_IS_ASCII(unicode)) { 1751 Py_UCS4 maxchar = ucs1lib_find_max_char( 1752 (const Py_UCS1*)str, 1753 (const Py_UCS1*)str + len); 1754 assert(maxchar < 128); 1755 } 1756#endif 1757 memcpy((char *) data + index, str, len); 1758 break; 1759 } 1760 case PyUnicode_2BYTE_KIND: { 1761 Py_UCS2 *start = (Py_UCS2 *)data + index; 1762 Py_UCS2 *ucs2 = start; 1763 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1764 1765 for (; str < end; ++ucs2, ++str) 1766 *ucs2 = (Py_UCS2)*str; 1767 1768 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1769 break; 1770 } 1771 default: { 1772 Py_UCS4 *start = (Py_UCS4 *)data + index; 1773 Py_UCS4 *ucs4 = start; 1774 assert(kind == PyUnicode_4BYTE_KIND); 1775 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1776 1777 for (; str < end; ++ucs4, ++str) 1778 *ucs4 = (Py_UCS4)*str; 1779 1780 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1781 } 1782 } 1783} 1784 1785static PyObject* 1786get_latin1_char(unsigned char ch) 1787{ 1788 PyObject *unicode = unicode_latin1[ch]; 1789 if (!unicode) { 1790 unicode = PyUnicode_New(1, ch); 1791 if (!unicode) 1792 return NULL; 1793 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1794 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1795 unicode_latin1[ch] = unicode; 1796 } 1797 Py_INCREF(unicode); 1798 return unicode; 1799} 1800 1801static PyObject* 1802unicode_char(Py_UCS4 ch) 1803{ 1804 PyObject *unicode; 1805 1806 assert(ch <= MAX_UNICODE); 1807 1808 if (ch < 256) 1809 return get_latin1_char(ch); 1810 1811 unicode = PyUnicode_New(1, ch); 1812 if (unicode == NULL) 1813 return NULL; 1814 switch (PyUnicode_KIND(unicode)) { 1815 case PyUnicode_1BYTE_KIND: 1816 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1817 break; 1818 case PyUnicode_2BYTE_KIND: 1819 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1820 break; 1821 default: 1822 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1823 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1824 } 1825 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1826 return unicode; 1827} 1828 1829PyObject * 1830PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1831{ 1832 PyObject *unicode; 1833 Py_UCS4 maxchar = 0; 1834 Py_ssize_t num_surrogates; 1835 1836 if (u == NULL) 1837 return (PyObject*)_PyUnicode_New(size); 1838 1839 /* If the Unicode data is known at construction time, we can apply 1840 some optimizations which share commonly used objects. */ 1841 1842 /* Optimization for empty strings */ 1843 if (size == 0) 1844 _Py_RETURN_UNICODE_EMPTY(); 1845 1846 /* Single character Unicode objects in the Latin-1 range are 1847 shared when using this constructor */ 1848 if (size == 1 && (Py_UCS4)*u < 256) 1849 return get_latin1_char((unsigned char)*u); 1850 1851 /* If not empty and not single character, copy the Unicode data 1852 into the new object */ 1853 if (find_maxchar_surrogates(u, u + size, 1854 &maxchar, &num_surrogates) == -1) 1855 return NULL; 1856 1857 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1858 if (!unicode) 1859 return NULL; 1860 1861 switch (PyUnicode_KIND(unicode)) { 1862 case PyUnicode_1BYTE_KIND: 1863 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1864 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1865 break; 1866 case PyUnicode_2BYTE_KIND: 1867#if Py_UNICODE_SIZE == 2 1868 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1869#else 1870 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1871 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1872#endif 1873 break; 1874 case PyUnicode_4BYTE_KIND: 1875#if SIZEOF_WCHAR_T == 2 1876 /* This is the only case which has to process surrogates, thus 1877 a simple copy loop is not enough and we need a function. */ 1878 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1879#else 1880 assert(num_surrogates == 0); 1881 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1882#endif 1883 break; 1884 default: 1885 assert(0 && "Impossible state"); 1886 } 1887 1888 return unicode_result(unicode); 1889} 1890 1891PyObject * 1892PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1893{ 1894 if (size < 0) { 1895 PyErr_SetString(PyExc_SystemError, 1896 "Negative size passed to PyUnicode_FromStringAndSize"); 1897 return NULL; 1898 } 1899 if (u != NULL) 1900 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1901 else 1902 return (PyObject *)_PyUnicode_New(size); 1903} 1904 1905PyObject * 1906PyUnicode_FromString(const char *u) 1907{ 1908 size_t size = strlen(u); 1909 if (size > PY_SSIZE_T_MAX) { 1910 PyErr_SetString(PyExc_OverflowError, "input too long"); 1911 return NULL; 1912 } 1913 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1914} 1915 1916PyObject * 1917_PyUnicode_FromId(_Py_Identifier *id) 1918{ 1919 if (!id->object) { 1920 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1921 strlen(id->string), 1922 NULL, NULL); 1923 if (!id->object) 1924 return NULL; 1925 PyUnicode_InternInPlace(&id->object); 1926 assert(!id->next); 1927 id->next = static_strings; 1928 static_strings = id; 1929 } 1930 return id->object; 1931} 1932 1933void 1934_PyUnicode_ClearStaticStrings() 1935{ 1936 _Py_Identifier *tmp, *s = static_strings; 1937 while (s) { 1938 Py_CLEAR(s->object); 1939 tmp = s->next; 1940 s->next = NULL; 1941 s = tmp; 1942 } 1943 static_strings = NULL; 1944} 1945 1946/* Internal function, doesn't check maximum character */ 1947 1948PyObject* 1949_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1950{ 1951 const unsigned char *s = (const unsigned char *)buffer; 1952 PyObject *unicode; 1953 if (size == 1) { 1954#ifdef Py_DEBUG 1955 assert((unsigned char)s[0] < 128); 1956#endif 1957 return get_latin1_char(s[0]); 1958 } 1959 unicode = PyUnicode_New(size, 127); 1960 if (!unicode) 1961 return NULL; 1962 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1963 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1964 return unicode; 1965} 1966 1967static Py_UCS4 1968kind_maxchar_limit(unsigned int kind) 1969{ 1970 switch (kind) { 1971 case PyUnicode_1BYTE_KIND: 1972 return 0x80; 1973 case PyUnicode_2BYTE_KIND: 1974 return 0x100; 1975 case PyUnicode_4BYTE_KIND: 1976 return 0x10000; 1977 default: 1978 assert(0 && "invalid kind"); 1979 return MAX_UNICODE; 1980 } 1981} 1982 1983Py_LOCAL_INLINE(Py_UCS4) 1984align_maxchar(Py_UCS4 maxchar) 1985{ 1986 if (maxchar <= 127) 1987 return 127; 1988 else if (maxchar <= 255) 1989 return 255; 1990 else if (maxchar <= 65535) 1991 return 65535; 1992 else 1993 return MAX_UNICODE; 1994} 1995 1996static PyObject* 1997_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1998{ 1999 PyObject *res; 2000 unsigned char max_char; 2001 2002 if (size == 0) 2003 _Py_RETURN_UNICODE_EMPTY(); 2004 assert(size > 0); 2005 if (size == 1) 2006 return get_latin1_char(u[0]); 2007 2008 max_char = ucs1lib_find_max_char(u, u + size); 2009 res = PyUnicode_New(size, max_char); 2010 if (!res) 2011 return NULL; 2012 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2013 assert(_PyUnicode_CheckConsistency(res, 1)); 2014 return res; 2015} 2016 2017static PyObject* 2018_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2019{ 2020 PyObject *res; 2021 Py_UCS2 max_char; 2022 2023 if (size == 0) 2024 _Py_RETURN_UNICODE_EMPTY(); 2025 assert(size > 0); 2026 if (size == 1) 2027 return unicode_char(u[0]); 2028 2029 max_char = ucs2lib_find_max_char(u, u + size); 2030 res = PyUnicode_New(size, max_char); 2031 if (!res) 2032 return NULL; 2033 if (max_char >= 256) 2034 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2035 else { 2036 _PyUnicode_CONVERT_BYTES( 2037 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2038 } 2039 assert(_PyUnicode_CheckConsistency(res, 1)); 2040 return res; 2041} 2042 2043static PyObject* 2044_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2045{ 2046 PyObject *res; 2047 Py_UCS4 max_char; 2048 2049 if (size == 0) 2050 _Py_RETURN_UNICODE_EMPTY(); 2051 assert(size > 0); 2052 if (size == 1) 2053 return unicode_char(u[0]); 2054 2055 max_char = ucs4lib_find_max_char(u, u + size); 2056 res = PyUnicode_New(size, max_char); 2057 if (!res) 2058 return NULL; 2059 if (max_char < 256) 2060 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2061 PyUnicode_1BYTE_DATA(res)); 2062 else if (max_char < 0x10000) 2063 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2064 PyUnicode_2BYTE_DATA(res)); 2065 else 2066 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2067 assert(_PyUnicode_CheckConsistency(res, 1)); 2068 return res; 2069} 2070 2071PyObject* 2072PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2073{ 2074 if (size < 0) { 2075 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2076 return NULL; 2077 } 2078 switch (kind) { 2079 case PyUnicode_1BYTE_KIND: 2080 return _PyUnicode_FromUCS1(buffer, size); 2081 case PyUnicode_2BYTE_KIND: 2082 return _PyUnicode_FromUCS2(buffer, size); 2083 case PyUnicode_4BYTE_KIND: 2084 return _PyUnicode_FromUCS4(buffer, size); 2085 default: 2086 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2087 return NULL; 2088 } 2089} 2090 2091Py_UCS4 2092_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2093{ 2094 enum PyUnicode_Kind kind; 2095 void *startptr, *endptr; 2096 2097 assert(PyUnicode_IS_READY(unicode)); 2098 assert(0 <= start); 2099 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2100 assert(start <= end); 2101 2102 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2103 return PyUnicode_MAX_CHAR_VALUE(unicode); 2104 2105 if (start == end) 2106 return 127; 2107 2108 if (PyUnicode_IS_ASCII(unicode)) 2109 return 127; 2110 2111 kind = PyUnicode_KIND(unicode); 2112 startptr = PyUnicode_DATA(unicode); 2113 endptr = (char *)startptr + end * kind; 2114 startptr = (char *)startptr + start * kind; 2115 switch(kind) { 2116 case PyUnicode_1BYTE_KIND: 2117 return ucs1lib_find_max_char(startptr, endptr); 2118 case PyUnicode_2BYTE_KIND: 2119 return ucs2lib_find_max_char(startptr, endptr); 2120 case PyUnicode_4BYTE_KIND: 2121 return ucs4lib_find_max_char(startptr, endptr); 2122 default: 2123 assert(0); 2124 return 0; 2125 } 2126} 2127 2128/* Ensure that a string uses the most efficient storage, if it is not the 2129 case: create a new string with of the right kind. Write NULL into *p_unicode 2130 on error. */ 2131static void 2132unicode_adjust_maxchar(PyObject **p_unicode) 2133{ 2134 PyObject *unicode, *copy; 2135 Py_UCS4 max_char; 2136 Py_ssize_t len; 2137 unsigned int kind; 2138 2139 assert(p_unicode != NULL); 2140 unicode = *p_unicode; 2141 assert(PyUnicode_IS_READY(unicode)); 2142 if (PyUnicode_IS_ASCII(unicode)) 2143 return; 2144 2145 len = PyUnicode_GET_LENGTH(unicode); 2146 kind = PyUnicode_KIND(unicode); 2147 if (kind == PyUnicode_1BYTE_KIND) { 2148 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2149 max_char = ucs1lib_find_max_char(u, u + len); 2150 if (max_char >= 128) 2151 return; 2152 } 2153 else if (kind == PyUnicode_2BYTE_KIND) { 2154 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2155 max_char = ucs2lib_find_max_char(u, u + len); 2156 if (max_char >= 256) 2157 return; 2158 } 2159 else { 2160 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2161 assert(kind == PyUnicode_4BYTE_KIND); 2162 max_char = ucs4lib_find_max_char(u, u + len); 2163 if (max_char >= 0x10000) 2164 return; 2165 } 2166 copy = PyUnicode_New(len, max_char); 2167 if (copy != NULL) 2168 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2169 Py_DECREF(unicode); 2170 *p_unicode = copy; 2171} 2172 2173PyObject* 2174_PyUnicode_Copy(PyObject *unicode) 2175{ 2176 Py_ssize_t length; 2177 PyObject *copy; 2178 2179 if (!PyUnicode_Check(unicode)) { 2180 PyErr_BadInternalCall(); 2181 return NULL; 2182 } 2183 if (PyUnicode_READY(unicode) == -1) 2184 return NULL; 2185 2186 length = PyUnicode_GET_LENGTH(unicode); 2187 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2188 if (!copy) 2189 return NULL; 2190 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2191 2192 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2193 length * PyUnicode_KIND(unicode)); 2194 assert(_PyUnicode_CheckConsistency(copy, 1)); 2195 return copy; 2196} 2197 2198 2199/* Widen Unicode objects to larger buffers. Don't write terminating null 2200 character. Return NULL on error. */ 2201 2202void* 2203_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2204{ 2205 Py_ssize_t len; 2206 void *result; 2207 unsigned int skind; 2208 2209 if (PyUnicode_READY(s) == -1) 2210 return NULL; 2211 2212 len = PyUnicode_GET_LENGTH(s); 2213 skind = PyUnicode_KIND(s); 2214 if (skind >= kind) { 2215 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2216 return NULL; 2217 } 2218 switch (kind) { 2219 case PyUnicode_2BYTE_KIND: 2220 result = PyMem_New(Py_UCS2, len); 2221 if (!result) 2222 return PyErr_NoMemory(); 2223 assert(skind == PyUnicode_1BYTE_KIND); 2224 _PyUnicode_CONVERT_BYTES( 2225 Py_UCS1, Py_UCS2, 2226 PyUnicode_1BYTE_DATA(s), 2227 PyUnicode_1BYTE_DATA(s) + len, 2228 result); 2229 return result; 2230 case PyUnicode_4BYTE_KIND: 2231 result = PyMem_New(Py_UCS4, len); 2232 if (!result) 2233 return PyErr_NoMemory(); 2234 if (skind == PyUnicode_2BYTE_KIND) { 2235 _PyUnicode_CONVERT_BYTES( 2236 Py_UCS2, Py_UCS4, 2237 PyUnicode_2BYTE_DATA(s), 2238 PyUnicode_2BYTE_DATA(s) + len, 2239 result); 2240 } 2241 else { 2242 assert(skind == PyUnicode_1BYTE_KIND); 2243 _PyUnicode_CONVERT_BYTES( 2244 Py_UCS1, Py_UCS4, 2245 PyUnicode_1BYTE_DATA(s), 2246 PyUnicode_1BYTE_DATA(s) + len, 2247 result); 2248 } 2249 return result; 2250 default: 2251 break; 2252 } 2253 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2254 return NULL; 2255} 2256 2257static Py_UCS4* 2258as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2259 int copy_null) 2260{ 2261 int kind; 2262 void *data; 2263 Py_ssize_t len, targetlen; 2264 if (PyUnicode_READY(string) == -1) 2265 return NULL; 2266 kind = PyUnicode_KIND(string); 2267 data = PyUnicode_DATA(string); 2268 len = PyUnicode_GET_LENGTH(string); 2269 targetlen = len; 2270 if (copy_null) 2271 targetlen++; 2272 if (!target) { 2273 target = PyMem_New(Py_UCS4, targetlen); 2274 if (!target) { 2275 PyErr_NoMemory(); 2276 return NULL; 2277 } 2278 } 2279 else { 2280 if (targetsize < targetlen) { 2281 PyErr_Format(PyExc_SystemError, 2282 "string is longer than the buffer"); 2283 if (copy_null && 0 < targetsize) 2284 target[0] = 0; 2285 return NULL; 2286 } 2287 } 2288 if (kind == PyUnicode_1BYTE_KIND) { 2289 Py_UCS1 *start = (Py_UCS1 *) data; 2290 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2291 } 2292 else if (kind == PyUnicode_2BYTE_KIND) { 2293 Py_UCS2 *start = (Py_UCS2 *) data; 2294 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2295 } 2296 else { 2297 assert(kind == PyUnicode_4BYTE_KIND); 2298 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2299 } 2300 if (copy_null) 2301 target[len] = 0; 2302 return target; 2303} 2304 2305Py_UCS4* 2306PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2307 int copy_null) 2308{ 2309 if (target == NULL || targetsize < 0) { 2310 PyErr_BadInternalCall(); 2311 return NULL; 2312 } 2313 return as_ucs4(string, target, targetsize, copy_null); 2314} 2315 2316Py_UCS4* 2317PyUnicode_AsUCS4Copy(PyObject *string) 2318{ 2319 return as_ucs4(string, NULL, 0, 1); 2320} 2321 2322#ifdef HAVE_WCHAR_H 2323 2324PyObject * 2325PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2326{ 2327 if (w == NULL) { 2328 if (size == 0) 2329 _Py_RETURN_UNICODE_EMPTY(); 2330 PyErr_BadInternalCall(); 2331 return NULL; 2332 } 2333 2334 if (size == -1) { 2335 size = wcslen(w); 2336 } 2337 2338 return PyUnicode_FromUnicode(w, size); 2339} 2340 2341#endif /* HAVE_WCHAR_H */ 2342 2343/* maximum number of characters required for output of %lld or %p. 2344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2347 2348static int 2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2350 Py_ssize_t width, Py_ssize_t precision) 2351{ 2352 Py_ssize_t length, fill, arglen; 2353 Py_UCS4 maxchar; 2354 2355 if (PyUnicode_READY(str) == -1) 2356 return -1; 2357 2358 length = PyUnicode_GET_LENGTH(str); 2359 if ((precision == -1 || precision >= length) 2360 && width <= length) 2361 return _PyUnicodeWriter_WriteStr(writer, str); 2362 2363 if (precision != -1) 2364 length = Py_MIN(precision, length); 2365 2366 arglen = Py_MAX(length, width); 2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2369 else 2370 maxchar = writer->maxchar; 2371 2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2373 return -1; 2374 2375 if (width > length) { 2376 fill = width - length; 2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2378 return -1; 2379 writer->pos += fill; 2380 } 2381 2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2383 str, 0, length); 2384 writer->pos += length; 2385 return 0; 2386} 2387 2388static int 2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2390 Py_ssize_t width, Py_ssize_t precision) 2391{ 2392 /* UTF-8 */ 2393 Py_ssize_t length; 2394 PyObject *unicode; 2395 int res; 2396 2397 length = strlen(str); 2398 if (precision != -1) 2399 length = Py_MIN(length, precision); 2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2401 if (unicode == NULL) 2402 return -1; 2403 2404 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2405 Py_DECREF(unicode); 2406 return res; 2407} 2408 2409static const char* 2410unicode_fromformat_arg(_PyUnicodeWriter *writer, 2411 const char *f, va_list *vargs) 2412{ 2413 const char *p; 2414 Py_ssize_t len; 2415 int zeropad; 2416 Py_ssize_t width; 2417 Py_ssize_t precision; 2418 int longflag; 2419 int longlongflag; 2420 int size_tflag; 2421 Py_ssize_t fill; 2422 2423 p = f; 2424 f++; 2425 zeropad = 0; 2426 if (*f == '0') { 2427 zeropad = 1; 2428 f++; 2429 } 2430 2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2432 width = -1; 2433 if (Py_ISDIGIT((unsigned)*f)) { 2434 width = *f - '0'; 2435 f++; 2436 while (Py_ISDIGIT((unsigned)*f)) { 2437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2438 PyErr_SetString(PyExc_ValueError, 2439 "width too big"); 2440 return NULL; 2441 } 2442 width = (width * 10) + (*f - '0'); 2443 f++; 2444 } 2445 } 2446 precision = -1; 2447 if (*f == '.') { 2448 f++; 2449 if (Py_ISDIGIT((unsigned)*f)) { 2450 precision = (*f - '0'); 2451 f++; 2452 while (Py_ISDIGIT((unsigned)*f)) { 2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2454 PyErr_SetString(PyExc_ValueError, 2455 "precision too big"); 2456 return NULL; 2457 } 2458 precision = (precision * 10) + (*f - '0'); 2459 f++; 2460 } 2461 } 2462 if (*f == '%') { 2463 /* "%.3%s" => f points to "3" */ 2464 f--; 2465 } 2466 } 2467 if (*f == '\0') { 2468 /* bogus format "%.123" => go backward, f points to "3" */ 2469 f--; 2470 } 2471 2472 /* Handle %ld, %lu, %lld and %llu. */ 2473 longflag = 0; 2474 longlongflag = 0; 2475 size_tflag = 0; 2476 if (*f == 'l') { 2477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2478 longflag = 1; 2479 ++f; 2480 } 2481#ifdef HAVE_LONG_LONG 2482 else if (f[1] == 'l' && 2483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2484 longlongflag = 1; 2485 f += 2; 2486 } 2487#endif 2488 } 2489 /* handle the size_t flag. */ 2490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2491 size_tflag = 1; 2492 ++f; 2493 } 2494 2495 if (f[1] == '\0') 2496 writer->overallocate = 0; 2497 2498 switch (*f) { 2499 case 'c': 2500 { 2501 int ordinal = va_arg(*vargs, int); 2502 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2503 PyErr_SetString(PyExc_OverflowError, 2504 "character argument not in range(0x110000)"); 2505 return NULL; 2506 } 2507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2508 return NULL; 2509 break; 2510 } 2511 2512 case 'i': 2513 case 'd': 2514 case 'u': 2515 case 'x': 2516 { 2517 /* used by sprintf */ 2518 char buffer[MAX_LONG_LONG_CHARS]; 2519 Py_ssize_t arglen; 2520 2521 if (*f == 'u') { 2522 if (longflag) 2523 len = sprintf(buffer, "%lu", 2524 va_arg(*vargs, unsigned long)); 2525#ifdef HAVE_LONG_LONG 2526 else if (longlongflag) 2527 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u", 2528 va_arg(*vargs, unsigned PY_LONG_LONG)); 2529#endif 2530 else if (size_tflag) 2531 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2532 va_arg(*vargs, size_t)); 2533 else 2534 len = sprintf(buffer, "%u", 2535 va_arg(*vargs, unsigned int)); 2536 } 2537 else if (*f == 'x') { 2538 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2539 } 2540 else { 2541 if (longflag) 2542 len = sprintf(buffer, "%li", 2543 va_arg(*vargs, long)); 2544#ifdef HAVE_LONG_LONG 2545 else if (longlongflag) 2546 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i", 2547 va_arg(*vargs, PY_LONG_LONG)); 2548#endif 2549 else if (size_tflag) 2550 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2551 va_arg(*vargs, Py_ssize_t)); 2552 else 2553 len = sprintf(buffer, "%i", 2554 va_arg(*vargs, int)); 2555 } 2556 assert(len >= 0); 2557 2558 if (precision < len) 2559 precision = len; 2560 2561 arglen = Py_MAX(precision, width); 2562 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2563 return NULL; 2564 2565 if (width > precision) { 2566 Py_UCS4 fillchar; 2567 fill = width - precision; 2568 fillchar = zeropad?'0':' '; 2569 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2570 return NULL; 2571 writer->pos += fill; 2572 } 2573 if (precision > len) { 2574 fill = precision - len; 2575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2576 return NULL; 2577 writer->pos += fill; 2578 } 2579 2580 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2581 return NULL; 2582 break; 2583 } 2584 2585 case 'p': 2586 { 2587 char number[MAX_LONG_LONG_CHARS]; 2588 2589 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2590 assert(len >= 0); 2591 2592 /* %p is ill-defined: ensure leading 0x. */ 2593 if (number[1] == 'X') 2594 number[1] = 'x'; 2595 else if (number[1] != 'x') { 2596 memmove(number + 2, number, 2597 strlen(number) + 1); 2598 number[0] = '0'; 2599 number[1] = 'x'; 2600 len += 2; 2601 } 2602 2603 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2604 return NULL; 2605 break; 2606 } 2607 2608 case 's': 2609 { 2610 /* UTF-8 */ 2611 const char *s = va_arg(*vargs, const char*); 2612 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2613 return NULL; 2614 break; 2615 } 2616 2617 case 'U': 2618 { 2619 PyObject *obj = va_arg(*vargs, PyObject *); 2620 assert(obj && _PyUnicode_CHECK(obj)); 2621 2622 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2623 return NULL; 2624 break; 2625 } 2626 2627 case 'V': 2628 { 2629 PyObject *obj = va_arg(*vargs, PyObject *); 2630 const char *str = va_arg(*vargs, const char *); 2631 if (obj) { 2632 assert(_PyUnicode_CHECK(obj)); 2633 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2634 return NULL; 2635 } 2636 else { 2637 assert(str != NULL); 2638 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2639 return NULL; 2640 } 2641 break; 2642 } 2643 2644 case 'S': 2645 { 2646 PyObject *obj = va_arg(*vargs, PyObject *); 2647 PyObject *str; 2648 assert(obj); 2649 str = PyObject_Str(obj); 2650 if (!str) 2651 return NULL; 2652 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2653 Py_DECREF(str); 2654 return NULL; 2655 } 2656 Py_DECREF(str); 2657 break; 2658 } 2659 2660 case 'R': 2661 { 2662 PyObject *obj = va_arg(*vargs, PyObject *); 2663 PyObject *repr; 2664 assert(obj); 2665 repr = PyObject_Repr(obj); 2666 if (!repr) 2667 return NULL; 2668 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2669 Py_DECREF(repr); 2670 return NULL; 2671 } 2672 Py_DECREF(repr); 2673 break; 2674 } 2675 2676 case 'A': 2677 { 2678 PyObject *obj = va_arg(*vargs, PyObject *); 2679 PyObject *ascii; 2680 assert(obj); 2681 ascii = PyObject_ASCII(obj); 2682 if (!ascii) 2683 return NULL; 2684 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2685 Py_DECREF(ascii); 2686 return NULL; 2687 } 2688 Py_DECREF(ascii); 2689 break; 2690 } 2691 2692 case '%': 2693 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2694 return NULL; 2695 break; 2696 2697 default: 2698 /* if we stumble upon an unknown formatting code, copy the rest 2699 of the format string to the output string. (we cannot just 2700 skip the code, since there's no way to know what's in the 2701 argument list) */ 2702 len = strlen(p); 2703 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2704 return NULL; 2705 f = p+len; 2706 return f; 2707 } 2708 2709 f++; 2710 return f; 2711} 2712 2713PyObject * 2714PyUnicode_FromFormatV(const char *format, va_list vargs) 2715{ 2716 va_list vargs2; 2717 const char *f; 2718 _PyUnicodeWriter writer; 2719 2720 _PyUnicodeWriter_Init(&writer); 2721 writer.min_length = strlen(format) + 100; 2722 writer.overallocate = 1; 2723 2724 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2725 Copy it to be able to pass a reference to a subfunction. */ 2726 Py_VA_COPY(vargs2, vargs); 2727 2728 for (f = format; *f; ) { 2729 if (*f == '%') { 2730 f = unicode_fromformat_arg(&writer, f, &vargs2); 2731 if (f == NULL) 2732 goto fail; 2733 } 2734 else { 2735 const char *p; 2736 Py_ssize_t len; 2737 2738 p = f; 2739 do 2740 { 2741 if ((unsigned char)*p > 127) { 2742 PyErr_Format(PyExc_ValueError, 2743 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2744 "string, got a non-ASCII byte: 0x%02x", 2745 (unsigned char)*p); 2746 return NULL; 2747 } 2748 p++; 2749 } 2750 while (*p != '\0' && *p != '%'); 2751 len = p - f; 2752 2753 if (*p == '\0') 2754 writer.overallocate = 0; 2755 2756 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2757 goto fail; 2758 2759 f = p; 2760 } 2761 } 2762 return _PyUnicodeWriter_Finish(&writer); 2763 2764 fail: 2765 _PyUnicodeWriter_Dealloc(&writer); 2766 return NULL; 2767} 2768 2769PyObject * 2770PyUnicode_FromFormat(const char *format, ...) 2771{ 2772 PyObject* ret; 2773 va_list vargs; 2774 2775#ifdef HAVE_STDARG_PROTOTYPES 2776 va_start(vargs, format); 2777#else 2778 va_start(vargs); 2779#endif 2780 ret = PyUnicode_FromFormatV(format, vargs); 2781 va_end(vargs); 2782 return ret; 2783} 2784 2785#ifdef HAVE_WCHAR_H 2786 2787/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2788 convert a Unicode object to a wide character string. 2789 2790 - If w is NULL: return the number of wide characters (including the null 2791 character) required to convert the unicode object. Ignore size argument. 2792 2793 - Otherwise: return the number of wide characters (excluding the null 2794 character) written into w. Write at most size wide characters (including 2795 the null character). */ 2796static Py_ssize_t 2797unicode_aswidechar(PyObject *unicode, 2798 wchar_t *w, 2799 Py_ssize_t size) 2800{ 2801 Py_ssize_t res; 2802 const wchar_t *wstr; 2803 2804 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2805 if (wstr == NULL) 2806 return -1; 2807 2808 if (w != NULL) { 2809 if (size > res) 2810 size = res + 1; 2811 else 2812 res = size; 2813 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2814 return res; 2815 } 2816 else 2817 return res + 1; 2818} 2819 2820Py_ssize_t 2821PyUnicode_AsWideChar(PyObject *unicode, 2822 wchar_t *w, 2823 Py_ssize_t size) 2824{ 2825 if (unicode == NULL) { 2826 PyErr_BadInternalCall(); 2827 return -1; 2828 } 2829 return unicode_aswidechar(unicode, w, size); 2830} 2831 2832wchar_t* 2833PyUnicode_AsWideCharString(PyObject *unicode, 2834 Py_ssize_t *size) 2835{ 2836 wchar_t* buffer; 2837 Py_ssize_t buflen; 2838 2839 if (unicode == NULL) { 2840 PyErr_BadInternalCall(); 2841 return NULL; 2842 } 2843 2844 buflen = unicode_aswidechar(unicode, NULL, 0); 2845 if (buflen == -1) 2846 return NULL; 2847 buffer = PyMem_NEW(wchar_t, buflen); 2848 if (buffer == NULL) { 2849 PyErr_NoMemory(); 2850 return NULL; 2851 } 2852 buflen = unicode_aswidechar(unicode, buffer, buflen); 2853 if (buflen == -1) { 2854 PyMem_FREE(buffer); 2855 return NULL; 2856 } 2857 if (size != NULL) 2858 *size = buflen; 2859 return buffer; 2860} 2861 2862#endif /* HAVE_WCHAR_H */ 2863 2864PyObject * 2865PyUnicode_FromOrdinal(int ordinal) 2866{ 2867 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2868 PyErr_SetString(PyExc_ValueError, 2869 "chr() arg not in range(0x110000)"); 2870 return NULL; 2871 } 2872 2873 return unicode_char((Py_UCS4)ordinal); 2874} 2875 2876PyObject * 2877PyUnicode_FromObject(PyObject *obj) 2878{ 2879 /* XXX Perhaps we should make this API an alias of 2880 PyObject_Str() instead ?! */ 2881 if (PyUnicode_CheckExact(obj)) { 2882 if (PyUnicode_READY(obj) == -1) 2883 return NULL; 2884 Py_INCREF(obj); 2885 return obj; 2886 } 2887 if (PyUnicode_Check(obj)) { 2888 /* For a Unicode subtype that's not a Unicode object, 2889 return a true Unicode object with the same data. */ 2890 return _PyUnicode_Copy(obj); 2891 } 2892 PyErr_Format(PyExc_TypeError, 2893 "Can't convert '%.100s' object to str implicitly", 2894 Py_TYPE(obj)->tp_name); 2895 return NULL; 2896} 2897 2898PyObject * 2899PyUnicode_FromEncodedObject(PyObject *obj, 2900 const char *encoding, 2901 const char *errors) 2902{ 2903 Py_buffer buffer; 2904 PyObject *v; 2905 2906 if (obj == NULL) { 2907 PyErr_BadInternalCall(); 2908 return NULL; 2909 } 2910 2911 /* Decoding bytes objects is the most common case and should be fast */ 2912 if (PyBytes_Check(obj)) { 2913 if (PyBytes_GET_SIZE(obj) == 0) 2914 _Py_RETURN_UNICODE_EMPTY(); 2915 v = PyUnicode_Decode( 2916 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2917 encoding, errors); 2918 return v; 2919 } 2920 2921 if (PyUnicode_Check(obj)) { 2922 PyErr_SetString(PyExc_TypeError, 2923 "decoding str is not supported"); 2924 return NULL; 2925 } 2926 2927 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2928 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2929 PyErr_Format(PyExc_TypeError, 2930 "coercing to str: need a bytes-like object, %.80s found", 2931 Py_TYPE(obj)->tp_name); 2932 return NULL; 2933 } 2934 2935 if (buffer.len == 0) { 2936 PyBuffer_Release(&buffer); 2937 _Py_RETURN_UNICODE_EMPTY(); 2938 } 2939 2940 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2941 PyBuffer_Release(&buffer); 2942 return v; 2943} 2944 2945/* Convert encoding to lower case and replace '_' with '-' in order to 2946 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2947 1 on success. */ 2948int 2949_Py_normalize_encoding(const char *encoding, 2950 char *lower, 2951 size_t lower_len) 2952{ 2953 const char *e; 2954 char *l; 2955 char *l_end; 2956 2957 if (encoding == NULL) { 2958 /* 6 == strlen("utf-8") + 1 */ 2959 if (lower_len < 6) 2960 return 0; 2961 strcpy(lower, "utf-8"); 2962 return 1; 2963 } 2964 e = encoding; 2965 l = lower; 2966 l_end = &lower[lower_len - 1]; 2967 while (*e) { 2968 if (l == l_end) 2969 return 0; 2970 if (Py_ISUPPER(*e)) { 2971 *l++ = Py_TOLOWER(*e++); 2972 } 2973 else if (*e == '_') { 2974 *l++ = '-'; 2975 e++; 2976 } 2977 else { 2978 *l++ = *e++; 2979 } 2980 } 2981 *l = '\0'; 2982 return 1; 2983} 2984 2985PyObject * 2986PyUnicode_Decode(const char *s, 2987 Py_ssize_t size, 2988 const char *encoding, 2989 const char *errors) 2990{ 2991 PyObject *buffer = NULL, *unicode; 2992 Py_buffer info; 2993 char lower[11]; /* Enough for any encoding shortcut */ 2994 2995 /* Shortcuts for common default encodings */ 2996 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 2997 if ((strcmp(lower, "utf-8") == 0) || 2998 (strcmp(lower, "utf8") == 0)) 2999 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3000 else if ((strcmp(lower, "latin-1") == 0) || 3001 (strcmp(lower, "latin1") == 0) || 3002 (strcmp(lower, "iso-8859-1") == 0) || 3003 (strcmp(lower, "iso8859-1") == 0)) 3004 return PyUnicode_DecodeLatin1(s, size, errors); 3005#ifdef HAVE_MBCS 3006 else if (strcmp(lower, "mbcs") == 0) 3007 return PyUnicode_DecodeMBCS(s, size, errors); 3008#endif 3009 else if (strcmp(lower, "ascii") == 0) 3010 return PyUnicode_DecodeASCII(s, size, errors); 3011 else if (strcmp(lower, "utf-16") == 0) 3012 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3013 else if (strcmp(lower, "utf-32") == 0) 3014 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3015 } 3016 3017 /* Decode via the codec registry */ 3018 buffer = NULL; 3019 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3020 goto onError; 3021 buffer = PyMemoryView_FromBuffer(&info); 3022 if (buffer == NULL) 3023 goto onError; 3024 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3025 if (unicode == NULL) 3026 goto onError; 3027 if (!PyUnicode_Check(unicode)) { 3028 PyErr_Format(PyExc_TypeError, 3029 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3030 "use codecs.decode() to decode to arbitrary types", 3031 encoding, 3032 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3033 Py_DECREF(unicode); 3034 goto onError; 3035 } 3036 Py_DECREF(buffer); 3037 return unicode_result(unicode); 3038 3039 onError: 3040 Py_XDECREF(buffer); 3041 return NULL; 3042} 3043 3044PyObject * 3045PyUnicode_AsDecodedObject(PyObject *unicode, 3046 const char *encoding, 3047 const char *errors) 3048{ 3049 PyObject *v; 3050 3051 if (!PyUnicode_Check(unicode)) { 3052 PyErr_BadArgument(); 3053 goto onError; 3054 } 3055 3056 if (encoding == NULL) 3057 encoding = PyUnicode_GetDefaultEncoding(); 3058 3059 /* Decode via the codec registry */ 3060 v = PyCodec_Decode(unicode, encoding, errors); 3061 if (v == NULL) 3062 goto onError; 3063 return unicode_result(v); 3064 3065 onError: 3066 return NULL; 3067} 3068 3069PyObject * 3070PyUnicode_AsDecodedUnicode(PyObject *unicode, 3071 const char *encoding, 3072 const char *errors) 3073{ 3074 PyObject *v; 3075 3076 if (!PyUnicode_Check(unicode)) { 3077 PyErr_BadArgument(); 3078 goto onError; 3079 } 3080 3081 if (encoding == NULL) 3082 encoding = PyUnicode_GetDefaultEncoding(); 3083 3084 /* Decode via the codec registry */ 3085 v = PyCodec_Decode(unicode, encoding, errors); 3086 if (v == NULL) 3087 goto onError; 3088 if (!PyUnicode_Check(v)) { 3089 PyErr_Format(PyExc_TypeError, 3090 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3091 "use codecs.decode() to decode to arbitrary types", 3092 encoding, 3093 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3094 Py_DECREF(v); 3095 goto onError; 3096 } 3097 return unicode_result(v); 3098 3099 onError: 3100 return NULL; 3101} 3102 3103PyObject * 3104PyUnicode_Encode(const Py_UNICODE *s, 3105 Py_ssize_t size, 3106 const char *encoding, 3107 const char *errors) 3108{ 3109 PyObject *v, *unicode; 3110 3111 unicode = PyUnicode_FromUnicode(s, size); 3112 if (unicode == NULL) 3113 return NULL; 3114 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3115 Py_DECREF(unicode); 3116 return v; 3117} 3118 3119PyObject * 3120PyUnicode_AsEncodedObject(PyObject *unicode, 3121 const char *encoding, 3122 const char *errors) 3123{ 3124 PyObject *v; 3125 3126 if (!PyUnicode_Check(unicode)) { 3127 PyErr_BadArgument(); 3128 goto onError; 3129 } 3130 3131 if (encoding == NULL) 3132 encoding = PyUnicode_GetDefaultEncoding(); 3133 3134 /* Encode via the codec registry */ 3135 v = PyCodec_Encode(unicode, encoding, errors); 3136 if (v == NULL) 3137 goto onError; 3138 return v; 3139 3140 onError: 3141 return NULL; 3142} 3143 3144static size_t 3145wcstombs_errorpos(const wchar_t *wstr) 3146{ 3147 size_t len; 3148#if SIZEOF_WCHAR_T == 2 3149 wchar_t buf[3]; 3150#else 3151 wchar_t buf[2]; 3152#endif 3153 char outbuf[MB_LEN_MAX]; 3154 const wchar_t *start, *previous; 3155 3156#if SIZEOF_WCHAR_T == 2 3157 buf[2] = 0; 3158#else 3159 buf[1] = 0; 3160#endif 3161 start = wstr; 3162 while (*wstr != L'\0') 3163 { 3164 previous = wstr; 3165#if SIZEOF_WCHAR_T == 2 3166 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3167 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3168 { 3169 buf[0] = wstr[0]; 3170 buf[1] = wstr[1]; 3171 wstr += 2; 3172 } 3173 else { 3174 buf[0] = *wstr; 3175 buf[1] = 0; 3176 wstr++; 3177 } 3178#else 3179 buf[0] = *wstr; 3180 wstr++; 3181#endif 3182 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3183 if (len == (size_t)-1) 3184 return previous - start; 3185 } 3186 3187 /* failed to find the unencodable character */ 3188 return 0; 3189} 3190 3191static int 3192locale_error_handler(const char *errors, int *surrogateescape) 3193{ 3194 _Py_error_handler error_handler = get_error_handler(errors); 3195 switch (error_handler) 3196 { 3197 case _Py_ERROR_STRICT: 3198 *surrogateescape = 0; 3199 return 0; 3200 case _Py_ERROR_SURROGATEESCAPE: 3201 *surrogateescape = 1; 3202 return 0; 3203 default: 3204 PyErr_Format(PyExc_ValueError, 3205 "only 'strict' and 'surrogateescape' error handlers " 3206 "are supported, not '%s'", 3207 errors); 3208 return -1; 3209 } 3210} 3211 3212PyObject * 3213PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3214{ 3215 Py_ssize_t wlen, wlen2; 3216 wchar_t *wstr; 3217 PyObject *bytes = NULL; 3218 char *errmsg; 3219 PyObject *reason = NULL; 3220 PyObject *exc; 3221 size_t error_pos; 3222 int surrogateescape; 3223 3224 if (locale_error_handler(errors, &surrogateescape) < 0) 3225 return NULL; 3226 3227 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3228 if (wstr == NULL) 3229 return NULL; 3230 3231 wlen2 = wcslen(wstr); 3232 if (wlen2 != wlen) { 3233 PyMem_Free(wstr); 3234 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3235 return NULL; 3236 } 3237 3238 if (surrogateescape) { 3239 /* "surrogateescape" error handler */ 3240 char *str; 3241 3242 str = Py_EncodeLocale(wstr, &error_pos); 3243 if (str == NULL) { 3244 if (error_pos == (size_t)-1) { 3245 PyErr_NoMemory(); 3246 PyMem_Free(wstr); 3247 return NULL; 3248 } 3249 else { 3250 goto encode_error; 3251 } 3252 } 3253 PyMem_Free(wstr); 3254 3255 bytes = PyBytes_FromString(str); 3256 PyMem_Free(str); 3257 } 3258 else { 3259 /* strict mode */ 3260 size_t len, len2; 3261 3262 len = wcstombs(NULL, wstr, 0); 3263 if (len == (size_t)-1) { 3264 error_pos = (size_t)-1; 3265 goto encode_error; 3266 } 3267 3268 bytes = PyBytes_FromStringAndSize(NULL, len); 3269 if (bytes == NULL) { 3270 PyMem_Free(wstr); 3271 return NULL; 3272 } 3273 3274 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3275 if (len2 == (size_t)-1 || len2 > len) { 3276 error_pos = (size_t)-1; 3277 goto encode_error; 3278 } 3279 PyMem_Free(wstr); 3280 } 3281 return bytes; 3282 3283encode_error: 3284 errmsg = strerror(errno); 3285 assert(errmsg != NULL); 3286 3287 if (error_pos == (size_t)-1) 3288 error_pos = wcstombs_errorpos(wstr); 3289 3290 PyMem_Free(wstr); 3291 Py_XDECREF(bytes); 3292 3293 if (errmsg != NULL) { 3294 size_t errlen; 3295 wstr = Py_DecodeLocale(errmsg, &errlen); 3296 if (wstr != NULL) { 3297 reason = PyUnicode_FromWideChar(wstr, errlen); 3298 PyMem_RawFree(wstr); 3299 } else 3300 errmsg = NULL; 3301 } 3302 if (errmsg == NULL) 3303 reason = PyUnicode_FromString( 3304 "wcstombs() encountered an unencodable " 3305 "wide character"); 3306 if (reason == NULL) 3307 return NULL; 3308 3309 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3310 "locale", unicode, 3311 (Py_ssize_t)error_pos, 3312 (Py_ssize_t)(error_pos+1), 3313 reason); 3314 Py_DECREF(reason); 3315 if (exc != NULL) { 3316 PyCodec_StrictErrors(exc); 3317 Py_XDECREF(exc); 3318 } 3319 return NULL; 3320} 3321 3322PyObject * 3323PyUnicode_EncodeFSDefault(PyObject *unicode) 3324{ 3325#ifdef HAVE_MBCS 3326 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3327#elif defined(__APPLE__) 3328 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3329#else 3330 PyInterpreterState *interp = PyThreadState_GET()->interp; 3331 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3332 cannot use it to encode and decode filenames before it is loaded. Load 3333 the Python codec requires to encode at least its own filename. Use the C 3334 version of the locale codec until the codec registry is initialized and 3335 the Python codec is loaded. 3336 3337 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3338 cannot only rely on it: check also interp->fscodec_initialized for 3339 subinterpreters. */ 3340 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3341 return PyUnicode_AsEncodedString(unicode, 3342 Py_FileSystemDefaultEncoding, 3343 "surrogateescape"); 3344 } 3345 else { 3346 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3347 } 3348#endif 3349} 3350 3351PyObject * 3352PyUnicode_AsEncodedString(PyObject *unicode, 3353 const char *encoding, 3354 const char *errors) 3355{ 3356 PyObject *v; 3357 char lower[11]; /* Enough for any encoding shortcut */ 3358 3359 if (!PyUnicode_Check(unicode)) { 3360 PyErr_BadArgument(); 3361 return NULL; 3362 } 3363 3364 /* Shortcuts for common default encodings */ 3365 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3366 if ((strcmp(lower, "utf-8") == 0) || 3367 (strcmp(lower, "utf8") == 0)) 3368 { 3369 if (errors == NULL || strcmp(errors, "strict") == 0) 3370 return _PyUnicode_AsUTF8String(unicode, NULL); 3371 else 3372 return _PyUnicode_AsUTF8String(unicode, errors); 3373 } 3374 else if ((strcmp(lower, "latin-1") == 0) || 3375 (strcmp(lower, "latin1") == 0) || 3376 (strcmp(lower, "iso-8859-1") == 0) || 3377 (strcmp(lower, "iso8859-1") == 0)) 3378 return _PyUnicode_AsLatin1String(unicode, errors); 3379#ifdef HAVE_MBCS 3380 else if (strcmp(lower, "mbcs") == 0) 3381 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3382#endif 3383 else if (strcmp(lower, "ascii") == 0) 3384 return _PyUnicode_AsASCIIString(unicode, errors); 3385 } 3386 3387 /* Encode via the codec registry */ 3388 v = _PyCodec_EncodeText(unicode, encoding, errors); 3389 if (v == NULL) 3390 return NULL; 3391 3392 /* The normal path */ 3393 if (PyBytes_Check(v)) 3394 return v; 3395 3396 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3397 if (PyByteArray_Check(v)) { 3398 int error; 3399 PyObject *b; 3400 3401 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3402 "encoder %s returned bytearray instead of bytes; " 3403 "use codecs.encode() to encode to arbitrary types", 3404 encoding); 3405 if (error) { 3406 Py_DECREF(v); 3407 return NULL; 3408 } 3409 3410 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3411 Py_DECREF(v); 3412 return b; 3413 } 3414 3415 PyErr_Format(PyExc_TypeError, 3416 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3417 "use codecs.encode() to encode to arbitrary types", 3418 encoding, 3419 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3420 Py_DECREF(v); 3421 return NULL; 3422} 3423 3424PyObject * 3425PyUnicode_AsEncodedUnicode(PyObject *unicode, 3426 const char *encoding, 3427 const char *errors) 3428{ 3429 PyObject *v; 3430 3431 if (!PyUnicode_Check(unicode)) { 3432 PyErr_BadArgument(); 3433 goto onError; 3434 } 3435 3436 if (encoding == NULL) 3437 encoding = PyUnicode_GetDefaultEncoding(); 3438 3439 /* Encode via the codec registry */ 3440 v = PyCodec_Encode(unicode, encoding, errors); 3441 if (v == NULL) 3442 goto onError; 3443 if (!PyUnicode_Check(v)) { 3444 PyErr_Format(PyExc_TypeError, 3445 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3446 "use codecs.encode() to encode to arbitrary types", 3447 encoding, 3448 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3449 Py_DECREF(v); 3450 goto onError; 3451 } 3452 return v; 3453 3454 onError: 3455 return NULL; 3456} 3457 3458static size_t 3459mbstowcs_errorpos(const char *str, size_t len) 3460{ 3461#ifdef HAVE_MBRTOWC 3462 const char *start = str; 3463 mbstate_t mbs; 3464 size_t converted; 3465 wchar_t ch; 3466 3467 memset(&mbs, 0, sizeof mbs); 3468 while (len) 3469 { 3470 converted = mbrtowc(&ch, str, len, &mbs); 3471 if (converted == 0) 3472 /* Reached end of string */ 3473 break; 3474 if (converted == (size_t)-1 || converted == (size_t)-2) { 3475 /* Conversion error or incomplete character */ 3476 return str - start; 3477 } 3478 else { 3479 str += converted; 3480 len -= converted; 3481 } 3482 } 3483 /* failed to find the undecodable byte sequence */ 3484 return 0; 3485#endif 3486 return 0; 3487} 3488 3489PyObject* 3490PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3491 const char *errors) 3492{ 3493 wchar_t smallbuf[256]; 3494 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3495 wchar_t *wstr; 3496 size_t wlen, wlen2; 3497 PyObject *unicode; 3498 int surrogateescape; 3499 size_t error_pos; 3500 char *errmsg; 3501 PyObject *reason = NULL; /* initialize to prevent gcc warning */ 3502 PyObject *exc; 3503 3504 if (locale_error_handler(errors, &surrogateescape) < 0) 3505 return NULL; 3506 3507 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3508 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3509 return NULL; 3510 } 3511 3512 if (surrogateescape) { 3513 /* "surrogateescape" error handler */ 3514 wstr = Py_DecodeLocale(str, &wlen); 3515 if (wstr == NULL) { 3516 if (wlen == (size_t)-1) 3517 PyErr_NoMemory(); 3518 else 3519 PyErr_SetFromErrno(PyExc_OSError); 3520 return NULL; 3521 } 3522 3523 unicode = PyUnicode_FromWideChar(wstr, wlen); 3524 PyMem_RawFree(wstr); 3525 } 3526 else { 3527 /* strict mode */ 3528#ifndef HAVE_BROKEN_MBSTOWCS 3529 wlen = mbstowcs(NULL, str, 0); 3530#else 3531 wlen = len; 3532#endif 3533 if (wlen == (size_t)-1) 3534 goto decode_error; 3535 if (wlen+1 <= smallbuf_len) { 3536 wstr = smallbuf; 3537 } 3538 else { 3539 wstr = PyMem_New(wchar_t, wlen+1); 3540 if (!wstr) 3541 return PyErr_NoMemory(); 3542 } 3543 3544 wlen2 = mbstowcs(wstr, str, wlen+1); 3545 if (wlen2 == (size_t)-1) { 3546 if (wstr != smallbuf) 3547 PyMem_Free(wstr); 3548 goto decode_error; 3549 } 3550#ifdef HAVE_BROKEN_MBSTOWCS 3551 assert(wlen2 == wlen); 3552#endif 3553 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3554 if (wstr != smallbuf) 3555 PyMem_Free(wstr); 3556 } 3557 return unicode; 3558 3559decode_error: 3560 reason = NULL; 3561 errmsg = strerror(errno); 3562 assert(errmsg != NULL); 3563 3564 error_pos = mbstowcs_errorpos(str, len); 3565 if (errmsg != NULL) { 3566 size_t errlen; 3567 wstr = Py_DecodeLocale(errmsg, &errlen); 3568 if (wstr != NULL) { 3569 reason = PyUnicode_FromWideChar(wstr, errlen); 3570 PyMem_RawFree(wstr); 3571 } 3572 } 3573 if (reason == NULL) 3574 reason = PyUnicode_FromString( 3575 "mbstowcs() encountered an invalid multibyte sequence"); 3576 if (reason == NULL) 3577 return NULL; 3578 3579 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3580 "locale", str, len, 3581 (Py_ssize_t)error_pos, 3582 (Py_ssize_t)(error_pos+1), 3583 reason); 3584 Py_DECREF(reason); 3585 if (exc != NULL) { 3586 PyCodec_StrictErrors(exc); 3587 Py_XDECREF(exc); 3588 } 3589 return NULL; 3590} 3591 3592PyObject* 3593PyUnicode_DecodeLocale(const char *str, const char *errors) 3594{ 3595 Py_ssize_t size = (Py_ssize_t)strlen(str); 3596 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3597} 3598 3599 3600PyObject* 3601PyUnicode_DecodeFSDefault(const char *s) { 3602 Py_ssize_t size = (Py_ssize_t)strlen(s); 3603 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3604} 3605 3606PyObject* 3607PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3608{ 3609#ifdef HAVE_MBCS 3610 return PyUnicode_DecodeMBCS(s, size, NULL); 3611#elif defined(__APPLE__) 3612 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3613#else 3614 PyInterpreterState *interp = PyThreadState_GET()->interp; 3615 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3616 cannot use it to encode and decode filenames before it is loaded. Load 3617 the Python codec requires to encode at least its own filename. Use the C 3618 version of the locale codec until the codec registry is initialized and 3619 the Python codec is loaded. 3620 3621 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3622 cannot only rely on it: check also interp->fscodec_initialized for 3623 subinterpreters. */ 3624 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3625 return PyUnicode_Decode(s, size, 3626 Py_FileSystemDefaultEncoding, 3627 "surrogateescape"); 3628 } 3629 else { 3630 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3631 } 3632#endif 3633} 3634 3635 3636int 3637PyUnicode_FSConverter(PyObject* arg, void* addr) 3638{ 3639 PyObject *output = NULL; 3640 Py_ssize_t size; 3641 void *data; 3642 if (arg == NULL) { 3643 Py_DECREF(*(PyObject**)addr); 3644 return 1; 3645 } 3646 if (PyBytes_Check(arg)) { 3647 output = arg; 3648 Py_INCREF(output); 3649 } 3650 else { 3651 arg = PyUnicode_FromObject(arg); 3652 if (!arg) 3653 return 0; 3654 output = PyUnicode_EncodeFSDefault(arg); 3655 Py_DECREF(arg); 3656 if (!output) 3657 return 0; 3658 if (!PyBytes_Check(output)) { 3659 Py_DECREF(output); 3660 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3661 return 0; 3662 } 3663 } 3664 size = PyBytes_GET_SIZE(output); 3665 data = PyBytes_AS_STRING(output); 3666 if ((size_t)size != strlen(data)) { 3667 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3668 Py_DECREF(output); 3669 return 0; 3670 } 3671 *(PyObject**)addr = output; 3672 return Py_CLEANUP_SUPPORTED; 3673} 3674 3675 3676int 3677PyUnicode_FSDecoder(PyObject* arg, void* addr) 3678{ 3679 PyObject *output = NULL; 3680 if (arg == NULL) { 3681 Py_DECREF(*(PyObject**)addr); 3682 return 1; 3683 } 3684 if (PyUnicode_Check(arg)) { 3685 if (PyUnicode_READY(arg) == -1) 3686 return 0; 3687 output = arg; 3688 Py_INCREF(output); 3689 } 3690 else { 3691 arg = PyBytes_FromObject(arg); 3692 if (!arg) 3693 return 0; 3694 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3695 PyBytes_GET_SIZE(arg)); 3696 Py_DECREF(arg); 3697 if (!output) 3698 return 0; 3699 if (!PyUnicode_Check(output)) { 3700 Py_DECREF(output); 3701 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3702 return 0; 3703 } 3704 } 3705 if (PyUnicode_READY(output) == -1) { 3706 Py_DECREF(output); 3707 return 0; 3708 } 3709 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3710 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3711 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3712 Py_DECREF(output); 3713 return 0; 3714 } 3715 *(PyObject**)addr = output; 3716 return Py_CLEANUP_SUPPORTED; 3717} 3718 3719 3720char* 3721PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3722{ 3723 PyObject *bytes; 3724 3725 if (!PyUnicode_Check(unicode)) { 3726 PyErr_BadArgument(); 3727 return NULL; 3728 } 3729 if (PyUnicode_READY(unicode) == -1) 3730 return NULL; 3731 3732 if (PyUnicode_UTF8(unicode) == NULL) { 3733 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3734 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3735 if (bytes == NULL) 3736 return NULL; 3737 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3738 if (_PyUnicode_UTF8(unicode) == NULL) { 3739 PyErr_NoMemory(); 3740 Py_DECREF(bytes); 3741 return NULL; 3742 } 3743 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3744 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3745 PyBytes_AS_STRING(bytes), 3746 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3747 Py_DECREF(bytes); 3748 } 3749 3750 if (psize) 3751 *psize = PyUnicode_UTF8_LENGTH(unicode); 3752 return PyUnicode_UTF8(unicode); 3753} 3754 3755char* 3756PyUnicode_AsUTF8(PyObject *unicode) 3757{ 3758 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3759} 3760 3761Py_UNICODE * 3762PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3763{ 3764 const unsigned char *one_byte; 3765#if SIZEOF_WCHAR_T == 4 3766 const Py_UCS2 *two_bytes; 3767#else 3768 const Py_UCS4 *four_bytes; 3769 const Py_UCS4 *ucs4_end; 3770 Py_ssize_t num_surrogates; 3771#endif 3772 wchar_t *w; 3773 wchar_t *wchar_end; 3774 3775 if (!PyUnicode_Check(unicode)) { 3776 PyErr_BadArgument(); 3777 return NULL; 3778 } 3779 if (_PyUnicode_WSTR(unicode) == NULL) { 3780 /* Non-ASCII compact unicode object */ 3781 assert(_PyUnicode_KIND(unicode) != 0); 3782 assert(PyUnicode_IS_READY(unicode)); 3783 3784 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3785#if SIZEOF_WCHAR_T == 2 3786 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3787 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3788 num_surrogates = 0; 3789 3790 for (; four_bytes < ucs4_end; ++four_bytes) { 3791 if (*four_bytes > 0xFFFF) 3792 ++num_surrogates; 3793 } 3794 3795 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3796 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3797 if (!_PyUnicode_WSTR(unicode)) { 3798 PyErr_NoMemory(); 3799 return NULL; 3800 } 3801 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3802 3803 w = _PyUnicode_WSTR(unicode); 3804 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3805 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3806 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3807 if (*four_bytes > 0xFFFF) { 3808 assert(*four_bytes <= MAX_UNICODE); 3809 /* encode surrogate pair in this case */ 3810 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3811 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3812 } 3813 else 3814 *w = *four_bytes; 3815 3816 if (w > wchar_end) { 3817 assert(0 && "Miscalculated string end"); 3818 } 3819 } 3820 *w = 0; 3821#else 3822 /* sizeof(wchar_t) == 4 */ 3823 Py_FatalError("Impossible unicode object state, wstr and str " 3824 "should share memory already."); 3825 return NULL; 3826#endif 3827 } 3828 else { 3829 if ((size_t)_PyUnicode_LENGTH(unicode) > 3830 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 3831 PyErr_NoMemory(); 3832 return NULL; 3833 } 3834 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3835 (_PyUnicode_LENGTH(unicode) + 1)); 3836 if (!_PyUnicode_WSTR(unicode)) { 3837 PyErr_NoMemory(); 3838 return NULL; 3839 } 3840 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3841 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3842 w = _PyUnicode_WSTR(unicode); 3843 wchar_end = w + _PyUnicode_LENGTH(unicode); 3844 3845 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3846 one_byte = PyUnicode_1BYTE_DATA(unicode); 3847 for (; w < wchar_end; ++one_byte, ++w) 3848 *w = *one_byte; 3849 /* null-terminate the wstr */ 3850 *w = 0; 3851 } 3852 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3853#if SIZEOF_WCHAR_T == 4 3854 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3855 for (; w < wchar_end; ++two_bytes, ++w) 3856 *w = *two_bytes; 3857 /* null-terminate the wstr */ 3858 *w = 0; 3859#else 3860 /* sizeof(wchar_t) == 2 */ 3861 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3862 _PyUnicode_WSTR(unicode) = NULL; 3863 Py_FatalError("Impossible unicode object state, wstr " 3864 "and str should share memory already."); 3865 return NULL; 3866#endif 3867 } 3868 else { 3869 assert(0 && "This should never happen."); 3870 } 3871 } 3872 } 3873 if (size != NULL) 3874 *size = PyUnicode_WSTR_LENGTH(unicode); 3875 return _PyUnicode_WSTR(unicode); 3876} 3877 3878Py_UNICODE * 3879PyUnicode_AsUnicode(PyObject *unicode) 3880{ 3881 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3882} 3883 3884 3885Py_ssize_t 3886PyUnicode_GetSize(PyObject *unicode) 3887{ 3888 if (!PyUnicode_Check(unicode)) { 3889 PyErr_BadArgument(); 3890 goto onError; 3891 } 3892 return PyUnicode_GET_SIZE(unicode); 3893 3894 onError: 3895 return -1; 3896} 3897 3898Py_ssize_t 3899PyUnicode_GetLength(PyObject *unicode) 3900{ 3901 if (!PyUnicode_Check(unicode)) { 3902 PyErr_BadArgument(); 3903 return -1; 3904 } 3905 if (PyUnicode_READY(unicode) == -1) 3906 return -1; 3907 return PyUnicode_GET_LENGTH(unicode); 3908} 3909 3910Py_UCS4 3911PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3912{ 3913 void *data; 3914 int kind; 3915 3916 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3917 PyErr_BadArgument(); 3918 return (Py_UCS4)-1; 3919 } 3920 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3921 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3922 return (Py_UCS4)-1; 3923 } 3924 data = PyUnicode_DATA(unicode); 3925 kind = PyUnicode_KIND(unicode); 3926 return PyUnicode_READ(kind, data, index); 3927} 3928 3929int 3930PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3931{ 3932 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3933 PyErr_BadArgument(); 3934 return -1; 3935 } 3936 assert(PyUnicode_IS_READY(unicode)); 3937 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3938 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3939 return -1; 3940 } 3941 if (unicode_check_modifiable(unicode)) 3942 return -1; 3943 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3944 PyErr_SetString(PyExc_ValueError, "character out of range"); 3945 return -1; 3946 } 3947 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3948 index, ch); 3949 return 0; 3950} 3951 3952const char * 3953PyUnicode_GetDefaultEncoding(void) 3954{ 3955 return "utf-8"; 3956} 3957 3958/* create or adjust a UnicodeDecodeError */ 3959static void 3960make_decode_exception(PyObject **exceptionObject, 3961 const char *encoding, 3962 const char *input, Py_ssize_t length, 3963 Py_ssize_t startpos, Py_ssize_t endpos, 3964 const char *reason) 3965{ 3966 if (*exceptionObject == NULL) { 3967 *exceptionObject = PyUnicodeDecodeError_Create( 3968 encoding, input, length, startpos, endpos, reason); 3969 } 3970 else { 3971 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3972 goto onError; 3973 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3974 goto onError; 3975 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3976 goto onError; 3977 } 3978 return; 3979 3980onError: 3981 Py_CLEAR(*exceptionObject); 3982} 3983 3984#ifdef HAVE_MBCS 3985/* error handling callback helper: 3986 build arguments, call the callback and check the arguments, 3987 if no exception occurred, copy the replacement to the output 3988 and adjust various state variables. 3989 return 0 on success, -1 on error 3990*/ 3991 3992static int 3993unicode_decode_call_errorhandler_wchar( 3994 const char *errors, PyObject **errorHandler, 3995 const char *encoding, const char *reason, 3996 const char **input, const char **inend, Py_ssize_t *startinpos, 3997 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3998 PyObject **output, Py_ssize_t *outpos) 3999{ 4000 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4001 4002 PyObject *restuple = NULL; 4003 PyObject *repunicode = NULL; 4004 Py_ssize_t outsize; 4005 Py_ssize_t insize; 4006 Py_ssize_t requiredsize; 4007 Py_ssize_t newpos; 4008 PyObject *inputobj = NULL; 4009 wchar_t *repwstr; 4010 Py_ssize_t repwlen; 4011 4012 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4013 outsize = _PyUnicode_WSTR_LENGTH(*output); 4014 4015 if (*errorHandler == NULL) { 4016 *errorHandler = PyCodec_LookupError(errors); 4017 if (*errorHandler == NULL) 4018 goto onError; 4019 } 4020 4021 make_decode_exception(exceptionObject, 4022 encoding, 4023 *input, *inend - *input, 4024 *startinpos, *endinpos, 4025 reason); 4026 if (*exceptionObject == NULL) 4027 goto onError; 4028 4029 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4030 if (restuple == NULL) 4031 goto onError; 4032 if (!PyTuple_Check(restuple)) { 4033 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4034 goto onError; 4035 } 4036 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4037 goto onError; 4038 4039 /* Copy back the bytes variables, which might have been modified by the 4040 callback */ 4041 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4042 if (!inputobj) 4043 goto onError; 4044 if (!PyBytes_Check(inputobj)) { 4045 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4046 } 4047 *input = PyBytes_AS_STRING(inputobj); 4048 insize = PyBytes_GET_SIZE(inputobj); 4049 *inend = *input + insize; 4050 /* we can DECREF safely, as the exception has another reference, 4051 so the object won't go away. */ 4052 Py_DECREF(inputobj); 4053 4054 if (newpos<0) 4055 newpos = insize+newpos; 4056 if (newpos<0 || newpos>insize) { 4057 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4058 goto onError; 4059 } 4060 4061 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4062 if (repwstr == NULL) 4063 goto onError; 4064 /* need more space? (at least enough for what we 4065 have+the replacement+the rest of the string (starting 4066 at the new input position), so we won't have to check space 4067 when there are no errors in the rest of the string) */ 4068 requiredsize = *outpos; 4069 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4070 goto overflow; 4071 requiredsize += repwlen; 4072 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4073 goto overflow; 4074 requiredsize += insize - newpos; 4075 if (requiredsize > outsize) { 4076 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4077 requiredsize = 2*outsize; 4078 if (unicode_resize(output, requiredsize) < 0) 4079 goto onError; 4080 } 4081 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4082 *outpos += repwlen; 4083 *endinpos = newpos; 4084 *inptr = *input + newpos; 4085 4086 /* we made it! */ 4087 Py_XDECREF(restuple); 4088 return 0; 4089 4090 overflow: 4091 PyErr_SetString(PyExc_OverflowError, 4092 "decoded result is too long for a Python string"); 4093 4094 onError: 4095 Py_XDECREF(restuple); 4096 return -1; 4097} 4098#endif /* HAVE_MBCS */ 4099 4100static int 4101unicode_decode_call_errorhandler_writer( 4102 const char *errors, PyObject **errorHandler, 4103 const char *encoding, const char *reason, 4104 const char **input, const char **inend, Py_ssize_t *startinpos, 4105 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4106 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4107{ 4108 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4109 4110 PyObject *restuple = NULL; 4111 PyObject *repunicode = NULL; 4112 Py_ssize_t insize; 4113 Py_ssize_t newpos; 4114 Py_ssize_t replen; 4115 PyObject *inputobj = NULL; 4116 4117 if (*errorHandler == NULL) { 4118 *errorHandler = PyCodec_LookupError(errors); 4119 if (*errorHandler == NULL) 4120 goto onError; 4121 } 4122 4123 make_decode_exception(exceptionObject, 4124 encoding, 4125 *input, *inend - *input, 4126 *startinpos, *endinpos, 4127 reason); 4128 if (*exceptionObject == NULL) 4129 goto onError; 4130 4131 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4132 if (restuple == NULL) 4133 goto onError; 4134 if (!PyTuple_Check(restuple)) { 4135 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4136 goto onError; 4137 } 4138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4139 goto onError; 4140 4141 /* Copy back the bytes variables, which might have been modified by the 4142 callback */ 4143 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4144 if (!inputobj) 4145 goto onError; 4146 if (!PyBytes_Check(inputobj)) { 4147 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4148 } 4149 *input = PyBytes_AS_STRING(inputobj); 4150 insize = PyBytes_GET_SIZE(inputobj); 4151 *inend = *input + insize; 4152 /* we can DECREF safely, as the exception has another reference, 4153 so the object won't go away. */ 4154 Py_DECREF(inputobj); 4155 4156 if (newpos<0) 4157 newpos = insize+newpos; 4158 if (newpos<0 || newpos>insize) { 4159 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4160 goto onError; 4161 } 4162 4163 if (PyUnicode_READY(repunicode) < 0) 4164 goto onError; 4165 replen = PyUnicode_GET_LENGTH(repunicode); 4166 if (replen > 1) { 4167 writer->min_length += replen - 1; 4168 writer->overallocate = 1; 4169 if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4170 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4171 goto onError; 4172 } 4173 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4174 goto onError; 4175 4176 *endinpos = newpos; 4177 *inptr = *input + newpos; 4178 4179 /* we made it! */ 4180 Py_XDECREF(restuple); 4181 return 0; 4182 4183 onError: 4184 Py_XDECREF(restuple); 4185 return -1; 4186} 4187 4188/* --- UTF-7 Codec -------------------------------------------------------- */ 4189 4190/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4191 4192/* Three simple macros defining base-64. */ 4193 4194/* Is c a base-64 character? */ 4195 4196#define IS_BASE64(c) \ 4197 (((c) >= 'A' && (c) <= 'Z') || \ 4198 ((c) >= 'a' && (c) <= 'z') || \ 4199 ((c) >= '0' && (c) <= '9') || \ 4200 (c) == '+' || (c) == '/') 4201 4202/* given that c is a base-64 character, what is its base-64 value? */ 4203 4204#define FROM_BASE64(c) \ 4205 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4206 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4207 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4208 (c) == '+' ? 62 : 63) 4209 4210/* What is the base-64 character of the bottom 6 bits of n? */ 4211 4212#define TO_BASE64(n) \ 4213 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4214 4215/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4216 * decoded as itself. We are permissive on decoding; the only ASCII 4217 * byte not decoding to itself is the + which begins a base64 4218 * string. */ 4219 4220#define DECODE_DIRECT(c) \ 4221 ((c) <= 127 && (c) != '+') 4222 4223/* The UTF-7 encoder treats ASCII characters differently according to 4224 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4225 * the above). See RFC2152. This array identifies these different 4226 * sets: 4227 * 0 : "Set D" 4228 * alphanumeric and '(),-./:? 4229 * 1 : "Set O" 4230 * !"#$%&*;<=>@[]^_`{|} 4231 * 2 : "whitespace" 4232 * ht nl cr sp 4233 * 3 : special (must be base64 encoded) 4234 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4235 */ 4236 4237static 4238char utf7_category[128] = { 4239/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4240 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4241/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4242 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4243/* sp ! " # $ % & ' ( ) * + , - . / */ 4244 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4245/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4247/* @ A B C D E F G H I J K L M N O */ 4248 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4249/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4251/* ` a b c d e f g h i j k l m n o */ 4252 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4253/* p q r s t u v w x y z { | } ~ del */ 4254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4255}; 4256 4257/* ENCODE_DIRECT: this character should be encoded as itself. The 4258 * answer depends on whether we are encoding set O as itself, and also 4259 * on whether we are encoding whitespace as itself. RFC2152 makes it 4260 * clear that the answers to these questions vary between 4261 * applications, so this code needs to be flexible. */ 4262 4263#define ENCODE_DIRECT(c, directO, directWS) \ 4264 ((c) < 128 && (c) > 0 && \ 4265 ((utf7_category[(c)] == 0) || \ 4266 (directWS && (utf7_category[(c)] == 2)) || \ 4267 (directO && (utf7_category[(c)] == 1)))) 4268 4269PyObject * 4270PyUnicode_DecodeUTF7(const char *s, 4271 Py_ssize_t size, 4272 const char *errors) 4273{ 4274 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4275} 4276 4277/* The decoder. The only state we preserve is our read position, 4278 * i.e. how many characters we have consumed. So if we end in the 4279 * middle of a shift sequence we have to back off the read position 4280 * and the output to the beginning of the sequence, otherwise we lose 4281 * all the shift state (seen bits, number of bits seen, high 4282 * surrogate). */ 4283 4284PyObject * 4285PyUnicode_DecodeUTF7Stateful(const char *s, 4286 Py_ssize_t size, 4287 const char *errors, 4288 Py_ssize_t *consumed) 4289{ 4290 const char *starts = s; 4291 Py_ssize_t startinpos; 4292 Py_ssize_t endinpos; 4293 const char *e; 4294 _PyUnicodeWriter writer; 4295 const char *errmsg = ""; 4296 int inShift = 0; 4297 Py_ssize_t shiftOutStart; 4298 unsigned int base64bits = 0; 4299 unsigned long base64buffer = 0; 4300 Py_UCS4 surrogate = 0; 4301 PyObject *errorHandler = NULL; 4302 PyObject *exc = NULL; 4303 4304 if (size == 0) { 4305 if (consumed) 4306 *consumed = 0; 4307 _Py_RETURN_UNICODE_EMPTY(); 4308 } 4309 4310 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4311 _PyUnicodeWriter_Init(&writer); 4312 writer.min_length = size; 4313 4314 shiftOutStart = 0; 4315 e = s + size; 4316 4317 while (s < e) { 4318 Py_UCS4 ch; 4319 restart: 4320 ch = (unsigned char) *s; 4321 4322 if (inShift) { /* in a base-64 section */ 4323 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4324 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4325 base64bits += 6; 4326 s++; 4327 if (base64bits >= 16) { 4328 /* we have enough bits for a UTF-16 value */ 4329 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4330 base64bits -= 16; 4331 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4332 assert(outCh <= 0xffff); 4333 if (surrogate) { 4334 /* expecting a second surrogate */ 4335 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4336 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4337 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4338 goto onError; 4339 surrogate = 0; 4340 continue; 4341 } 4342 else { 4343 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4344 goto onError; 4345 surrogate = 0; 4346 } 4347 } 4348 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4349 /* first surrogate */ 4350 surrogate = outCh; 4351 } 4352 else { 4353 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4354 goto onError; 4355 } 4356 } 4357 } 4358 else { /* now leaving a base-64 section */ 4359 inShift = 0; 4360 s++; 4361 if (surrogate) { 4362 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4363 goto onError; 4364 surrogate = 0; 4365 } 4366 if (base64bits > 0) { /* left-over bits */ 4367 if (base64bits >= 6) { 4368 /* We've seen at least one base-64 character */ 4369 errmsg = "partial character in shift sequence"; 4370 goto utf7Error; 4371 } 4372 else { 4373 /* Some bits remain; they should be zero */ 4374 if (base64buffer != 0) { 4375 errmsg = "non-zero padding bits in shift sequence"; 4376 goto utf7Error; 4377 } 4378 } 4379 } 4380 if (ch != '-') { 4381 /* '-' is absorbed; other terminating 4382 characters are preserved */ 4383 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4384 goto onError; 4385 } 4386 } 4387 } 4388 else if ( ch == '+' ) { 4389 startinpos = s-starts; 4390 s++; /* consume '+' */ 4391 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4392 s++; 4393 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4394 goto onError; 4395 } 4396 else { /* begin base64-encoded section */ 4397 inShift = 1; 4398 shiftOutStart = writer.pos; 4399 base64bits = 0; 4400 base64buffer = 0; 4401 } 4402 } 4403 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4404 s++; 4405 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4406 goto onError; 4407 } 4408 else { 4409 startinpos = s-starts; 4410 s++; 4411 errmsg = "unexpected special character"; 4412 goto utf7Error; 4413 } 4414 continue; 4415utf7Error: 4416 endinpos = s-starts; 4417 if (unicode_decode_call_errorhandler_writer( 4418 errors, &errorHandler, 4419 "utf7", errmsg, 4420 &starts, &e, &startinpos, &endinpos, &exc, &s, 4421 &writer)) 4422 goto onError; 4423 } 4424 4425 /* end of string */ 4426 4427 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4428 /* if we're in an inconsistent state, that's an error */ 4429 if (surrogate || 4430 (base64bits >= 6) || 4431 (base64bits > 0 && base64buffer != 0)) { 4432 endinpos = size; 4433 if (unicode_decode_call_errorhandler_writer( 4434 errors, &errorHandler, 4435 "utf7", "unterminated shift sequence", 4436 &starts, &e, &startinpos, &endinpos, &exc, &s, 4437 &writer)) 4438 goto onError; 4439 if (s < e) 4440 goto restart; 4441 } 4442 } 4443 4444 /* return state */ 4445 if (consumed) { 4446 if (inShift) { 4447 *consumed = startinpos; 4448 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4449 PyObject *result = PyUnicode_FromKindAndData( 4450 writer.kind, writer.data, shiftOutStart); 4451 Py_XDECREF(errorHandler); 4452 Py_XDECREF(exc); 4453 _PyUnicodeWriter_Dealloc(&writer); 4454 return result; 4455 } 4456 writer.pos = shiftOutStart; /* back off output */ 4457 } 4458 else { 4459 *consumed = s-starts; 4460 } 4461 } 4462 4463 Py_XDECREF(errorHandler); 4464 Py_XDECREF(exc); 4465 return _PyUnicodeWriter_Finish(&writer); 4466 4467 onError: 4468 Py_XDECREF(errorHandler); 4469 Py_XDECREF(exc); 4470 _PyUnicodeWriter_Dealloc(&writer); 4471 return NULL; 4472} 4473 4474 4475PyObject * 4476_PyUnicode_EncodeUTF7(PyObject *str, 4477 int base64SetO, 4478 int base64WhiteSpace, 4479 const char *errors) 4480{ 4481 int kind; 4482 void *data; 4483 Py_ssize_t len; 4484 PyObject *v; 4485 int inShift = 0; 4486 Py_ssize_t i; 4487 unsigned int base64bits = 0; 4488 unsigned long base64buffer = 0; 4489 char * out; 4490 char * start; 4491 4492 if (PyUnicode_READY(str) == -1) 4493 return NULL; 4494 kind = PyUnicode_KIND(str); 4495 data = PyUnicode_DATA(str); 4496 len = PyUnicode_GET_LENGTH(str); 4497 4498 if (len == 0) 4499 return PyBytes_FromStringAndSize(NULL, 0); 4500 4501 /* It might be possible to tighten this worst case */ 4502 if (len > PY_SSIZE_T_MAX / 8) 4503 return PyErr_NoMemory(); 4504 v = PyBytes_FromStringAndSize(NULL, len * 8); 4505 if (v == NULL) 4506 return NULL; 4507 4508 start = out = PyBytes_AS_STRING(v); 4509 for (i = 0; i < len; ++i) { 4510 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4511 4512 if (inShift) { 4513 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4514 /* shifting out */ 4515 if (base64bits) { /* output remaining bits */ 4516 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4517 base64buffer = 0; 4518 base64bits = 0; 4519 } 4520 inShift = 0; 4521 /* Characters not in the BASE64 set implicitly unshift the sequence 4522 so no '-' is required, except if the character is itself a '-' */ 4523 if (IS_BASE64(ch) || ch == '-') { 4524 *out++ = '-'; 4525 } 4526 *out++ = (char) ch; 4527 } 4528 else { 4529 goto encode_char; 4530 } 4531 } 4532 else { /* not in a shift sequence */ 4533 if (ch == '+') { 4534 *out++ = '+'; 4535 *out++ = '-'; 4536 } 4537 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4538 *out++ = (char) ch; 4539 } 4540 else { 4541 *out++ = '+'; 4542 inShift = 1; 4543 goto encode_char; 4544 } 4545 } 4546 continue; 4547encode_char: 4548 if (ch >= 0x10000) { 4549 assert(ch <= MAX_UNICODE); 4550 4551 /* code first surrogate */ 4552 base64bits += 16; 4553 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4554 while (base64bits >= 6) { 4555 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4556 base64bits -= 6; 4557 } 4558 /* prepare second surrogate */ 4559 ch = Py_UNICODE_LOW_SURROGATE(ch); 4560 } 4561 base64bits += 16; 4562 base64buffer = (base64buffer << 16) | ch; 4563 while (base64bits >= 6) { 4564 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4565 base64bits -= 6; 4566 } 4567 } 4568 if (base64bits) 4569 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4570 if (inShift) 4571 *out++ = '-'; 4572 if (_PyBytes_Resize(&v, out - start) < 0) 4573 return NULL; 4574 return v; 4575} 4576PyObject * 4577PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4578 Py_ssize_t size, 4579 int base64SetO, 4580 int base64WhiteSpace, 4581 const char *errors) 4582{ 4583 PyObject *result; 4584 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4585 if (tmp == NULL) 4586 return NULL; 4587 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4588 base64WhiteSpace, errors); 4589 Py_DECREF(tmp); 4590 return result; 4591} 4592 4593#undef IS_BASE64 4594#undef FROM_BASE64 4595#undef TO_BASE64 4596#undef DECODE_DIRECT 4597#undef ENCODE_DIRECT 4598 4599/* --- UTF-8 Codec -------------------------------------------------------- */ 4600 4601PyObject * 4602PyUnicode_DecodeUTF8(const char *s, 4603 Py_ssize_t size, 4604 const char *errors) 4605{ 4606 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4607} 4608 4609#include "stringlib/asciilib.h" 4610#include "stringlib/codecs.h" 4611#include "stringlib/undef.h" 4612 4613#include "stringlib/ucs1lib.h" 4614#include "stringlib/codecs.h" 4615#include "stringlib/undef.h" 4616 4617#include "stringlib/ucs2lib.h" 4618#include "stringlib/codecs.h" 4619#include "stringlib/undef.h" 4620 4621#include "stringlib/ucs4lib.h" 4622#include "stringlib/codecs.h" 4623#include "stringlib/undef.h" 4624 4625/* Mask to quickly check whether a C 'long' contains a 4626 non-ASCII, UTF8-encoded char. */ 4627#if (SIZEOF_LONG == 8) 4628# define ASCII_CHAR_MASK 0x8080808080808080UL 4629#elif (SIZEOF_LONG == 4) 4630# define ASCII_CHAR_MASK 0x80808080UL 4631#else 4632# error C 'long' size should be either 4 or 8! 4633#endif 4634 4635static Py_ssize_t 4636ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4637{ 4638 const char *p = start; 4639 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4640 4641 /* 4642 * Issue #17237: m68k is a bit different from most architectures in 4643 * that objects do not use "natural alignment" - for example, int and 4644 * long are only aligned at 2-byte boundaries. Therefore the assert() 4645 * won't work; also, tests have shown that skipping the "optimised 4646 * version" will even speed up m68k. 4647 */ 4648#if !defined(__m68k__) 4649#if SIZEOF_LONG <= SIZEOF_VOID_P 4650 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4651 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4652 /* Fast path, see in STRINGLIB(utf8_decode) for 4653 an explanation. */ 4654 /* Help allocation */ 4655 const char *_p = p; 4656 Py_UCS1 * q = dest; 4657 while (_p < aligned_end) { 4658 unsigned long value = *(const unsigned long *) _p; 4659 if (value & ASCII_CHAR_MASK) 4660 break; 4661 *((unsigned long *)q) = value; 4662 _p += SIZEOF_LONG; 4663 q += SIZEOF_LONG; 4664 } 4665 p = _p; 4666 while (p < end) { 4667 if ((unsigned char)*p & 0x80) 4668 break; 4669 *q++ = *p++; 4670 } 4671 return p - start; 4672 } 4673#endif 4674#endif 4675 while (p < end) { 4676 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4677 for an explanation. */ 4678 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4679 /* Help allocation */ 4680 const char *_p = p; 4681 while (_p < aligned_end) { 4682 unsigned long value = *(unsigned long *) _p; 4683 if (value & ASCII_CHAR_MASK) 4684 break; 4685 _p += SIZEOF_LONG; 4686 } 4687 p = _p; 4688 if (_p == end) 4689 break; 4690 } 4691 if ((unsigned char)*p & 0x80) 4692 break; 4693 ++p; 4694 } 4695 memcpy(dest, start, p - start); 4696 return p - start; 4697} 4698 4699PyObject * 4700PyUnicode_DecodeUTF8Stateful(const char *s, 4701 Py_ssize_t size, 4702 const char *errors, 4703 Py_ssize_t *consumed) 4704{ 4705 _PyUnicodeWriter writer; 4706 const char *starts = s; 4707 const char *end = s + size; 4708 4709 Py_ssize_t startinpos; 4710 Py_ssize_t endinpos; 4711 const char *errmsg = ""; 4712 PyObject *errorHandler = NULL; 4713 PyObject *exc = NULL; 4714 4715 if (size == 0) { 4716 if (consumed) 4717 *consumed = 0; 4718 _Py_RETURN_UNICODE_EMPTY(); 4719 } 4720 4721 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4722 if (size == 1 && (unsigned char)s[0] < 128) { 4723 if (consumed) 4724 *consumed = 1; 4725 return get_latin1_char((unsigned char)s[0]); 4726 } 4727 4728 _PyUnicodeWriter_Init(&writer); 4729 writer.min_length = size; 4730 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4731 goto onError; 4732 4733 writer.pos = ascii_decode(s, end, writer.data); 4734 s += writer.pos; 4735 while (s < end) { 4736 Py_UCS4 ch; 4737 int kind = writer.kind; 4738 if (kind == PyUnicode_1BYTE_KIND) { 4739 if (PyUnicode_IS_ASCII(writer.buffer)) 4740 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4741 else 4742 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4743 } else if (kind == PyUnicode_2BYTE_KIND) { 4744 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4745 } else { 4746 assert(kind == PyUnicode_4BYTE_KIND); 4747 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4748 } 4749 4750 switch (ch) { 4751 case 0: 4752 if (s == end || consumed) 4753 goto End; 4754 errmsg = "unexpected end of data"; 4755 startinpos = s - starts; 4756 endinpos = end - starts; 4757 break; 4758 case 1: 4759 errmsg = "invalid start byte"; 4760 startinpos = s - starts; 4761 endinpos = startinpos + 1; 4762 break; 4763 case 2: 4764 case 3: 4765 case 4: 4766 errmsg = "invalid continuation byte"; 4767 startinpos = s - starts; 4768 endinpos = startinpos + ch - 1; 4769 break; 4770 default: 4771 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4772 goto onError; 4773 continue; 4774 } 4775 4776 if (unicode_decode_call_errorhandler_writer( 4777 errors, &errorHandler, 4778 "utf-8", errmsg, 4779 &starts, &end, &startinpos, &endinpos, &exc, &s, 4780 &writer)) 4781 goto onError; 4782 } 4783 4784End: 4785 if (consumed) 4786 *consumed = s - starts; 4787 4788 Py_XDECREF(errorHandler); 4789 Py_XDECREF(exc); 4790 return _PyUnicodeWriter_Finish(&writer); 4791 4792onError: 4793 Py_XDECREF(errorHandler); 4794 Py_XDECREF(exc); 4795 _PyUnicodeWriter_Dealloc(&writer); 4796 return NULL; 4797} 4798 4799#ifdef __APPLE__ 4800 4801/* Simplified UTF-8 decoder using surrogateescape error handler, 4802 used to decode the command line arguments on Mac OS X. 4803 4804 Return a pointer to a newly allocated wide character string (use 4805 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4806 4807wchar_t* 4808_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4809{ 4810 const char *e; 4811 wchar_t *unicode; 4812 Py_ssize_t outpos; 4813 4814 /* Note: size will always be longer than the resulting Unicode 4815 character count */ 4816 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) 4817 return NULL; 4818 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4819 if (!unicode) 4820 return NULL; 4821 4822 /* Unpack UTF-8 encoded data */ 4823 e = s + size; 4824 outpos = 0; 4825 while (s < e) { 4826 Py_UCS4 ch; 4827#if SIZEOF_WCHAR_T == 4 4828 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4829#else 4830 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4831#endif 4832 if (ch > 0xFF) { 4833#if SIZEOF_WCHAR_T == 4 4834 assert(0); 4835#else 4836 assert(Py_UNICODE_IS_SURROGATE(ch)); 4837 /* compute and append the two surrogates: */ 4838 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4839 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4840#endif 4841 } 4842 else { 4843 if (!ch && s == e) 4844 break; 4845 /* surrogateescape */ 4846 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4847 } 4848 } 4849 unicode[outpos] = L'\0'; 4850 return unicode; 4851} 4852 4853#endif /* __APPLE__ */ 4854 4855/* Primary internal function which creates utf8 encoded bytes objects. 4856 4857 Allocation strategy: if the string is short, convert into a stack buffer 4858 and allocate exactly as much space needed at the end. Else allocate the 4859 maximum possible needed (4 result bytes per Unicode character), and return 4860 the excess memory at the end. 4861*/ 4862PyObject * 4863_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4864{ 4865 enum PyUnicode_Kind kind; 4866 void *data; 4867 Py_ssize_t size; 4868 4869 if (!PyUnicode_Check(unicode)) { 4870 PyErr_BadArgument(); 4871 return NULL; 4872 } 4873 4874 if (PyUnicode_READY(unicode) == -1) 4875 return NULL; 4876 4877 if (PyUnicode_UTF8(unicode)) 4878 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4879 PyUnicode_UTF8_LENGTH(unicode)); 4880 4881 kind = PyUnicode_KIND(unicode); 4882 data = PyUnicode_DATA(unicode); 4883 size = PyUnicode_GET_LENGTH(unicode); 4884 4885 switch (kind) { 4886 default: 4887 assert(0); 4888 case PyUnicode_1BYTE_KIND: 4889 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4890 assert(!PyUnicode_IS_ASCII(unicode)); 4891 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4892 case PyUnicode_2BYTE_KIND: 4893 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4894 case PyUnicode_4BYTE_KIND: 4895 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4896 } 4897} 4898 4899PyObject * 4900PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4901 Py_ssize_t size, 4902 const char *errors) 4903{ 4904 PyObject *v, *unicode; 4905 4906 unicode = PyUnicode_FromUnicode(s, size); 4907 if (unicode == NULL) 4908 return NULL; 4909 v = _PyUnicode_AsUTF8String(unicode, errors); 4910 Py_DECREF(unicode); 4911 return v; 4912} 4913 4914PyObject * 4915PyUnicode_AsUTF8String(PyObject *unicode) 4916{ 4917 return _PyUnicode_AsUTF8String(unicode, NULL); 4918} 4919 4920/* --- UTF-32 Codec ------------------------------------------------------- */ 4921 4922PyObject * 4923PyUnicode_DecodeUTF32(const char *s, 4924 Py_ssize_t size, 4925 const char *errors, 4926 int *byteorder) 4927{ 4928 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4929} 4930 4931PyObject * 4932PyUnicode_DecodeUTF32Stateful(const char *s, 4933 Py_ssize_t size, 4934 const char *errors, 4935 int *byteorder, 4936 Py_ssize_t *consumed) 4937{ 4938 const char *starts = s; 4939 Py_ssize_t startinpos; 4940 Py_ssize_t endinpos; 4941 _PyUnicodeWriter writer; 4942 const unsigned char *q, *e; 4943 int le, bo = 0; /* assume native ordering by default */ 4944 const char *encoding; 4945 const char *errmsg = ""; 4946 PyObject *errorHandler = NULL; 4947 PyObject *exc = NULL; 4948 4949 q = (unsigned char *)s; 4950 e = q + size; 4951 4952 if (byteorder) 4953 bo = *byteorder; 4954 4955 /* Check for BOM marks (U+FEFF) in the input and adjust current 4956 byte order setting accordingly. In native mode, the leading BOM 4957 mark is skipped, in all other modes, it is copied to the output 4958 stream as-is (giving a ZWNBSP character). */ 4959 if (bo == 0 && size >= 4) { 4960 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4961 if (bom == 0x0000FEFF) { 4962 bo = -1; 4963 q += 4; 4964 } 4965 else if (bom == 0xFFFE0000) { 4966 bo = 1; 4967 q += 4; 4968 } 4969 if (byteorder) 4970 *byteorder = bo; 4971 } 4972 4973 if (q == e) { 4974 if (consumed) 4975 *consumed = size; 4976 _Py_RETURN_UNICODE_EMPTY(); 4977 } 4978 4979#ifdef WORDS_BIGENDIAN 4980 le = bo < 0; 4981#else 4982 le = bo <= 0; 4983#endif 4984 encoding = le ? "utf-32-le" : "utf-32-be"; 4985 4986 _PyUnicodeWriter_Init(&writer); 4987 writer.min_length = (e - q + 3) / 4; 4988 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4989 goto onError; 4990 4991 while (1) { 4992 Py_UCS4 ch = 0; 4993 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 4994 4995 if (e - q >= 4) { 4996 enum PyUnicode_Kind kind = writer.kind; 4997 void *data = writer.data; 4998 const unsigned char *last = e - 4; 4999 Py_ssize_t pos = writer.pos; 5000 if (le) { 5001 do { 5002 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5003 if (ch > maxch) 5004 break; 5005 if (kind != PyUnicode_1BYTE_KIND && 5006 Py_UNICODE_IS_SURROGATE(ch)) 5007 break; 5008 PyUnicode_WRITE(kind, data, pos++, ch); 5009 q += 4; 5010 } while (q <= last); 5011 } 5012 else { 5013 do { 5014 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5015 if (ch > maxch) 5016 break; 5017 if (kind != PyUnicode_1BYTE_KIND && 5018 Py_UNICODE_IS_SURROGATE(ch)) 5019 break; 5020 PyUnicode_WRITE(kind, data, pos++, ch); 5021 q += 4; 5022 } while (q <= last); 5023 } 5024 writer.pos = pos; 5025 } 5026 5027 if (Py_UNICODE_IS_SURROGATE(ch)) { 5028 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5029 startinpos = ((const char *)q) - starts; 5030 endinpos = startinpos + 4; 5031 } 5032 else if (ch <= maxch) { 5033 if (q == e || consumed) 5034 break; 5035 /* remaining bytes at the end? (size should be divisible by 4) */ 5036 errmsg = "truncated data"; 5037 startinpos = ((const char *)q) - starts; 5038 endinpos = ((const char *)e) - starts; 5039 } 5040 else { 5041 if (ch < 0x110000) { 5042 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5043 goto onError; 5044 q += 4; 5045 continue; 5046 } 5047 errmsg = "code point not in range(0x110000)"; 5048 startinpos = ((const char *)q) - starts; 5049 endinpos = startinpos + 4; 5050 } 5051 5052 /* The remaining input chars are ignored if the callback 5053 chooses to skip the input */ 5054 if (unicode_decode_call_errorhandler_writer( 5055 errors, &errorHandler, 5056 encoding, errmsg, 5057 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5058 &writer)) 5059 goto onError; 5060 } 5061 5062 if (consumed) 5063 *consumed = (const char *)q-starts; 5064 5065 Py_XDECREF(errorHandler); 5066 Py_XDECREF(exc); 5067 return _PyUnicodeWriter_Finish(&writer); 5068 5069 onError: 5070 _PyUnicodeWriter_Dealloc(&writer); 5071 Py_XDECREF(errorHandler); 5072 Py_XDECREF(exc); 5073 return NULL; 5074} 5075 5076PyObject * 5077_PyUnicode_EncodeUTF32(PyObject *str, 5078 const char *errors, 5079 int byteorder) 5080{ 5081 enum PyUnicode_Kind kind; 5082 const void *data; 5083 Py_ssize_t len; 5084 PyObject *v; 5085 PY_UINT32_T *out; 5086#if PY_LITTLE_ENDIAN 5087 int native_ordering = byteorder <= 0; 5088#else 5089 int native_ordering = byteorder >= 0; 5090#endif 5091 const char *encoding; 5092 Py_ssize_t nsize, pos; 5093 PyObject *errorHandler = NULL; 5094 PyObject *exc = NULL; 5095 PyObject *rep = NULL; 5096 5097 if (!PyUnicode_Check(str)) { 5098 PyErr_BadArgument(); 5099 return NULL; 5100 } 5101 if (PyUnicode_READY(str) == -1) 5102 return NULL; 5103 kind = PyUnicode_KIND(str); 5104 data = PyUnicode_DATA(str); 5105 len = PyUnicode_GET_LENGTH(str); 5106 5107 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5108 return PyErr_NoMemory(); 5109 nsize = len + (byteorder == 0); 5110 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5111 if (v == NULL) 5112 return NULL; 5113 5114 /* output buffer is 4-bytes aligned */ 5115 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5116 out = (PY_UINT32_T *)PyBytes_AS_STRING(v); 5117 if (byteorder == 0) 5118 *out++ = 0xFEFF; 5119 if (len == 0) 5120 goto done; 5121 5122 if (byteorder == -1) 5123 encoding = "utf-32-le"; 5124 else if (byteorder == 1) 5125 encoding = "utf-32-be"; 5126 else 5127 encoding = "utf-32"; 5128 5129 if (kind == PyUnicode_1BYTE_KIND) { 5130 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5131 goto done; 5132 } 5133 5134 pos = 0; 5135 while (pos < len) { 5136 Py_ssize_t repsize, moreunits; 5137 5138 if (kind == PyUnicode_2BYTE_KIND) { 5139 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5140 &out, native_ordering); 5141 } 5142 else { 5143 assert(kind == PyUnicode_4BYTE_KIND); 5144 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5145 &out, native_ordering); 5146 } 5147 if (pos == len) 5148 break; 5149 5150 rep = unicode_encode_call_errorhandler( 5151 errors, &errorHandler, 5152 encoding, "surrogates not allowed", 5153 str, &exc, pos, pos + 1, &pos); 5154 if (!rep) 5155 goto error; 5156 5157 if (PyBytes_Check(rep)) { 5158 repsize = PyBytes_GET_SIZE(rep); 5159 if (repsize & 3) { 5160 raise_encode_exception(&exc, encoding, 5161 str, pos - 1, pos, 5162 "surrogates not allowed"); 5163 goto error; 5164 } 5165 moreunits = repsize / 4; 5166 } 5167 else { 5168 assert(PyUnicode_Check(rep)); 5169 if (PyUnicode_READY(rep) < 0) 5170 goto error; 5171 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5172 if (!PyUnicode_IS_ASCII(rep)) { 5173 raise_encode_exception(&exc, encoding, 5174 str, pos - 1, pos, 5175 "surrogates not allowed"); 5176 goto error; 5177 } 5178 } 5179 5180 /* four bytes are reserved for each surrogate */ 5181 if (moreunits > 1) { 5182 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v); 5183 Py_ssize_t morebytes = 4 * (moreunits - 1); 5184 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5185 /* integer overflow */ 5186 PyErr_NoMemory(); 5187 goto error; 5188 } 5189 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5190 goto error; 5191 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos; 5192 } 5193 5194 if (PyBytes_Check(rep)) { 5195 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5196 out += moreunits; 5197 } else /* rep is unicode */ { 5198 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5199 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5200 &out, native_ordering); 5201 } 5202 5203 Py_CLEAR(rep); 5204 } 5205 5206 /* Cut back to size actually needed. This is necessary for, for example, 5207 encoding of a string containing isolated surrogates and the 'ignore' 5208 handler is used. */ 5209 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5210 if (nsize != PyBytes_GET_SIZE(v)) 5211 _PyBytes_Resize(&v, nsize); 5212 Py_XDECREF(errorHandler); 5213 Py_XDECREF(exc); 5214 done: 5215 return v; 5216 error: 5217 Py_XDECREF(rep); 5218 Py_XDECREF(errorHandler); 5219 Py_XDECREF(exc); 5220 Py_XDECREF(v); 5221 return NULL; 5222} 5223 5224PyObject * 5225PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5226 Py_ssize_t size, 5227 const char *errors, 5228 int byteorder) 5229{ 5230 PyObject *result; 5231 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5232 if (tmp == NULL) 5233 return NULL; 5234 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5235 Py_DECREF(tmp); 5236 return result; 5237} 5238 5239PyObject * 5240PyUnicode_AsUTF32String(PyObject *unicode) 5241{ 5242 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5243} 5244 5245/* --- UTF-16 Codec ------------------------------------------------------- */ 5246 5247PyObject * 5248PyUnicode_DecodeUTF16(const char *s, 5249 Py_ssize_t size, 5250 const char *errors, 5251 int *byteorder) 5252{ 5253 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5254} 5255 5256PyObject * 5257PyUnicode_DecodeUTF16Stateful(const char *s, 5258 Py_ssize_t size, 5259 const char *errors, 5260 int *byteorder, 5261 Py_ssize_t *consumed) 5262{ 5263 const char *starts = s; 5264 Py_ssize_t startinpos; 5265 Py_ssize_t endinpos; 5266 _PyUnicodeWriter writer; 5267 const unsigned char *q, *e; 5268 int bo = 0; /* assume native ordering by default */ 5269 int native_ordering; 5270 const char *errmsg = ""; 5271 PyObject *errorHandler = NULL; 5272 PyObject *exc = NULL; 5273 const char *encoding; 5274 5275 q = (unsigned char *)s; 5276 e = q + size; 5277 5278 if (byteorder) 5279 bo = *byteorder; 5280 5281 /* Check for BOM marks (U+FEFF) in the input and adjust current 5282 byte order setting accordingly. In native mode, the leading BOM 5283 mark is skipped, in all other modes, it is copied to the output 5284 stream as-is (giving a ZWNBSP character). */ 5285 if (bo == 0 && size >= 2) { 5286 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5287 if (bom == 0xFEFF) { 5288 q += 2; 5289 bo = -1; 5290 } 5291 else if (bom == 0xFFFE) { 5292 q += 2; 5293 bo = 1; 5294 } 5295 if (byteorder) 5296 *byteorder = bo; 5297 } 5298 5299 if (q == e) { 5300 if (consumed) 5301 *consumed = size; 5302 _Py_RETURN_UNICODE_EMPTY(); 5303 } 5304 5305#if PY_LITTLE_ENDIAN 5306 native_ordering = bo <= 0; 5307 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5308#else 5309 native_ordering = bo >= 0; 5310 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5311#endif 5312 5313 /* Note: size will always be longer than the resulting Unicode 5314 character count */ 5315 _PyUnicodeWriter_Init(&writer); 5316 writer.min_length = (e - q + 1) / 2; 5317 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5318 goto onError; 5319 5320 while (1) { 5321 Py_UCS4 ch = 0; 5322 if (e - q >= 2) { 5323 int kind = writer.kind; 5324 if (kind == PyUnicode_1BYTE_KIND) { 5325 if (PyUnicode_IS_ASCII(writer.buffer)) 5326 ch = asciilib_utf16_decode(&q, e, 5327 (Py_UCS1*)writer.data, &writer.pos, 5328 native_ordering); 5329 else 5330 ch = ucs1lib_utf16_decode(&q, e, 5331 (Py_UCS1*)writer.data, &writer.pos, 5332 native_ordering); 5333 } else if (kind == PyUnicode_2BYTE_KIND) { 5334 ch = ucs2lib_utf16_decode(&q, e, 5335 (Py_UCS2*)writer.data, &writer.pos, 5336 native_ordering); 5337 } else { 5338 assert(kind == PyUnicode_4BYTE_KIND); 5339 ch = ucs4lib_utf16_decode(&q, e, 5340 (Py_UCS4*)writer.data, &writer.pos, 5341 native_ordering); 5342 } 5343 } 5344 5345 switch (ch) 5346 { 5347 case 0: 5348 /* remaining byte at the end? (size should be even) */ 5349 if (q == e || consumed) 5350 goto End; 5351 errmsg = "truncated data"; 5352 startinpos = ((const char *)q) - starts; 5353 endinpos = ((const char *)e) - starts; 5354 break; 5355 /* The remaining input chars are ignored if the callback 5356 chooses to skip the input */ 5357 case 1: 5358 q -= 2; 5359 if (consumed) 5360 goto End; 5361 errmsg = "unexpected end of data"; 5362 startinpos = ((const char *)q) - starts; 5363 endinpos = ((const char *)e) - starts; 5364 break; 5365 case 2: 5366 errmsg = "illegal encoding"; 5367 startinpos = ((const char *)q) - 2 - starts; 5368 endinpos = startinpos + 2; 5369 break; 5370 case 3: 5371 errmsg = "illegal UTF-16 surrogate"; 5372 startinpos = ((const char *)q) - 4 - starts; 5373 endinpos = startinpos + 2; 5374 break; 5375 default: 5376 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5377 goto onError; 5378 continue; 5379 } 5380 5381 if (unicode_decode_call_errorhandler_writer( 5382 errors, 5383 &errorHandler, 5384 encoding, errmsg, 5385 &starts, 5386 (const char **)&e, 5387 &startinpos, 5388 &endinpos, 5389 &exc, 5390 (const char **)&q, 5391 &writer)) 5392 goto onError; 5393 } 5394 5395End: 5396 if (consumed) 5397 *consumed = (const char *)q-starts; 5398 5399 Py_XDECREF(errorHandler); 5400 Py_XDECREF(exc); 5401 return _PyUnicodeWriter_Finish(&writer); 5402 5403 onError: 5404 _PyUnicodeWriter_Dealloc(&writer); 5405 Py_XDECREF(errorHandler); 5406 Py_XDECREF(exc); 5407 return NULL; 5408} 5409 5410PyObject * 5411_PyUnicode_EncodeUTF16(PyObject *str, 5412 const char *errors, 5413 int byteorder) 5414{ 5415 enum PyUnicode_Kind kind; 5416 const void *data; 5417 Py_ssize_t len; 5418 PyObject *v; 5419 unsigned short *out; 5420 Py_ssize_t pairs; 5421#if PY_BIG_ENDIAN 5422 int native_ordering = byteorder >= 0; 5423#else 5424 int native_ordering = byteorder <= 0; 5425#endif 5426 const char *encoding; 5427 Py_ssize_t nsize, pos; 5428 PyObject *errorHandler = NULL; 5429 PyObject *exc = NULL; 5430 PyObject *rep = NULL; 5431 5432 if (!PyUnicode_Check(str)) { 5433 PyErr_BadArgument(); 5434 return NULL; 5435 } 5436 if (PyUnicode_READY(str) == -1) 5437 return NULL; 5438 kind = PyUnicode_KIND(str); 5439 data = PyUnicode_DATA(str); 5440 len = PyUnicode_GET_LENGTH(str); 5441 5442 pairs = 0; 5443 if (kind == PyUnicode_4BYTE_KIND) { 5444 const Py_UCS4 *in = (const Py_UCS4 *)data; 5445 const Py_UCS4 *end = in + len; 5446 while (in < end) 5447 if (*in++ >= 0x10000) 5448 pairs++; 5449 } 5450 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5451 return PyErr_NoMemory(); 5452 nsize = len + pairs + (byteorder == 0); 5453 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5454 if (v == NULL) 5455 return NULL; 5456 5457 /* output buffer is 2-bytes aligned */ 5458 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5459 out = (unsigned short *)PyBytes_AS_STRING(v); 5460 if (byteorder == 0) 5461 *out++ = 0xFEFF; 5462 if (len == 0) 5463 goto done; 5464 5465 if (kind == PyUnicode_1BYTE_KIND) { 5466 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5467 goto done; 5468 } 5469 5470 if (byteorder < 0) 5471 encoding = "utf-16-le"; 5472 else if (byteorder > 0) 5473 encoding = "utf-16-be"; 5474 else 5475 encoding = "utf-16"; 5476 5477 pos = 0; 5478 while (pos < len) { 5479 Py_ssize_t repsize, moreunits; 5480 5481 if (kind == PyUnicode_2BYTE_KIND) { 5482 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5483 &out, native_ordering); 5484 } 5485 else { 5486 assert(kind == PyUnicode_4BYTE_KIND); 5487 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5488 &out, native_ordering); 5489 } 5490 if (pos == len) 5491 break; 5492 5493 rep = unicode_encode_call_errorhandler( 5494 errors, &errorHandler, 5495 encoding, "surrogates not allowed", 5496 str, &exc, pos, pos + 1, &pos); 5497 if (!rep) 5498 goto error; 5499 5500 if (PyBytes_Check(rep)) { 5501 repsize = PyBytes_GET_SIZE(rep); 5502 if (repsize & 1) { 5503 raise_encode_exception(&exc, encoding, 5504 str, pos - 1, pos, 5505 "surrogates not allowed"); 5506 goto error; 5507 } 5508 moreunits = repsize / 2; 5509 } 5510 else { 5511 assert(PyUnicode_Check(rep)); 5512 if (PyUnicode_READY(rep) < 0) 5513 goto error; 5514 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5515 if (!PyUnicode_IS_ASCII(rep)) { 5516 raise_encode_exception(&exc, encoding, 5517 str, pos - 1, pos, 5518 "surrogates not allowed"); 5519 goto error; 5520 } 5521 } 5522 5523 /* two bytes are reserved for each surrogate */ 5524 if (moreunits > 1) { 5525 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5526 Py_ssize_t morebytes = 2 * (moreunits - 1); 5527 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5528 /* integer overflow */ 5529 PyErr_NoMemory(); 5530 goto error; 5531 } 5532 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5533 goto error; 5534 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5535 } 5536 5537 if (PyBytes_Check(rep)) { 5538 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5539 out += moreunits; 5540 } else /* rep is unicode */ { 5541 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5542 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5543 &out, native_ordering); 5544 } 5545 5546 Py_CLEAR(rep); 5547 } 5548 5549 /* Cut back to size actually needed. This is necessary for, for example, 5550 encoding of a string containing isolated surrogates and the 'ignore' handler 5551 is used. */ 5552 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5553 if (nsize != PyBytes_GET_SIZE(v)) 5554 _PyBytes_Resize(&v, nsize); 5555 Py_XDECREF(errorHandler); 5556 Py_XDECREF(exc); 5557 done: 5558 return v; 5559 error: 5560 Py_XDECREF(rep); 5561 Py_XDECREF(errorHandler); 5562 Py_XDECREF(exc); 5563 Py_XDECREF(v); 5564 return NULL; 5565#undef STORECHAR 5566} 5567 5568PyObject * 5569PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5570 Py_ssize_t size, 5571 const char *errors, 5572 int byteorder) 5573{ 5574 PyObject *result; 5575 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5576 if (tmp == NULL) 5577 return NULL; 5578 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5579 Py_DECREF(tmp); 5580 return result; 5581} 5582 5583PyObject * 5584PyUnicode_AsUTF16String(PyObject *unicode) 5585{ 5586 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5587} 5588 5589/* --- Unicode Escape Codec ----------------------------------------------- */ 5590 5591/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5592 if all the escapes in the string make it still a valid ASCII string. 5593 Returns -1 if any escapes were found which cause the string to 5594 pop out of ASCII range. Otherwise returns the length of the 5595 required buffer to hold the string. 5596 */ 5597static Py_ssize_t 5598length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5599{ 5600 const unsigned char *p = (const unsigned char *)s; 5601 const unsigned char *end = p + size; 5602 Py_ssize_t length = 0; 5603 5604 if (size < 0) 5605 return -1; 5606 5607 for (; p < end; ++p) { 5608 if (*p > 127) { 5609 /* Non-ASCII */ 5610 return -1; 5611 } 5612 else if (*p != '\\') { 5613 /* Normal character */ 5614 ++length; 5615 } 5616 else { 5617 /* Backslash-escape, check next char */ 5618 ++p; 5619 /* Escape sequence reaches till end of string or 5620 non-ASCII follow-up. */ 5621 if (p >= end || *p > 127) 5622 return -1; 5623 switch (*p) { 5624 case '\n': 5625 /* backslash + \n result in zero characters */ 5626 break; 5627 case '\\': case '\'': case '\"': 5628 case 'b': case 'f': case 't': 5629 case 'n': case 'r': case 'v': case 'a': 5630 ++length; 5631 break; 5632 case '0': case '1': case '2': case '3': 5633 case '4': case '5': case '6': case '7': 5634 case 'x': case 'u': case 'U': case 'N': 5635 /* these do not guarantee ASCII characters */ 5636 return -1; 5637 default: 5638 /* count the backslash + the other character */ 5639 length += 2; 5640 } 5641 } 5642 } 5643 return length; 5644} 5645 5646static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5647 5648PyObject * 5649PyUnicode_DecodeUnicodeEscape(const char *s, 5650 Py_ssize_t size, 5651 const char *errors) 5652{ 5653 const char *starts = s; 5654 Py_ssize_t startinpos; 5655 Py_ssize_t endinpos; 5656 _PyUnicodeWriter writer; 5657 const char *end; 5658 char* message; 5659 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5660 PyObject *errorHandler = NULL; 5661 PyObject *exc = NULL; 5662 Py_ssize_t len; 5663 5664 len = length_of_escaped_ascii_string(s, size); 5665 if (len == 0) 5666 _Py_RETURN_UNICODE_EMPTY(); 5667 5668 /* After length_of_escaped_ascii_string() there are two alternatives, 5669 either the string is pure ASCII with named escapes like \n, etc. 5670 and we determined it's exact size (common case) 5671 or it contains \x, \u, ... escape sequences. then we create a 5672 legacy wchar string and resize it at the end of this function. */ 5673 _PyUnicodeWriter_Init(&writer); 5674 if (len > 0) { 5675 writer.min_length = len; 5676 } 5677 else { 5678 /* Escaped strings will always be longer than the resulting 5679 Unicode string, so we start with size here and then reduce the 5680 length after conversion to the true value. 5681 (but if the error callback returns a long replacement string 5682 we'll have to allocate more space) */ 5683 writer.min_length = size; 5684 } 5685 5686 if (size == 0) 5687 return _PyUnicodeWriter_Finish(&writer); 5688 end = s + size; 5689 5690 while (s < end) { 5691 unsigned char c; 5692 Py_UCS4 x; 5693 int digits; 5694 5695 /* Non-escape characters are interpreted as Unicode ordinals */ 5696 if (*s != '\\') { 5697 x = (unsigned char)*s; 5698 s++; 5699 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5700 goto onError; 5701 continue; 5702 } 5703 5704 startinpos = s-starts; 5705 /* \ - Escapes */ 5706 s++; 5707 c = *s++; 5708 if (s > end) 5709 c = '\0'; /* Invalid after \ */ 5710 5711 switch (c) { 5712 5713 /* \x escapes */ 5714#define WRITECHAR(ch) \ 5715 do { \ 5716 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5717 goto onError; \ 5718 } while(0) 5719 5720 case '\n': break; 5721 case '\\': WRITECHAR('\\'); break; 5722 case '\'': WRITECHAR('\''); break; 5723 case '\"': WRITECHAR('\"'); break; 5724 case 'b': WRITECHAR('\b'); break; 5725 /* FF */ 5726 case 'f': WRITECHAR('\014'); break; 5727 case 't': WRITECHAR('\t'); break; 5728 case 'n': WRITECHAR('\n'); break; 5729 case 'r': WRITECHAR('\r'); break; 5730 /* VT */ 5731 case 'v': WRITECHAR('\013'); break; 5732 /* BEL, not classic C */ 5733 case 'a': WRITECHAR('\007'); break; 5734 5735 /* \OOO (octal) escapes */ 5736 case '0': case '1': case '2': case '3': 5737 case '4': case '5': case '6': case '7': 5738 x = s[-1] - '0'; 5739 if (s < end && '0' <= *s && *s <= '7') { 5740 x = (x<<3) + *s++ - '0'; 5741 if (s < end && '0' <= *s && *s <= '7') 5742 x = (x<<3) + *s++ - '0'; 5743 } 5744 WRITECHAR(x); 5745 break; 5746 5747 /* hex escapes */ 5748 /* \xXX */ 5749 case 'x': 5750 digits = 2; 5751 message = "truncated \\xXX escape"; 5752 goto hexescape; 5753 5754 /* \uXXXX */ 5755 case 'u': 5756 digits = 4; 5757 message = "truncated \\uXXXX escape"; 5758 goto hexescape; 5759 5760 /* \UXXXXXXXX */ 5761 case 'U': 5762 digits = 8; 5763 message = "truncated \\UXXXXXXXX escape"; 5764 hexescape: 5765 chr = 0; 5766 if (end - s < digits) { 5767 /* count only hex digits */ 5768 for (; s < end; ++s) { 5769 c = (unsigned char)*s; 5770 if (!Py_ISXDIGIT(c)) 5771 goto error; 5772 } 5773 goto error; 5774 } 5775 for (; digits--; ++s) { 5776 c = (unsigned char)*s; 5777 if (!Py_ISXDIGIT(c)) 5778 goto error; 5779 chr = (chr<<4) & ~0xF; 5780 if (c >= '0' && c <= '9') 5781 chr += c - '0'; 5782 else if (c >= 'a' && c <= 'f') 5783 chr += 10 + c - 'a'; 5784 else 5785 chr += 10 + c - 'A'; 5786 } 5787 if (chr == 0xffffffff && PyErr_Occurred()) 5788 /* _decoding_error will have already written into the 5789 target buffer. */ 5790 break; 5791 store: 5792 /* when we get here, chr is a 32-bit unicode character */ 5793 message = "illegal Unicode character"; 5794 if (chr > MAX_UNICODE) 5795 goto error; 5796 WRITECHAR(chr); 5797 break; 5798 5799 /* \N{name} */ 5800 case 'N': 5801 message = "malformed \\N character escape"; 5802 if (ucnhash_CAPI == NULL) { 5803 /* load the unicode data module */ 5804 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5805 PyUnicodeData_CAPSULE_NAME, 1); 5806 if (ucnhash_CAPI == NULL) 5807 goto ucnhashError; 5808 } 5809 if (*s == '{') { 5810 const char *start = s+1; 5811 /* look for the closing brace */ 5812 while (*s != '}' && s < end) 5813 s++; 5814 if (s > start && s < end && *s == '}') { 5815 /* found a name. look it up in the unicode database */ 5816 message = "unknown Unicode character name"; 5817 s++; 5818 if (s - start - 1 <= INT_MAX && 5819 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5820 &chr, 0)) 5821 goto store; 5822 } 5823 } 5824 goto error; 5825 5826 default: 5827 if (s > end) { 5828 message = "\\ at end of string"; 5829 s--; 5830 goto error; 5831 } 5832 else { 5833 WRITECHAR('\\'); 5834 WRITECHAR((unsigned char)s[-1]); 5835 } 5836 break; 5837 } 5838 continue; 5839 5840 error: 5841 endinpos = s-starts; 5842 if (unicode_decode_call_errorhandler_writer( 5843 errors, &errorHandler, 5844 "unicodeescape", message, 5845 &starts, &end, &startinpos, &endinpos, &exc, &s, 5846 &writer)) 5847 goto onError; 5848 continue; 5849 } 5850#undef WRITECHAR 5851 5852 Py_XDECREF(errorHandler); 5853 Py_XDECREF(exc); 5854 return _PyUnicodeWriter_Finish(&writer); 5855 5856 ucnhashError: 5857 PyErr_SetString( 5858 PyExc_UnicodeError, 5859 "\\N escapes not supported (can't load unicodedata module)" 5860 ); 5861 _PyUnicodeWriter_Dealloc(&writer); 5862 Py_XDECREF(errorHandler); 5863 Py_XDECREF(exc); 5864 return NULL; 5865 5866 onError: 5867 _PyUnicodeWriter_Dealloc(&writer); 5868 Py_XDECREF(errorHandler); 5869 Py_XDECREF(exc); 5870 return NULL; 5871} 5872 5873/* Return a Unicode-Escape string version of the Unicode object. 5874 5875 If quotes is true, the string is enclosed in u"" or u'' quotes as 5876 appropriate. 5877 5878*/ 5879 5880PyObject * 5881PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5882{ 5883 Py_ssize_t i, len; 5884 PyObject *repr; 5885 char *p; 5886 int kind; 5887 void *data; 5888 Py_ssize_t expandsize = 0; 5889 5890 /* Initial allocation is based on the longest-possible character 5891 escape. 5892 5893 For UCS1 strings it's '\xxx', 4 bytes per source character. 5894 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5895 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5896 */ 5897 5898 if (!PyUnicode_Check(unicode)) { 5899 PyErr_BadArgument(); 5900 return NULL; 5901 } 5902 if (PyUnicode_READY(unicode) == -1) 5903 return NULL; 5904 len = PyUnicode_GET_LENGTH(unicode); 5905 kind = PyUnicode_KIND(unicode); 5906 data = PyUnicode_DATA(unicode); 5907 switch (kind) { 5908 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5909 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5910 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5911 } 5912 5913 if (len == 0) 5914 return PyBytes_FromStringAndSize(NULL, 0); 5915 5916 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5917 return PyErr_NoMemory(); 5918 5919 repr = PyBytes_FromStringAndSize(NULL, 5920 2 5921 + expandsize*len 5922 + 1); 5923 if (repr == NULL) 5924 return NULL; 5925 5926 p = PyBytes_AS_STRING(repr); 5927 5928 for (i = 0; i < len; i++) { 5929 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5930 5931 /* Escape backslashes */ 5932 if (ch == '\\') { 5933 *p++ = '\\'; 5934 *p++ = (char) ch; 5935 continue; 5936 } 5937 5938 /* Map 21-bit characters to '\U00xxxxxx' */ 5939 else if (ch >= 0x10000) { 5940 assert(ch <= MAX_UNICODE); 5941 *p++ = '\\'; 5942 *p++ = 'U'; 5943 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5944 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5945 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5946 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5947 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5948 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5949 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5950 *p++ = Py_hexdigits[ch & 0x0000000F]; 5951 continue; 5952 } 5953 5954 /* Map 16-bit characters to '\uxxxx' */ 5955 if (ch >= 256) { 5956 *p++ = '\\'; 5957 *p++ = 'u'; 5958 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5959 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5960 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5961 *p++ = Py_hexdigits[ch & 0x000F]; 5962 } 5963 5964 /* Map special whitespace to '\t', \n', '\r' */ 5965 else if (ch == '\t') { 5966 *p++ = '\\'; 5967 *p++ = 't'; 5968 } 5969 else if (ch == '\n') { 5970 *p++ = '\\'; 5971 *p++ = 'n'; 5972 } 5973 else if (ch == '\r') { 5974 *p++ = '\\'; 5975 *p++ = 'r'; 5976 } 5977 5978 /* Map non-printable US ASCII to '\xhh' */ 5979 else if (ch < ' ' || ch >= 0x7F) { 5980 *p++ = '\\'; 5981 *p++ = 'x'; 5982 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5983 *p++ = Py_hexdigits[ch & 0x000F]; 5984 } 5985 5986 /* Copy everything else as-is */ 5987 else 5988 *p++ = (char) ch; 5989 } 5990 5991 assert(p - PyBytes_AS_STRING(repr) > 0); 5992 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5993 return NULL; 5994 return repr; 5995} 5996 5997PyObject * 5998PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5999 Py_ssize_t size) 6000{ 6001 PyObject *result; 6002 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6003 if (tmp == NULL) 6004 return NULL; 6005 result = PyUnicode_AsUnicodeEscapeString(tmp); 6006 Py_DECREF(tmp); 6007 return result; 6008} 6009 6010/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6011 6012PyObject * 6013PyUnicode_DecodeRawUnicodeEscape(const char *s, 6014 Py_ssize_t size, 6015 const char *errors) 6016{ 6017 const char *starts = s; 6018 Py_ssize_t startinpos; 6019 Py_ssize_t endinpos; 6020 _PyUnicodeWriter writer; 6021 const char *end; 6022 const char *bs; 6023 PyObject *errorHandler = NULL; 6024 PyObject *exc = NULL; 6025 6026 if (size == 0) 6027 _Py_RETURN_UNICODE_EMPTY(); 6028 6029 /* Escaped strings will always be longer than the resulting 6030 Unicode string, so we start with size here and then reduce the 6031 length after conversion to the true value. (But decoding error 6032 handler might have to resize the string) */ 6033 _PyUnicodeWriter_Init(&writer); 6034 writer.min_length = size; 6035 6036 end = s + size; 6037 while (s < end) { 6038 unsigned char c; 6039 Py_UCS4 x; 6040 int i; 6041 int count; 6042 6043 /* Non-escape characters are interpreted as Unicode ordinals */ 6044 if (*s != '\\') { 6045 x = (unsigned char)*s++; 6046 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6047 goto onError; 6048 continue; 6049 } 6050 startinpos = s-starts; 6051 6052 /* \u-escapes are only interpreted iff the number of leading 6053 backslashes if odd */ 6054 bs = s; 6055 for (;s < end;) { 6056 if (*s != '\\') 6057 break; 6058 x = (unsigned char)*s++; 6059 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6060 goto onError; 6061 } 6062 if (((s - bs) & 1) == 0 || 6063 s >= end || 6064 (*s != 'u' && *s != 'U')) { 6065 continue; 6066 } 6067 writer.pos--; 6068 count = *s=='u' ? 4 : 8; 6069 s++; 6070 6071 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6072 for (x = 0, i = 0; i < count; ++i, ++s) { 6073 c = (unsigned char)*s; 6074 if (!Py_ISXDIGIT(c)) { 6075 endinpos = s-starts; 6076 if (unicode_decode_call_errorhandler_writer( 6077 errors, &errorHandler, 6078 "rawunicodeescape", "truncated \\uXXXX", 6079 &starts, &end, &startinpos, &endinpos, &exc, &s, 6080 &writer)) 6081 goto onError; 6082 goto nextByte; 6083 } 6084 x = (x<<4) & ~0xF; 6085 if (c >= '0' && c <= '9') 6086 x += c - '0'; 6087 else if (c >= 'a' && c <= 'f') 6088 x += 10 + c - 'a'; 6089 else 6090 x += 10 + c - 'A'; 6091 } 6092 if (x <= MAX_UNICODE) { 6093 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6094 goto onError; 6095 } 6096 else { 6097 endinpos = s-starts; 6098 if (unicode_decode_call_errorhandler_writer( 6099 errors, &errorHandler, 6100 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6101 &starts, &end, &startinpos, &endinpos, &exc, &s, 6102 &writer)) 6103 goto onError; 6104 } 6105 nextByte: 6106 ; 6107 } 6108 Py_XDECREF(errorHandler); 6109 Py_XDECREF(exc); 6110 return _PyUnicodeWriter_Finish(&writer); 6111 6112 onError: 6113 _PyUnicodeWriter_Dealloc(&writer); 6114 Py_XDECREF(errorHandler); 6115 Py_XDECREF(exc); 6116 return NULL; 6117} 6118 6119 6120PyObject * 6121PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6122{ 6123 PyObject *repr; 6124 char *p; 6125 char *q; 6126 Py_ssize_t expandsize, pos; 6127 int kind; 6128 void *data; 6129 Py_ssize_t len; 6130 6131 if (!PyUnicode_Check(unicode)) { 6132 PyErr_BadArgument(); 6133 return NULL; 6134 } 6135 if (PyUnicode_READY(unicode) == -1) 6136 return NULL; 6137 kind = PyUnicode_KIND(unicode); 6138 data = PyUnicode_DATA(unicode); 6139 len = PyUnicode_GET_LENGTH(unicode); 6140 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6141 bytes, and 1 byte characters 4. */ 6142 expandsize = kind * 2 + 2; 6143 6144 if (len > PY_SSIZE_T_MAX / expandsize) 6145 return PyErr_NoMemory(); 6146 6147 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6148 if (repr == NULL) 6149 return NULL; 6150 if (len == 0) 6151 return repr; 6152 6153 p = q = PyBytes_AS_STRING(repr); 6154 for (pos = 0; pos < len; pos++) { 6155 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6156 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6157 if (ch >= 0x10000) { 6158 assert(ch <= MAX_UNICODE); 6159 *p++ = '\\'; 6160 *p++ = 'U'; 6161 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6162 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6163 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6164 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6165 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6166 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6167 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6168 *p++ = Py_hexdigits[ch & 15]; 6169 } 6170 /* Map 16-bit characters to '\uxxxx' */ 6171 else if (ch >= 256) { 6172 *p++ = '\\'; 6173 *p++ = 'u'; 6174 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6175 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6176 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6177 *p++ = Py_hexdigits[ch & 15]; 6178 } 6179 /* Copy everything else as-is */ 6180 else 6181 *p++ = (char) ch; 6182 } 6183 6184 assert(p > q); 6185 if (_PyBytes_Resize(&repr, p - q) < 0) 6186 return NULL; 6187 return repr; 6188} 6189 6190PyObject * 6191PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6192 Py_ssize_t size) 6193{ 6194 PyObject *result; 6195 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6196 if (tmp == NULL) 6197 return NULL; 6198 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6199 Py_DECREF(tmp); 6200 return result; 6201} 6202 6203/* --- Unicode Internal Codec ------------------------------------------- */ 6204 6205PyObject * 6206_PyUnicode_DecodeUnicodeInternal(const char *s, 6207 Py_ssize_t size, 6208 const char *errors) 6209{ 6210 const char *starts = s; 6211 Py_ssize_t startinpos; 6212 Py_ssize_t endinpos; 6213 _PyUnicodeWriter writer; 6214 const char *end; 6215 const char *reason; 6216 PyObject *errorHandler = NULL; 6217 PyObject *exc = NULL; 6218 6219 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6220 "unicode_internal codec has been deprecated", 6221 1)) 6222 return NULL; 6223 6224 if (size == 0) 6225 _Py_RETURN_UNICODE_EMPTY(); 6226 6227 _PyUnicodeWriter_Init(&writer); 6228 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6229 PyErr_NoMemory(); 6230 goto onError; 6231 } 6232 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6233 6234 end = s + size; 6235 while (s < end) { 6236 Py_UNICODE uch; 6237 Py_UCS4 ch; 6238 if (end - s < Py_UNICODE_SIZE) { 6239 endinpos = end-starts; 6240 reason = "truncated input"; 6241 goto error; 6242 } 6243 /* We copy the raw representation one byte at a time because the 6244 pointer may be unaligned (see test_codeccallbacks). */ 6245 ((char *) &uch)[0] = s[0]; 6246 ((char *) &uch)[1] = s[1]; 6247#ifdef Py_UNICODE_WIDE 6248 ((char *) &uch)[2] = s[2]; 6249 ((char *) &uch)[3] = s[3]; 6250#endif 6251 ch = uch; 6252#ifdef Py_UNICODE_WIDE 6253 /* We have to sanity check the raw data, otherwise doom looms for 6254 some malformed UCS-4 data. */ 6255 if (ch > 0x10ffff) { 6256 endinpos = s - starts + Py_UNICODE_SIZE; 6257 reason = "illegal code point (> 0x10FFFF)"; 6258 goto error; 6259 } 6260#endif 6261 s += Py_UNICODE_SIZE; 6262#ifndef Py_UNICODE_WIDE 6263 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6264 { 6265 Py_UNICODE uch2; 6266 ((char *) &uch2)[0] = s[0]; 6267 ((char *) &uch2)[1] = s[1]; 6268 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6269 { 6270 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6271 s += Py_UNICODE_SIZE; 6272 } 6273 } 6274#endif 6275 6276 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6277 goto onError; 6278 continue; 6279 6280 error: 6281 startinpos = s - starts; 6282 if (unicode_decode_call_errorhandler_writer( 6283 errors, &errorHandler, 6284 "unicode_internal", reason, 6285 &starts, &end, &startinpos, &endinpos, &exc, &s, 6286 &writer)) 6287 goto onError; 6288 } 6289 6290 Py_XDECREF(errorHandler); 6291 Py_XDECREF(exc); 6292 return _PyUnicodeWriter_Finish(&writer); 6293 6294 onError: 6295 _PyUnicodeWriter_Dealloc(&writer); 6296 Py_XDECREF(errorHandler); 6297 Py_XDECREF(exc); 6298 return NULL; 6299} 6300 6301/* --- Latin-1 Codec ------------------------------------------------------ */ 6302 6303PyObject * 6304PyUnicode_DecodeLatin1(const char *s, 6305 Py_ssize_t size, 6306 const char *errors) 6307{ 6308 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6309 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6310} 6311 6312/* create or adjust a UnicodeEncodeError */ 6313static void 6314make_encode_exception(PyObject **exceptionObject, 6315 const char *encoding, 6316 PyObject *unicode, 6317 Py_ssize_t startpos, Py_ssize_t endpos, 6318 const char *reason) 6319{ 6320 if (*exceptionObject == NULL) { 6321 *exceptionObject = PyObject_CallFunction( 6322 PyExc_UnicodeEncodeError, "sOnns", 6323 encoding, unicode, startpos, endpos, reason); 6324 } 6325 else { 6326 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6327 goto onError; 6328 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6329 goto onError; 6330 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6331 goto onError; 6332 return; 6333 onError: 6334 Py_CLEAR(*exceptionObject); 6335 } 6336} 6337 6338/* raises a UnicodeEncodeError */ 6339static void 6340raise_encode_exception(PyObject **exceptionObject, 6341 const char *encoding, 6342 PyObject *unicode, 6343 Py_ssize_t startpos, Py_ssize_t endpos, 6344 const char *reason) 6345{ 6346 make_encode_exception(exceptionObject, 6347 encoding, unicode, startpos, endpos, reason); 6348 if (*exceptionObject != NULL) 6349 PyCodec_StrictErrors(*exceptionObject); 6350} 6351 6352/* error handling callback helper: 6353 build arguments, call the callback and check the arguments, 6354 put the result into newpos and return the replacement string, which 6355 has to be freed by the caller */ 6356static PyObject * 6357unicode_encode_call_errorhandler(const char *errors, 6358 PyObject **errorHandler, 6359 const char *encoding, const char *reason, 6360 PyObject *unicode, PyObject **exceptionObject, 6361 Py_ssize_t startpos, Py_ssize_t endpos, 6362 Py_ssize_t *newpos) 6363{ 6364 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6365 Py_ssize_t len; 6366 PyObject *restuple; 6367 PyObject *resunicode; 6368 6369 if (*errorHandler == NULL) { 6370 *errorHandler = PyCodec_LookupError(errors); 6371 if (*errorHandler == NULL) 6372 return NULL; 6373 } 6374 6375 if (PyUnicode_READY(unicode) == -1) 6376 return NULL; 6377 len = PyUnicode_GET_LENGTH(unicode); 6378 6379 make_encode_exception(exceptionObject, 6380 encoding, unicode, startpos, endpos, reason); 6381 if (*exceptionObject == NULL) 6382 return NULL; 6383 6384 restuple = PyObject_CallFunctionObjArgs( 6385 *errorHandler, *exceptionObject, NULL); 6386 if (restuple == NULL) 6387 return NULL; 6388 if (!PyTuple_Check(restuple)) { 6389 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6390 Py_DECREF(restuple); 6391 return NULL; 6392 } 6393 if (!PyArg_ParseTuple(restuple, argparse, 6394 &resunicode, newpos)) { 6395 Py_DECREF(restuple); 6396 return NULL; 6397 } 6398 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6399 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6400 Py_DECREF(restuple); 6401 return NULL; 6402 } 6403 if (*newpos<0) 6404 *newpos = len + *newpos; 6405 if (*newpos<0 || *newpos>len) { 6406 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6407 Py_DECREF(restuple); 6408 return NULL; 6409 } 6410 Py_INCREF(resunicode); 6411 Py_DECREF(restuple); 6412 return resunicode; 6413} 6414 6415static PyObject * 6416unicode_encode_ucs1(PyObject *unicode, 6417 const char *errors, 6418 const Py_UCS4 limit) 6419{ 6420 /* input state */ 6421 Py_ssize_t pos=0, size; 6422 int kind; 6423 void *data; 6424 /* output object */ 6425 PyObject *res; 6426 /* pointer into the output */ 6427 char *str; 6428 /* current output position */ 6429 Py_ssize_t ressize; 6430 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6431 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6432 PyObject *error_handler_obj = NULL; 6433 PyObject *exc = NULL; 6434 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6435 6436 if (PyUnicode_READY(unicode) == -1) 6437 return NULL; 6438 size = PyUnicode_GET_LENGTH(unicode); 6439 kind = PyUnicode_KIND(unicode); 6440 data = PyUnicode_DATA(unicode); 6441 /* allocate enough for a simple encoding without 6442 replacements, if we need more, we'll resize */ 6443 if (size == 0) 6444 return PyBytes_FromStringAndSize(NULL, 0); 6445 res = PyBytes_FromStringAndSize(NULL, size); 6446 if (res == NULL) 6447 return NULL; 6448 str = PyBytes_AS_STRING(res); 6449 ressize = size; 6450 6451 while (pos < size) { 6452 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6453 6454 /* can we encode this? */ 6455 if (ch < limit) { 6456 /* no overflow check, because we know that the space is enough */ 6457 *str++ = (char)ch; 6458 ++pos; 6459 } 6460 else { 6461 Py_ssize_t requiredsize; 6462 PyObject *repunicode; 6463 Py_ssize_t repsize, newpos, respos, i; 6464 /* startpos for collecting unencodable chars */ 6465 Py_ssize_t collstart = pos; 6466 Py_ssize_t collend = pos; 6467 /* find all unecodable characters */ 6468 6469 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6470 ++collend; 6471 6472 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6473 if (error_handler == _Py_ERROR_UNKNOWN) 6474 error_handler = get_error_handler(errors); 6475 6476 switch (error_handler) { 6477 case _Py_ERROR_STRICT: 6478 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6479 goto onError; 6480 6481 case _Py_ERROR_REPLACE: 6482 while (collstart++ < collend) 6483 *str++ = '?'; 6484 /* fall through ignore error handler */ 6485 case _Py_ERROR_IGNORE: 6486 pos = collend; 6487 break; 6488 6489 case _Py_ERROR_XMLCHARREFREPLACE: 6490 respos = str - PyBytes_AS_STRING(res); 6491 requiredsize = respos; 6492 /* determine replacement size */ 6493 for (i = collstart; i < collend; ++i) { 6494 Py_ssize_t incr; 6495 6496 ch = PyUnicode_READ(kind, data, i); 6497 if (ch < 10) 6498 incr = 2+1+1; 6499 else if (ch < 100) 6500 incr = 2+2+1; 6501 else if (ch < 1000) 6502 incr = 2+3+1; 6503 else if (ch < 10000) 6504 incr = 2+4+1; 6505 else if (ch < 100000) 6506 incr = 2+5+1; 6507 else if (ch < 1000000) 6508 incr = 2+6+1; 6509 else { 6510 assert(ch <= MAX_UNICODE); 6511 incr = 2+7+1; 6512 } 6513 if (requiredsize > PY_SSIZE_T_MAX - incr) 6514 goto overflow; 6515 requiredsize += incr; 6516 } 6517 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6518 goto overflow; 6519 requiredsize += size - collend; 6520 if (requiredsize > ressize) { 6521 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6522 requiredsize = 2*ressize; 6523 if (_PyBytes_Resize(&res, requiredsize)) 6524 goto onError; 6525 str = PyBytes_AS_STRING(res) + respos; 6526 ressize = requiredsize; 6527 } 6528 /* generate replacement */ 6529 for (i = collstart; i < collend; ++i) { 6530 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6531 } 6532 pos = collend; 6533 break; 6534 6535 default: 6536 repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, 6537 encoding, reason, unicode, &exc, 6538 collstart, collend, &newpos); 6539 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6540 PyUnicode_READY(repunicode) == -1)) 6541 goto onError; 6542 6543 if (PyBytes_Check(repunicode)) { 6544 /* Directly copy bytes result to output. */ 6545 repsize = PyBytes_Size(repunicode); 6546 if (repsize > 1) { 6547 /* Make room for all additional bytes. */ 6548 respos = str - PyBytes_AS_STRING(res); 6549 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6550 Py_DECREF(repunicode); 6551 goto overflow; 6552 } 6553 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6554 Py_DECREF(repunicode); 6555 goto onError; 6556 } 6557 str = PyBytes_AS_STRING(res) + respos; 6558 ressize += repsize-1; 6559 } 6560 memcpy(str, PyBytes_AsString(repunicode), repsize); 6561 str += repsize; 6562 pos = newpos; 6563 Py_DECREF(repunicode); 6564 break; 6565 } 6566 6567 /* need more space? (at least enough for what we 6568 have+the replacement+the rest of the string, so 6569 we won't have to check space for encodable characters) */ 6570 respos = str - PyBytes_AS_STRING(res); 6571 repsize = PyUnicode_GET_LENGTH(repunicode); 6572 requiredsize = respos; 6573 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6574 goto overflow; 6575 requiredsize += repsize; 6576 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6577 goto overflow; 6578 requiredsize += size - collend; 6579 if (requiredsize > ressize) { 6580 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6581 requiredsize = 2*ressize; 6582 if (_PyBytes_Resize(&res, requiredsize)) { 6583 Py_DECREF(repunicode); 6584 goto onError; 6585 } 6586 str = PyBytes_AS_STRING(res) + respos; 6587 ressize = requiredsize; 6588 } 6589 6590 /* check if there is anything unencodable in the replacement 6591 and copy it to the output */ 6592 for (i = 0; repsize-->0; ++i, ++str) { 6593 ch = PyUnicode_READ_CHAR(repunicode, i); 6594 if (ch >= limit) { 6595 raise_encode_exception(&exc, encoding, unicode, 6596 pos, pos+1, reason); 6597 Py_DECREF(repunicode); 6598 goto onError; 6599 } 6600 *str = (char)ch; 6601 } 6602 pos = newpos; 6603 Py_DECREF(repunicode); 6604 } 6605 } 6606 } 6607 /* Resize if we allocated to much */ 6608 size = str - PyBytes_AS_STRING(res); 6609 if (size < ressize) { /* If this falls res will be NULL */ 6610 assert(size >= 0); 6611 if (_PyBytes_Resize(&res, size) < 0) 6612 goto onError; 6613 } 6614 6615 Py_XDECREF(error_handler_obj); 6616 Py_XDECREF(exc); 6617 return res; 6618 6619 overflow: 6620 PyErr_SetString(PyExc_OverflowError, 6621 "encoded result is too long for a Python string"); 6622 6623 onError: 6624 Py_XDECREF(res); 6625 Py_XDECREF(error_handler_obj); 6626 Py_XDECREF(exc); 6627 return NULL; 6628} 6629 6630/* Deprecated */ 6631PyObject * 6632PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6633 Py_ssize_t size, 6634 const char *errors) 6635{ 6636 PyObject *result; 6637 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6638 if (unicode == NULL) 6639 return NULL; 6640 result = unicode_encode_ucs1(unicode, errors, 256); 6641 Py_DECREF(unicode); 6642 return result; 6643} 6644 6645PyObject * 6646_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6647{ 6648 if (!PyUnicode_Check(unicode)) { 6649 PyErr_BadArgument(); 6650 return NULL; 6651 } 6652 if (PyUnicode_READY(unicode) == -1) 6653 return NULL; 6654 /* Fast path: if it is a one-byte string, construct 6655 bytes object directly. */ 6656 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6657 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6658 PyUnicode_GET_LENGTH(unicode)); 6659 /* Non-Latin-1 characters present. Defer to above function to 6660 raise the exception. */ 6661 return unicode_encode_ucs1(unicode, errors, 256); 6662} 6663 6664PyObject* 6665PyUnicode_AsLatin1String(PyObject *unicode) 6666{ 6667 return _PyUnicode_AsLatin1String(unicode, NULL); 6668} 6669 6670/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6671 6672PyObject * 6673PyUnicode_DecodeASCII(const char *s, 6674 Py_ssize_t size, 6675 const char *errors) 6676{ 6677 const char *starts = s; 6678 _PyUnicodeWriter writer; 6679 int kind; 6680 void *data; 6681 Py_ssize_t startinpos; 6682 Py_ssize_t endinpos; 6683 Py_ssize_t outpos; 6684 const char *e; 6685 PyObject *error_handler_obj = NULL; 6686 PyObject *exc = NULL; 6687 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6688 6689 if (size == 0) 6690 _Py_RETURN_UNICODE_EMPTY(); 6691 6692 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6693 if (size == 1 && (unsigned char)s[0] < 128) 6694 return get_latin1_char((unsigned char)s[0]); 6695 6696 _PyUnicodeWriter_Init(&writer); 6697 writer.min_length = size; 6698 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6699 return NULL; 6700 6701 e = s + size; 6702 data = writer.data; 6703 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6704 writer.pos = outpos; 6705 if (writer.pos == size) 6706 return _PyUnicodeWriter_Finish(&writer); 6707 6708 s += writer.pos; 6709 kind = writer.kind; 6710 while (s < e) { 6711 unsigned char c = (unsigned char)*s; 6712 if (c < 128) { 6713 PyUnicode_WRITE(kind, data, writer.pos, c); 6714 writer.pos++; 6715 ++s; 6716 continue; 6717 } 6718 6719 /* byte outsize range 0x00..0x7f: call the error handler */ 6720 6721 if (error_handler == _Py_ERROR_UNKNOWN) 6722 error_handler = get_error_handler(errors); 6723 6724 switch (error_handler) 6725 { 6726 case _Py_ERROR_REPLACE: 6727 case _Py_ERROR_SURROGATEESCAPE: 6728 /* Fast-path: the error handler only writes one character, 6729 but we may switch to UCS2 at the first write */ 6730 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 6731 goto onError; 6732 kind = writer.kind; 6733 data = writer.data; 6734 6735 if (error_handler == _Py_ERROR_REPLACE) 6736 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 6737 else 6738 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 6739 writer.pos++; 6740 ++s; 6741 break; 6742 6743 case _Py_ERROR_IGNORE: 6744 ++s; 6745 break; 6746 6747 default: 6748 startinpos = s-starts; 6749 endinpos = startinpos + 1; 6750 if (unicode_decode_call_errorhandler_writer( 6751 errors, &error_handler_obj, 6752 "ascii", "ordinal not in range(128)", 6753 &starts, &e, &startinpos, &endinpos, &exc, &s, 6754 &writer)) 6755 goto onError; 6756 kind = writer.kind; 6757 data = writer.data; 6758 } 6759 } 6760 Py_XDECREF(error_handler_obj); 6761 Py_XDECREF(exc); 6762 return _PyUnicodeWriter_Finish(&writer); 6763 6764 onError: 6765 _PyUnicodeWriter_Dealloc(&writer); 6766 Py_XDECREF(error_handler_obj); 6767 Py_XDECREF(exc); 6768 return NULL; 6769} 6770 6771/* Deprecated */ 6772PyObject * 6773PyUnicode_EncodeASCII(const Py_UNICODE *p, 6774 Py_ssize_t size, 6775 const char *errors) 6776{ 6777 PyObject *result; 6778 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6779 if (unicode == NULL) 6780 return NULL; 6781 result = unicode_encode_ucs1(unicode, errors, 128); 6782 Py_DECREF(unicode); 6783 return result; 6784} 6785 6786PyObject * 6787_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6788{ 6789 if (!PyUnicode_Check(unicode)) { 6790 PyErr_BadArgument(); 6791 return NULL; 6792 } 6793 if (PyUnicode_READY(unicode) == -1) 6794 return NULL; 6795 /* Fast path: if it is an ASCII-only string, construct bytes object 6796 directly. Else defer to above function to raise the exception. */ 6797 if (PyUnicode_IS_ASCII(unicode)) 6798 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6799 PyUnicode_GET_LENGTH(unicode)); 6800 return unicode_encode_ucs1(unicode, errors, 128); 6801} 6802 6803PyObject * 6804PyUnicode_AsASCIIString(PyObject *unicode) 6805{ 6806 return _PyUnicode_AsASCIIString(unicode, NULL); 6807} 6808 6809#ifdef HAVE_MBCS 6810 6811/* --- MBCS codecs for Windows -------------------------------------------- */ 6812 6813#if SIZEOF_INT < SIZEOF_SIZE_T 6814#define NEED_RETRY 6815#endif 6816 6817#ifndef WC_ERR_INVALID_CHARS 6818# define WC_ERR_INVALID_CHARS 0x0080 6819#endif 6820 6821static char* 6822code_page_name(UINT code_page, PyObject **obj) 6823{ 6824 *obj = NULL; 6825 if (code_page == CP_ACP) 6826 return "mbcs"; 6827 if (code_page == CP_UTF7) 6828 return "CP_UTF7"; 6829 if (code_page == CP_UTF8) 6830 return "CP_UTF8"; 6831 6832 *obj = PyBytes_FromFormat("cp%u", code_page); 6833 if (*obj == NULL) 6834 return NULL; 6835 return PyBytes_AS_STRING(*obj); 6836} 6837 6838static DWORD 6839decode_code_page_flags(UINT code_page) 6840{ 6841 if (code_page == CP_UTF7) { 6842 /* The CP_UTF7 decoder only supports flags=0 */ 6843 return 0; 6844 } 6845 else 6846 return MB_ERR_INVALID_CHARS; 6847} 6848 6849/* 6850 * Decode a byte string from a Windows code page into unicode object in strict 6851 * mode. 6852 * 6853 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6854 * OSError and returns -1 on other error. 6855 */ 6856static int 6857decode_code_page_strict(UINT code_page, 6858 PyObject **v, 6859 const char *in, 6860 int insize) 6861{ 6862 const DWORD flags = decode_code_page_flags(code_page); 6863 wchar_t *out; 6864 DWORD outsize; 6865 6866 /* First get the size of the result */ 6867 assert(insize > 0); 6868 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6869 if (outsize <= 0) 6870 goto error; 6871 6872 if (*v == NULL) { 6873 /* Create unicode object */ 6874 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6875 *v = (PyObject*)_PyUnicode_New(outsize); 6876 if (*v == NULL) 6877 return -1; 6878 out = PyUnicode_AS_UNICODE(*v); 6879 } 6880 else { 6881 /* Extend unicode object */ 6882 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6883 if (unicode_resize(v, n + outsize) < 0) 6884 return -1; 6885 out = PyUnicode_AS_UNICODE(*v) + n; 6886 } 6887 6888 /* Do the conversion */ 6889 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6890 if (outsize <= 0) 6891 goto error; 6892 return insize; 6893 6894error: 6895 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6896 return -2; 6897 PyErr_SetFromWindowsErr(0); 6898 return -1; 6899} 6900 6901/* 6902 * Decode a byte string from a code page into unicode object with an error 6903 * handler. 6904 * 6905 * Returns consumed size if succeed, or raise an OSError or 6906 * UnicodeDecodeError exception and returns -1 on error. 6907 */ 6908static int 6909decode_code_page_errors(UINT code_page, 6910 PyObject **v, 6911 const char *in, const int size, 6912 const char *errors, int final) 6913{ 6914 const char *startin = in; 6915 const char *endin = in + size; 6916 const DWORD flags = decode_code_page_flags(code_page); 6917 /* Ideally, we should get reason from FormatMessage. This is the Windows 6918 2000 English version of the message. */ 6919 const char *reason = "No mapping for the Unicode character exists " 6920 "in the target code page."; 6921 /* each step cannot decode more than 1 character, but a character can be 6922 represented as a surrogate pair */ 6923 wchar_t buffer[2], *startout, *out; 6924 int insize; 6925 Py_ssize_t outsize; 6926 PyObject *errorHandler = NULL; 6927 PyObject *exc = NULL; 6928 PyObject *encoding_obj = NULL; 6929 char *encoding; 6930 DWORD err; 6931 int ret = -1; 6932 6933 assert(size > 0); 6934 6935 encoding = code_page_name(code_page, &encoding_obj); 6936 if (encoding == NULL) 6937 return -1; 6938 6939 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 6940 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6941 UnicodeDecodeError. */ 6942 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6943 if (exc != NULL) { 6944 PyCodec_StrictErrors(exc); 6945 Py_CLEAR(exc); 6946 } 6947 goto error; 6948 } 6949 6950 if (*v == NULL) { 6951 /* Create unicode object */ 6952 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6953 PyErr_NoMemory(); 6954 goto error; 6955 } 6956 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6957 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6958 if (*v == NULL) 6959 goto error; 6960 startout = PyUnicode_AS_UNICODE(*v); 6961 } 6962 else { 6963 /* Extend unicode object */ 6964 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6965 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6966 PyErr_NoMemory(); 6967 goto error; 6968 } 6969 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6970 goto error; 6971 startout = PyUnicode_AS_UNICODE(*v) + n; 6972 } 6973 6974 /* Decode the byte string character per character */ 6975 out = startout; 6976 while (in < endin) 6977 { 6978 /* Decode a character */ 6979 insize = 1; 6980 do 6981 { 6982 outsize = MultiByteToWideChar(code_page, flags, 6983 in, insize, 6984 buffer, Py_ARRAY_LENGTH(buffer)); 6985 if (outsize > 0) 6986 break; 6987 err = GetLastError(); 6988 if (err != ERROR_NO_UNICODE_TRANSLATION 6989 && err != ERROR_INSUFFICIENT_BUFFER) 6990 { 6991 PyErr_SetFromWindowsErr(0); 6992 goto error; 6993 } 6994 insize++; 6995 } 6996 /* 4=maximum length of a UTF-8 sequence */ 6997 while (insize <= 4 && (in + insize) <= endin); 6998 6999 if (outsize <= 0) { 7000 Py_ssize_t startinpos, endinpos, outpos; 7001 7002 /* last character in partial decode? */ 7003 if (in + insize >= endin && !final) 7004 break; 7005 7006 startinpos = in - startin; 7007 endinpos = startinpos + 1; 7008 outpos = out - PyUnicode_AS_UNICODE(*v); 7009 if (unicode_decode_call_errorhandler_wchar( 7010 errors, &errorHandler, 7011 encoding, reason, 7012 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7013 v, &outpos)) 7014 { 7015 goto error; 7016 } 7017 out = PyUnicode_AS_UNICODE(*v) + outpos; 7018 } 7019 else { 7020 in += insize; 7021 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7022 out += outsize; 7023 } 7024 } 7025 7026 /* write a NUL character at the end */ 7027 *out = 0; 7028 7029 /* Extend unicode object */ 7030 outsize = out - startout; 7031 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7032 if (unicode_resize(v, outsize) < 0) 7033 goto error; 7034 /* (in - startin) <= size and size is an int */ 7035 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7036 7037error: 7038 Py_XDECREF(encoding_obj); 7039 Py_XDECREF(errorHandler); 7040 Py_XDECREF(exc); 7041 return ret; 7042} 7043 7044static PyObject * 7045decode_code_page_stateful(int code_page, 7046 const char *s, Py_ssize_t size, 7047 const char *errors, Py_ssize_t *consumed) 7048{ 7049 PyObject *v = NULL; 7050 int chunk_size, final, converted, done; 7051 7052 if (code_page < 0) { 7053 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7054 return NULL; 7055 } 7056 7057 if (consumed) 7058 *consumed = 0; 7059 7060 do 7061 { 7062#ifdef NEED_RETRY 7063 if (size > INT_MAX) { 7064 chunk_size = INT_MAX; 7065 final = 0; 7066 done = 0; 7067 } 7068 else 7069#endif 7070 { 7071 chunk_size = (int)size; 7072 final = (consumed == NULL); 7073 done = 1; 7074 } 7075 7076 if (chunk_size == 0 && done) { 7077 if (v != NULL) 7078 break; 7079 _Py_RETURN_UNICODE_EMPTY(); 7080 } 7081 7082 converted = decode_code_page_strict(code_page, &v, 7083 s, chunk_size); 7084 if (converted == -2) 7085 converted = decode_code_page_errors(code_page, &v, 7086 s, chunk_size, 7087 errors, final); 7088 assert(converted != 0 || done); 7089 7090 if (converted < 0) { 7091 Py_XDECREF(v); 7092 return NULL; 7093 } 7094 7095 if (consumed) 7096 *consumed += converted; 7097 7098 s += converted; 7099 size -= converted; 7100 } while (!done); 7101 7102 return unicode_result(v); 7103} 7104 7105PyObject * 7106PyUnicode_DecodeCodePageStateful(int code_page, 7107 const char *s, 7108 Py_ssize_t size, 7109 const char *errors, 7110 Py_ssize_t *consumed) 7111{ 7112 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7113} 7114 7115PyObject * 7116PyUnicode_DecodeMBCSStateful(const char *s, 7117 Py_ssize_t size, 7118 const char *errors, 7119 Py_ssize_t *consumed) 7120{ 7121 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7122} 7123 7124PyObject * 7125PyUnicode_DecodeMBCS(const char *s, 7126 Py_ssize_t size, 7127 const char *errors) 7128{ 7129 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7130} 7131 7132static DWORD 7133encode_code_page_flags(UINT code_page, const char *errors) 7134{ 7135 if (code_page == CP_UTF8) { 7136 return WC_ERR_INVALID_CHARS; 7137 } 7138 else if (code_page == CP_UTF7) { 7139 /* CP_UTF7 only supports flags=0 */ 7140 return 0; 7141 } 7142 else { 7143 if (errors != NULL && strcmp(errors, "replace") == 0) 7144 return 0; 7145 else 7146 return WC_NO_BEST_FIT_CHARS; 7147 } 7148} 7149 7150/* 7151 * Encode a Unicode string to a Windows code page into a byte string in strict 7152 * mode. 7153 * 7154 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7155 * an OSError and returns -1 on other error. 7156 */ 7157static int 7158encode_code_page_strict(UINT code_page, PyObject **outbytes, 7159 PyObject *unicode, Py_ssize_t offset, int len, 7160 const char* errors) 7161{ 7162 BOOL usedDefaultChar = FALSE; 7163 BOOL *pusedDefaultChar = &usedDefaultChar; 7164 int outsize; 7165 PyObject *exc = NULL; 7166 wchar_t *p; 7167 Py_ssize_t size; 7168 const DWORD flags = encode_code_page_flags(code_page, NULL); 7169 char *out; 7170 /* Create a substring so that we can get the UTF-16 representation 7171 of just the slice under consideration. */ 7172 PyObject *substring; 7173 7174 assert(len > 0); 7175 7176 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7177 pusedDefaultChar = &usedDefaultChar; 7178 else 7179 pusedDefaultChar = NULL; 7180 7181 substring = PyUnicode_Substring(unicode, offset, offset+len); 7182 if (substring == NULL) 7183 return -1; 7184 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7185 if (p == NULL) { 7186 Py_DECREF(substring); 7187 return -1; 7188 } 7189 assert(size <= INT_MAX); 7190 7191 /* First get the size of the result */ 7192 outsize = WideCharToMultiByte(code_page, flags, 7193 p, (int)size, 7194 NULL, 0, 7195 NULL, pusedDefaultChar); 7196 if (outsize <= 0) 7197 goto error; 7198 /* If we used a default char, then we failed! */ 7199 if (pusedDefaultChar && *pusedDefaultChar) { 7200 Py_DECREF(substring); 7201 return -2; 7202 } 7203 7204 if (*outbytes == NULL) { 7205 /* Create string object */ 7206 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7207 if (*outbytes == NULL) { 7208 Py_DECREF(substring); 7209 return -1; 7210 } 7211 out = PyBytes_AS_STRING(*outbytes); 7212 } 7213 else { 7214 /* Extend string object */ 7215 const Py_ssize_t n = PyBytes_Size(*outbytes); 7216 if (outsize > PY_SSIZE_T_MAX - n) { 7217 PyErr_NoMemory(); 7218 Py_DECREF(substring); 7219 return -1; 7220 } 7221 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7222 Py_DECREF(substring); 7223 return -1; 7224 } 7225 out = PyBytes_AS_STRING(*outbytes) + n; 7226 } 7227 7228 /* Do the conversion */ 7229 outsize = WideCharToMultiByte(code_page, flags, 7230 p, (int)size, 7231 out, outsize, 7232 NULL, pusedDefaultChar); 7233 Py_CLEAR(substring); 7234 if (outsize <= 0) 7235 goto error; 7236 if (pusedDefaultChar && *pusedDefaultChar) 7237 return -2; 7238 return 0; 7239 7240error: 7241 Py_XDECREF(substring); 7242 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7243 return -2; 7244 PyErr_SetFromWindowsErr(0); 7245 return -1; 7246} 7247 7248/* 7249 * Encode a Unicode string to a Windows code page into a byte string using a 7250 * error handler. 7251 * 7252 * Returns consumed characters if succeed, or raise an OSError and returns 7253 * -1 on other error. 7254 */ 7255static int 7256encode_code_page_errors(UINT code_page, PyObject **outbytes, 7257 PyObject *unicode, Py_ssize_t unicode_offset, 7258 Py_ssize_t insize, const char* errors) 7259{ 7260 const DWORD flags = encode_code_page_flags(code_page, errors); 7261 Py_ssize_t pos = unicode_offset; 7262 Py_ssize_t endin = unicode_offset + insize; 7263 /* Ideally, we should get reason from FormatMessage. This is the Windows 7264 2000 English version of the message. */ 7265 const char *reason = "invalid character"; 7266 /* 4=maximum length of a UTF-8 sequence */ 7267 char buffer[4]; 7268 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7269 Py_ssize_t outsize; 7270 char *out; 7271 PyObject *errorHandler = NULL; 7272 PyObject *exc = NULL; 7273 PyObject *encoding_obj = NULL; 7274 char *encoding; 7275 Py_ssize_t newpos, newoutsize; 7276 PyObject *rep; 7277 int ret = -1; 7278 7279 assert(insize > 0); 7280 7281 encoding = code_page_name(code_page, &encoding_obj); 7282 if (encoding == NULL) 7283 return -1; 7284 7285 if (errors == NULL || strcmp(errors, "strict") == 0) { 7286 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7287 then we raise a UnicodeEncodeError. */ 7288 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7289 if (exc != NULL) { 7290 PyCodec_StrictErrors(exc); 7291 Py_DECREF(exc); 7292 } 7293 Py_XDECREF(encoding_obj); 7294 return -1; 7295 } 7296 7297 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7298 pusedDefaultChar = &usedDefaultChar; 7299 else 7300 pusedDefaultChar = NULL; 7301 7302 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7303 PyErr_NoMemory(); 7304 goto error; 7305 } 7306 outsize = insize * Py_ARRAY_LENGTH(buffer); 7307 7308 if (*outbytes == NULL) { 7309 /* Create string object */ 7310 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7311 if (*outbytes == NULL) 7312 goto error; 7313 out = PyBytes_AS_STRING(*outbytes); 7314 } 7315 else { 7316 /* Extend string object */ 7317 Py_ssize_t n = PyBytes_Size(*outbytes); 7318 if (n > PY_SSIZE_T_MAX - outsize) { 7319 PyErr_NoMemory(); 7320 goto error; 7321 } 7322 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7323 goto error; 7324 out = PyBytes_AS_STRING(*outbytes) + n; 7325 } 7326 7327 /* Encode the string character per character */ 7328 while (pos < endin) 7329 { 7330 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7331 wchar_t chars[2]; 7332 int charsize; 7333 if (ch < 0x10000) { 7334 chars[0] = (wchar_t)ch; 7335 charsize = 1; 7336 } 7337 else { 7338 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7339 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7340 charsize = 2; 7341 } 7342 7343 outsize = WideCharToMultiByte(code_page, flags, 7344 chars, charsize, 7345 buffer, Py_ARRAY_LENGTH(buffer), 7346 NULL, pusedDefaultChar); 7347 if (outsize > 0) { 7348 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7349 { 7350 pos++; 7351 memcpy(out, buffer, outsize); 7352 out += outsize; 7353 continue; 7354 } 7355 } 7356 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7357 PyErr_SetFromWindowsErr(0); 7358 goto error; 7359 } 7360 7361 rep = unicode_encode_call_errorhandler( 7362 errors, &errorHandler, encoding, reason, 7363 unicode, &exc, 7364 pos, pos + 1, &newpos); 7365 if (rep == NULL) 7366 goto error; 7367 pos = newpos; 7368 7369 if (PyBytes_Check(rep)) { 7370 outsize = PyBytes_GET_SIZE(rep); 7371 if (outsize != 1) { 7372 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7373 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7374 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7375 Py_DECREF(rep); 7376 goto error; 7377 } 7378 out = PyBytes_AS_STRING(*outbytes) + offset; 7379 } 7380 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7381 out += outsize; 7382 } 7383 else { 7384 Py_ssize_t i; 7385 enum PyUnicode_Kind kind; 7386 void *data; 7387 7388 if (PyUnicode_READY(rep) == -1) { 7389 Py_DECREF(rep); 7390 goto error; 7391 } 7392 7393 outsize = PyUnicode_GET_LENGTH(rep); 7394 if (outsize != 1) { 7395 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7396 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7397 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7398 Py_DECREF(rep); 7399 goto error; 7400 } 7401 out = PyBytes_AS_STRING(*outbytes) + offset; 7402 } 7403 kind = PyUnicode_KIND(rep); 7404 data = PyUnicode_DATA(rep); 7405 for (i=0; i < outsize; i++) { 7406 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7407 if (ch > 127) { 7408 raise_encode_exception(&exc, 7409 encoding, unicode, 7410 pos, pos + 1, 7411 "unable to encode error handler result to ASCII"); 7412 Py_DECREF(rep); 7413 goto error; 7414 } 7415 *out = (unsigned char)ch; 7416 out++; 7417 } 7418 } 7419 Py_DECREF(rep); 7420 } 7421 /* write a NUL byte */ 7422 *out = 0; 7423 outsize = out - PyBytes_AS_STRING(*outbytes); 7424 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7425 if (_PyBytes_Resize(outbytes, outsize) < 0) 7426 goto error; 7427 ret = 0; 7428 7429error: 7430 Py_XDECREF(encoding_obj); 7431 Py_XDECREF(errorHandler); 7432 Py_XDECREF(exc); 7433 return ret; 7434} 7435 7436static PyObject * 7437encode_code_page(int code_page, 7438 PyObject *unicode, 7439 const char *errors) 7440{ 7441 Py_ssize_t len; 7442 PyObject *outbytes = NULL; 7443 Py_ssize_t offset; 7444 int chunk_len, ret, done; 7445 7446 if (!PyUnicode_Check(unicode)) { 7447 PyErr_BadArgument(); 7448 return NULL; 7449 } 7450 7451 if (PyUnicode_READY(unicode) == -1) 7452 return NULL; 7453 len = PyUnicode_GET_LENGTH(unicode); 7454 7455 if (code_page < 0) { 7456 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7457 return NULL; 7458 } 7459 7460 if (len == 0) 7461 return PyBytes_FromStringAndSize(NULL, 0); 7462 7463 offset = 0; 7464 do 7465 { 7466#ifdef NEED_RETRY 7467 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7468 chunks. */ 7469 if (len > INT_MAX/2) { 7470 chunk_len = INT_MAX/2; 7471 done = 0; 7472 } 7473 else 7474#endif 7475 { 7476 chunk_len = (int)len; 7477 done = 1; 7478 } 7479 7480 ret = encode_code_page_strict(code_page, &outbytes, 7481 unicode, offset, chunk_len, 7482 errors); 7483 if (ret == -2) 7484 ret = encode_code_page_errors(code_page, &outbytes, 7485 unicode, offset, 7486 chunk_len, errors); 7487 if (ret < 0) { 7488 Py_XDECREF(outbytes); 7489 return NULL; 7490 } 7491 7492 offset += chunk_len; 7493 len -= chunk_len; 7494 } while (!done); 7495 7496 return outbytes; 7497} 7498 7499PyObject * 7500PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7501 Py_ssize_t size, 7502 const char *errors) 7503{ 7504 PyObject *unicode, *res; 7505 unicode = PyUnicode_FromUnicode(p, size); 7506 if (unicode == NULL) 7507 return NULL; 7508 res = encode_code_page(CP_ACP, unicode, errors); 7509 Py_DECREF(unicode); 7510 return res; 7511} 7512 7513PyObject * 7514PyUnicode_EncodeCodePage(int code_page, 7515 PyObject *unicode, 7516 const char *errors) 7517{ 7518 return encode_code_page(code_page, unicode, errors); 7519} 7520 7521PyObject * 7522PyUnicode_AsMBCSString(PyObject *unicode) 7523{ 7524 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7525} 7526 7527#undef NEED_RETRY 7528 7529#endif /* HAVE_MBCS */ 7530 7531/* --- Character Mapping Codec -------------------------------------------- */ 7532 7533static int 7534charmap_decode_string(const char *s, 7535 Py_ssize_t size, 7536 PyObject *mapping, 7537 const char *errors, 7538 _PyUnicodeWriter *writer) 7539{ 7540 const char *starts = s; 7541 const char *e; 7542 Py_ssize_t startinpos, endinpos; 7543 PyObject *errorHandler = NULL, *exc = NULL; 7544 Py_ssize_t maplen; 7545 enum PyUnicode_Kind mapkind; 7546 void *mapdata; 7547 Py_UCS4 x; 7548 unsigned char ch; 7549 7550 if (PyUnicode_READY(mapping) == -1) 7551 return -1; 7552 7553 maplen = PyUnicode_GET_LENGTH(mapping); 7554 mapdata = PyUnicode_DATA(mapping); 7555 mapkind = PyUnicode_KIND(mapping); 7556 7557 e = s + size; 7558 7559 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7560 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7561 * is disabled in encoding aliases, latin1 is preferred because 7562 * its implementation is faster. */ 7563 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7564 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7565 Py_UCS4 maxchar = writer->maxchar; 7566 7567 assert (writer->kind == PyUnicode_1BYTE_KIND); 7568 while (s < e) { 7569 ch = *s; 7570 x = mapdata_ucs1[ch]; 7571 if (x > maxchar) { 7572 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7573 goto onError; 7574 maxchar = writer->maxchar; 7575 outdata = (Py_UCS1 *)writer->data; 7576 } 7577 outdata[writer->pos] = x; 7578 writer->pos++; 7579 ++s; 7580 } 7581 return 0; 7582 } 7583 7584 while (s < e) { 7585 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7586 enum PyUnicode_Kind outkind = writer->kind; 7587 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7588 if (outkind == PyUnicode_1BYTE_KIND) { 7589 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7590 Py_UCS4 maxchar = writer->maxchar; 7591 while (s < e) { 7592 ch = *s; 7593 x = mapdata_ucs2[ch]; 7594 if (x > maxchar) 7595 goto Error; 7596 outdata[writer->pos] = x; 7597 writer->pos++; 7598 ++s; 7599 } 7600 break; 7601 } 7602 else if (outkind == PyUnicode_2BYTE_KIND) { 7603 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7604 while (s < e) { 7605 ch = *s; 7606 x = mapdata_ucs2[ch]; 7607 if (x == 0xFFFE) 7608 goto Error; 7609 outdata[writer->pos] = x; 7610 writer->pos++; 7611 ++s; 7612 } 7613 break; 7614 } 7615 } 7616 ch = *s; 7617 7618 if (ch < maplen) 7619 x = PyUnicode_READ(mapkind, mapdata, ch); 7620 else 7621 x = 0xfffe; /* invalid value */ 7622Error: 7623 if (x == 0xfffe) 7624 { 7625 /* undefined mapping */ 7626 startinpos = s-starts; 7627 endinpos = startinpos+1; 7628 if (unicode_decode_call_errorhandler_writer( 7629 errors, &errorHandler, 7630 "charmap", "character maps to <undefined>", 7631 &starts, &e, &startinpos, &endinpos, &exc, &s, 7632 writer)) { 7633 goto onError; 7634 } 7635 continue; 7636 } 7637 7638 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7639 goto onError; 7640 ++s; 7641 } 7642 Py_XDECREF(errorHandler); 7643 Py_XDECREF(exc); 7644 return 0; 7645 7646onError: 7647 Py_XDECREF(errorHandler); 7648 Py_XDECREF(exc); 7649 return -1; 7650} 7651 7652static int 7653charmap_decode_mapping(const char *s, 7654 Py_ssize_t size, 7655 PyObject *mapping, 7656 const char *errors, 7657 _PyUnicodeWriter *writer) 7658{ 7659 const char *starts = s; 7660 const char *e; 7661 Py_ssize_t startinpos, endinpos; 7662 PyObject *errorHandler = NULL, *exc = NULL; 7663 unsigned char ch; 7664 PyObject *key, *item = NULL; 7665 7666 e = s + size; 7667 7668 while (s < e) { 7669 ch = *s; 7670 7671 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7672 key = PyLong_FromLong((long)ch); 7673 if (key == NULL) 7674 goto onError; 7675 7676 item = PyObject_GetItem(mapping, key); 7677 Py_DECREF(key); 7678 if (item == NULL) { 7679 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7680 /* No mapping found means: mapping is undefined. */ 7681 PyErr_Clear(); 7682 goto Undefined; 7683 } else 7684 goto onError; 7685 } 7686 7687 /* Apply mapping */ 7688 if (item == Py_None) 7689 goto Undefined; 7690 if (PyLong_Check(item)) { 7691 long value = PyLong_AS_LONG(item); 7692 if (value == 0xFFFE) 7693 goto Undefined; 7694 if (value < 0 || value > MAX_UNICODE) { 7695 PyErr_Format(PyExc_TypeError, 7696 "character mapping must be in range(0x%lx)", 7697 (unsigned long)MAX_UNICODE + 1); 7698 goto onError; 7699 } 7700 7701 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7702 goto onError; 7703 } 7704 else if (PyUnicode_Check(item)) { 7705 if (PyUnicode_READY(item) == -1) 7706 goto onError; 7707 if (PyUnicode_GET_LENGTH(item) == 1) { 7708 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7709 if (value == 0xFFFE) 7710 goto Undefined; 7711 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7712 goto onError; 7713 } 7714 else { 7715 writer->overallocate = 1; 7716 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7717 goto onError; 7718 } 7719 } 7720 else { 7721 /* wrong return value */ 7722 PyErr_SetString(PyExc_TypeError, 7723 "character mapping must return integer, None or str"); 7724 goto onError; 7725 } 7726 Py_CLEAR(item); 7727 ++s; 7728 continue; 7729 7730Undefined: 7731 /* undefined mapping */ 7732 Py_CLEAR(item); 7733 startinpos = s-starts; 7734 endinpos = startinpos+1; 7735 if (unicode_decode_call_errorhandler_writer( 7736 errors, &errorHandler, 7737 "charmap", "character maps to <undefined>", 7738 &starts, &e, &startinpos, &endinpos, &exc, &s, 7739 writer)) { 7740 goto onError; 7741 } 7742 } 7743 Py_XDECREF(errorHandler); 7744 Py_XDECREF(exc); 7745 return 0; 7746 7747onError: 7748 Py_XDECREF(item); 7749 Py_XDECREF(errorHandler); 7750 Py_XDECREF(exc); 7751 return -1; 7752} 7753 7754PyObject * 7755PyUnicode_DecodeCharmap(const char *s, 7756 Py_ssize_t size, 7757 PyObject *mapping, 7758 const char *errors) 7759{ 7760 _PyUnicodeWriter writer; 7761 7762 /* Default to Latin-1 */ 7763 if (mapping == NULL) 7764 return PyUnicode_DecodeLatin1(s, size, errors); 7765 7766 if (size == 0) 7767 _Py_RETURN_UNICODE_EMPTY(); 7768 _PyUnicodeWriter_Init(&writer); 7769 writer.min_length = size; 7770 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7771 goto onError; 7772 7773 if (PyUnicode_CheckExact(mapping)) { 7774 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7775 goto onError; 7776 } 7777 else { 7778 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7779 goto onError; 7780 } 7781 return _PyUnicodeWriter_Finish(&writer); 7782 7783 onError: 7784 _PyUnicodeWriter_Dealloc(&writer); 7785 return NULL; 7786} 7787 7788/* Charmap encoding: the lookup table */ 7789 7790struct encoding_map { 7791 PyObject_HEAD 7792 unsigned char level1[32]; 7793 int count2, count3; 7794 unsigned char level23[1]; 7795}; 7796 7797static PyObject* 7798encoding_map_size(PyObject *obj, PyObject* args) 7799{ 7800 struct encoding_map *map = (struct encoding_map*)obj; 7801 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7802 128*map->count3); 7803} 7804 7805static PyMethodDef encoding_map_methods[] = { 7806 {"size", encoding_map_size, METH_NOARGS, 7807 PyDoc_STR("Return the size (in bytes) of this object") }, 7808 { 0 } 7809}; 7810 7811static void 7812encoding_map_dealloc(PyObject* o) 7813{ 7814 PyObject_FREE(o); 7815} 7816 7817static PyTypeObject EncodingMapType = { 7818 PyVarObject_HEAD_INIT(NULL, 0) 7819 "EncodingMap", /*tp_name*/ 7820 sizeof(struct encoding_map), /*tp_basicsize*/ 7821 0, /*tp_itemsize*/ 7822 /* methods */ 7823 encoding_map_dealloc, /*tp_dealloc*/ 7824 0, /*tp_print*/ 7825 0, /*tp_getattr*/ 7826 0, /*tp_setattr*/ 7827 0, /*tp_reserved*/ 7828 0, /*tp_repr*/ 7829 0, /*tp_as_number*/ 7830 0, /*tp_as_sequence*/ 7831 0, /*tp_as_mapping*/ 7832 0, /*tp_hash*/ 7833 0, /*tp_call*/ 7834 0, /*tp_str*/ 7835 0, /*tp_getattro*/ 7836 0, /*tp_setattro*/ 7837 0, /*tp_as_buffer*/ 7838 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7839 0, /*tp_doc*/ 7840 0, /*tp_traverse*/ 7841 0, /*tp_clear*/ 7842 0, /*tp_richcompare*/ 7843 0, /*tp_weaklistoffset*/ 7844 0, /*tp_iter*/ 7845 0, /*tp_iternext*/ 7846 encoding_map_methods, /*tp_methods*/ 7847 0, /*tp_members*/ 7848 0, /*tp_getset*/ 7849 0, /*tp_base*/ 7850 0, /*tp_dict*/ 7851 0, /*tp_descr_get*/ 7852 0, /*tp_descr_set*/ 7853 0, /*tp_dictoffset*/ 7854 0, /*tp_init*/ 7855 0, /*tp_alloc*/ 7856 0, /*tp_new*/ 7857 0, /*tp_free*/ 7858 0, /*tp_is_gc*/ 7859}; 7860 7861PyObject* 7862PyUnicode_BuildEncodingMap(PyObject* string) 7863{ 7864 PyObject *result; 7865 struct encoding_map *mresult; 7866 int i; 7867 int need_dict = 0; 7868 unsigned char level1[32]; 7869 unsigned char level2[512]; 7870 unsigned char *mlevel1, *mlevel2, *mlevel3; 7871 int count2 = 0, count3 = 0; 7872 int kind; 7873 void *data; 7874 Py_ssize_t length; 7875 Py_UCS4 ch; 7876 7877 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7878 PyErr_BadArgument(); 7879 return NULL; 7880 } 7881 kind = PyUnicode_KIND(string); 7882 data = PyUnicode_DATA(string); 7883 length = PyUnicode_GET_LENGTH(string); 7884 length = Py_MIN(length, 256); 7885 memset(level1, 0xFF, sizeof level1); 7886 memset(level2, 0xFF, sizeof level2); 7887 7888 /* If there isn't a one-to-one mapping of NULL to \0, 7889 or if there are non-BMP characters, we need to use 7890 a mapping dictionary. */ 7891 if (PyUnicode_READ(kind, data, 0) != 0) 7892 need_dict = 1; 7893 for (i = 1; i < length; i++) { 7894 int l1, l2; 7895 ch = PyUnicode_READ(kind, data, i); 7896 if (ch == 0 || ch > 0xFFFF) { 7897 need_dict = 1; 7898 break; 7899 } 7900 if (ch == 0xFFFE) 7901 /* unmapped character */ 7902 continue; 7903 l1 = ch >> 11; 7904 l2 = ch >> 7; 7905 if (level1[l1] == 0xFF) 7906 level1[l1] = count2++; 7907 if (level2[l2] == 0xFF) 7908 level2[l2] = count3++; 7909 } 7910 7911 if (count2 >= 0xFF || count3 >= 0xFF) 7912 need_dict = 1; 7913 7914 if (need_dict) { 7915 PyObject *result = PyDict_New(); 7916 PyObject *key, *value; 7917 if (!result) 7918 return NULL; 7919 for (i = 0; i < length; i++) { 7920 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7921 value = PyLong_FromLong(i); 7922 if (!key || !value) 7923 goto failed1; 7924 if (PyDict_SetItem(result, key, value) == -1) 7925 goto failed1; 7926 Py_DECREF(key); 7927 Py_DECREF(value); 7928 } 7929 return result; 7930 failed1: 7931 Py_XDECREF(key); 7932 Py_XDECREF(value); 7933 Py_DECREF(result); 7934 return NULL; 7935 } 7936 7937 /* Create a three-level trie */ 7938 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7939 16*count2 + 128*count3 - 1); 7940 if (!result) 7941 return PyErr_NoMemory(); 7942 PyObject_Init(result, &EncodingMapType); 7943 mresult = (struct encoding_map*)result; 7944 mresult->count2 = count2; 7945 mresult->count3 = count3; 7946 mlevel1 = mresult->level1; 7947 mlevel2 = mresult->level23; 7948 mlevel3 = mresult->level23 + 16*count2; 7949 memcpy(mlevel1, level1, 32); 7950 memset(mlevel2, 0xFF, 16*count2); 7951 memset(mlevel3, 0, 128*count3); 7952 count3 = 0; 7953 for (i = 1; i < length; i++) { 7954 int o1, o2, o3, i2, i3; 7955 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7956 if (ch == 0xFFFE) 7957 /* unmapped character */ 7958 continue; 7959 o1 = ch>>11; 7960 o2 = (ch>>7) & 0xF; 7961 i2 = 16*mlevel1[o1] + o2; 7962 if (mlevel2[i2] == 0xFF) 7963 mlevel2[i2] = count3++; 7964 o3 = ch & 0x7F; 7965 i3 = 128*mlevel2[i2] + o3; 7966 mlevel3[i3] = i; 7967 } 7968 return result; 7969} 7970 7971static int 7972encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7973{ 7974 struct encoding_map *map = (struct encoding_map*)mapping; 7975 int l1 = c>>11; 7976 int l2 = (c>>7) & 0xF; 7977 int l3 = c & 0x7F; 7978 int i; 7979 7980 if (c > 0xFFFF) 7981 return -1; 7982 if (c == 0) 7983 return 0; 7984 /* level 1*/ 7985 i = map->level1[l1]; 7986 if (i == 0xFF) { 7987 return -1; 7988 } 7989 /* level 2*/ 7990 i = map->level23[16*i+l2]; 7991 if (i == 0xFF) { 7992 return -1; 7993 } 7994 /* level 3 */ 7995 i = map->level23[16*map->count2 + 128*i + l3]; 7996 if (i == 0) { 7997 return -1; 7998 } 7999 return i; 8000} 8001 8002/* Lookup the character ch in the mapping. If the character 8003 can't be found, Py_None is returned (or NULL, if another 8004 error occurred). */ 8005static PyObject * 8006charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8007{ 8008 PyObject *w = PyLong_FromLong((long)c); 8009 PyObject *x; 8010 8011 if (w == NULL) 8012 return NULL; 8013 x = PyObject_GetItem(mapping, w); 8014 Py_DECREF(w); 8015 if (x == NULL) { 8016 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8017 /* No mapping found means: mapping is undefined. */ 8018 PyErr_Clear(); 8019 x = Py_None; 8020 Py_INCREF(x); 8021 return x; 8022 } else 8023 return NULL; 8024 } 8025 else if (x == Py_None) 8026 return x; 8027 else if (PyLong_Check(x)) { 8028 long value = PyLong_AS_LONG(x); 8029 if (value < 0 || value > 255) { 8030 PyErr_SetString(PyExc_TypeError, 8031 "character mapping must be in range(256)"); 8032 Py_DECREF(x); 8033 return NULL; 8034 } 8035 return x; 8036 } 8037 else if (PyBytes_Check(x)) 8038 return x; 8039 else { 8040 /* wrong return value */ 8041 PyErr_Format(PyExc_TypeError, 8042 "character mapping must return integer, bytes or None, not %.400s", 8043 x->ob_type->tp_name); 8044 Py_DECREF(x); 8045 return NULL; 8046 } 8047} 8048 8049static int 8050charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8051{ 8052 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8053 /* exponentially overallocate to minimize reallocations */ 8054 if (requiredsize < 2*outsize) 8055 requiredsize = 2*outsize; 8056 if (_PyBytes_Resize(outobj, requiredsize)) 8057 return -1; 8058 return 0; 8059} 8060 8061typedef enum charmapencode_result { 8062 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8063} charmapencode_result; 8064/* lookup the character, put the result in the output string and adjust 8065 various state variables. Resize the output bytes object if not enough 8066 space is available. Return a new reference to the object that 8067 was put in the output buffer, or Py_None, if the mapping was undefined 8068 (in which case no character was written) or NULL, if a 8069 reallocation error occurred. The caller must decref the result */ 8070static charmapencode_result 8071charmapencode_output(Py_UCS4 c, PyObject *mapping, 8072 PyObject **outobj, Py_ssize_t *outpos) 8073{ 8074 PyObject *rep; 8075 char *outstart; 8076 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8077 8078 if (Py_TYPE(mapping) == &EncodingMapType) { 8079 int res = encoding_map_lookup(c, mapping); 8080 Py_ssize_t requiredsize = *outpos+1; 8081 if (res == -1) 8082 return enc_FAILED; 8083 if (outsize<requiredsize) 8084 if (charmapencode_resize(outobj, outpos, requiredsize)) 8085 return enc_EXCEPTION; 8086 outstart = PyBytes_AS_STRING(*outobj); 8087 outstart[(*outpos)++] = (char)res; 8088 return enc_SUCCESS; 8089 } 8090 8091 rep = charmapencode_lookup(c, mapping); 8092 if (rep==NULL) 8093 return enc_EXCEPTION; 8094 else if (rep==Py_None) { 8095 Py_DECREF(rep); 8096 return enc_FAILED; 8097 } else { 8098 if (PyLong_Check(rep)) { 8099 Py_ssize_t requiredsize = *outpos+1; 8100 if (outsize<requiredsize) 8101 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8102 Py_DECREF(rep); 8103 return enc_EXCEPTION; 8104 } 8105 outstart = PyBytes_AS_STRING(*outobj); 8106 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8107 } 8108 else { 8109 const char *repchars = PyBytes_AS_STRING(rep); 8110 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8111 Py_ssize_t requiredsize = *outpos+repsize; 8112 if (outsize<requiredsize) 8113 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8114 Py_DECREF(rep); 8115 return enc_EXCEPTION; 8116 } 8117 outstart = PyBytes_AS_STRING(*outobj); 8118 memcpy(outstart + *outpos, repchars, repsize); 8119 *outpos += repsize; 8120 } 8121 } 8122 Py_DECREF(rep); 8123 return enc_SUCCESS; 8124} 8125 8126/* handle an error in PyUnicode_EncodeCharmap 8127 Return 0 on success, -1 on error */ 8128static int 8129charmap_encoding_error( 8130 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8131 PyObject **exceptionObject, 8132 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8133 PyObject **res, Py_ssize_t *respos) 8134{ 8135 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8136 Py_ssize_t size, repsize; 8137 Py_ssize_t newpos; 8138 enum PyUnicode_Kind kind; 8139 void *data; 8140 Py_ssize_t index; 8141 /* startpos for collecting unencodable chars */ 8142 Py_ssize_t collstartpos = *inpos; 8143 Py_ssize_t collendpos = *inpos+1; 8144 Py_ssize_t collpos; 8145 char *encoding = "charmap"; 8146 char *reason = "character maps to <undefined>"; 8147 charmapencode_result x; 8148 Py_UCS4 ch; 8149 int val; 8150 8151 if (PyUnicode_READY(unicode) == -1) 8152 return -1; 8153 size = PyUnicode_GET_LENGTH(unicode); 8154 /* find all unencodable characters */ 8155 while (collendpos < size) { 8156 PyObject *rep; 8157 if (Py_TYPE(mapping) == &EncodingMapType) { 8158 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8159 val = encoding_map_lookup(ch, mapping); 8160 if (val != -1) 8161 break; 8162 ++collendpos; 8163 continue; 8164 } 8165 8166 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8167 rep = charmapencode_lookup(ch, mapping); 8168 if (rep==NULL) 8169 return -1; 8170 else if (rep!=Py_None) { 8171 Py_DECREF(rep); 8172 break; 8173 } 8174 Py_DECREF(rep); 8175 ++collendpos; 8176 } 8177 /* cache callback name lookup 8178 * (if not done yet, i.e. it's the first error) */ 8179 if (*error_handler == _Py_ERROR_UNKNOWN) 8180 *error_handler = get_error_handler(errors); 8181 8182 switch (*error_handler) { 8183 case _Py_ERROR_STRICT: 8184 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8185 return -1; 8186 8187 case _Py_ERROR_REPLACE: 8188 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8189 x = charmapencode_output('?', mapping, res, respos); 8190 if (x==enc_EXCEPTION) { 8191 return -1; 8192 } 8193 else if (x==enc_FAILED) { 8194 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8195 return -1; 8196 } 8197 } 8198 /* fall through */ 8199 case _Py_ERROR_IGNORE: 8200 *inpos = collendpos; 8201 break; 8202 8203 case _Py_ERROR_XMLCHARREFREPLACE: 8204 /* generate replacement (temporarily (mis)uses p) */ 8205 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8206 char buffer[2+29+1+1]; 8207 char *cp; 8208 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8209 for (cp = buffer; *cp; ++cp) { 8210 x = charmapencode_output(*cp, mapping, res, respos); 8211 if (x==enc_EXCEPTION) 8212 return -1; 8213 else if (x==enc_FAILED) { 8214 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8215 return -1; 8216 } 8217 } 8218 } 8219 *inpos = collendpos; 8220 break; 8221 8222 default: 8223 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8224 encoding, reason, unicode, exceptionObject, 8225 collstartpos, collendpos, &newpos); 8226 if (repunicode == NULL) 8227 return -1; 8228 if (PyBytes_Check(repunicode)) { 8229 /* Directly copy bytes result to output. */ 8230 Py_ssize_t outsize = PyBytes_Size(*res); 8231 Py_ssize_t requiredsize; 8232 repsize = PyBytes_Size(repunicode); 8233 requiredsize = *respos + repsize; 8234 if (requiredsize > outsize) 8235 /* Make room for all additional bytes. */ 8236 if (charmapencode_resize(res, respos, requiredsize)) { 8237 Py_DECREF(repunicode); 8238 return -1; 8239 } 8240 memcpy(PyBytes_AsString(*res) + *respos, 8241 PyBytes_AsString(repunicode), repsize); 8242 *respos += repsize; 8243 *inpos = newpos; 8244 Py_DECREF(repunicode); 8245 break; 8246 } 8247 /* generate replacement */ 8248 if (PyUnicode_READY(repunicode) == -1) { 8249 Py_DECREF(repunicode); 8250 return -1; 8251 } 8252 repsize = PyUnicode_GET_LENGTH(repunicode); 8253 data = PyUnicode_DATA(repunicode); 8254 kind = PyUnicode_KIND(repunicode); 8255 for (index = 0; index < repsize; index++) { 8256 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8257 x = charmapencode_output(repch, mapping, res, respos); 8258 if (x==enc_EXCEPTION) { 8259 Py_DECREF(repunicode); 8260 return -1; 8261 } 8262 else if (x==enc_FAILED) { 8263 Py_DECREF(repunicode); 8264 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8265 return -1; 8266 } 8267 } 8268 *inpos = newpos; 8269 Py_DECREF(repunicode); 8270 } 8271 return 0; 8272} 8273 8274PyObject * 8275_PyUnicode_EncodeCharmap(PyObject *unicode, 8276 PyObject *mapping, 8277 const char *errors) 8278{ 8279 /* output object */ 8280 PyObject *res = NULL; 8281 /* current input position */ 8282 Py_ssize_t inpos = 0; 8283 Py_ssize_t size; 8284 /* current output position */ 8285 Py_ssize_t respos = 0; 8286 PyObject *error_handler_obj = NULL; 8287 PyObject *exc = NULL; 8288 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8289 void *data; 8290 int kind; 8291 8292 if (PyUnicode_READY(unicode) == -1) 8293 return NULL; 8294 size = PyUnicode_GET_LENGTH(unicode); 8295 data = PyUnicode_DATA(unicode); 8296 kind = PyUnicode_KIND(unicode); 8297 8298 /* Default to Latin-1 */ 8299 if (mapping == NULL) 8300 return unicode_encode_ucs1(unicode, errors, 256); 8301 8302 /* allocate enough for a simple encoding without 8303 replacements, if we need more, we'll resize */ 8304 res = PyBytes_FromStringAndSize(NULL, size); 8305 if (res == NULL) 8306 goto onError; 8307 if (size == 0) 8308 return res; 8309 8310 while (inpos<size) { 8311 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8312 /* try to encode it */ 8313 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8314 if (x==enc_EXCEPTION) /* error */ 8315 goto onError; 8316 if (x==enc_FAILED) { /* unencodable character */ 8317 if (charmap_encoding_error(unicode, &inpos, mapping, 8318 &exc, 8319 &error_handler, &error_handler_obj, errors, 8320 &res, &respos)) { 8321 goto onError; 8322 } 8323 } 8324 else 8325 /* done with this character => adjust input position */ 8326 ++inpos; 8327 } 8328 8329 /* Resize if we allocated to much */ 8330 if (respos<PyBytes_GET_SIZE(res)) 8331 if (_PyBytes_Resize(&res, respos) < 0) 8332 goto onError; 8333 8334 Py_XDECREF(exc); 8335 Py_XDECREF(error_handler_obj); 8336 return res; 8337 8338 onError: 8339 Py_XDECREF(res); 8340 Py_XDECREF(exc); 8341 Py_XDECREF(error_handler_obj); 8342 return NULL; 8343} 8344 8345/* Deprecated */ 8346PyObject * 8347PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8348 Py_ssize_t size, 8349 PyObject *mapping, 8350 const char *errors) 8351{ 8352 PyObject *result; 8353 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8354 if (unicode == NULL) 8355 return NULL; 8356 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8357 Py_DECREF(unicode); 8358 return result; 8359} 8360 8361PyObject * 8362PyUnicode_AsCharmapString(PyObject *unicode, 8363 PyObject *mapping) 8364{ 8365 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8366 PyErr_BadArgument(); 8367 return NULL; 8368 } 8369 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8370} 8371 8372/* create or adjust a UnicodeTranslateError */ 8373static void 8374make_translate_exception(PyObject **exceptionObject, 8375 PyObject *unicode, 8376 Py_ssize_t startpos, Py_ssize_t endpos, 8377 const char *reason) 8378{ 8379 if (*exceptionObject == NULL) { 8380 *exceptionObject = _PyUnicodeTranslateError_Create( 8381 unicode, startpos, endpos, reason); 8382 } 8383 else { 8384 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8385 goto onError; 8386 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8387 goto onError; 8388 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8389 goto onError; 8390 return; 8391 onError: 8392 Py_CLEAR(*exceptionObject); 8393 } 8394} 8395 8396/* error handling callback helper: 8397 build arguments, call the callback and check the arguments, 8398 put the result into newpos and return the replacement string, which 8399 has to be freed by the caller */ 8400static PyObject * 8401unicode_translate_call_errorhandler(const char *errors, 8402 PyObject **errorHandler, 8403 const char *reason, 8404 PyObject *unicode, PyObject **exceptionObject, 8405 Py_ssize_t startpos, Py_ssize_t endpos, 8406 Py_ssize_t *newpos) 8407{ 8408 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8409 8410 Py_ssize_t i_newpos; 8411 PyObject *restuple; 8412 PyObject *resunicode; 8413 8414 if (*errorHandler == NULL) { 8415 *errorHandler = PyCodec_LookupError(errors); 8416 if (*errorHandler == NULL) 8417 return NULL; 8418 } 8419 8420 make_translate_exception(exceptionObject, 8421 unicode, startpos, endpos, reason); 8422 if (*exceptionObject == NULL) 8423 return NULL; 8424 8425 restuple = PyObject_CallFunctionObjArgs( 8426 *errorHandler, *exceptionObject, NULL); 8427 if (restuple == NULL) 8428 return NULL; 8429 if (!PyTuple_Check(restuple)) { 8430 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8431 Py_DECREF(restuple); 8432 return NULL; 8433 } 8434 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8435 &resunicode, &i_newpos)) { 8436 Py_DECREF(restuple); 8437 return NULL; 8438 } 8439 if (i_newpos<0) 8440 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8441 else 8442 *newpos = i_newpos; 8443 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8444 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8445 Py_DECREF(restuple); 8446 return NULL; 8447 } 8448 Py_INCREF(resunicode); 8449 Py_DECREF(restuple); 8450 return resunicode; 8451} 8452 8453/* Lookup the character ch in the mapping and put the result in result, 8454 which must be decrefed by the caller. 8455 Return 0 on success, -1 on error */ 8456static int 8457charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8458{ 8459 PyObject *w = PyLong_FromLong((long)c); 8460 PyObject *x; 8461 8462 if (w == NULL) 8463 return -1; 8464 x = PyObject_GetItem(mapping, w); 8465 Py_DECREF(w); 8466 if (x == NULL) { 8467 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8468 /* No mapping found means: use 1:1 mapping. */ 8469 PyErr_Clear(); 8470 *result = NULL; 8471 return 0; 8472 } else 8473 return -1; 8474 } 8475 else if (x == Py_None) { 8476 *result = x; 8477 return 0; 8478 } 8479 else if (PyLong_Check(x)) { 8480 long value = PyLong_AS_LONG(x); 8481 if (value < 0 || value > MAX_UNICODE) { 8482 PyErr_Format(PyExc_ValueError, 8483 "character mapping must be in range(0x%x)", 8484 MAX_UNICODE+1); 8485 Py_DECREF(x); 8486 return -1; 8487 } 8488 *result = x; 8489 return 0; 8490 } 8491 else if (PyUnicode_Check(x)) { 8492 *result = x; 8493 return 0; 8494 } 8495 else { 8496 /* wrong return value */ 8497 PyErr_SetString(PyExc_TypeError, 8498 "character mapping must return integer, None or str"); 8499 Py_DECREF(x); 8500 return -1; 8501 } 8502} 8503 8504/* lookup the character, write the result into the writer. 8505 Return 1 if the result was written into the writer, return 0 if the mapping 8506 was undefined, raise an exception return -1 on error. */ 8507static int 8508charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8509 _PyUnicodeWriter *writer) 8510{ 8511 PyObject *item; 8512 8513 if (charmaptranslate_lookup(ch, mapping, &item)) 8514 return -1; 8515 8516 if (item == NULL) { 8517 /* not found => default to 1:1 mapping */ 8518 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8519 return -1; 8520 } 8521 return 1; 8522 } 8523 8524 if (item == Py_None) { 8525 Py_DECREF(item); 8526 return 0; 8527 } 8528 8529 if (PyLong_Check(item)) { 8530 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8531 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8532 used it */ 8533 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8534 Py_DECREF(item); 8535 return -1; 8536 } 8537 Py_DECREF(item); 8538 return 1; 8539 } 8540 8541 if (!PyUnicode_Check(item)) { 8542 Py_DECREF(item); 8543 return -1; 8544 } 8545 8546 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8547 Py_DECREF(item); 8548 return -1; 8549 } 8550 8551 Py_DECREF(item); 8552 return 1; 8553} 8554 8555static int 8556unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8557 Py_UCS1 *translate) 8558{ 8559 PyObject *item = NULL; 8560 int ret = 0; 8561 8562 if (charmaptranslate_lookup(ch, mapping, &item)) { 8563 return -1; 8564 } 8565 8566 if (item == Py_None) { 8567 /* deletion */ 8568 translate[ch] = 0xfe; 8569 } 8570 else if (item == NULL) { 8571 /* not found => default to 1:1 mapping */ 8572 translate[ch] = ch; 8573 return 1; 8574 } 8575 else if (PyLong_Check(item)) { 8576 long replace = PyLong_AS_LONG(item); 8577 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8578 used it */ 8579 if (127 < replace) { 8580 /* invalid character or character outside ASCII: 8581 skip the fast translate */ 8582 goto exit; 8583 } 8584 translate[ch] = (Py_UCS1)replace; 8585 } 8586 else if (PyUnicode_Check(item)) { 8587 Py_UCS4 replace; 8588 8589 if (PyUnicode_READY(item) == -1) { 8590 Py_DECREF(item); 8591 return -1; 8592 } 8593 if (PyUnicode_GET_LENGTH(item) != 1) 8594 goto exit; 8595 8596 replace = PyUnicode_READ_CHAR(item, 0); 8597 if (replace > 127) 8598 goto exit; 8599 translate[ch] = (Py_UCS1)replace; 8600 } 8601 else { 8602 /* not None, NULL, long or unicode */ 8603 goto exit; 8604 } 8605 ret = 1; 8606 8607 exit: 8608 Py_DECREF(item); 8609 return ret; 8610} 8611 8612/* Fast path for ascii => ascii translation. Return 1 if the whole string 8613 was translated into writer, return 0 if the input string was partially 8614 translated into writer, raise an exception and return -1 on error. */ 8615static int 8616unicode_fast_translate(PyObject *input, PyObject *mapping, 8617 _PyUnicodeWriter *writer, int ignore) 8618{ 8619 Py_UCS1 ascii_table[128], ch, ch2; 8620 Py_ssize_t len; 8621 Py_UCS1 *in, *end, *out; 8622 int res = 0; 8623 8624 if (PyUnicode_READY(input) == -1) 8625 return -1; 8626 if (!PyUnicode_IS_ASCII(input)) 8627 return 0; 8628 len = PyUnicode_GET_LENGTH(input); 8629 8630 memset(ascii_table, 0xff, 128); 8631 8632 in = PyUnicode_1BYTE_DATA(input); 8633 end = in + len; 8634 8635 assert(PyUnicode_IS_ASCII(writer->buffer)); 8636 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8637 out = PyUnicode_1BYTE_DATA(writer->buffer); 8638 8639 for (; in < end; in++) { 8640 ch = *in; 8641 ch2 = ascii_table[ch]; 8642 if (ch2 == 0xff) { 8643 int translate = unicode_fast_translate_lookup(mapping, ch, 8644 ascii_table); 8645 if (translate < 0) 8646 return -1; 8647 if (translate == 0) 8648 goto exit; 8649 ch2 = ascii_table[ch]; 8650 } 8651 if (ch2 == 0xfe) { 8652 if (ignore) 8653 continue; 8654 goto exit; 8655 } 8656 assert(ch2 < 128); 8657 *out = ch2; 8658 out++; 8659 } 8660 res = 1; 8661 8662exit: 8663 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8664 return res; 8665} 8666 8667PyObject * 8668_PyUnicode_TranslateCharmap(PyObject *input, 8669 PyObject *mapping, 8670 const char *errors) 8671{ 8672 /* input object */ 8673 char *data; 8674 Py_ssize_t size, i; 8675 int kind; 8676 /* output buffer */ 8677 _PyUnicodeWriter writer; 8678 /* error handler */ 8679 char *reason = "character maps to <undefined>"; 8680 PyObject *errorHandler = NULL; 8681 PyObject *exc = NULL; 8682 int ignore; 8683 int res; 8684 8685 if (mapping == NULL) { 8686 PyErr_BadArgument(); 8687 return NULL; 8688 } 8689 8690 if (PyUnicode_READY(input) == -1) 8691 return NULL; 8692 data = (char*)PyUnicode_DATA(input); 8693 kind = PyUnicode_KIND(input); 8694 size = PyUnicode_GET_LENGTH(input); 8695 8696 if (size == 0) { 8697 Py_INCREF(input); 8698 return input; 8699 } 8700 8701 /* allocate enough for a simple 1:1 translation without 8702 replacements, if we need more, we'll resize */ 8703 _PyUnicodeWriter_Init(&writer); 8704 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8705 goto onError; 8706 8707 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8708 8709 res = unicode_fast_translate(input, mapping, &writer, ignore); 8710 if (res < 0) { 8711 _PyUnicodeWriter_Dealloc(&writer); 8712 return NULL; 8713 } 8714 if (res == 1) 8715 return _PyUnicodeWriter_Finish(&writer); 8716 8717 i = writer.pos; 8718 while (i<size) { 8719 /* try to encode it */ 8720 int translate; 8721 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8722 Py_ssize_t newpos; 8723 /* startpos for collecting untranslatable chars */ 8724 Py_ssize_t collstart; 8725 Py_ssize_t collend; 8726 Py_UCS4 ch; 8727 8728 ch = PyUnicode_READ(kind, data, i); 8729 translate = charmaptranslate_output(ch, mapping, &writer); 8730 if (translate < 0) 8731 goto onError; 8732 8733 if (translate != 0) { 8734 /* it worked => adjust input pointer */ 8735 ++i; 8736 continue; 8737 } 8738 8739 /* untranslatable character */ 8740 collstart = i; 8741 collend = i+1; 8742 8743 /* find all untranslatable characters */ 8744 while (collend < size) { 8745 PyObject *x; 8746 ch = PyUnicode_READ(kind, data, collend); 8747 if (charmaptranslate_lookup(ch, mapping, &x)) 8748 goto onError; 8749 Py_XDECREF(x); 8750 if (x != Py_None) 8751 break; 8752 ++collend; 8753 } 8754 8755 if (ignore) { 8756 i = collend; 8757 } 8758 else { 8759 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8760 reason, input, &exc, 8761 collstart, collend, &newpos); 8762 if (repunicode == NULL) 8763 goto onError; 8764 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 8765 Py_DECREF(repunicode); 8766 goto onError; 8767 } 8768 Py_DECREF(repunicode); 8769 i = newpos; 8770 } 8771 } 8772 Py_XDECREF(exc); 8773 Py_XDECREF(errorHandler); 8774 return _PyUnicodeWriter_Finish(&writer); 8775 8776 onError: 8777 _PyUnicodeWriter_Dealloc(&writer); 8778 Py_XDECREF(exc); 8779 Py_XDECREF(errorHandler); 8780 return NULL; 8781} 8782 8783/* Deprecated. Use PyUnicode_Translate instead. */ 8784PyObject * 8785PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8786 Py_ssize_t size, 8787 PyObject *mapping, 8788 const char *errors) 8789{ 8790 PyObject *result; 8791 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8792 if (!unicode) 8793 return NULL; 8794 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8795 Py_DECREF(unicode); 8796 return result; 8797} 8798 8799PyObject * 8800PyUnicode_Translate(PyObject *str, 8801 PyObject *mapping, 8802 const char *errors) 8803{ 8804 PyObject *result; 8805 8806 str = PyUnicode_FromObject(str); 8807 if (str == NULL) 8808 return NULL; 8809 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8810 Py_DECREF(str); 8811 return result; 8812} 8813 8814static Py_UCS4 8815fix_decimal_and_space_to_ascii(PyObject *self) 8816{ 8817 /* No need to call PyUnicode_READY(self) because this function is only 8818 called as a callback from fixup() which does it already. */ 8819 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8820 const int kind = PyUnicode_KIND(self); 8821 void *data = PyUnicode_DATA(self); 8822 Py_UCS4 maxchar = 127, ch, fixed; 8823 int modified = 0; 8824 Py_ssize_t i; 8825 8826 for (i = 0; i < len; ++i) { 8827 ch = PyUnicode_READ(kind, data, i); 8828 fixed = 0; 8829 if (ch > 127) { 8830 if (Py_UNICODE_ISSPACE(ch)) 8831 fixed = ' '; 8832 else { 8833 const int decimal = Py_UNICODE_TODECIMAL(ch); 8834 if (decimal >= 0) 8835 fixed = '0' + decimal; 8836 } 8837 if (fixed != 0) { 8838 modified = 1; 8839 maxchar = Py_MAX(maxchar, fixed); 8840 PyUnicode_WRITE(kind, data, i, fixed); 8841 } 8842 else 8843 maxchar = Py_MAX(maxchar, ch); 8844 } 8845 } 8846 8847 return (modified) ? maxchar : 0; 8848} 8849 8850PyObject * 8851_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8852{ 8853 if (!PyUnicode_Check(unicode)) { 8854 PyErr_BadInternalCall(); 8855 return NULL; 8856 } 8857 if (PyUnicode_READY(unicode) == -1) 8858 return NULL; 8859 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8860 /* If the string is already ASCII, just return the same string */ 8861 Py_INCREF(unicode); 8862 return unicode; 8863 } 8864 return fixup(unicode, fix_decimal_and_space_to_ascii); 8865} 8866 8867PyObject * 8868PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8869 Py_ssize_t length) 8870{ 8871 PyObject *decimal; 8872 Py_ssize_t i; 8873 Py_UCS4 maxchar; 8874 enum PyUnicode_Kind kind; 8875 void *data; 8876 8877 maxchar = 127; 8878 for (i = 0; i < length; i++) { 8879 Py_UCS4 ch = s[i]; 8880 if (ch > 127) { 8881 int decimal = Py_UNICODE_TODECIMAL(ch); 8882 if (decimal >= 0) 8883 ch = '0' + decimal; 8884 maxchar = Py_MAX(maxchar, ch); 8885 } 8886 } 8887 8888 /* Copy to a new string */ 8889 decimal = PyUnicode_New(length, maxchar); 8890 if (decimal == NULL) 8891 return decimal; 8892 kind = PyUnicode_KIND(decimal); 8893 data = PyUnicode_DATA(decimal); 8894 /* Iterate over code points */ 8895 for (i = 0; i < length; i++) { 8896 Py_UCS4 ch = s[i]; 8897 if (ch > 127) { 8898 int decimal = Py_UNICODE_TODECIMAL(ch); 8899 if (decimal >= 0) 8900 ch = '0' + decimal; 8901 } 8902 PyUnicode_WRITE(kind, data, i, ch); 8903 } 8904 return unicode_result(decimal); 8905} 8906/* --- Decimal Encoder ---------------------------------------------------- */ 8907 8908int 8909PyUnicode_EncodeDecimal(Py_UNICODE *s, 8910 Py_ssize_t length, 8911 char *output, 8912 const char *errors) 8913{ 8914 PyObject *unicode; 8915 Py_ssize_t i; 8916 enum PyUnicode_Kind kind; 8917 void *data; 8918 8919 if (output == NULL) { 8920 PyErr_BadArgument(); 8921 return -1; 8922 } 8923 8924 unicode = PyUnicode_FromUnicode(s, length); 8925 if (unicode == NULL) 8926 return -1; 8927 8928 if (PyUnicode_READY(unicode) == -1) { 8929 Py_DECREF(unicode); 8930 return -1; 8931 } 8932 kind = PyUnicode_KIND(unicode); 8933 data = PyUnicode_DATA(unicode); 8934 8935 for (i=0; i < length; ) { 8936 PyObject *exc; 8937 Py_UCS4 ch; 8938 int decimal; 8939 Py_ssize_t startpos; 8940 8941 ch = PyUnicode_READ(kind, data, i); 8942 8943 if (Py_UNICODE_ISSPACE(ch)) { 8944 *output++ = ' '; 8945 i++; 8946 continue; 8947 } 8948 decimal = Py_UNICODE_TODECIMAL(ch); 8949 if (decimal >= 0) { 8950 *output++ = '0' + decimal; 8951 i++; 8952 continue; 8953 } 8954 if (0 < ch && ch < 256) { 8955 *output++ = (char)ch; 8956 i++; 8957 continue; 8958 } 8959 8960 startpos = i; 8961 exc = NULL; 8962 raise_encode_exception(&exc, "decimal", unicode, 8963 startpos, startpos+1, 8964 "invalid decimal Unicode string"); 8965 Py_XDECREF(exc); 8966 Py_DECREF(unicode); 8967 return -1; 8968 } 8969 /* 0-terminate the output string */ 8970 *output++ = '\0'; 8971 Py_DECREF(unicode); 8972 return 0; 8973} 8974 8975/* --- Helpers ------------------------------------------------------------ */ 8976 8977/* helper macro to fixup start/end slice values */ 8978#define ADJUST_INDICES(start, end, len) \ 8979 if (end > len) \ 8980 end = len; \ 8981 else if (end < 0) { \ 8982 end += len; \ 8983 if (end < 0) \ 8984 end = 0; \ 8985 } \ 8986 if (start < 0) { \ 8987 start += len; \ 8988 if (start < 0) \ 8989 start = 0; \ 8990 } 8991 8992static Py_ssize_t 8993any_find_slice(int direction, PyObject* s1, PyObject* s2, 8994 Py_ssize_t start, 8995 Py_ssize_t end) 8996{ 8997 int kind1, kind2; 8998 void *buf1, *buf2; 8999 Py_ssize_t len1, len2, result; 9000 9001 kind1 = PyUnicode_KIND(s1); 9002 kind2 = PyUnicode_KIND(s2); 9003 if (kind1 < kind2) 9004 return -1; 9005 9006 len1 = PyUnicode_GET_LENGTH(s1); 9007 len2 = PyUnicode_GET_LENGTH(s2); 9008 ADJUST_INDICES(start, end, len1); 9009 if (end - start < len2) 9010 return -1; 9011 9012 buf1 = PyUnicode_DATA(s1); 9013 buf2 = PyUnicode_DATA(s2); 9014 if (len2 == 1) { 9015 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9016 result = findchar((const char *)buf1 + kind1*start, 9017 kind1, end - start, ch, direction); 9018 if (result == -1) 9019 return -1; 9020 else 9021 return start + result; 9022 } 9023 9024 if (kind2 != kind1) { 9025 buf2 = _PyUnicode_AsKind(s2, kind1); 9026 if (!buf2) 9027 return -2; 9028 } 9029 9030 if (direction > 0) { 9031 switch (kind1) { 9032 case PyUnicode_1BYTE_KIND: 9033 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9034 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9035 else 9036 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9037 break; 9038 case PyUnicode_2BYTE_KIND: 9039 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9040 break; 9041 case PyUnicode_4BYTE_KIND: 9042 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9043 break; 9044 default: 9045 assert(0); result = -2; 9046 } 9047 } 9048 else { 9049 switch (kind1) { 9050 case PyUnicode_1BYTE_KIND: 9051 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9052 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9053 else 9054 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9055 break; 9056 case PyUnicode_2BYTE_KIND: 9057 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9058 break; 9059 case PyUnicode_4BYTE_KIND: 9060 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9061 break; 9062 default: 9063 assert(0); result = -2; 9064 } 9065 } 9066 9067 if (kind2 != kind1) 9068 PyMem_Free(buf2); 9069 9070 return result; 9071} 9072 9073Py_ssize_t 9074_PyUnicode_InsertThousandsGrouping( 9075 PyObject *unicode, Py_ssize_t index, 9076 Py_ssize_t n_buffer, 9077 void *digits, Py_ssize_t n_digits, 9078 Py_ssize_t min_width, 9079 const char *grouping, PyObject *thousands_sep, 9080 Py_UCS4 *maxchar) 9081{ 9082 unsigned int kind, thousands_sep_kind; 9083 char *data, *thousands_sep_data; 9084 Py_ssize_t thousands_sep_len; 9085 Py_ssize_t len; 9086 9087 if (unicode != NULL) { 9088 kind = PyUnicode_KIND(unicode); 9089 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9090 } 9091 else { 9092 kind = PyUnicode_1BYTE_KIND; 9093 data = NULL; 9094 } 9095 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9096 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9097 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9098 if (unicode != NULL && thousands_sep_kind != kind) { 9099 if (thousands_sep_kind < kind) { 9100 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9101 if (!thousands_sep_data) 9102 return -1; 9103 } 9104 else { 9105 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9106 if (!data) 9107 return -1; 9108 } 9109 } 9110 9111 switch (kind) { 9112 case PyUnicode_1BYTE_KIND: 9113 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9114 len = asciilib_InsertThousandsGrouping( 9115 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9116 min_width, grouping, 9117 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9118 else 9119 len = ucs1lib_InsertThousandsGrouping( 9120 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9121 min_width, grouping, 9122 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9123 break; 9124 case PyUnicode_2BYTE_KIND: 9125 len = ucs2lib_InsertThousandsGrouping( 9126 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9127 min_width, grouping, 9128 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9129 break; 9130 case PyUnicode_4BYTE_KIND: 9131 len = ucs4lib_InsertThousandsGrouping( 9132 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9133 min_width, grouping, 9134 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9135 break; 9136 default: 9137 assert(0); 9138 return -1; 9139 } 9140 if (unicode != NULL && thousands_sep_kind != kind) { 9141 if (thousands_sep_kind < kind) 9142 PyMem_Free(thousands_sep_data); 9143 else 9144 PyMem_Free(data); 9145 } 9146 if (unicode == NULL) { 9147 *maxchar = 127; 9148 if (len != n_digits) { 9149 *maxchar = Py_MAX(*maxchar, 9150 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9151 } 9152 } 9153 return len; 9154} 9155 9156 9157Py_ssize_t 9158PyUnicode_Count(PyObject *str, 9159 PyObject *substr, 9160 Py_ssize_t start, 9161 Py_ssize_t end) 9162{ 9163 Py_ssize_t result; 9164 PyObject* str_obj; 9165 PyObject* sub_obj; 9166 int kind1, kind2; 9167 void *buf1 = NULL, *buf2 = NULL; 9168 Py_ssize_t len1, len2; 9169 9170 str_obj = PyUnicode_FromObject(str); 9171 if (!str_obj) 9172 return -1; 9173 sub_obj = PyUnicode_FromObject(substr); 9174 if (!sub_obj) { 9175 Py_DECREF(str_obj); 9176 return -1; 9177 } 9178 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9179 Py_DECREF(sub_obj); 9180 Py_DECREF(str_obj); 9181 return -1; 9182 } 9183 9184 kind1 = PyUnicode_KIND(str_obj); 9185 kind2 = PyUnicode_KIND(sub_obj); 9186 if (kind1 < kind2) { 9187 Py_DECREF(sub_obj); 9188 Py_DECREF(str_obj); 9189 return 0; 9190 } 9191 9192 len1 = PyUnicode_GET_LENGTH(str_obj); 9193 len2 = PyUnicode_GET_LENGTH(sub_obj); 9194 ADJUST_INDICES(start, end, len1); 9195 if (end - start < len2) { 9196 Py_DECREF(sub_obj); 9197 Py_DECREF(str_obj); 9198 return 0; 9199 } 9200 9201 buf1 = PyUnicode_DATA(str_obj); 9202 buf2 = PyUnicode_DATA(sub_obj); 9203 if (kind2 != kind1) { 9204 buf2 = _PyUnicode_AsKind(sub_obj, kind1); 9205 if (!buf2) 9206 goto onError; 9207 } 9208 9209 switch (kind1) { 9210 case PyUnicode_1BYTE_KIND: 9211 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9212 result = asciilib_count( 9213 ((Py_UCS1*)buf1) + start, end - start, 9214 buf2, len2, PY_SSIZE_T_MAX 9215 ); 9216 else 9217 result = ucs1lib_count( 9218 ((Py_UCS1*)buf1) + start, end - start, 9219 buf2, len2, PY_SSIZE_T_MAX 9220 ); 9221 break; 9222 case PyUnicode_2BYTE_KIND: 9223 result = ucs2lib_count( 9224 ((Py_UCS2*)buf1) + start, end - start, 9225 buf2, len2, PY_SSIZE_T_MAX 9226 ); 9227 break; 9228 case PyUnicode_4BYTE_KIND: 9229 result = ucs4lib_count( 9230 ((Py_UCS4*)buf1) + start, end - start, 9231 buf2, len2, PY_SSIZE_T_MAX 9232 ); 9233 break; 9234 default: 9235 assert(0); result = 0; 9236 } 9237 9238 Py_DECREF(sub_obj); 9239 Py_DECREF(str_obj); 9240 9241 if (kind2 != kind1) 9242 PyMem_Free(buf2); 9243 9244 return result; 9245 onError: 9246 Py_DECREF(sub_obj); 9247 Py_DECREF(str_obj); 9248 if (kind2 != kind1 && buf2) 9249 PyMem_Free(buf2); 9250 return -1; 9251} 9252 9253Py_ssize_t 9254PyUnicode_Find(PyObject *str, 9255 PyObject *sub, 9256 Py_ssize_t start, 9257 Py_ssize_t end, 9258 int direction) 9259{ 9260 Py_ssize_t result; 9261 9262 str = PyUnicode_FromObject(str); 9263 if (!str) 9264 return -2; 9265 sub = PyUnicode_FromObject(sub); 9266 if (!sub) { 9267 Py_DECREF(str); 9268 return -2; 9269 } 9270 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9271 Py_DECREF(sub); 9272 Py_DECREF(str); 9273 return -2; 9274 } 9275 9276 result = any_find_slice(direction, 9277 str, sub, start, end 9278 ); 9279 9280 Py_DECREF(str); 9281 Py_DECREF(sub); 9282 9283 return result; 9284} 9285 9286Py_ssize_t 9287PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9288 Py_ssize_t start, Py_ssize_t end, 9289 int direction) 9290{ 9291 int kind; 9292 Py_ssize_t result; 9293 if (PyUnicode_READY(str) == -1) 9294 return -2; 9295 if (start < 0 || end < 0) { 9296 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9297 return -2; 9298 } 9299 if (end > PyUnicode_GET_LENGTH(str)) 9300 end = PyUnicode_GET_LENGTH(str); 9301 if (start >= end) 9302 return -1; 9303 kind = PyUnicode_KIND(str); 9304 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9305 kind, end-start, ch, direction); 9306 if (result == -1) 9307 return -1; 9308 else 9309 return start + result; 9310} 9311 9312static int 9313tailmatch(PyObject *self, 9314 PyObject *substring, 9315 Py_ssize_t start, 9316 Py_ssize_t end, 9317 int direction) 9318{ 9319 int kind_self; 9320 int kind_sub; 9321 void *data_self; 9322 void *data_sub; 9323 Py_ssize_t offset; 9324 Py_ssize_t i; 9325 Py_ssize_t end_sub; 9326 9327 if (PyUnicode_READY(self) == -1 || 9328 PyUnicode_READY(substring) == -1) 9329 return -1; 9330 9331 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9332 end -= PyUnicode_GET_LENGTH(substring); 9333 if (end < start) 9334 return 0; 9335 9336 if (PyUnicode_GET_LENGTH(substring) == 0) 9337 return 1; 9338 9339 kind_self = PyUnicode_KIND(self); 9340 data_self = PyUnicode_DATA(self); 9341 kind_sub = PyUnicode_KIND(substring); 9342 data_sub = PyUnicode_DATA(substring); 9343 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9344 9345 if (direction > 0) 9346 offset = end; 9347 else 9348 offset = start; 9349 9350 if (PyUnicode_READ(kind_self, data_self, offset) == 9351 PyUnicode_READ(kind_sub, data_sub, 0) && 9352 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9353 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9354 /* If both are of the same kind, memcmp is sufficient */ 9355 if (kind_self == kind_sub) { 9356 return ! memcmp((char *)data_self + 9357 (offset * PyUnicode_KIND(substring)), 9358 data_sub, 9359 PyUnicode_GET_LENGTH(substring) * 9360 PyUnicode_KIND(substring)); 9361 } 9362 /* otherwise we have to compare each character by first accesing it */ 9363 else { 9364 /* We do not need to compare 0 and len(substring)-1 because 9365 the if statement above ensured already that they are equal 9366 when we end up here. */ 9367 for (i = 1; i < end_sub; ++i) { 9368 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9369 PyUnicode_READ(kind_sub, data_sub, i)) 9370 return 0; 9371 } 9372 return 1; 9373 } 9374 } 9375 9376 return 0; 9377} 9378 9379Py_ssize_t 9380PyUnicode_Tailmatch(PyObject *str, 9381 PyObject *substr, 9382 Py_ssize_t start, 9383 Py_ssize_t end, 9384 int direction) 9385{ 9386 Py_ssize_t result; 9387 9388 str = PyUnicode_FromObject(str); 9389 if (str == NULL) 9390 return -1; 9391 substr = PyUnicode_FromObject(substr); 9392 if (substr == NULL) { 9393 Py_DECREF(str); 9394 return -1; 9395 } 9396 9397 result = tailmatch(str, substr, 9398 start, end, direction); 9399 Py_DECREF(str); 9400 Py_DECREF(substr); 9401 return result; 9402} 9403 9404/* Apply fixfct filter to the Unicode object self and return a 9405 reference to the modified object */ 9406 9407static PyObject * 9408fixup(PyObject *self, 9409 Py_UCS4 (*fixfct)(PyObject *s)) 9410{ 9411 PyObject *u; 9412 Py_UCS4 maxchar_old, maxchar_new = 0; 9413 PyObject *v; 9414 9415 u = _PyUnicode_Copy(self); 9416 if (u == NULL) 9417 return NULL; 9418 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9419 9420 /* fix functions return the new maximum character in a string, 9421 if the kind of the resulting unicode object does not change, 9422 everything is fine. Otherwise we need to change the string kind 9423 and re-run the fix function. */ 9424 maxchar_new = fixfct(u); 9425 9426 if (maxchar_new == 0) { 9427 /* no changes */; 9428 if (PyUnicode_CheckExact(self)) { 9429 Py_DECREF(u); 9430 Py_INCREF(self); 9431 return self; 9432 } 9433 else 9434 return u; 9435 } 9436 9437 maxchar_new = align_maxchar(maxchar_new); 9438 9439 if (maxchar_new == maxchar_old) 9440 return u; 9441 9442 /* In case the maximum character changed, we need to 9443 convert the string to the new category. */ 9444 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9445 if (v == NULL) { 9446 Py_DECREF(u); 9447 return NULL; 9448 } 9449 if (maxchar_new > maxchar_old) { 9450 /* If the maxchar increased so that the kind changed, not all 9451 characters are representable anymore and we need to fix the 9452 string again. This only happens in very few cases. */ 9453 _PyUnicode_FastCopyCharacters(v, 0, 9454 self, 0, PyUnicode_GET_LENGTH(self)); 9455 maxchar_old = fixfct(v); 9456 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9457 } 9458 else { 9459 _PyUnicode_FastCopyCharacters(v, 0, 9460 u, 0, PyUnicode_GET_LENGTH(self)); 9461 } 9462 Py_DECREF(u); 9463 assert(_PyUnicode_CheckConsistency(v, 1)); 9464 return v; 9465} 9466 9467static PyObject * 9468ascii_upper_or_lower(PyObject *self, int lower) 9469{ 9470 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9471 char *resdata, *data = PyUnicode_DATA(self); 9472 PyObject *res; 9473 9474 res = PyUnicode_New(len, 127); 9475 if (res == NULL) 9476 return NULL; 9477 resdata = PyUnicode_DATA(res); 9478 if (lower) 9479 _Py_bytes_lower(resdata, data, len); 9480 else 9481 _Py_bytes_upper(resdata, data, len); 9482 return res; 9483} 9484 9485static Py_UCS4 9486handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9487{ 9488 Py_ssize_t j; 9489 int final_sigma; 9490 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9491 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9492 9493 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9494 9495 where ! is a negation and \p{xxx} is a character with property xxx. 9496 */ 9497 for (j = i - 1; j >= 0; j--) { 9498 c = PyUnicode_READ(kind, data, j); 9499 if (!_PyUnicode_IsCaseIgnorable(c)) 9500 break; 9501 } 9502 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9503 if (final_sigma) { 9504 for (j = i + 1; j < length; j++) { 9505 c = PyUnicode_READ(kind, data, j); 9506 if (!_PyUnicode_IsCaseIgnorable(c)) 9507 break; 9508 } 9509 final_sigma = j == length || !_PyUnicode_IsCased(c); 9510 } 9511 return (final_sigma) ? 0x3C2 : 0x3C3; 9512} 9513 9514static int 9515lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9516 Py_UCS4 c, Py_UCS4 *mapped) 9517{ 9518 /* Obscure special case. */ 9519 if (c == 0x3A3) { 9520 mapped[0] = handle_capital_sigma(kind, data, length, i); 9521 return 1; 9522 } 9523 return _PyUnicode_ToLowerFull(c, mapped); 9524} 9525 9526static Py_ssize_t 9527do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9528{ 9529 Py_ssize_t i, k = 0; 9530 int n_res, j; 9531 Py_UCS4 c, mapped[3]; 9532 9533 c = PyUnicode_READ(kind, data, 0); 9534 n_res = _PyUnicode_ToUpperFull(c, mapped); 9535 for (j = 0; j < n_res; j++) { 9536 *maxchar = Py_MAX(*maxchar, mapped[j]); 9537 res[k++] = mapped[j]; 9538 } 9539 for (i = 1; i < length; i++) { 9540 c = PyUnicode_READ(kind, data, i); 9541 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9542 for (j = 0; j < n_res; j++) { 9543 *maxchar = Py_MAX(*maxchar, mapped[j]); 9544 res[k++] = mapped[j]; 9545 } 9546 } 9547 return k; 9548} 9549 9550static Py_ssize_t 9551do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9552 Py_ssize_t i, k = 0; 9553 9554 for (i = 0; i < length; i++) { 9555 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9556 int n_res, j; 9557 if (Py_UNICODE_ISUPPER(c)) { 9558 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9559 } 9560 else if (Py_UNICODE_ISLOWER(c)) { 9561 n_res = _PyUnicode_ToUpperFull(c, mapped); 9562 } 9563 else { 9564 n_res = 1; 9565 mapped[0] = c; 9566 } 9567 for (j = 0; j < n_res; j++) { 9568 *maxchar = Py_MAX(*maxchar, mapped[j]); 9569 res[k++] = mapped[j]; 9570 } 9571 } 9572 return k; 9573} 9574 9575static Py_ssize_t 9576do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9577 Py_UCS4 *maxchar, int lower) 9578{ 9579 Py_ssize_t i, k = 0; 9580 9581 for (i = 0; i < length; i++) { 9582 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9583 int n_res, j; 9584 if (lower) 9585 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9586 else 9587 n_res = _PyUnicode_ToUpperFull(c, mapped); 9588 for (j = 0; j < n_res; j++) { 9589 *maxchar = Py_MAX(*maxchar, mapped[j]); 9590 res[k++] = mapped[j]; 9591 } 9592 } 9593 return k; 9594} 9595 9596static Py_ssize_t 9597do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9598{ 9599 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9600} 9601 9602static Py_ssize_t 9603do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9604{ 9605 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9606} 9607 9608static Py_ssize_t 9609do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9610{ 9611 Py_ssize_t i, k = 0; 9612 9613 for (i = 0; i < length; i++) { 9614 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9615 Py_UCS4 mapped[3]; 9616 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9617 for (j = 0; j < n_res; j++) { 9618 *maxchar = Py_MAX(*maxchar, mapped[j]); 9619 res[k++] = mapped[j]; 9620 } 9621 } 9622 return k; 9623} 9624 9625static Py_ssize_t 9626do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9627{ 9628 Py_ssize_t i, k = 0; 9629 int previous_is_cased; 9630 9631 previous_is_cased = 0; 9632 for (i = 0; i < length; i++) { 9633 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9634 Py_UCS4 mapped[3]; 9635 int n_res, j; 9636 9637 if (previous_is_cased) 9638 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9639 else 9640 n_res = _PyUnicode_ToTitleFull(c, mapped); 9641 9642 for (j = 0; j < n_res; j++) { 9643 *maxchar = Py_MAX(*maxchar, mapped[j]); 9644 res[k++] = mapped[j]; 9645 } 9646 9647 previous_is_cased = _PyUnicode_IsCased(c); 9648 } 9649 return k; 9650} 9651 9652static PyObject * 9653case_operation(PyObject *self, 9654 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9655{ 9656 PyObject *res = NULL; 9657 Py_ssize_t length, newlength = 0; 9658 int kind, outkind; 9659 void *data, *outdata; 9660 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9661 9662 assert(PyUnicode_IS_READY(self)); 9663 9664 kind = PyUnicode_KIND(self); 9665 data = PyUnicode_DATA(self); 9666 length = PyUnicode_GET_LENGTH(self); 9667 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9668 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9669 return NULL; 9670 } 9671 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9672 if (tmp == NULL) 9673 return PyErr_NoMemory(); 9674 newlength = perform(kind, data, length, tmp, &maxchar); 9675 res = PyUnicode_New(newlength, maxchar); 9676 if (res == NULL) 9677 goto leave; 9678 tmpend = tmp + newlength; 9679 outdata = PyUnicode_DATA(res); 9680 outkind = PyUnicode_KIND(res); 9681 switch (outkind) { 9682 case PyUnicode_1BYTE_KIND: 9683 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9684 break; 9685 case PyUnicode_2BYTE_KIND: 9686 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9687 break; 9688 case PyUnicode_4BYTE_KIND: 9689 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9690 break; 9691 default: 9692 assert(0); 9693 break; 9694 } 9695 leave: 9696 PyMem_FREE(tmp); 9697 return res; 9698} 9699 9700PyObject * 9701PyUnicode_Join(PyObject *separator, PyObject *seq) 9702{ 9703 PyObject *sep = NULL; 9704 Py_ssize_t seplen; 9705 PyObject *res = NULL; /* the result */ 9706 PyObject *fseq; /* PySequence_Fast(seq) */ 9707 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9708 PyObject **items; 9709 PyObject *item; 9710 Py_ssize_t sz, i, res_offset; 9711 Py_UCS4 maxchar; 9712 Py_UCS4 item_maxchar; 9713 int use_memcpy; 9714 unsigned char *res_data = NULL, *sep_data = NULL; 9715 PyObject *last_obj; 9716 unsigned int kind = 0; 9717 9718 fseq = PySequence_Fast(seq, "can only join an iterable"); 9719 if (fseq == NULL) { 9720 return NULL; 9721 } 9722 9723 /* NOTE: the following code can't call back into Python code, 9724 * so we are sure that fseq won't be mutated. 9725 */ 9726 9727 seqlen = PySequence_Fast_GET_SIZE(fseq); 9728 /* If empty sequence, return u"". */ 9729 if (seqlen == 0) { 9730 Py_DECREF(fseq); 9731 _Py_RETURN_UNICODE_EMPTY(); 9732 } 9733 9734 /* If singleton sequence with an exact Unicode, return that. */ 9735 last_obj = NULL; 9736 items = PySequence_Fast_ITEMS(fseq); 9737 if (seqlen == 1) { 9738 if (PyUnicode_CheckExact(items[0])) { 9739 res = items[0]; 9740 Py_INCREF(res); 9741 Py_DECREF(fseq); 9742 return res; 9743 } 9744 seplen = 0; 9745 maxchar = 0; 9746 } 9747 else { 9748 /* Set up sep and seplen */ 9749 if (separator == NULL) { 9750 /* fall back to a blank space separator */ 9751 sep = PyUnicode_FromOrdinal(' '); 9752 if (!sep) 9753 goto onError; 9754 seplen = 1; 9755 maxchar = 32; 9756 } 9757 else { 9758 if (!PyUnicode_Check(separator)) { 9759 PyErr_Format(PyExc_TypeError, 9760 "separator: expected str instance," 9761 " %.80s found", 9762 Py_TYPE(separator)->tp_name); 9763 goto onError; 9764 } 9765 if (PyUnicode_READY(separator)) 9766 goto onError; 9767 sep = separator; 9768 seplen = PyUnicode_GET_LENGTH(separator); 9769 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9770 /* inc refcount to keep this code path symmetric with the 9771 above case of a blank separator */ 9772 Py_INCREF(sep); 9773 } 9774 last_obj = sep; 9775 } 9776 9777 /* There are at least two things to join, or else we have a subclass 9778 * of str in the sequence. 9779 * Do a pre-pass to figure out the total amount of space we'll 9780 * need (sz), and see whether all argument are strings. 9781 */ 9782 sz = 0; 9783#ifdef Py_DEBUG 9784 use_memcpy = 0; 9785#else 9786 use_memcpy = 1; 9787#endif 9788 for (i = 0; i < seqlen; i++) { 9789 const Py_ssize_t old_sz = sz; 9790 item = items[i]; 9791 if (!PyUnicode_Check(item)) { 9792 PyErr_Format(PyExc_TypeError, 9793 "sequence item %zd: expected str instance," 9794 " %.80s found", 9795 i, Py_TYPE(item)->tp_name); 9796 goto onError; 9797 } 9798 if (PyUnicode_READY(item) == -1) 9799 goto onError; 9800 sz += PyUnicode_GET_LENGTH(item); 9801 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9802 maxchar = Py_MAX(maxchar, item_maxchar); 9803 if (i != 0) 9804 sz += seplen; 9805 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9806 PyErr_SetString(PyExc_OverflowError, 9807 "join() result is too long for a Python string"); 9808 goto onError; 9809 } 9810 if (use_memcpy && last_obj != NULL) { 9811 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9812 use_memcpy = 0; 9813 } 9814 last_obj = item; 9815 } 9816 9817 res = PyUnicode_New(sz, maxchar); 9818 if (res == NULL) 9819 goto onError; 9820 9821 /* Catenate everything. */ 9822#ifdef Py_DEBUG 9823 use_memcpy = 0; 9824#else 9825 if (use_memcpy) { 9826 res_data = PyUnicode_1BYTE_DATA(res); 9827 kind = PyUnicode_KIND(res); 9828 if (seplen != 0) 9829 sep_data = PyUnicode_1BYTE_DATA(sep); 9830 } 9831#endif 9832 if (use_memcpy) { 9833 for (i = 0; i < seqlen; ++i) { 9834 Py_ssize_t itemlen; 9835 item = items[i]; 9836 9837 /* Copy item, and maybe the separator. */ 9838 if (i && seplen != 0) { 9839 Py_MEMCPY(res_data, 9840 sep_data, 9841 kind * seplen); 9842 res_data += kind * seplen; 9843 } 9844 9845 itemlen = PyUnicode_GET_LENGTH(item); 9846 if (itemlen != 0) { 9847 Py_MEMCPY(res_data, 9848 PyUnicode_DATA(item), 9849 kind * itemlen); 9850 res_data += kind * itemlen; 9851 } 9852 } 9853 assert(res_data == PyUnicode_1BYTE_DATA(res) 9854 + kind * PyUnicode_GET_LENGTH(res)); 9855 } 9856 else { 9857 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9858 Py_ssize_t itemlen; 9859 item = items[i]; 9860 9861 /* Copy item, and maybe the separator. */ 9862 if (i && seplen != 0) { 9863 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9864 res_offset += seplen; 9865 } 9866 9867 itemlen = PyUnicode_GET_LENGTH(item); 9868 if (itemlen != 0) { 9869 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9870 res_offset += itemlen; 9871 } 9872 } 9873 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9874 } 9875 9876 Py_DECREF(fseq); 9877 Py_XDECREF(sep); 9878 assert(_PyUnicode_CheckConsistency(res, 1)); 9879 return res; 9880 9881 onError: 9882 Py_DECREF(fseq); 9883 Py_XDECREF(sep); 9884 Py_XDECREF(res); 9885 return NULL; 9886} 9887 9888#define FILL(kind, data, value, start, length) \ 9889 do { \ 9890 Py_ssize_t i_ = 0; \ 9891 assert(kind != PyUnicode_WCHAR_KIND); \ 9892 switch ((kind)) { \ 9893 case PyUnicode_1BYTE_KIND: { \ 9894 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9895 memset(to_, (unsigned char)value, (length)); \ 9896 break; \ 9897 } \ 9898 case PyUnicode_2BYTE_KIND: { \ 9899 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9900 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9901 break; \ 9902 } \ 9903 case PyUnicode_4BYTE_KIND: { \ 9904 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9905 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9906 break; \ 9907 } \ 9908 default: assert(0); \ 9909 } \ 9910 } while (0) 9911 9912void 9913_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9914 Py_UCS4 fill_char) 9915{ 9916 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9917 const void *data = PyUnicode_DATA(unicode); 9918 assert(PyUnicode_IS_READY(unicode)); 9919 assert(unicode_modifiable(unicode)); 9920 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9921 assert(start >= 0); 9922 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9923 FILL(kind, data, fill_char, start, length); 9924} 9925 9926Py_ssize_t 9927PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9928 Py_UCS4 fill_char) 9929{ 9930 Py_ssize_t maxlen; 9931 9932 if (!PyUnicode_Check(unicode)) { 9933 PyErr_BadInternalCall(); 9934 return -1; 9935 } 9936 if (PyUnicode_READY(unicode) == -1) 9937 return -1; 9938 if (unicode_check_modifiable(unicode)) 9939 return -1; 9940 9941 if (start < 0) { 9942 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9943 return -1; 9944 } 9945 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9946 PyErr_SetString(PyExc_ValueError, 9947 "fill character is bigger than " 9948 "the string maximum character"); 9949 return -1; 9950 } 9951 9952 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9953 length = Py_MIN(maxlen, length); 9954 if (length <= 0) 9955 return 0; 9956 9957 _PyUnicode_FastFill(unicode, start, length, fill_char); 9958 return length; 9959} 9960 9961static PyObject * 9962pad(PyObject *self, 9963 Py_ssize_t left, 9964 Py_ssize_t right, 9965 Py_UCS4 fill) 9966{ 9967 PyObject *u; 9968 Py_UCS4 maxchar; 9969 int kind; 9970 void *data; 9971 9972 if (left < 0) 9973 left = 0; 9974 if (right < 0) 9975 right = 0; 9976 9977 if (left == 0 && right == 0) 9978 return unicode_result_unchanged(self); 9979 9980 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9981 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9982 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9983 return NULL; 9984 } 9985 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 9986 maxchar = Py_MAX(maxchar, fill); 9987 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 9988 if (!u) 9989 return NULL; 9990 9991 kind = PyUnicode_KIND(u); 9992 data = PyUnicode_DATA(u); 9993 if (left) 9994 FILL(kind, data, fill, 0, left); 9995 if (right) 9996 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 9997 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 9998 assert(_PyUnicode_CheckConsistency(u, 1)); 9999 return u; 10000} 10001 10002PyObject * 10003PyUnicode_Splitlines(PyObject *string, int keepends) 10004{ 10005 PyObject *list; 10006 10007 string = PyUnicode_FromObject(string); 10008 if (string == NULL) 10009 return NULL; 10010 if (PyUnicode_READY(string) == -1) { 10011 Py_DECREF(string); 10012 return NULL; 10013 } 10014 10015 switch (PyUnicode_KIND(string)) { 10016 case PyUnicode_1BYTE_KIND: 10017 if (PyUnicode_IS_ASCII(string)) 10018 list = asciilib_splitlines( 10019 string, PyUnicode_1BYTE_DATA(string), 10020 PyUnicode_GET_LENGTH(string), keepends); 10021 else 10022 list = ucs1lib_splitlines( 10023 string, PyUnicode_1BYTE_DATA(string), 10024 PyUnicode_GET_LENGTH(string), keepends); 10025 break; 10026 case PyUnicode_2BYTE_KIND: 10027 list = ucs2lib_splitlines( 10028 string, PyUnicode_2BYTE_DATA(string), 10029 PyUnicode_GET_LENGTH(string), keepends); 10030 break; 10031 case PyUnicode_4BYTE_KIND: 10032 list = ucs4lib_splitlines( 10033 string, PyUnicode_4BYTE_DATA(string), 10034 PyUnicode_GET_LENGTH(string), keepends); 10035 break; 10036 default: 10037 assert(0); 10038 list = 0; 10039 } 10040 Py_DECREF(string); 10041 return list; 10042} 10043 10044static PyObject * 10045split(PyObject *self, 10046 PyObject *substring, 10047 Py_ssize_t maxcount) 10048{ 10049 int kind1, kind2; 10050 void *buf1, *buf2; 10051 Py_ssize_t len1, len2; 10052 PyObject* out; 10053 10054 if (maxcount < 0) 10055 maxcount = PY_SSIZE_T_MAX; 10056 10057 if (PyUnicode_READY(self) == -1) 10058 return NULL; 10059 10060 if (substring == NULL) 10061 switch (PyUnicode_KIND(self)) { 10062 case PyUnicode_1BYTE_KIND: 10063 if (PyUnicode_IS_ASCII(self)) 10064 return asciilib_split_whitespace( 10065 self, PyUnicode_1BYTE_DATA(self), 10066 PyUnicode_GET_LENGTH(self), maxcount 10067 ); 10068 else 10069 return ucs1lib_split_whitespace( 10070 self, PyUnicode_1BYTE_DATA(self), 10071 PyUnicode_GET_LENGTH(self), maxcount 10072 ); 10073 case PyUnicode_2BYTE_KIND: 10074 return ucs2lib_split_whitespace( 10075 self, PyUnicode_2BYTE_DATA(self), 10076 PyUnicode_GET_LENGTH(self), maxcount 10077 ); 10078 case PyUnicode_4BYTE_KIND: 10079 return ucs4lib_split_whitespace( 10080 self, PyUnicode_4BYTE_DATA(self), 10081 PyUnicode_GET_LENGTH(self), maxcount 10082 ); 10083 default: 10084 assert(0); 10085 return NULL; 10086 } 10087 10088 if (PyUnicode_READY(substring) == -1) 10089 return NULL; 10090 10091 kind1 = PyUnicode_KIND(self); 10092 kind2 = PyUnicode_KIND(substring); 10093 len1 = PyUnicode_GET_LENGTH(self); 10094 len2 = PyUnicode_GET_LENGTH(substring); 10095 if (kind1 < kind2 || len1 < len2) { 10096 out = PyList_New(1); 10097 if (out == NULL) 10098 return NULL; 10099 Py_INCREF(self); 10100 PyList_SET_ITEM(out, 0, self); 10101 return out; 10102 } 10103 buf1 = PyUnicode_DATA(self); 10104 buf2 = PyUnicode_DATA(substring); 10105 if (kind2 != kind1) { 10106 buf2 = _PyUnicode_AsKind(substring, kind1); 10107 if (!buf2) 10108 return NULL; 10109 } 10110 10111 switch (kind1) { 10112 case PyUnicode_1BYTE_KIND: 10113 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10114 out = asciilib_split( 10115 self, buf1, len1, buf2, len2, maxcount); 10116 else 10117 out = ucs1lib_split( 10118 self, buf1, len1, buf2, len2, maxcount); 10119 break; 10120 case PyUnicode_2BYTE_KIND: 10121 out = ucs2lib_split( 10122 self, buf1, len1, buf2, len2, maxcount); 10123 break; 10124 case PyUnicode_4BYTE_KIND: 10125 out = ucs4lib_split( 10126 self, buf1, len1, buf2, len2, maxcount); 10127 break; 10128 default: 10129 out = NULL; 10130 } 10131 if (kind2 != kind1) 10132 PyMem_Free(buf2); 10133 return out; 10134} 10135 10136static PyObject * 10137rsplit(PyObject *self, 10138 PyObject *substring, 10139 Py_ssize_t maxcount) 10140{ 10141 int kind1, kind2; 10142 void *buf1, *buf2; 10143 Py_ssize_t len1, len2; 10144 PyObject* out; 10145 10146 if (maxcount < 0) 10147 maxcount = PY_SSIZE_T_MAX; 10148 10149 if (PyUnicode_READY(self) == -1) 10150 return NULL; 10151 10152 if (substring == NULL) 10153 switch (PyUnicode_KIND(self)) { 10154 case PyUnicode_1BYTE_KIND: 10155 if (PyUnicode_IS_ASCII(self)) 10156 return asciilib_rsplit_whitespace( 10157 self, PyUnicode_1BYTE_DATA(self), 10158 PyUnicode_GET_LENGTH(self), maxcount 10159 ); 10160 else 10161 return ucs1lib_rsplit_whitespace( 10162 self, PyUnicode_1BYTE_DATA(self), 10163 PyUnicode_GET_LENGTH(self), maxcount 10164 ); 10165 case PyUnicode_2BYTE_KIND: 10166 return ucs2lib_rsplit_whitespace( 10167 self, PyUnicode_2BYTE_DATA(self), 10168 PyUnicode_GET_LENGTH(self), maxcount 10169 ); 10170 case PyUnicode_4BYTE_KIND: 10171 return ucs4lib_rsplit_whitespace( 10172 self, PyUnicode_4BYTE_DATA(self), 10173 PyUnicode_GET_LENGTH(self), maxcount 10174 ); 10175 default: 10176 assert(0); 10177 return NULL; 10178 } 10179 10180 if (PyUnicode_READY(substring) == -1) 10181 return NULL; 10182 10183 kind1 = PyUnicode_KIND(self); 10184 kind2 = PyUnicode_KIND(substring); 10185 len1 = PyUnicode_GET_LENGTH(self); 10186 len2 = PyUnicode_GET_LENGTH(substring); 10187 if (kind1 < kind2 || len1 < len2) { 10188 out = PyList_New(1); 10189 if (out == NULL) 10190 return NULL; 10191 Py_INCREF(self); 10192 PyList_SET_ITEM(out, 0, self); 10193 return out; 10194 } 10195 buf1 = PyUnicode_DATA(self); 10196 buf2 = PyUnicode_DATA(substring); 10197 if (kind2 != kind1) { 10198 buf2 = _PyUnicode_AsKind(substring, kind1); 10199 if (!buf2) 10200 return NULL; 10201 } 10202 10203 switch (kind1) { 10204 case PyUnicode_1BYTE_KIND: 10205 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10206 out = asciilib_rsplit( 10207 self, buf1, len1, buf2, len2, maxcount); 10208 else 10209 out = ucs1lib_rsplit( 10210 self, buf1, len1, buf2, len2, maxcount); 10211 break; 10212 case PyUnicode_2BYTE_KIND: 10213 out = ucs2lib_rsplit( 10214 self, buf1, len1, buf2, len2, maxcount); 10215 break; 10216 case PyUnicode_4BYTE_KIND: 10217 out = ucs4lib_rsplit( 10218 self, buf1, len1, buf2, len2, maxcount); 10219 break; 10220 default: 10221 out = NULL; 10222 } 10223 if (kind2 != kind1) 10224 PyMem_Free(buf2); 10225 return out; 10226} 10227 10228static Py_ssize_t 10229anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10230 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10231{ 10232 switch (kind) { 10233 case PyUnicode_1BYTE_KIND: 10234 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10235 return asciilib_find(buf1, len1, buf2, len2, offset); 10236 else 10237 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10238 case PyUnicode_2BYTE_KIND: 10239 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10240 case PyUnicode_4BYTE_KIND: 10241 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10242 } 10243 assert(0); 10244 return -1; 10245} 10246 10247static Py_ssize_t 10248anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10249 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10250{ 10251 switch (kind) { 10252 case PyUnicode_1BYTE_KIND: 10253 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10254 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10255 else 10256 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10257 case PyUnicode_2BYTE_KIND: 10258 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10259 case PyUnicode_4BYTE_KIND: 10260 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10261 } 10262 assert(0); 10263 return 0; 10264} 10265 10266static void 10267replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10268 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10269{ 10270 int kind = PyUnicode_KIND(u); 10271 void *data = PyUnicode_DATA(u); 10272 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10273 if (kind == PyUnicode_1BYTE_KIND) { 10274 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10275 (Py_UCS1 *)data + len, 10276 u1, u2, maxcount); 10277 } 10278 else if (kind == PyUnicode_2BYTE_KIND) { 10279 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10280 (Py_UCS2 *)data + len, 10281 u1, u2, maxcount); 10282 } 10283 else { 10284 assert(kind == PyUnicode_4BYTE_KIND); 10285 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10286 (Py_UCS4 *)data + len, 10287 u1, u2, maxcount); 10288 } 10289} 10290 10291static PyObject * 10292replace(PyObject *self, PyObject *str1, 10293 PyObject *str2, Py_ssize_t maxcount) 10294{ 10295 PyObject *u; 10296 char *sbuf = PyUnicode_DATA(self); 10297 char *buf1 = PyUnicode_DATA(str1); 10298 char *buf2 = PyUnicode_DATA(str2); 10299 int srelease = 0, release1 = 0, release2 = 0; 10300 int skind = PyUnicode_KIND(self); 10301 int kind1 = PyUnicode_KIND(str1); 10302 int kind2 = PyUnicode_KIND(str2); 10303 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10304 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10305 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10306 int mayshrink; 10307 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10308 10309 if (maxcount < 0) 10310 maxcount = PY_SSIZE_T_MAX; 10311 else if (maxcount == 0 || slen == 0) 10312 goto nothing; 10313 10314 if (str1 == str2) 10315 goto nothing; 10316 10317 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10318 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10319 if (maxchar < maxchar_str1) 10320 /* substring too wide to be present */ 10321 goto nothing; 10322 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10323 /* Replacing str1 with str2 may cause a maxchar reduction in the 10324 result string. */ 10325 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10326 maxchar = Py_MAX(maxchar, maxchar_str2); 10327 10328 if (len1 == len2) { 10329 /* same length */ 10330 if (len1 == 0) 10331 goto nothing; 10332 if (len1 == 1) { 10333 /* replace characters */ 10334 Py_UCS4 u1, u2; 10335 Py_ssize_t pos; 10336 10337 u1 = PyUnicode_READ(kind1, buf1, 0); 10338 pos = findchar(sbuf, skind, slen, u1, 1); 10339 if (pos < 0) 10340 goto nothing; 10341 u2 = PyUnicode_READ(kind2, buf2, 0); 10342 u = PyUnicode_New(slen, maxchar); 10343 if (!u) 10344 goto error; 10345 10346 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10347 replace_1char_inplace(u, pos, u1, u2, maxcount); 10348 } 10349 else { 10350 int rkind = skind; 10351 char *res; 10352 Py_ssize_t i; 10353 10354 if (kind1 < rkind) { 10355 /* widen substring */ 10356 buf1 = _PyUnicode_AsKind(str1, rkind); 10357 if (!buf1) goto error; 10358 release1 = 1; 10359 } 10360 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10361 if (i < 0) 10362 goto nothing; 10363 if (rkind > kind2) { 10364 /* widen replacement */ 10365 buf2 = _PyUnicode_AsKind(str2, rkind); 10366 if (!buf2) goto error; 10367 release2 = 1; 10368 } 10369 else if (rkind < kind2) { 10370 /* widen self and buf1 */ 10371 rkind = kind2; 10372 if (release1) PyMem_Free(buf1); 10373 release1 = 0; 10374 sbuf = _PyUnicode_AsKind(self, rkind); 10375 if (!sbuf) goto error; 10376 srelease = 1; 10377 buf1 = _PyUnicode_AsKind(str1, rkind); 10378 if (!buf1) goto error; 10379 release1 = 1; 10380 } 10381 u = PyUnicode_New(slen, maxchar); 10382 if (!u) 10383 goto error; 10384 assert(PyUnicode_KIND(u) == rkind); 10385 res = PyUnicode_DATA(u); 10386 10387 memcpy(res, sbuf, rkind * slen); 10388 /* change everything in-place, starting with this one */ 10389 memcpy(res + rkind * i, 10390 buf2, 10391 rkind * len2); 10392 i += len1; 10393 10394 while ( --maxcount > 0) { 10395 i = anylib_find(rkind, self, 10396 sbuf+rkind*i, slen-i, 10397 str1, buf1, len1, i); 10398 if (i == -1) 10399 break; 10400 memcpy(res + rkind * i, 10401 buf2, 10402 rkind * len2); 10403 i += len1; 10404 } 10405 } 10406 } 10407 else { 10408 Py_ssize_t n, i, j, ires; 10409 Py_ssize_t new_size; 10410 int rkind = skind; 10411 char *res; 10412 10413 if (kind1 < rkind) { 10414 /* widen substring */ 10415 buf1 = _PyUnicode_AsKind(str1, rkind); 10416 if (!buf1) goto error; 10417 release1 = 1; 10418 } 10419 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10420 if (n == 0) 10421 goto nothing; 10422 if (kind2 < rkind) { 10423 /* widen replacement */ 10424 buf2 = _PyUnicode_AsKind(str2, rkind); 10425 if (!buf2) goto error; 10426 release2 = 1; 10427 } 10428 else if (kind2 > rkind) { 10429 /* widen self and buf1 */ 10430 rkind = kind2; 10431 sbuf = _PyUnicode_AsKind(self, rkind); 10432 if (!sbuf) goto error; 10433 srelease = 1; 10434 if (release1) PyMem_Free(buf1); 10435 release1 = 0; 10436 buf1 = _PyUnicode_AsKind(str1, rkind); 10437 if (!buf1) goto error; 10438 release1 = 1; 10439 } 10440 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10441 PyUnicode_GET_LENGTH(str1))); */ 10442 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10443 PyErr_SetString(PyExc_OverflowError, 10444 "replace string is too long"); 10445 goto error; 10446 } 10447 new_size = slen + n * (len2 - len1); 10448 if (new_size == 0) { 10449 _Py_INCREF_UNICODE_EMPTY(); 10450 if (!unicode_empty) 10451 goto error; 10452 u = unicode_empty; 10453 goto done; 10454 } 10455 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10456 PyErr_SetString(PyExc_OverflowError, 10457 "replace string is too long"); 10458 goto error; 10459 } 10460 u = PyUnicode_New(new_size, maxchar); 10461 if (!u) 10462 goto error; 10463 assert(PyUnicode_KIND(u) == rkind); 10464 res = PyUnicode_DATA(u); 10465 ires = i = 0; 10466 if (len1 > 0) { 10467 while (n-- > 0) { 10468 /* look for next match */ 10469 j = anylib_find(rkind, self, 10470 sbuf + rkind * i, slen-i, 10471 str1, buf1, len1, i); 10472 if (j == -1) 10473 break; 10474 else if (j > i) { 10475 /* copy unchanged part [i:j] */ 10476 memcpy(res + rkind * ires, 10477 sbuf + rkind * i, 10478 rkind * (j-i)); 10479 ires += j - i; 10480 } 10481 /* copy substitution string */ 10482 if (len2 > 0) { 10483 memcpy(res + rkind * ires, 10484 buf2, 10485 rkind * len2); 10486 ires += len2; 10487 } 10488 i = j + len1; 10489 } 10490 if (i < slen) 10491 /* copy tail [i:] */ 10492 memcpy(res + rkind * ires, 10493 sbuf + rkind * i, 10494 rkind * (slen-i)); 10495 } 10496 else { 10497 /* interleave */ 10498 while (n > 0) { 10499 memcpy(res + rkind * ires, 10500 buf2, 10501 rkind * len2); 10502 ires += len2; 10503 if (--n <= 0) 10504 break; 10505 memcpy(res + rkind * ires, 10506 sbuf + rkind * i, 10507 rkind); 10508 ires++; 10509 i++; 10510 } 10511 memcpy(res + rkind * ires, 10512 sbuf + rkind * i, 10513 rkind * (slen-i)); 10514 } 10515 } 10516 10517 if (mayshrink) { 10518 unicode_adjust_maxchar(&u); 10519 if (u == NULL) 10520 goto error; 10521 } 10522 10523 done: 10524 if (srelease) 10525 PyMem_FREE(sbuf); 10526 if (release1) 10527 PyMem_FREE(buf1); 10528 if (release2) 10529 PyMem_FREE(buf2); 10530 assert(_PyUnicode_CheckConsistency(u, 1)); 10531 return u; 10532 10533 nothing: 10534 /* nothing to replace; return original string (when possible) */ 10535 if (srelease) 10536 PyMem_FREE(sbuf); 10537 if (release1) 10538 PyMem_FREE(buf1); 10539 if (release2) 10540 PyMem_FREE(buf2); 10541 return unicode_result_unchanged(self); 10542 10543 error: 10544 if (srelease && sbuf) 10545 PyMem_FREE(sbuf); 10546 if (release1 && buf1) 10547 PyMem_FREE(buf1); 10548 if (release2 && buf2) 10549 PyMem_FREE(buf2); 10550 return NULL; 10551} 10552 10553/* --- Unicode Object Methods --------------------------------------------- */ 10554 10555PyDoc_STRVAR(title__doc__, 10556 "S.title() -> str\n\ 10557\n\ 10558Return a titlecased version of S, i.e. words start with title case\n\ 10559characters, all remaining cased characters have lower case."); 10560 10561static PyObject* 10562unicode_title(PyObject *self) 10563{ 10564 if (PyUnicode_READY(self) == -1) 10565 return NULL; 10566 return case_operation(self, do_title); 10567} 10568 10569PyDoc_STRVAR(capitalize__doc__, 10570 "S.capitalize() -> str\n\ 10571\n\ 10572Return a capitalized version of S, i.e. make the first character\n\ 10573have upper case and the rest lower case."); 10574 10575static PyObject* 10576unicode_capitalize(PyObject *self) 10577{ 10578 if (PyUnicode_READY(self) == -1) 10579 return NULL; 10580 if (PyUnicode_GET_LENGTH(self) == 0) 10581 return unicode_result_unchanged(self); 10582 return case_operation(self, do_capitalize); 10583} 10584 10585PyDoc_STRVAR(casefold__doc__, 10586 "S.casefold() -> str\n\ 10587\n\ 10588Return a version of S suitable for caseless comparisons."); 10589 10590static PyObject * 10591unicode_casefold(PyObject *self) 10592{ 10593 if (PyUnicode_READY(self) == -1) 10594 return NULL; 10595 if (PyUnicode_IS_ASCII(self)) 10596 return ascii_upper_or_lower(self, 1); 10597 return case_operation(self, do_casefold); 10598} 10599 10600 10601/* Argument converter. Coerces to a single unicode character */ 10602 10603static int 10604convert_uc(PyObject *obj, void *addr) 10605{ 10606 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10607 PyObject *uniobj; 10608 10609 uniobj = PyUnicode_FromObject(obj); 10610 if (uniobj == NULL) { 10611 PyErr_SetString(PyExc_TypeError, 10612 "The fill character cannot be converted to Unicode"); 10613 return 0; 10614 } 10615 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10616 PyErr_SetString(PyExc_TypeError, 10617 "The fill character must be exactly one character long"); 10618 Py_DECREF(uniobj); 10619 return 0; 10620 } 10621 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10622 Py_DECREF(uniobj); 10623 return 1; 10624} 10625 10626PyDoc_STRVAR(center__doc__, 10627 "S.center(width[, fillchar]) -> str\n\ 10628\n\ 10629Return S centered in a string of length width. Padding is\n\ 10630done using the specified fill character (default is a space)"); 10631 10632static PyObject * 10633unicode_center(PyObject *self, PyObject *args) 10634{ 10635 Py_ssize_t marg, left; 10636 Py_ssize_t width; 10637 Py_UCS4 fillchar = ' '; 10638 10639 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10640 return NULL; 10641 10642 if (PyUnicode_READY(self) == -1) 10643 return NULL; 10644 10645 if (PyUnicode_GET_LENGTH(self) >= width) 10646 return unicode_result_unchanged(self); 10647 10648 marg = width - PyUnicode_GET_LENGTH(self); 10649 left = marg / 2 + (marg & width & 1); 10650 10651 return pad(self, left, marg - left, fillchar); 10652} 10653 10654/* This function assumes that str1 and str2 are readied by the caller. */ 10655 10656static int 10657unicode_compare(PyObject *str1, PyObject *str2) 10658{ 10659#define COMPARE(TYPE1, TYPE2) \ 10660 do { \ 10661 TYPE1* p1 = (TYPE1 *)data1; \ 10662 TYPE2* p2 = (TYPE2 *)data2; \ 10663 TYPE1* end = p1 + len; \ 10664 Py_UCS4 c1, c2; \ 10665 for (; p1 != end; p1++, p2++) { \ 10666 c1 = *p1; \ 10667 c2 = *p2; \ 10668 if (c1 != c2) \ 10669 return (c1 < c2) ? -1 : 1; \ 10670 } \ 10671 } \ 10672 while (0) 10673 10674 int kind1, kind2; 10675 void *data1, *data2; 10676 Py_ssize_t len1, len2, len; 10677 10678 kind1 = PyUnicode_KIND(str1); 10679 kind2 = PyUnicode_KIND(str2); 10680 data1 = PyUnicode_DATA(str1); 10681 data2 = PyUnicode_DATA(str2); 10682 len1 = PyUnicode_GET_LENGTH(str1); 10683 len2 = PyUnicode_GET_LENGTH(str2); 10684 len = Py_MIN(len1, len2); 10685 10686 switch(kind1) { 10687 case PyUnicode_1BYTE_KIND: 10688 { 10689 switch(kind2) { 10690 case PyUnicode_1BYTE_KIND: 10691 { 10692 int cmp = memcmp(data1, data2, len); 10693 /* normalize result of memcmp() into the range [-1; 1] */ 10694 if (cmp < 0) 10695 return -1; 10696 if (cmp > 0) 10697 return 1; 10698 break; 10699 } 10700 case PyUnicode_2BYTE_KIND: 10701 COMPARE(Py_UCS1, Py_UCS2); 10702 break; 10703 case PyUnicode_4BYTE_KIND: 10704 COMPARE(Py_UCS1, Py_UCS4); 10705 break; 10706 default: 10707 assert(0); 10708 } 10709 break; 10710 } 10711 case PyUnicode_2BYTE_KIND: 10712 { 10713 switch(kind2) { 10714 case PyUnicode_1BYTE_KIND: 10715 COMPARE(Py_UCS2, Py_UCS1); 10716 break; 10717 case PyUnicode_2BYTE_KIND: 10718 { 10719 COMPARE(Py_UCS2, Py_UCS2); 10720 break; 10721 } 10722 case PyUnicode_4BYTE_KIND: 10723 COMPARE(Py_UCS2, Py_UCS4); 10724 break; 10725 default: 10726 assert(0); 10727 } 10728 break; 10729 } 10730 case PyUnicode_4BYTE_KIND: 10731 { 10732 switch(kind2) { 10733 case PyUnicode_1BYTE_KIND: 10734 COMPARE(Py_UCS4, Py_UCS1); 10735 break; 10736 case PyUnicode_2BYTE_KIND: 10737 COMPARE(Py_UCS4, Py_UCS2); 10738 break; 10739 case PyUnicode_4BYTE_KIND: 10740 { 10741#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10742 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10743 /* normalize result of wmemcmp() into the range [-1; 1] */ 10744 if (cmp < 0) 10745 return -1; 10746 if (cmp > 0) 10747 return 1; 10748#else 10749 COMPARE(Py_UCS4, Py_UCS4); 10750#endif 10751 break; 10752 } 10753 default: 10754 assert(0); 10755 } 10756 break; 10757 } 10758 default: 10759 assert(0); 10760 } 10761 10762 if (len1 == len2) 10763 return 0; 10764 if (len1 < len2) 10765 return -1; 10766 else 10767 return 1; 10768 10769#undef COMPARE 10770} 10771 10772Py_LOCAL(int) 10773unicode_compare_eq(PyObject *str1, PyObject *str2) 10774{ 10775 int kind; 10776 void *data1, *data2; 10777 Py_ssize_t len; 10778 int cmp; 10779 10780 len = PyUnicode_GET_LENGTH(str1); 10781 if (PyUnicode_GET_LENGTH(str2) != len) 10782 return 0; 10783 kind = PyUnicode_KIND(str1); 10784 if (PyUnicode_KIND(str2) != kind) 10785 return 0; 10786 data1 = PyUnicode_DATA(str1); 10787 data2 = PyUnicode_DATA(str2); 10788 10789 cmp = memcmp(data1, data2, len * kind); 10790 return (cmp == 0); 10791} 10792 10793 10794int 10795PyUnicode_Compare(PyObject *left, PyObject *right) 10796{ 10797 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10798 if (PyUnicode_READY(left) == -1 || 10799 PyUnicode_READY(right) == -1) 10800 return -1; 10801 10802 /* a string is equal to itself */ 10803 if (left == right) 10804 return 0; 10805 10806 return unicode_compare(left, right); 10807 } 10808 PyErr_Format(PyExc_TypeError, 10809 "Can't compare %.100s and %.100s", 10810 left->ob_type->tp_name, 10811 right->ob_type->tp_name); 10812 return -1; 10813} 10814 10815int 10816_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10817{ 10818 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10819 if (right_str == NULL) 10820 return -1; 10821 return PyUnicode_Compare(left, right_str); 10822} 10823 10824int 10825PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10826{ 10827 Py_ssize_t i; 10828 int kind; 10829 Py_UCS4 chr; 10830 10831 assert(_PyUnicode_CHECK(uni)); 10832 if (PyUnicode_READY(uni) == -1) 10833 return -1; 10834 kind = PyUnicode_KIND(uni); 10835 if (kind == PyUnicode_1BYTE_KIND) { 10836 const void *data = PyUnicode_1BYTE_DATA(uni); 10837 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10838 size_t len, len2 = strlen(str); 10839 int cmp; 10840 10841 len = Py_MIN(len1, len2); 10842 cmp = memcmp(data, str, len); 10843 if (cmp != 0) { 10844 if (cmp < 0) 10845 return -1; 10846 else 10847 return 1; 10848 } 10849 if (len1 > len2) 10850 return 1; /* uni is longer */ 10851 if (len1 < len2) 10852 return -1; /* str is longer */ 10853 return 0; 10854 } 10855 else { 10856 void *data = PyUnicode_DATA(uni); 10857 /* Compare Unicode string and source character set string */ 10858 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10859 if (chr != (unsigned char)str[i]) 10860 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10861 /* This check keeps Python strings that end in '\0' from comparing equal 10862 to C strings identical up to that point. */ 10863 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10864 return 1; /* uni is longer */ 10865 if (str[i]) 10866 return -1; /* str is longer */ 10867 return 0; 10868 } 10869} 10870 10871 10872#define TEST_COND(cond) \ 10873 ((cond) ? Py_True : Py_False) 10874 10875PyObject * 10876PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10877{ 10878 int result; 10879 PyObject *v; 10880 10881 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10882 Py_RETURN_NOTIMPLEMENTED; 10883 10884 if (PyUnicode_READY(left) == -1 || 10885 PyUnicode_READY(right) == -1) 10886 return NULL; 10887 10888 if (left == right) { 10889 switch (op) { 10890 case Py_EQ: 10891 case Py_LE: 10892 case Py_GE: 10893 /* a string is equal to itself */ 10894 v = Py_True; 10895 break; 10896 case Py_NE: 10897 case Py_LT: 10898 case Py_GT: 10899 v = Py_False; 10900 break; 10901 default: 10902 PyErr_BadArgument(); 10903 return NULL; 10904 } 10905 } 10906 else if (op == Py_EQ || op == Py_NE) { 10907 result = unicode_compare_eq(left, right); 10908 result ^= (op == Py_NE); 10909 v = TEST_COND(result); 10910 } 10911 else { 10912 result = unicode_compare(left, right); 10913 10914 /* Convert the return value to a Boolean */ 10915 switch (op) { 10916 case Py_LE: 10917 v = TEST_COND(result <= 0); 10918 break; 10919 case Py_GE: 10920 v = TEST_COND(result >= 0); 10921 break; 10922 case Py_LT: 10923 v = TEST_COND(result == -1); 10924 break; 10925 case Py_GT: 10926 v = TEST_COND(result == 1); 10927 break; 10928 default: 10929 PyErr_BadArgument(); 10930 return NULL; 10931 } 10932 } 10933 Py_INCREF(v); 10934 return v; 10935} 10936 10937int 10938_PyUnicode_EQ(PyObject *aa, PyObject *bb) 10939{ 10940 return unicode_eq(aa, bb); 10941} 10942 10943int 10944PyUnicode_Contains(PyObject *container, PyObject *element) 10945{ 10946 PyObject *str, *sub; 10947 int kind1, kind2; 10948 void *buf1, *buf2; 10949 Py_ssize_t len1, len2; 10950 int result; 10951 10952 /* Coerce the two arguments */ 10953 sub = PyUnicode_FromObject(element); 10954 if (!sub) { 10955 PyErr_Format(PyExc_TypeError, 10956 "'in <string>' requires string as left operand, not %s", 10957 element->ob_type->tp_name); 10958 return -1; 10959 } 10960 10961 str = PyUnicode_FromObject(container); 10962 if (!str) { 10963 Py_DECREF(sub); 10964 return -1; 10965 } 10966 10967 kind1 = PyUnicode_KIND(str); 10968 kind2 = PyUnicode_KIND(sub); 10969 if (kind1 < kind2) { 10970 Py_DECREF(sub); 10971 Py_DECREF(str); 10972 return 0; 10973 } 10974 len1 = PyUnicode_GET_LENGTH(str); 10975 len2 = PyUnicode_GET_LENGTH(sub); 10976 if (len1 < len2) { 10977 Py_DECREF(sub); 10978 Py_DECREF(str); 10979 return 0; 10980 } 10981 buf1 = PyUnicode_DATA(str); 10982 buf2 = PyUnicode_DATA(sub); 10983 if (len2 == 1) { 10984 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 10985 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 10986 Py_DECREF(sub); 10987 Py_DECREF(str); 10988 return result; 10989 } 10990 if (kind2 != kind1) { 10991 buf2 = _PyUnicode_AsKind(sub, kind1); 10992 if (!buf2) { 10993 Py_DECREF(sub); 10994 Py_DECREF(str); 10995 return -1; 10996 } 10997 } 10998 10999 switch (kind1) { 11000 case PyUnicode_1BYTE_KIND: 11001 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11002 break; 11003 case PyUnicode_2BYTE_KIND: 11004 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11005 break; 11006 case PyUnicode_4BYTE_KIND: 11007 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11008 break; 11009 default: 11010 result = -1; 11011 assert(0); 11012 } 11013 11014 Py_DECREF(str); 11015 Py_DECREF(sub); 11016 11017 if (kind2 != kind1) 11018 PyMem_Free(buf2); 11019 11020 return result; 11021} 11022 11023/* Concat to string or Unicode object giving a new Unicode object. */ 11024 11025PyObject * 11026PyUnicode_Concat(PyObject *left, PyObject *right) 11027{ 11028 PyObject *u = NULL, *v = NULL, *w; 11029 Py_UCS4 maxchar, maxchar2; 11030 Py_ssize_t u_len, v_len, new_len; 11031 11032 /* Coerce the two arguments */ 11033 u = PyUnicode_FromObject(left); 11034 if (u == NULL) 11035 goto onError; 11036 v = PyUnicode_FromObject(right); 11037 if (v == NULL) 11038 goto onError; 11039 11040 /* Shortcuts */ 11041 if (v == unicode_empty) { 11042 Py_DECREF(v); 11043 return u; 11044 } 11045 if (u == unicode_empty) { 11046 Py_DECREF(u); 11047 return v; 11048 } 11049 11050 u_len = PyUnicode_GET_LENGTH(u); 11051 v_len = PyUnicode_GET_LENGTH(v); 11052 if (u_len > PY_SSIZE_T_MAX - v_len) { 11053 PyErr_SetString(PyExc_OverflowError, 11054 "strings are too large to concat"); 11055 goto onError; 11056 } 11057 new_len = u_len + v_len; 11058 11059 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11060 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11061 maxchar = Py_MAX(maxchar, maxchar2); 11062 11063 /* Concat the two Unicode strings */ 11064 w = PyUnicode_New(new_len, maxchar); 11065 if (w == NULL) 11066 goto onError; 11067 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11068 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11069 Py_DECREF(u); 11070 Py_DECREF(v); 11071 assert(_PyUnicode_CheckConsistency(w, 1)); 11072 return w; 11073 11074 onError: 11075 Py_XDECREF(u); 11076 Py_XDECREF(v); 11077 return NULL; 11078} 11079 11080void 11081PyUnicode_Append(PyObject **p_left, PyObject *right) 11082{ 11083 PyObject *left, *res; 11084 Py_UCS4 maxchar, maxchar2; 11085 Py_ssize_t left_len, right_len, new_len; 11086 11087 if (p_left == NULL) { 11088 if (!PyErr_Occurred()) 11089 PyErr_BadInternalCall(); 11090 return; 11091 } 11092 left = *p_left; 11093 if (right == NULL || left == NULL 11094 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11095 if (!PyErr_Occurred()) 11096 PyErr_BadInternalCall(); 11097 goto error; 11098 } 11099 11100 if (PyUnicode_READY(left) == -1) 11101 goto error; 11102 if (PyUnicode_READY(right) == -1) 11103 goto error; 11104 11105 /* Shortcuts */ 11106 if (left == unicode_empty) { 11107 Py_DECREF(left); 11108 Py_INCREF(right); 11109 *p_left = right; 11110 return; 11111 } 11112 if (right == unicode_empty) 11113 return; 11114 11115 left_len = PyUnicode_GET_LENGTH(left); 11116 right_len = PyUnicode_GET_LENGTH(right); 11117 if (left_len > PY_SSIZE_T_MAX - right_len) { 11118 PyErr_SetString(PyExc_OverflowError, 11119 "strings are too large to concat"); 11120 goto error; 11121 } 11122 new_len = left_len + right_len; 11123 11124 if (unicode_modifiable(left) 11125 && PyUnicode_CheckExact(right) 11126 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11127 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11128 to change the structure size, but characters are stored just after 11129 the structure, and so it requires to move all characters which is 11130 not so different than duplicating the string. */ 11131 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11132 { 11133 /* append inplace */ 11134 if (unicode_resize(p_left, new_len) != 0) 11135 goto error; 11136 11137 /* copy 'right' into the newly allocated area of 'left' */ 11138 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11139 } 11140 else { 11141 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11142 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11143 maxchar = Py_MAX(maxchar, maxchar2); 11144 11145 /* Concat the two Unicode strings */ 11146 res = PyUnicode_New(new_len, maxchar); 11147 if (res == NULL) 11148 goto error; 11149 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11150 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11151 Py_DECREF(left); 11152 *p_left = res; 11153 } 11154 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11155 return; 11156 11157error: 11158 Py_CLEAR(*p_left); 11159} 11160 11161void 11162PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11163{ 11164 PyUnicode_Append(pleft, right); 11165 Py_XDECREF(right); 11166} 11167 11168PyDoc_STRVAR(count__doc__, 11169 "S.count(sub[, start[, end]]) -> int\n\ 11170\n\ 11171Return the number of non-overlapping occurrences of substring sub in\n\ 11172string S[start:end]. Optional arguments start and end are\n\ 11173interpreted as in slice notation."); 11174 11175static PyObject * 11176unicode_count(PyObject *self, PyObject *args) 11177{ 11178 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11179 Py_ssize_t start = 0; 11180 Py_ssize_t end = PY_SSIZE_T_MAX; 11181 PyObject *result; 11182 int kind1, kind2; 11183 void *buf1, *buf2; 11184 Py_ssize_t len1, len2, iresult; 11185 11186 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11187 &start, &end)) 11188 return NULL; 11189 11190 kind1 = PyUnicode_KIND(self); 11191 kind2 = PyUnicode_KIND(substring); 11192 if (kind1 < kind2) { 11193 Py_DECREF(substring); 11194 return PyLong_FromLong(0); 11195 } 11196 len1 = PyUnicode_GET_LENGTH(self); 11197 len2 = PyUnicode_GET_LENGTH(substring); 11198 ADJUST_INDICES(start, end, len1); 11199 if (end - start < len2) { 11200 Py_DECREF(substring); 11201 return PyLong_FromLong(0); 11202 } 11203 buf1 = PyUnicode_DATA(self); 11204 buf2 = PyUnicode_DATA(substring); 11205 if (kind2 != kind1) { 11206 buf2 = _PyUnicode_AsKind(substring, kind1); 11207 if (!buf2) { 11208 Py_DECREF(substring); 11209 return NULL; 11210 } 11211 } 11212 switch (kind1) { 11213 case PyUnicode_1BYTE_KIND: 11214 iresult = ucs1lib_count( 11215 ((Py_UCS1*)buf1) + start, end - start, 11216 buf2, len2, PY_SSIZE_T_MAX 11217 ); 11218 break; 11219 case PyUnicode_2BYTE_KIND: 11220 iresult = ucs2lib_count( 11221 ((Py_UCS2*)buf1) + start, end - start, 11222 buf2, len2, PY_SSIZE_T_MAX 11223 ); 11224 break; 11225 case PyUnicode_4BYTE_KIND: 11226 iresult = ucs4lib_count( 11227 ((Py_UCS4*)buf1) + start, end - start, 11228 buf2, len2, PY_SSIZE_T_MAX 11229 ); 11230 break; 11231 default: 11232 assert(0); iresult = 0; 11233 } 11234 11235 result = PyLong_FromSsize_t(iresult); 11236 11237 if (kind2 != kind1) 11238 PyMem_Free(buf2); 11239 11240 Py_DECREF(substring); 11241 11242 return result; 11243} 11244 11245PyDoc_STRVAR(encode__doc__, 11246 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11247\n\ 11248Encode S using the codec registered for encoding. Default encoding\n\ 11249is 'utf-8'. errors may be given to set a different error\n\ 11250handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11251a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11252'xmlcharrefreplace' as well as any other name registered with\n\ 11253codecs.register_error that can handle UnicodeEncodeErrors."); 11254 11255static PyObject * 11256unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11257{ 11258 static char *kwlist[] = {"encoding", "errors", 0}; 11259 char *encoding = NULL; 11260 char *errors = NULL; 11261 11262 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11263 kwlist, &encoding, &errors)) 11264 return NULL; 11265 return PyUnicode_AsEncodedString(self, encoding, errors); 11266} 11267 11268PyDoc_STRVAR(expandtabs__doc__, 11269 "S.expandtabs(tabsize=8) -> str\n\ 11270\n\ 11271Return a copy of S where all tab characters are expanded using spaces.\n\ 11272If tabsize is not given, a tab size of 8 characters is assumed."); 11273 11274static PyObject* 11275unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11276{ 11277 Py_ssize_t i, j, line_pos, src_len, incr; 11278 Py_UCS4 ch; 11279 PyObject *u; 11280 void *src_data, *dest_data; 11281 static char *kwlist[] = {"tabsize", 0}; 11282 int tabsize = 8; 11283 int kind; 11284 int found; 11285 11286 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11287 kwlist, &tabsize)) 11288 return NULL; 11289 11290 if (PyUnicode_READY(self) == -1) 11291 return NULL; 11292 11293 /* First pass: determine size of output string */ 11294 src_len = PyUnicode_GET_LENGTH(self); 11295 i = j = line_pos = 0; 11296 kind = PyUnicode_KIND(self); 11297 src_data = PyUnicode_DATA(self); 11298 found = 0; 11299 for (; i < src_len; i++) { 11300 ch = PyUnicode_READ(kind, src_data, i); 11301 if (ch == '\t') { 11302 found = 1; 11303 if (tabsize > 0) { 11304 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11305 if (j > PY_SSIZE_T_MAX - incr) 11306 goto overflow; 11307 line_pos += incr; 11308 j += incr; 11309 } 11310 } 11311 else { 11312 if (j > PY_SSIZE_T_MAX - 1) 11313 goto overflow; 11314 line_pos++; 11315 j++; 11316 if (ch == '\n' || ch == '\r') 11317 line_pos = 0; 11318 } 11319 } 11320 if (!found) 11321 return unicode_result_unchanged(self); 11322 11323 /* Second pass: create output string and fill it */ 11324 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11325 if (!u) 11326 return NULL; 11327 dest_data = PyUnicode_DATA(u); 11328 11329 i = j = line_pos = 0; 11330 11331 for (; i < src_len; i++) { 11332 ch = PyUnicode_READ(kind, src_data, i); 11333 if (ch == '\t') { 11334 if (tabsize > 0) { 11335 incr = tabsize - (line_pos % tabsize); 11336 line_pos += incr; 11337 FILL(kind, dest_data, ' ', j, incr); 11338 j += incr; 11339 } 11340 } 11341 else { 11342 line_pos++; 11343 PyUnicode_WRITE(kind, dest_data, j, ch); 11344 j++; 11345 if (ch == '\n' || ch == '\r') 11346 line_pos = 0; 11347 } 11348 } 11349 assert (j == PyUnicode_GET_LENGTH(u)); 11350 return unicode_result(u); 11351 11352 overflow: 11353 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11354 return NULL; 11355} 11356 11357PyDoc_STRVAR(find__doc__, 11358 "S.find(sub[, start[, end]]) -> int\n\ 11359\n\ 11360Return the lowest index in S where substring sub is found,\n\ 11361such that sub is contained within S[start:end]. Optional\n\ 11362arguments start and end are interpreted as in slice notation.\n\ 11363\n\ 11364Return -1 on failure."); 11365 11366static PyObject * 11367unicode_find(PyObject *self, PyObject *args) 11368{ 11369 /* initialize variables to prevent gcc warning */ 11370 PyObject *substring = NULL; 11371 Py_ssize_t start = 0; 11372 Py_ssize_t end = 0; 11373 Py_ssize_t result; 11374 11375 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11376 &start, &end)) 11377 return NULL; 11378 11379 if (PyUnicode_READY(self) == -1) { 11380 Py_DECREF(substring); 11381 return NULL; 11382 } 11383 if (PyUnicode_READY(substring) == -1) { 11384 Py_DECREF(substring); 11385 return NULL; 11386 } 11387 11388 result = any_find_slice(1, self, substring, start, end); 11389 11390 Py_DECREF(substring); 11391 11392 if (result == -2) 11393 return NULL; 11394 11395 return PyLong_FromSsize_t(result); 11396} 11397 11398static PyObject * 11399unicode_getitem(PyObject *self, Py_ssize_t index) 11400{ 11401 void *data; 11402 enum PyUnicode_Kind kind; 11403 Py_UCS4 ch; 11404 11405 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11406 PyErr_BadArgument(); 11407 return NULL; 11408 } 11409 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11410 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11411 return NULL; 11412 } 11413 kind = PyUnicode_KIND(self); 11414 data = PyUnicode_DATA(self); 11415 ch = PyUnicode_READ(kind, data, index); 11416 return unicode_char(ch); 11417} 11418 11419/* Believe it or not, this produces the same value for ASCII strings 11420 as bytes_hash(). */ 11421static Py_hash_t 11422unicode_hash(PyObject *self) 11423{ 11424 Py_ssize_t len; 11425 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11426 11427#ifdef Py_DEBUG 11428 assert(_Py_HashSecret_Initialized); 11429#endif 11430 if (_PyUnicode_HASH(self) != -1) 11431 return _PyUnicode_HASH(self); 11432 if (PyUnicode_READY(self) == -1) 11433 return -1; 11434 len = PyUnicode_GET_LENGTH(self); 11435 /* 11436 We make the hash of the empty string be 0, rather than using 11437 (prefix ^ suffix), since this slightly obfuscates the hash secret 11438 */ 11439 if (len == 0) { 11440 _PyUnicode_HASH(self) = 0; 11441 return 0; 11442 } 11443 x = _Py_HashBytes(PyUnicode_DATA(self), 11444 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11445 _PyUnicode_HASH(self) = x; 11446 return x; 11447} 11448 11449PyDoc_STRVAR(index__doc__, 11450 "S.index(sub[, start[, end]]) -> int\n\ 11451\n\ 11452Like S.find() but raise ValueError when the substring is not found."); 11453 11454static PyObject * 11455unicode_index(PyObject *self, PyObject *args) 11456{ 11457 /* initialize variables to prevent gcc warning */ 11458 Py_ssize_t result; 11459 PyObject *substring = NULL; 11460 Py_ssize_t start = 0; 11461 Py_ssize_t end = 0; 11462 11463 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11464 &start, &end)) 11465 return NULL; 11466 11467 if (PyUnicode_READY(self) == -1) { 11468 Py_DECREF(substring); 11469 return NULL; 11470 } 11471 if (PyUnicode_READY(substring) == -1) { 11472 Py_DECREF(substring); 11473 return NULL; 11474 } 11475 11476 result = any_find_slice(1, self, substring, start, end); 11477 11478 Py_DECREF(substring); 11479 11480 if (result == -2) 11481 return NULL; 11482 11483 if (result < 0) { 11484 PyErr_SetString(PyExc_ValueError, "substring not found"); 11485 return NULL; 11486 } 11487 11488 return PyLong_FromSsize_t(result); 11489} 11490 11491PyDoc_STRVAR(islower__doc__, 11492 "S.islower() -> bool\n\ 11493\n\ 11494Return True if all cased characters in S are lowercase and there is\n\ 11495at least one cased character in S, False otherwise."); 11496 11497static PyObject* 11498unicode_islower(PyObject *self) 11499{ 11500 Py_ssize_t i, length; 11501 int kind; 11502 void *data; 11503 int cased; 11504 11505 if (PyUnicode_READY(self) == -1) 11506 return NULL; 11507 length = PyUnicode_GET_LENGTH(self); 11508 kind = PyUnicode_KIND(self); 11509 data = PyUnicode_DATA(self); 11510 11511 /* Shortcut for single character strings */ 11512 if (length == 1) 11513 return PyBool_FromLong( 11514 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11515 11516 /* Special case for empty strings */ 11517 if (length == 0) 11518 return PyBool_FromLong(0); 11519 11520 cased = 0; 11521 for (i = 0; i < length; i++) { 11522 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11523 11524 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11525 return PyBool_FromLong(0); 11526 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11527 cased = 1; 11528 } 11529 return PyBool_FromLong(cased); 11530} 11531 11532PyDoc_STRVAR(isupper__doc__, 11533 "S.isupper() -> bool\n\ 11534\n\ 11535Return True if all cased characters in S are uppercase and there is\n\ 11536at least one cased character in S, False otherwise."); 11537 11538static PyObject* 11539unicode_isupper(PyObject *self) 11540{ 11541 Py_ssize_t i, length; 11542 int kind; 11543 void *data; 11544 int cased; 11545 11546 if (PyUnicode_READY(self) == -1) 11547 return NULL; 11548 length = PyUnicode_GET_LENGTH(self); 11549 kind = PyUnicode_KIND(self); 11550 data = PyUnicode_DATA(self); 11551 11552 /* Shortcut for single character strings */ 11553 if (length == 1) 11554 return PyBool_FromLong( 11555 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11556 11557 /* Special case for empty strings */ 11558 if (length == 0) 11559 return PyBool_FromLong(0); 11560 11561 cased = 0; 11562 for (i = 0; i < length; i++) { 11563 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11564 11565 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11566 return PyBool_FromLong(0); 11567 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11568 cased = 1; 11569 } 11570 return PyBool_FromLong(cased); 11571} 11572 11573PyDoc_STRVAR(istitle__doc__, 11574 "S.istitle() -> bool\n\ 11575\n\ 11576Return True if S is a titlecased string and there is at least one\n\ 11577character in S, i.e. upper- and titlecase characters may only\n\ 11578follow uncased characters and lowercase characters only cased ones.\n\ 11579Return False otherwise."); 11580 11581static PyObject* 11582unicode_istitle(PyObject *self) 11583{ 11584 Py_ssize_t i, length; 11585 int kind; 11586 void *data; 11587 int cased, previous_is_cased; 11588 11589 if (PyUnicode_READY(self) == -1) 11590 return NULL; 11591 length = PyUnicode_GET_LENGTH(self); 11592 kind = PyUnicode_KIND(self); 11593 data = PyUnicode_DATA(self); 11594 11595 /* Shortcut for single character strings */ 11596 if (length == 1) { 11597 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11598 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11599 (Py_UNICODE_ISUPPER(ch) != 0)); 11600 } 11601 11602 /* Special case for empty strings */ 11603 if (length == 0) 11604 return PyBool_FromLong(0); 11605 11606 cased = 0; 11607 previous_is_cased = 0; 11608 for (i = 0; i < length; i++) { 11609 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11610 11611 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11612 if (previous_is_cased) 11613 return PyBool_FromLong(0); 11614 previous_is_cased = 1; 11615 cased = 1; 11616 } 11617 else if (Py_UNICODE_ISLOWER(ch)) { 11618 if (!previous_is_cased) 11619 return PyBool_FromLong(0); 11620 previous_is_cased = 1; 11621 cased = 1; 11622 } 11623 else 11624 previous_is_cased = 0; 11625 } 11626 return PyBool_FromLong(cased); 11627} 11628 11629PyDoc_STRVAR(isspace__doc__, 11630 "S.isspace() -> bool\n\ 11631\n\ 11632Return True if all characters in S are whitespace\n\ 11633and there is at least one character in S, False otherwise."); 11634 11635static PyObject* 11636unicode_isspace(PyObject *self) 11637{ 11638 Py_ssize_t i, length; 11639 int kind; 11640 void *data; 11641 11642 if (PyUnicode_READY(self) == -1) 11643 return NULL; 11644 length = PyUnicode_GET_LENGTH(self); 11645 kind = PyUnicode_KIND(self); 11646 data = PyUnicode_DATA(self); 11647 11648 /* Shortcut for single character strings */ 11649 if (length == 1) 11650 return PyBool_FromLong( 11651 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11652 11653 /* Special case for empty strings */ 11654 if (length == 0) 11655 return PyBool_FromLong(0); 11656 11657 for (i = 0; i < length; i++) { 11658 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11659 if (!Py_UNICODE_ISSPACE(ch)) 11660 return PyBool_FromLong(0); 11661 } 11662 return PyBool_FromLong(1); 11663} 11664 11665PyDoc_STRVAR(isalpha__doc__, 11666 "S.isalpha() -> bool\n\ 11667\n\ 11668Return True if all characters in S are alphabetic\n\ 11669and there is at least one character in S, False otherwise."); 11670 11671static PyObject* 11672unicode_isalpha(PyObject *self) 11673{ 11674 Py_ssize_t i, length; 11675 int kind; 11676 void *data; 11677 11678 if (PyUnicode_READY(self) == -1) 11679 return NULL; 11680 length = PyUnicode_GET_LENGTH(self); 11681 kind = PyUnicode_KIND(self); 11682 data = PyUnicode_DATA(self); 11683 11684 /* Shortcut for single character strings */ 11685 if (length == 1) 11686 return PyBool_FromLong( 11687 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11688 11689 /* Special case for empty strings */ 11690 if (length == 0) 11691 return PyBool_FromLong(0); 11692 11693 for (i = 0; i < length; i++) { 11694 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11695 return PyBool_FromLong(0); 11696 } 11697 return PyBool_FromLong(1); 11698} 11699 11700PyDoc_STRVAR(isalnum__doc__, 11701 "S.isalnum() -> bool\n\ 11702\n\ 11703Return True if all characters in S are alphanumeric\n\ 11704and there is at least one character in S, False otherwise."); 11705 11706static PyObject* 11707unicode_isalnum(PyObject *self) 11708{ 11709 int kind; 11710 void *data; 11711 Py_ssize_t len, i; 11712 11713 if (PyUnicode_READY(self) == -1) 11714 return NULL; 11715 11716 kind = PyUnicode_KIND(self); 11717 data = PyUnicode_DATA(self); 11718 len = PyUnicode_GET_LENGTH(self); 11719 11720 /* Shortcut for single character strings */ 11721 if (len == 1) { 11722 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11723 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11724 } 11725 11726 /* Special case for empty strings */ 11727 if (len == 0) 11728 return PyBool_FromLong(0); 11729 11730 for (i = 0; i < len; i++) { 11731 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11732 if (!Py_UNICODE_ISALNUM(ch)) 11733 return PyBool_FromLong(0); 11734 } 11735 return PyBool_FromLong(1); 11736} 11737 11738PyDoc_STRVAR(isdecimal__doc__, 11739 "S.isdecimal() -> bool\n\ 11740\n\ 11741Return True if there are only decimal characters in S,\n\ 11742False otherwise."); 11743 11744static PyObject* 11745unicode_isdecimal(PyObject *self) 11746{ 11747 Py_ssize_t i, length; 11748 int kind; 11749 void *data; 11750 11751 if (PyUnicode_READY(self) == -1) 11752 return NULL; 11753 length = PyUnicode_GET_LENGTH(self); 11754 kind = PyUnicode_KIND(self); 11755 data = PyUnicode_DATA(self); 11756 11757 /* Shortcut for single character strings */ 11758 if (length == 1) 11759 return PyBool_FromLong( 11760 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11761 11762 /* Special case for empty strings */ 11763 if (length == 0) 11764 return PyBool_FromLong(0); 11765 11766 for (i = 0; i < length; i++) { 11767 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11768 return PyBool_FromLong(0); 11769 } 11770 return PyBool_FromLong(1); 11771} 11772 11773PyDoc_STRVAR(isdigit__doc__, 11774 "S.isdigit() -> bool\n\ 11775\n\ 11776Return True if all characters in S are digits\n\ 11777and there is at least one character in S, False otherwise."); 11778 11779static PyObject* 11780unicode_isdigit(PyObject *self) 11781{ 11782 Py_ssize_t i, length; 11783 int kind; 11784 void *data; 11785 11786 if (PyUnicode_READY(self) == -1) 11787 return NULL; 11788 length = PyUnicode_GET_LENGTH(self); 11789 kind = PyUnicode_KIND(self); 11790 data = PyUnicode_DATA(self); 11791 11792 /* Shortcut for single character strings */ 11793 if (length == 1) { 11794 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11795 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11796 } 11797 11798 /* Special case for empty strings */ 11799 if (length == 0) 11800 return PyBool_FromLong(0); 11801 11802 for (i = 0; i < length; i++) { 11803 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11804 return PyBool_FromLong(0); 11805 } 11806 return PyBool_FromLong(1); 11807} 11808 11809PyDoc_STRVAR(isnumeric__doc__, 11810 "S.isnumeric() -> bool\n\ 11811\n\ 11812Return True if there are only numeric characters in S,\n\ 11813False otherwise."); 11814 11815static PyObject* 11816unicode_isnumeric(PyObject *self) 11817{ 11818 Py_ssize_t i, length; 11819 int kind; 11820 void *data; 11821 11822 if (PyUnicode_READY(self) == -1) 11823 return NULL; 11824 length = PyUnicode_GET_LENGTH(self); 11825 kind = PyUnicode_KIND(self); 11826 data = PyUnicode_DATA(self); 11827 11828 /* Shortcut for single character strings */ 11829 if (length == 1) 11830 return PyBool_FromLong( 11831 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11832 11833 /* Special case for empty strings */ 11834 if (length == 0) 11835 return PyBool_FromLong(0); 11836 11837 for (i = 0; i < length; i++) { 11838 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11839 return PyBool_FromLong(0); 11840 } 11841 return PyBool_FromLong(1); 11842} 11843 11844int 11845PyUnicode_IsIdentifier(PyObject *self) 11846{ 11847 int kind; 11848 void *data; 11849 Py_ssize_t i; 11850 Py_UCS4 first; 11851 11852 if (PyUnicode_READY(self) == -1) { 11853 Py_FatalError("identifier not ready"); 11854 return 0; 11855 } 11856 11857 /* Special case for empty strings */ 11858 if (PyUnicode_GET_LENGTH(self) == 0) 11859 return 0; 11860 kind = PyUnicode_KIND(self); 11861 data = PyUnicode_DATA(self); 11862 11863 /* PEP 3131 says that the first character must be in 11864 XID_Start and subsequent characters in XID_Continue, 11865 and for the ASCII range, the 2.x rules apply (i.e 11866 start with letters and underscore, continue with 11867 letters, digits, underscore). However, given the current 11868 definition of XID_Start and XID_Continue, it is sufficient 11869 to check just for these, except that _ must be allowed 11870 as starting an identifier. */ 11871 first = PyUnicode_READ(kind, data, 0); 11872 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11873 return 0; 11874 11875 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11876 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11877 return 0; 11878 return 1; 11879} 11880 11881PyDoc_STRVAR(isidentifier__doc__, 11882 "S.isidentifier() -> bool\n\ 11883\n\ 11884Return True if S is a valid identifier according\n\ 11885to the language definition.\n\ 11886\n\ 11887Use keyword.iskeyword() to test for reserved identifiers\n\ 11888such as \"def\" and \"class\".\n"); 11889 11890static PyObject* 11891unicode_isidentifier(PyObject *self) 11892{ 11893 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11894} 11895 11896PyDoc_STRVAR(isprintable__doc__, 11897 "S.isprintable() -> bool\n\ 11898\n\ 11899Return True if all characters in S are considered\n\ 11900printable in repr() or S is empty, False otherwise."); 11901 11902static PyObject* 11903unicode_isprintable(PyObject *self) 11904{ 11905 Py_ssize_t i, length; 11906 int kind; 11907 void *data; 11908 11909 if (PyUnicode_READY(self) == -1) 11910 return NULL; 11911 length = PyUnicode_GET_LENGTH(self); 11912 kind = PyUnicode_KIND(self); 11913 data = PyUnicode_DATA(self); 11914 11915 /* Shortcut for single character strings */ 11916 if (length == 1) 11917 return PyBool_FromLong( 11918 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11919 11920 for (i = 0; i < length; i++) { 11921 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11922 Py_RETURN_FALSE; 11923 } 11924 } 11925 Py_RETURN_TRUE; 11926} 11927 11928PyDoc_STRVAR(join__doc__, 11929 "S.join(iterable) -> str\n\ 11930\n\ 11931Return a string which is the concatenation of the strings in the\n\ 11932iterable. The separator between elements is S."); 11933 11934static PyObject* 11935unicode_join(PyObject *self, PyObject *data) 11936{ 11937 return PyUnicode_Join(self, data); 11938} 11939 11940static Py_ssize_t 11941unicode_length(PyObject *self) 11942{ 11943 if (PyUnicode_READY(self) == -1) 11944 return -1; 11945 return PyUnicode_GET_LENGTH(self); 11946} 11947 11948PyDoc_STRVAR(ljust__doc__, 11949 "S.ljust(width[, fillchar]) -> str\n\ 11950\n\ 11951Return S left-justified in a Unicode string of length width. Padding is\n\ 11952done using the specified fill character (default is a space)."); 11953 11954static PyObject * 11955unicode_ljust(PyObject *self, PyObject *args) 11956{ 11957 Py_ssize_t width; 11958 Py_UCS4 fillchar = ' '; 11959 11960 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11961 return NULL; 11962 11963 if (PyUnicode_READY(self) == -1) 11964 return NULL; 11965 11966 if (PyUnicode_GET_LENGTH(self) >= width) 11967 return unicode_result_unchanged(self); 11968 11969 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11970} 11971 11972PyDoc_STRVAR(lower__doc__, 11973 "S.lower() -> str\n\ 11974\n\ 11975Return a copy of the string S converted to lowercase."); 11976 11977static PyObject* 11978unicode_lower(PyObject *self) 11979{ 11980 if (PyUnicode_READY(self) == -1) 11981 return NULL; 11982 if (PyUnicode_IS_ASCII(self)) 11983 return ascii_upper_or_lower(self, 1); 11984 return case_operation(self, do_lower); 11985} 11986 11987#define LEFTSTRIP 0 11988#define RIGHTSTRIP 1 11989#define BOTHSTRIP 2 11990 11991/* Arrays indexed by above */ 11992static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 11993 11994#define STRIPNAME(i) (stripformat[i]+3) 11995 11996/* externally visible for str.strip(unicode) */ 11997PyObject * 11998_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 11999{ 12000 void *data; 12001 int kind; 12002 Py_ssize_t i, j, len; 12003 BLOOM_MASK sepmask; 12004 Py_ssize_t seplen; 12005 12006 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12007 return NULL; 12008 12009 kind = PyUnicode_KIND(self); 12010 data = PyUnicode_DATA(self); 12011 len = PyUnicode_GET_LENGTH(self); 12012 seplen = PyUnicode_GET_LENGTH(sepobj); 12013 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12014 PyUnicode_DATA(sepobj), 12015 seplen); 12016 12017 i = 0; 12018 if (striptype != RIGHTSTRIP) { 12019 while (i < len) { 12020 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12021 if (!BLOOM(sepmask, ch)) 12022 break; 12023 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12024 break; 12025 i++; 12026 } 12027 } 12028 12029 j = len; 12030 if (striptype != LEFTSTRIP) { 12031 j--; 12032 while (j >= i) { 12033 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12034 if (!BLOOM(sepmask, ch)) 12035 break; 12036 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12037 break; 12038 j--; 12039 } 12040 12041 j++; 12042 } 12043 12044 return PyUnicode_Substring(self, i, j); 12045} 12046 12047PyObject* 12048PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12049{ 12050 unsigned char *data; 12051 int kind; 12052 Py_ssize_t length; 12053 12054 if (PyUnicode_READY(self) == -1) 12055 return NULL; 12056 12057 length = PyUnicode_GET_LENGTH(self); 12058 end = Py_MIN(end, length); 12059 12060 if (start == 0 && end == length) 12061 return unicode_result_unchanged(self); 12062 12063 if (start < 0 || end < 0) { 12064 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12065 return NULL; 12066 } 12067 if (start >= length || end < start) 12068 _Py_RETURN_UNICODE_EMPTY(); 12069 12070 length = end - start; 12071 if (PyUnicode_IS_ASCII(self)) { 12072 data = PyUnicode_1BYTE_DATA(self); 12073 return _PyUnicode_FromASCII((char*)(data + start), length); 12074 } 12075 else { 12076 kind = PyUnicode_KIND(self); 12077 data = PyUnicode_1BYTE_DATA(self); 12078 return PyUnicode_FromKindAndData(kind, 12079 data + kind * start, 12080 length); 12081 } 12082} 12083 12084static PyObject * 12085do_strip(PyObject *self, int striptype) 12086{ 12087 Py_ssize_t len, i, j; 12088 12089 if (PyUnicode_READY(self) == -1) 12090 return NULL; 12091 12092 len = PyUnicode_GET_LENGTH(self); 12093 12094 if (PyUnicode_IS_ASCII(self)) { 12095 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12096 12097 i = 0; 12098 if (striptype != RIGHTSTRIP) { 12099 while (i < len) { 12100 Py_UCS1 ch = data[i]; 12101 if (!_Py_ascii_whitespace[ch]) 12102 break; 12103 i++; 12104 } 12105 } 12106 12107 j = len; 12108 if (striptype != LEFTSTRIP) { 12109 j--; 12110 while (j >= i) { 12111 Py_UCS1 ch = data[j]; 12112 if (!_Py_ascii_whitespace[ch]) 12113 break; 12114 j--; 12115 } 12116 j++; 12117 } 12118 } 12119 else { 12120 int kind = PyUnicode_KIND(self); 12121 void *data = PyUnicode_DATA(self); 12122 12123 i = 0; 12124 if (striptype != RIGHTSTRIP) { 12125 while (i < len) { 12126 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12127 if (!Py_UNICODE_ISSPACE(ch)) 12128 break; 12129 i++; 12130 } 12131 } 12132 12133 j = len; 12134 if (striptype != LEFTSTRIP) { 12135 j--; 12136 while (j >= i) { 12137 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12138 if (!Py_UNICODE_ISSPACE(ch)) 12139 break; 12140 j--; 12141 } 12142 j++; 12143 } 12144 } 12145 12146 return PyUnicode_Substring(self, i, j); 12147} 12148 12149 12150static PyObject * 12151do_argstrip(PyObject *self, int striptype, PyObject *args) 12152{ 12153 PyObject *sep = NULL; 12154 12155 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12156 return NULL; 12157 12158 if (sep != NULL && sep != Py_None) { 12159 if (PyUnicode_Check(sep)) 12160 return _PyUnicode_XStrip(self, striptype, sep); 12161 else { 12162 PyErr_Format(PyExc_TypeError, 12163 "%s arg must be None or str", 12164 STRIPNAME(striptype)); 12165 return NULL; 12166 } 12167 } 12168 12169 return do_strip(self, striptype); 12170} 12171 12172 12173PyDoc_STRVAR(strip__doc__, 12174 "S.strip([chars]) -> str\n\ 12175\n\ 12176Return a copy of the string S with leading and trailing\n\ 12177whitespace removed.\n\ 12178If chars is given and not None, remove characters in chars instead."); 12179 12180static PyObject * 12181unicode_strip(PyObject *self, PyObject *args) 12182{ 12183 if (PyTuple_GET_SIZE(args) == 0) 12184 return do_strip(self, BOTHSTRIP); /* Common case */ 12185 else 12186 return do_argstrip(self, BOTHSTRIP, args); 12187} 12188 12189 12190PyDoc_STRVAR(lstrip__doc__, 12191 "S.lstrip([chars]) -> str\n\ 12192\n\ 12193Return a copy of the string S with leading whitespace removed.\n\ 12194If chars is given and not None, remove characters in chars instead."); 12195 12196static PyObject * 12197unicode_lstrip(PyObject *self, PyObject *args) 12198{ 12199 if (PyTuple_GET_SIZE(args) == 0) 12200 return do_strip(self, LEFTSTRIP); /* Common case */ 12201 else 12202 return do_argstrip(self, LEFTSTRIP, args); 12203} 12204 12205 12206PyDoc_STRVAR(rstrip__doc__, 12207 "S.rstrip([chars]) -> str\n\ 12208\n\ 12209Return a copy of the string S with trailing whitespace removed.\n\ 12210If chars is given and not None, remove characters in chars instead."); 12211 12212static PyObject * 12213unicode_rstrip(PyObject *self, PyObject *args) 12214{ 12215 if (PyTuple_GET_SIZE(args) == 0) 12216 return do_strip(self, RIGHTSTRIP); /* Common case */ 12217 else 12218 return do_argstrip(self, RIGHTSTRIP, args); 12219} 12220 12221 12222static PyObject* 12223unicode_repeat(PyObject *str, Py_ssize_t len) 12224{ 12225 PyObject *u; 12226 Py_ssize_t nchars, n; 12227 12228 if (len < 1) 12229 _Py_RETURN_UNICODE_EMPTY(); 12230 12231 /* no repeat, return original string */ 12232 if (len == 1) 12233 return unicode_result_unchanged(str); 12234 12235 if (PyUnicode_READY(str) == -1) 12236 return NULL; 12237 12238 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12239 PyErr_SetString(PyExc_OverflowError, 12240 "repeated string is too long"); 12241 return NULL; 12242 } 12243 nchars = len * PyUnicode_GET_LENGTH(str); 12244 12245 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12246 if (!u) 12247 return NULL; 12248 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12249 12250 if (PyUnicode_GET_LENGTH(str) == 1) { 12251 const int kind = PyUnicode_KIND(str); 12252 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12253 if (kind == PyUnicode_1BYTE_KIND) { 12254 void *to = PyUnicode_DATA(u); 12255 memset(to, (unsigned char)fill_char, len); 12256 } 12257 else if (kind == PyUnicode_2BYTE_KIND) { 12258 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12259 for (n = 0; n < len; ++n) 12260 ucs2[n] = fill_char; 12261 } else { 12262 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12263 assert(kind == PyUnicode_4BYTE_KIND); 12264 for (n = 0; n < len; ++n) 12265 ucs4[n] = fill_char; 12266 } 12267 } 12268 else { 12269 /* number of characters copied this far */ 12270 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12271 const Py_ssize_t char_size = PyUnicode_KIND(str); 12272 char *to = (char *) PyUnicode_DATA(u); 12273 Py_MEMCPY(to, PyUnicode_DATA(str), 12274 PyUnicode_GET_LENGTH(str) * char_size); 12275 while (done < nchars) { 12276 n = (done <= nchars-done) ? done : nchars-done; 12277 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12278 done += n; 12279 } 12280 } 12281 12282 assert(_PyUnicode_CheckConsistency(u, 1)); 12283 return u; 12284} 12285 12286PyObject * 12287PyUnicode_Replace(PyObject *obj, 12288 PyObject *subobj, 12289 PyObject *replobj, 12290 Py_ssize_t maxcount) 12291{ 12292 PyObject *self; 12293 PyObject *str1; 12294 PyObject *str2; 12295 PyObject *result; 12296 12297 self = PyUnicode_FromObject(obj); 12298 if (self == NULL) 12299 return NULL; 12300 str1 = PyUnicode_FromObject(subobj); 12301 if (str1 == NULL) { 12302 Py_DECREF(self); 12303 return NULL; 12304 } 12305 str2 = PyUnicode_FromObject(replobj); 12306 if (str2 == NULL) { 12307 Py_DECREF(self); 12308 Py_DECREF(str1); 12309 return NULL; 12310 } 12311 if (PyUnicode_READY(self) == -1 || 12312 PyUnicode_READY(str1) == -1 || 12313 PyUnicode_READY(str2) == -1) 12314 result = NULL; 12315 else 12316 result = replace(self, str1, str2, maxcount); 12317 Py_DECREF(self); 12318 Py_DECREF(str1); 12319 Py_DECREF(str2); 12320 return result; 12321} 12322 12323PyDoc_STRVAR(replace__doc__, 12324 "S.replace(old, new[, count]) -> str\n\ 12325\n\ 12326Return a copy of S with all occurrences of substring\n\ 12327old replaced by new. If the optional argument count is\n\ 12328given, only the first count occurrences are replaced."); 12329 12330static PyObject* 12331unicode_replace(PyObject *self, PyObject *args) 12332{ 12333 PyObject *str1; 12334 PyObject *str2; 12335 Py_ssize_t maxcount = -1; 12336 PyObject *result; 12337 12338 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12339 return NULL; 12340 if (PyUnicode_READY(self) == -1) 12341 return NULL; 12342 str1 = PyUnicode_FromObject(str1); 12343 if (str1 == NULL) 12344 return NULL; 12345 str2 = PyUnicode_FromObject(str2); 12346 if (str2 == NULL) { 12347 Py_DECREF(str1); 12348 return NULL; 12349 } 12350 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12351 result = NULL; 12352 else 12353 result = replace(self, str1, str2, maxcount); 12354 12355 Py_DECREF(str1); 12356 Py_DECREF(str2); 12357 return result; 12358} 12359 12360static PyObject * 12361unicode_repr(PyObject *unicode) 12362{ 12363 PyObject *repr; 12364 Py_ssize_t isize; 12365 Py_ssize_t osize, squote, dquote, i, o; 12366 Py_UCS4 max, quote; 12367 int ikind, okind, unchanged; 12368 void *idata, *odata; 12369 12370 if (PyUnicode_READY(unicode) == -1) 12371 return NULL; 12372 12373 isize = PyUnicode_GET_LENGTH(unicode); 12374 idata = PyUnicode_DATA(unicode); 12375 12376 /* Compute length of output, quote characters, and 12377 maximum character */ 12378 osize = 0; 12379 max = 127; 12380 squote = dquote = 0; 12381 ikind = PyUnicode_KIND(unicode); 12382 for (i = 0; i < isize; i++) { 12383 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12384 Py_ssize_t incr = 1; 12385 switch (ch) { 12386 case '\'': squote++; break; 12387 case '"': dquote++; break; 12388 case '\\': case '\t': case '\r': case '\n': 12389 incr = 2; 12390 break; 12391 default: 12392 /* Fast-path ASCII */ 12393 if (ch < ' ' || ch == 0x7f) 12394 incr = 4; /* \xHH */ 12395 else if (ch < 0x7f) 12396 ; 12397 else if (Py_UNICODE_ISPRINTABLE(ch)) 12398 max = ch > max ? ch : max; 12399 else if (ch < 0x100) 12400 incr = 4; /* \xHH */ 12401 else if (ch < 0x10000) 12402 incr = 6; /* \uHHHH */ 12403 else 12404 incr = 10; /* \uHHHHHHHH */ 12405 } 12406 if (osize > PY_SSIZE_T_MAX - incr) { 12407 PyErr_SetString(PyExc_OverflowError, 12408 "string is too long to generate repr"); 12409 return NULL; 12410 } 12411 osize += incr; 12412 } 12413 12414 quote = '\''; 12415 unchanged = (osize == isize); 12416 if (squote) { 12417 unchanged = 0; 12418 if (dquote) 12419 /* Both squote and dquote present. Use squote, 12420 and escape them */ 12421 osize += squote; 12422 else 12423 quote = '"'; 12424 } 12425 osize += 2; /* quotes */ 12426 12427 repr = PyUnicode_New(osize, max); 12428 if (repr == NULL) 12429 return NULL; 12430 okind = PyUnicode_KIND(repr); 12431 odata = PyUnicode_DATA(repr); 12432 12433 PyUnicode_WRITE(okind, odata, 0, quote); 12434 PyUnicode_WRITE(okind, odata, osize-1, quote); 12435 if (unchanged) { 12436 _PyUnicode_FastCopyCharacters(repr, 1, 12437 unicode, 0, 12438 isize); 12439 } 12440 else { 12441 for (i = 0, o = 1; i < isize; i++) { 12442 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12443 12444 /* Escape quotes and backslashes */ 12445 if ((ch == quote) || (ch == '\\')) { 12446 PyUnicode_WRITE(okind, odata, o++, '\\'); 12447 PyUnicode_WRITE(okind, odata, o++, ch); 12448 continue; 12449 } 12450 12451 /* Map special whitespace to '\t', \n', '\r' */ 12452 if (ch == '\t') { 12453 PyUnicode_WRITE(okind, odata, o++, '\\'); 12454 PyUnicode_WRITE(okind, odata, o++, 't'); 12455 } 12456 else if (ch == '\n') { 12457 PyUnicode_WRITE(okind, odata, o++, '\\'); 12458 PyUnicode_WRITE(okind, odata, o++, 'n'); 12459 } 12460 else if (ch == '\r') { 12461 PyUnicode_WRITE(okind, odata, o++, '\\'); 12462 PyUnicode_WRITE(okind, odata, o++, 'r'); 12463 } 12464 12465 /* Map non-printable US ASCII to '\xhh' */ 12466 else if (ch < ' ' || ch == 0x7F) { 12467 PyUnicode_WRITE(okind, odata, o++, '\\'); 12468 PyUnicode_WRITE(okind, odata, o++, 'x'); 12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12471 } 12472 12473 /* Copy ASCII characters as-is */ 12474 else if (ch < 0x7F) { 12475 PyUnicode_WRITE(okind, odata, o++, ch); 12476 } 12477 12478 /* Non-ASCII characters */ 12479 else { 12480 /* Map Unicode whitespace and control characters 12481 (categories Z* and C* except ASCII space) 12482 */ 12483 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12484 PyUnicode_WRITE(okind, odata, o++, '\\'); 12485 /* Map 8-bit characters to '\xhh' */ 12486 if (ch <= 0xff) { 12487 PyUnicode_WRITE(okind, odata, o++, 'x'); 12488 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12489 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12490 } 12491 /* Map 16-bit characters to '\uxxxx' */ 12492 else if (ch <= 0xffff) { 12493 PyUnicode_WRITE(okind, odata, o++, 'u'); 12494 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12495 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12496 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12497 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12498 } 12499 /* Map 21-bit characters to '\U00xxxxxx' */ 12500 else { 12501 PyUnicode_WRITE(okind, odata, o++, 'U'); 12502 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12503 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12504 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12505 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12506 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12507 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12508 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12509 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12510 } 12511 } 12512 /* Copy characters as-is */ 12513 else { 12514 PyUnicode_WRITE(okind, odata, o++, ch); 12515 } 12516 } 12517 } 12518 } 12519 /* Closing quote already added at the beginning */ 12520 assert(_PyUnicode_CheckConsistency(repr, 1)); 12521 return repr; 12522} 12523 12524PyDoc_STRVAR(rfind__doc__, 12525 "S.rfind(sub[, start[, end]]) -> int\n\ 12526\n\ 12527Return the highest index in S where substring sub is found,\n\ 12528such that sub is contained within S[start:end]. Optional\n\ 12529arguments start and end are interpreted as in slice notation.\n\ 12530\n\ 12531Return -1 on failure."); 12532 12533static PyObject * 12534unicode_rfind(PyObject *self, PyObject *args) 12535{ 12536 /* initialize variables to prevent gcc warning */ 12537 PyObject *substring = NULL; 12538 Py_ssize_t start = 0; 12539 Py_ssize_t end = 0; 12540 Py_ssize_t result; 12541 12542 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12543 &start, &end)) 12544 return NULL; 12545 12546 if (PyUnicode_READY(self) == -1) { 12547 Py_DECREF(substring); 12548 return NULL; 12549 } 12550 if (PyUnicode_READY(substring) == -1) { 12551 Py_DECREF(substring); 12552 return NULL; 12553 } 12554 12555 result = any_find_slice(-1, self, substring, start, end); 12556 12557 Py_DECREF(substring); 12558 12559 if (result == -2) 12560 return NULL; 12561 12562 return PyLong_FromSsize_t(result); 12563} 12564 12565PyDoc_STRVAR(rindex__doc__, 12566 "S.rindex(sub[, start[, end]]) -> int\n\ 12567\n\ 12568Like S.rfind() but raise ValueError when the substring is not found."); 12569 12570static PyObject * 12571unicode_rindex(PyObject *self, PyObject *args) 12572{ 12573 /* initialize variables to prevent gcc warning */ 12574 PyObject *substring = NULL; 12575 Py_ssize_t start = 0; 12576 Py_ssize_t end = 0; 12577 Py_ssize_t result; 12578 12579 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12580 &start, &end)) 12581 return NULL; 12582 12583 if (PyUnicode_READY(self) == -1) { 12584 Py_DECREF(substring); 12585 return NULL; 12586 } 12587 if (PyUnicode_READY(substring) == -1) { 12588 Py_DECREF(substring); 12589 return NULL; 12590 } 12591 12592 result = any_find_slice(-1, self, substring, start, end); 12593 12594 Py_DECREF(substring); 12595 12596 if (result == -2) 12597 return NULL; 12598 12599 if (result < 0) { 12600 PyErr_SetString(PyExc_ValueError, "substring not found"); 12601 return NULL; 12602 } 12603 12604 return PyLong_FromSsize_t(result); 12605} 12606 12607PyDoc_STRVAR(rjust__doc__, 12608 "S.rjust(width[, fillchar]) -> str\n\ 12609\n\ 12610Return S right-justified in a string of length width. Padding is\n\ 12611done using the specified fill character (default is a space)."); 12612 12613static PyObject * 12614unicode_rjust(PyObject *self, PyObject *args) 12615{ 12616 Py_ssize_t width; 12617 Py_UCS4 fillchar = ' '; 12618 12619 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12620 return NULL; 12621 12622 if (PyUnicode_READY(self) == -1) 12623 return NULL; 12624 12625 if (PyUnicode_GET_LENGTH(self) >= width) 12626 return unicode_result_unchanged(self); 12627 12628 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12629} 12630 12631PyObject * 12632PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12633{ 12634 PyObject *result; 12635 12636 s = PyUnicode_FromObject(s); 12637 if (s == NULL) 12638 return NULL; 12639 if (sep != NULL) { 12640 sep = PyUnicode_FromObject(sep); 12641 if (sep == NULL) { 12642 Py_DECREF(s); 12643 return NULL; 12644 } 12645 } 12646 12647 result = split(s, sep, maxsplit); 12648 12649 Py_DECREF(s); 12650 Py_XDECREF(sep); 12651 return result; 12652} 12653 12654PyDoc_STRVAR(split__doc__, 12655 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12656\n\ 12657Return a list of the words in S, using sep as the\n\ 12658delimiter string. If maxsplit is given, at most maxsplit\n\ 12659splits are done. If sep is not specified or is None, any\n\ 12660whitespace string is a separator and empty strings are\n\ 12661removed from the result."); 12662 12663static PyObject* 12664unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12665{ 12666 static char *kwlist[] = {"sep", "maxsplit", 0}; 12667 PyObject *substring = Py_None; 12668 Py_ssize_t maxcount = -1; 12669 12670 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12671 kwlist, &substring, &maxcount)) 12672 return NULL; 12673 12674 if (substring == Py_None) 12675 return split(self, NULL, maxcount); 12676 else if (PyUnicode_Check(substring)) 12677 return split(self, substring, maxcount); 12678 else 12679 return PyUnicode_Split(self, substring, maxcount); 12680} 12681 12682PyObject * 12683PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12684{ 12685 PyObject* str_obj; 12686 PyObject* sep_obj; 12687 PyObject* out; 12688 int kind1, kind2; 12689 void *buf1, *buf2; 12690 Py_ssize_t len1, len2; 12691 12692 str_obj = PyUnicode_FromObject(str_in); 12693 if (!str_obj) 12694 return NULL; 12695 sep_obj = PyUnicode_FromObject(sep_in); 12696 if (!sep_obj) { 12697 Py_DECREF(str_obj); 12698 return NULL; 12699 } 12700 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12701 Py_DECREF(sep_obj); 12702 Py_DECREF(str_obj); 12703 return NULL; 12704 } 12705 12706 kind1 = PyUnicode_KIND(str_obj); 12707 kind2 = PyUnicode_KIND(sep_obj); 12708 len1 = PyUnicode_GET_LENGTH(str_obj); 12709 len2 = PyUnicode_GET_LENGTH(sep_obj); 12710 if (kind1 < kind2 || len1 < len2) { 12711 _Py_INCREF_UNICODE_EMPTY(); 12712 if (!unicode_empty) 12713 out = NULL; 12714 else { 12715 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12716 Py_DECREF(unicode_empty); 12717 } 12718 Py_DECREF(sep_obj); 12719 Py_DECREF(str_obj); 12720 return out; 12721 } 12722 buf1 = PyUnicode_DATA(str_obj); 12723 buf2 = PyUnicode_DATA(sep_obj); 12724 if (kind2 != kind1) { 12725 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12726 if (!buf2) 12727 goto onError; 12728 } 12729 12730 switch (kind1) { 12731 case PyUnicode_1BYTE_KIND: 12732 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12733 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12734 else 12735 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12736 break; 12737 case PyUnicode_2BYTE_KIND: 12738 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12739 break; 12740 case PyUnicode_4BYTE_KIND: 12741 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12742 break; 12743 default: 12744 assert(0); 12745 out = 0; 12746 } 12747 12748 Py_DECREF(sep_obj); 12749 Py_DECREF(str_obj); 12750 if (kind2 != kind1) 12751 PyMem_Free(buf2); 12752 12753 return out; 12754 onError: 12755 Py_DECREF(sep_obj); 12756 Py_DECREF(str_obj); 12757 if (kind2 != kind1 && buf2) 12758 PyMem_Free(buf2); 12759 return NULL; 12760} 12761 12762 12763PyObject * 12764PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12765{ 12766 PyObject* str_obj; 12767 PyObject* sep_obj; 12768 PyObject* out; 12769 int kind1, kind2; 12770 void *buf1, *buf2; 12771 Py_ssize_t len1, len2; 12772 12773 str_obj = PyUnicode_FromObject(str_in); 12774 if (!str_obj) 12775 return NULL; 12776 sep_obj = PyUnicode_FromObject(sep_in); 12777 if (!sep_obj) { 12778 Py_DECREF(str_obj); 12779 return NULL; 12780 } 12781 12782 kind1 = PyUnicode_KIND(str_obj); 12783 kind2 = PyUnicode_KIND(sep_obj); 12784 len1 = PyUnicode_GET_LENGTH(str_obj); 12785 len2 = PyUnicode_GET_LENGTH(sep_obj); 12786 if (kind1 < kind2 || len1 < len2) { 12787 _Py_INCREF_UNICODE_EMPTY(); 12788 if (!unicode_empty) 12789 out = NULL; 12790 else { 12791 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12792 Py_DECREF(unicode_empty); 12793 } 12794 Py_DECREF(sep_obj); 12795 Py_DECREF(str_obj); 12796 return out; 12797 } 12798 buf1 = PyUnicode_DATA(str_obj); 12799 buf2 = PyUnicode_DATA(sep_obj); 12800 if (kind2 != kind1) { 12801 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12802 if (!buf2) 12803 goto onError; 12804 } 12805 12806 switch (kind1) { 12807 case PyUnicode_1BYTE_KIND: 12808 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12809 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12810 else 12811 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12812 break; 12813 case PyUnicode_2BYTE_KIND: 12814 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12815 break; 12816 case PyUnicode_4BYTE_KIND: 12817 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12818 break; 12819 default: 12820 assert(0); 12821 out = 0; 12822 } 12823 12824 Py_DECREF(sep_obj); 12825 Py_DECREF(str_obj); 12826 if (kind2 != kind1) 12827 PyMem_Free(buf2); 12828 12829 return out; 12830 onError: 12831 Py_DECREF(sep_obj); 12832 Py_DECREF(str_obj); 12833 if (kind2 != kind1 && buf2) 12834 PyMem_Free(buf2); 12835 return NULL; 12836} 12837 12838PyDoc_STRVAR(partition__doc__, 12839 "S.partition(sep) -> (head, sep, tail)\n\ 12840\n\ 12841Search for the separator sep in S, and return the part before it,\n\ 12842the separator itself, and the part after it. If the separator is not\n\ 12843found, return S and two empty strings."); 12844 12845static PyObject* 12846unicode_partition(PyObject *self, PyObject *separator) 12847{ 12848 return PyUnicode_Partition(self, separator); 12849} 12850 12851PyDoc_STRVAR(rpartition__doc__, 12852 "S.rpartition(sep) -> (head, sep, tail)\n\ 12853\n\ 12854Search for the separator sep in S, starting at the end of S, and return\n\ 12855the part before it, the separator itself, and the part after it. If the\n\ 12856separator is not found, return two empty strings and S."); 12857 12858static PyObject* 12859unicode_rpartition(PyObject *self, PyObject *separator) 12860{ 12861 return PyUnicode_RPartition(self, separator); 12862} 12863 12864PyObject * 12865PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12866{ 12867 PyObject *result; 12868 12869 s = PyUnicode_FromObject(s); 12870 if (s == NULL) 12871 return NULL; 12872 if (sep != NULL) { 12873 sep = PyUnicode_FromObject(sep); 12874 if (sep == NULL) { 12875 Py_DECREF(s); 12876 return NULL; 12877 } 12878 } 12879 12880 result = rsplit(s, sep, maxsplit); 12881 12882 Py_DECREF(s); 12883 Py_XDECREF(sep); 12884 return result; 12885} 12886 12887PyDoc_STRVAR(rsplit__doc__, 12888 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12889\n\ 12890Return a list of the words in S, using sep as the\n\ 12891delimiter string, starting at the end of the string and\n\ 12892working to the front. If maxsplit is given, at most maxsplit\n\ 12893splits are done. If sep is not specified, any whitespace string\n\ 12894is a separator."); 12895 12896static PyObject* 12897unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12898{ 12899 static char *kwlist[] = {"sep", "maxsplit", 0}; 12900 PyObject *substring = Py_None; 12901 Py_ssize_t maxcount = -1; 12902 12903 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12904 kwlist, &substring, &maxcount)) 12905 return NULL; 12906 12907 if (substring == Py_None) 12908 return rsplit(self, NULL, maxcount); 12909 else if (PyUnicode_Check(substring)) 12910 return rsplit(self, substring, maxcount); 12911 else 12912 return PyUnicode_RSplit(self, substring, maxcount); 12913} 12914 12915PyDoc_STRVAR(splitlines__doc__, 12916 "S.splitlines([keepends]) -> list of strings\n\ 12917\n\ 12918Return a list of the lines in S, breaking at line boundaries.\n\ 12919Line breaks are not included in the resulting list unless keepends\n\ 12920is given and true."); 12921 12922static PyObject* 12923unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12924{ 12925 static char *kwlist[] = {"keepends", 0}; 12926 int keepends = 0; 12927 12928 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12929 kwlist, &keepends)) 12930 return NULL; 12931 12932 return PyUnicode_Splitlines(self, keepends); 12933} 12934 12935static 12936PyObject *unicode_str(PyObject *self) 12937{ 12938 return unicode_result_unchanged(self); 12939} 12940 12941PyDoc_STRVAR(swapcase__doc__, 12942 "S.swapcase() -> str\n\ 12943\n\ 12944Return a copy of S with uppercase characters converted to lowercase\n\ 12945and vice versa."); 12946 12947static PyObject* 12948unicode_swapcase(PyObject *self) 12949{ 12950 if (PyUnicode_READY(self) == -1) 12951 return NULL; 12952 return case_operation(self, do_swapcase); 12953} 12954 12955/*[clinic input] 12956 12957@staticmethod 12958str.maketrans as unicode_maketrans 12959 12960 x: object 12961 12962 y: unicode=NULL 12963 12964 z: unicode=NULL 12965 12966 / 12967 12968Return a translation table usable for str.translate(). 12969 12970If there is only one argument, it must be a dictionary mapping Unicode 12971ordinals (integers) or characters to Unicode ordinals, strings or None. 12972Character keys will be then converted to ordinals. 12973If there are two arguments, they must be strings of equal length, and 12974in the resulting dictionary, each character in x will be mapped to the 12975character at the same position in y. If there is a third argument, it 12976must be a string, whose characters will be mapped to None in the result. 12977[clinic start generated code]*/ 12978 12979static PyObject * 12980unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12981/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 12982{ 12983 PyObject *new = NULL, *key, *value; 12984 Py_ssize_t i = 0; 12985 int res; 12986 12987 new = PyDict_New(); 12988 if (!new) 12989 return NULL; 12990 if (y != NULL) { 12991 int x_kind, y_kind, z_kind; 12992 void *x_data, *y_data, *z_data; 12993 12994 /* x must be a string too, of equal length */ 12995 if (!PyUnicode_Check(x)) { 12996 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 12997 "be a string if there is a second argument"); 12998 goto err; 12999 } 13000 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13001 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13002 "arguments must have equal length"); 13003 goto err; 13004 } 13005 /* create entries for translating chars in x to those in y */ 13006 x_kind = PyUnicode_KIND(x); 13007 y_kind = PyUnicode_KIND(y); 13008 x_data = PyUnicode_DATA(x); 13009 y_data = PyUnicode_DATA(y); 13010 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13011 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13012 if (!key) 13013 goto err; 13014 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13015 if (!value) { 13016 Py_DECREF(key); 13017 goto err; 13018 } 13019 res = PyDict_SetItem(new, key, value); 13020 Py_DECREF(key); 13021 Py_DECREF(value); 13022 if (res < 0) 13023 goto err; 13024 } 13025 /* create entries for deleting chars in z */ 13026 if (z != NULL) { 13027 z_kind = PyUnicode_KIND(z); 13028 z_data = PyUnicode_DATA(z); 13029 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13030 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13031 if (!key) 13032 goto err; 13033 res = PyDict_SetItem(new, key, Py_None); 13034 Py_DECREF(key); 13035 if (res < 0) 13036 goto err; 13037 } 13038 } 13039 } else { 13040 int kind; 13041 void *data; 13042 13043 /* x must be a dict */ 13044 if (!PyDict_CheckExact(x)) { 13045 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13046 "to maketrans it must be a dict"); 13047 goto err; 13048 } 13049 /* copy entries into the new dict, converting string keys to int keys */ 13050 while (PyDict_Next(x, &i, &key, &value)) { 13051 if (PyUnicode_Check(key)) { 13052 /* convert string keys to integer keys */ 13053 PyObject *newkey; 13054 if (PyUnicode_GET_LENGTH(key) != 1) { 13055 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13056 "table must be of length 1"); 13057 goto err; 13058 } 13059 kind = PyUnicode_KIND(key); 13060 data = PyUnicode_DATA(key); 13061 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13062 if (!newkey) 13063 goto err; 13064 res = PyDict_SetItem(new, newkey, value); 13065 Py_DECREF(newkey); 13066 if (res < 0) 13067 goto err; 13068 } else if (PyLong_Check(key)) { 13069 /* just keep integer keys */ 13070 if (PyDict_SetItem(new, key, value) < 0) 13071 goto err; 13072 } else { 13073 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13074 "be strings or integers"); 13075 goto err; 13076 } 13077 } 13078 } 13079 return new; 13080 err: 13081 Py_DECREF(new); 13082 return NULL; 13083} 13084 13085PyDoc_STRVAR(translate__doc__, 13086 "S.translate(table) -> str\n\ 13087\n\ 13088Return a copy of the string S in which each character has been mapped\n\ 13089through the given translation table. The table must implement\n\ 13090lookup/indexing via __getitem__, for instance a dictionary or list,\n\ 13091mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\ 13092this operation raises LookupError, the character is left untouched.\n\ 13093Characters mapped to None are deleted."); 13094 13095static PyObject* 13096unicode_translate(PyObject *self, PyObject *table) 13097{ 13098 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13099} 13100 13101PyDoc_STRVAR(upper__doc__, 13102 "S.upper() -> str\n\ 13103\n\ 13104Return a copy of S converted to uppercase."); 13105 13106static PyObject* 13107unicode_upper(PyObject *self) 13108{ 13109 if (PyUnicode_READY(self) == -1) 13110 return NULL; 13111 if (PyUnicode_IS_ASCII(self)) 13112 return ascii_upper_or_lower(self, 0); 13113 return case_operation(self, do_upper); 13114} 13115 13116PyDoc_STRVAR(zfill__doc__, 13117 "S.zfill(width) -> str\n\ 13118\n\ 13119Pad a numeric string S with zeros on the left, to fill a field\n\ 13120of the specified width. The string S is never truncated."); 13121 13122static PyObject * 13123unicode_zfill(PyObject *self, PyObject *args) 13124{ 13125 Py_ssize_t fill; 13126 PyObject *u; 13127 Py_ssize_t width; 13128 int kind; 13129 void *data; 13130 Py_UCS4 chr; 13131 13132 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13133 return NULL; 13134 13135 if (PyUnicode_READY(self) == -1) 13136 return NULL; 13137 13138 if (PyUnicode_GET_LENGTH(self) >= width) 13139 return unicode_result_unchanged(self); 13140 13141 fill = width - PyUnicode_GET_LENGTH(self); 13142 13143 u = pad(self, fill, 0, '0'); 13144 13145 if (u == NULL) 13146 return NULL; 13147 13148 kind = PyUnicode_KIND(u); 13149 data = PyUnicode_DATA(u); 13150 chr = PyUnicode_READ(kind, data, fill); 13151 13152 if (chr == '+' || chr == '-') { 13153 /* move sign to beginning of string */ 13154 PyUnicode_WRITE(kind, data, 0, chr); 13155 PyUnicode_WRITE(kind, data, fill, '0'); 13156 } 13157 13158 assert(_PyUnicode_CheckConsistency(u, 1)); 13159 return u; 13160} 13161 13162#if 0 13163static PyObject * 13164unicode__decimal2ascii(PyObject *self) 13165{ 13166 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13167} 13168#endif 13169 13170PyDoc_STRVAR(startswith__doc__, 13171 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13172\n\ 13173Return True if S starts with the specified prefix, False otherwise.\n\ 13174With optional start, test S beginning at that position.\n\ 13175With optional end, stop comparing S at that position.\n\ 13176prefix can also be a tuple of strings to try."); 13177 13178static PyObject * 13179unicode_startswith(PyObject *self, 13180 PyObject *args) 13181{ 13182 PyObject *subobj; 13183 PyObject *substring; 13184 Py_ssize_t start = 0; 13185 Py_ssize_t end = PY_SSIZE_T_MAX; 13186 int result; 13187 13188 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13189 return NULL; 13190 if (PyTuple_Check(subobj)) { 13191 Py_ssize_t i; 13192 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13193 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13194 if (substring == NULL) 13195 return NULL; 13196 result = tailmatch(self, substring, start, end, -1); 13197 Py_DECREF(substring); 13198 if (result == -1) 13199 return NULL; 13200 if (result) { 13201 Py_RETURN_TRUE; 13202 } 13203 } 13204 /* nothing matched */ 13205 Py_RETURN_FALSE; 13206 } 13207 substring = PyUnicode_FromObject(subobj); 13208 if (substring == NULL) { 13209 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13210 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13211 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13212 return NULL; 13213 } 13214 result = tailmatch(self, substring, start, end, -1); 13215 Py_DECREF(substring); 13216 if (result == -1) 13217 return NULL; 13218 return PyBool_FromLong(result); 13219} 13220 13221 13222PyDoc_STRVAR(endswith__doc__, 13223 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13224\n\ 13225Return True if S ends with the specified suffix, False otherwise.\n\ 13226With optional start, test S beginning at that position.\n\ 13227With optional end, stop comparing S at that position.\n\ 13228suffix can also be a tuple of strings to try."); 13229 13230static PyObject * 13231unicode_endswith(PyObject *self, 13232 PyObject *args) 13233{ 13234 PyObject *subobj; 13235 PyObject *substring; 13236 Py_ssize_t start = 0; 13237 Py_ssize_t end = PY_SSIZE_T_MAX; 13238 int result; 13239 13240 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13241 return NULL; 13242 if (PyTuple_Check(subobj)) { 13243 Py_ssize_t i; 13244 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13245 substring = PyUnicode_FromObject( 13246 PyTuple_GET_ITEM(subobj, i)); 13247 if (substring == NULL) 13248 return NULL; 13249 result = tailmatch(self, substring, start, end, +1); 13250 Py_DECREF(substring); 13251 if (result == -1) 13252 return NULL; 13253 if (result) { 13254 Py_RETURN_TRUE; 13255 } 13256 } 13257 Py_RETURN_FALSE; 13258 } 13259 substring = PyUnicode_FromObject(subobj); 13260 if (substring == NULL) { 13261 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13262 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13263 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13264 return NULL; 13265 } 13266 result = tailmatch(self, substring, start, end, +1); 13267 Py_DECREF(substring); 13268 if (result == -1) 13269 return NULL; 13270 return PyBool_FromLong(result); 13271} 13272 13273Py_LOCAL_INLINE(void) 13274_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13275{ 13276 if (!writer->readonly) 13277 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13278 else { 13279 /* Copy-on-write mode: set buffer size to 0 so 13280 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13281 * next write. */ 13282 writer->size = 0; 13283 } 13284 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13285 writer->data = PyUnicode_DATA(writer->buffer); 13286 writer->kind = PyUnicode_KIND(writer->buffer); 13287} 13288 13289void 13290_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13291{ 13292 memset(writer, 0, sizeof(*writer)); 13293#ifdef Py_DEBUG 13294 writer->kind = 5; /* invalid kind */ 13295#endif 13296 writer->min_char = 127; 13297} 13298 13299int 13300_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13301 Py_ssize_t length, Py_UCS4 maxchar) 13302{ 13303#ifdef MS_WINDOWS 13304 /* On Windows, overallocate by 50% is the best factor */ 13305# define OVERALLOCATE_FACTOR 2 13306#else 13307 /* On Linux, overallocate by 25% is the best factor */ 13308# define OVERALLOCATE_FACTOR 4 13309#endif 13310 Py_ssize_t newlen; 13311 PyObject *newbuffer; 13312 13313 /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13314 assert((maxchar > writer->maxchar && length >= 0) 13315 || length > 0); 13316 13317 if (length > PY_SSIZE_T_MAX - writer->pos) { 13318 PyErr_NoMemory(); 13319 return -1; 13320 } 13321 newlen = writer->pos + length; 13322 13323 maxchar = Py_MAX(maxchar, writer->min_char); 13324 13325 if (writer->buffer == NULL) { 13326 assert(!writer->readonly); 13327 if (writer->overallocate 13328 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13329 /* overallocate to limit the number of realloc() */ 13330 newlen += newlen / OVERALLOCATE_FACTOR; 13331 } 13332 if (newlen < writer->min_length) 13333 newlen = writer->min_length; 13334 13335 writer->buffer = PyUnicode_New(newlen, maxchar); 13336 if (writer->buffer == NULL) 13337 return -1; 13338 } 13339 else if (newlen > writer->size) { 13340 if (writer->overallocate 13341 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13342 /* overallocate to limit the number of realloc() */ 13343 newlen += newlen / OVERALLOCATE_FACTOR; 13344 } 13345 if (newlen < writer->min_length) 13346 newlen = writer->min_length; 13347 13348 if (maxchar > writer->maxchar || writer->readonly) { 13349 /* resize + widen */ 13350 newbuffer = PyUnicode_New(newlen, maxchar); 13351 if (newbuffer == NULL) 13352 return -1; 13353 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13354 writer->buffer, 0, writer->pos); 13355 Py_DECREF(writer->buffer); 13356 writer->readonly = 0; 13357 } 13358 else { 13359 newbuffer = resize_compact(writer->buffer, newlen); 13360 if (newbuffer == NULL) 13361 return -1; 13362 } 13363 writer->buffer = newbuffer; 13364 } 13365 else if (maxchar > writer->maxchar) { 13366 assert(!writer->readonly); 13367 newbuffer = PyUnicode_New(writer->size, maxchar); 13368 if (newbuffer == NULL) 13369 return -1; 13370 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13371 writer->buffer, 0, writer->pos); 13372 Py_DECREF(writer->buffer); 13373 writer->buffer = newbuffer; 13374 } 13375 _PyUnicodeWriter_Update(writer); 13376 return 0; 13377 13378#undef OVERALLOCATE_FACTOR 13379} 13380 13381int 13382_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13383 enum PyUnicode_Kind kind) 13384{ 13385 Py_UCS4 maxchar; 13386 13387 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13388 assert(writer->kind < kind); 13389 13390 switch (kind) 13391 { 13392 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13393 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13394 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; 13395 default: 13396 assert(0 && "invalid kind"); 13397 return -1; 13398 } 13399 13400 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13401} 13402 13403Py_LOCAL_INLINE(int) 13404_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13405{ 13406 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13407 return -1; 13408 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13409 writer->pos++; 13410 return 0; 13411} 13412 13413int 13414_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13415{ 13416 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13417} 13418 13419int 13420_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13421{ 13422 Py_UCS4 maxchar; 13423 Py_ssize_t len; 13424 13425 if (PyUnicode_READY(str) == -1) 13426 return -1; 13427 len = PyUnicode_GET_LENGTH(str); 13428 if (len == 0) 13429 return 0; 13430 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13431 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13432 if (writer->buffer == NULL && !writer->overallocate) { 13433 assert(_PyUnicode_CheckConsistency(str, 1)); 13434 writer->readonly = 1; 13435 Py_INCREF(str); 13436 writer->buffer = str; 13437 _PyUnicodeWriter_Update(writer); 13438 writer->pos += len; 13439 return 0; 13440 } 13441 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13442 return -1; 13443 } 13444 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13445 str, 0, len); 13446 writer->pos += len; 13447 return 0; 13448} 13449 13450int 13451_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13452 Py_ssize_t start, Py_ssize_t end) 13453{ 13454 Py_UCS4 maxchar; 13455 Py_ssize_t len; 13456 13457 if (PyUnicode_READY(str) == -1) 13458 return -1; 13459 13460 assert(0 <= start); 13461 assert(end <= PyUnicode_GET_LENGTH(str)); 13462 assert(start <= end); 13463 13464 if (end == 0) 13465 return 0; 13466 13467 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13468 return _PyUnicodeWriter_WriteStr(writer, str); 13469 13470 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13471 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13472 else 13473 maxchar = writer->maxchar; 13474 len = end - start; 13475 13476 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13477 return -1; 13478 13479 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13480 str, start, len); 13481 writer->pos += len; 13482 return 0; 13483} 13484 13485int 13486_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13487 const char *ascii, Py_ssize_t len) 13488{ 13489 if (len == -1) 13490 len = strlen(ascii); 13491 13492 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13493 13494 if (writer->buffer == NULL && !writer->overallocate) { 13495 PyObject *str; 13496 13497 str = _PyUnicode_FromASCII(ascii, len); 13498 if (str == NULL) 13499 return -1; 13500 13501 writer->readonly = 1; 13502 writer->buffer = str; 13503 _PyUnicodeWriter_Update(writer); 13504 writer->pos += len; 13505 return 0; 13506 } 13507 13508 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13509 return -1; 13510 13511 switch (writer->kind) 13512 { 13513 case PyUnicode_1BYTE_KIND: 13514 { 13515 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13516 Py_UCS1 *data = writer->data; 13517 13518 Py_MEMCPY(data + writer->pos, str, len); 13519 break; 13520 } 13521 case PyUnicode_2BYTE_KIND: 13522 { 13523 _PyUnicode_CONVERT_BYTES( 13524 Py_UCS1, Py_UCS2, 13525 ascii, ascii + len, 13526 (Py_UCS2 *)writer->data + writer->pos); 13527 break; 13528 } 13529 case PyUnicode_4BYTE_KIND: 13530 { 13531 _PyUnicode_CONVERT_BYTES( 13532 Py_UCS1, Py_UCS4, 13533 ascii, ascii + len, 13534 (Py_UCS4 *)writer->data + writer->pos); 13535 break; 13536 } 13537 default: 13538 assert(0); 13539 } 13540 13541 writer->pos += len; 13542 return 0; 13543} 13544 13545int 13546_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13547 const char *str, Py_ssize_t len) 13548{ 13549 Py_UCS4 maxchar; 13550 13551 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13552 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13553 return -1; 13554 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13555 writer->pos += len; 13556 return 0; 13557} 13558 13559PyObject * 13560_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13561{ 13562 PyObject *str; 13563 if (writer->pos == 0) { 13564 Py_CLEAR(writer->buffer); 13565 _Py_RETURN_UNICODE_EMPTY(); 13566 } 13567 if (writer->readonly) { 13568 str = writer->buffer; 13569 writer->buffer = NULL; 13570 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13571 return str; 13572 } 13573 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13574 PyObject *newbuffer; 13575 newbuffer = resize_compact(writer->buffer, writer->pos); 13576 if (newbuffer == NULL) { 13577 Py_CLEAR(writer->buffer); 13578 return NULL; 13579 } 13580 writer->buffer = newbuffer; 13581 } 13582 str = writer->buffer; 13583 writer->buffer = NULL; 13584 assert(_PyUnicode_CheckConsistency(str, 1)); 13585 return unicode_result_ready(str); 13586} 13587 13588void 13589_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13590{ 13591 Py_CLEAR(writer->buffer); 13592} 13593 13594#include "stringlib/unicode_format.h" 13595 13596PyDoc_STRVAR(format__doc__, 13597 "S.format(*args, **kwargs) -> str\n\ 13598\n\ 13599Return a formatted version of S, using substitutions from args and kwargs.\n\ 13600The substitutions are identified by braces ('{' and '}')."); 13601 13602PyDoc_STRVAR(format_map__doc__, 13603 "S.format_map(mapping) -> str\n\ 13604\n\ 13605Return a formatted version of S, using substitutions from mapping.\n\ 13606The substitutions are identified by braces ('{' and '}')."); 13607 13608static PyObject * 13609unicode__format__(PyObject* self, PyObject* args) 13610{ 13611 PyObject *format_spec; 13612 _PyUnicodeWriter writer; 13613 int ret; 13614 13615 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13616 return NULL; 13617 13618 if (PyUnicode_READY(self) == -1) 13619 return NULL; 13620 _PyUnicodeWriter_Init(&writer); 13621 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13622 self, format_spec, 0, 13623 PyUnicode_GET_LENGTH(format_spec)); 13624 if (ret == -1) { 13625 _PyUnicodeWriter_Dealloc(&writer); 13626 return NULL; 13627 } 13628 return _PyUnicodeWriter_Finish(&writer); 13629} 13630 13631PyDoc_STRVAR(p_format__doc__, 13632 "S.__format__(format_spec) -> str\n\ 13633\n\ 13634Return a formatted version of S as described by format_spec."); 13635 13636static PyObject * 13637unicode__sizeof__(PyObject *v) 13638{ 13639 Py_ssize_t size; 13640 13641 /* If it's a compact object, account for base structure + 13642 character data. */ 13643 if (PyUnicode_IS_COMPACT_ASCII(v)) 13644 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13645 else if (PyUnicode_IS_COMPACT(v)) 13646 size = sizeof(PyCompactUnicodeObject) + 13647 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13648 else { 13649 /* If it is a two-block object, account for base object, and 13650 for character block if present. */ 13651 size = sizeof(PyUnicodeObject); 13652 if (_PyUnicode_DATA_ANY(v)) 13653 size += (PyUnicode_GET_LENGTH(v) + 1) * 13654 PyUnicode_KIND(v); 13655 } 13656 /* If the wstr pointer is present, account for it unless it is shared 13657 with the data pointer. Check if the data is not shared. */ 13658 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13659 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13660 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13661 size += PyUnicode_UTF8_LENGTH(v) + 1; 13662 13663 return PyLong_FromSsize_t(size); 13664} 13665 13666PyDoc_STRVAR(sizeof__doc__, 13667 "S.__sizeof__() -> size of S in memory, in bytes"); 13668 13669static PyObject * 13670unicode_getnewargs(PyObject *v) 13671{ 13672 PyObject *copy = _PyUnicode_Copy(v); 13673 if (!copy) 13674 return NULL; 13675 return Py_BuildValue("(N)", copy); 13676} 13677 13678static PyMethodDef unicode_methods[] = { 13679 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13680 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13681 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13682 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13683 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13684 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13685 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13686 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13687 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13688 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13689 {"expandtabs", (PyCFunction) unicode_expandtabs, 13690 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13691 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13692 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13693 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13694 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13695 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13696 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13697 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13698 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13699 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13700 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13701 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13702 {"splitlines", (PyCFunction) unicode_splitlines, 13703 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13704 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13705 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13706 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13707 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13708 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13709 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13710 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13711 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13712 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13713 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13714 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13715 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13716 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13717 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13718 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13719 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13720 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13721 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13722 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13723 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13724 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13725 UNICODE_MAKETRANS_METHODDEF 13726 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13727#if 0 13728 /* These methods are just used for debugging the implementation. */ 13729 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13730#endif 13731 13732 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13733 {NULL, NULL} 13734}; 13735 13736static PyObject * 13737unicode_mod(PyObject *v, PyObject *w) 13738{ 13739 if (!PyUnicode_Check(v)) 13740 Py_RETURN_NOTIMPLEMENTED; 13741 return PyUnicode_Format(v, w); 13742} 13743 13744static PyNumberMethods unicode_as_number = { 13745 0, /*nb_add*/ 13746 0, /*nb_subtract*/ 13747 0, /*nb_multiply*/ 13748 unicode_mod, /*nb_remainder*/ 13749}; 13750 13751static PySequenceMethods unicode_as_sequence = { 13752 (lenfunc) unicode_length, /* sq_length */ 13753 PyUnicode_Concat, /* sq_concat */ 13754 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13755 (ssizeargfunc) unicode_getitem, /* sq_item */ 13756 0, /* sq_slice */ 13757 0, /* sq_ass_item */ 13758 0, /* sq_ass_slice */ 13759 PyUnicode_Contains, /* sq_contains */ 13760}; 13761 13762static PyObject* 13763unicode_subscript(PyObject* self, PyObject* item) 13764{ 13765 if (PyUnicode_READY(self) == -1) 13766 return NULL; 13767 13768 if (PyIndex_Check(item)) { 13769 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13770 if (i == -1 && PyErr_Occurred()) 13771 return NULL; 13772 if (i < 0) 13773 i += PyUnicode_GET_LENGTH(self); 13774 return unicode_getitem(self, i); 13775 } else if (PySlice_Check(item)) { 13776 Py_ssize_t start, stop, step, slicelength, cur, i; 13777 PyObject *result; 13778 void *src_data, *dest_data; 13779 int src_kind, dest_kind; 13780 Py_UCS4 ch, max_char, kind_limit; 13781 13782 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13783 &start, &stop, &step, &slicelength) < 0) { 13784 return NULL; 13785 } 13786 13787 if (slicelength <= 0) { 13788 _Py_RETURN_UNICODE_EMPTY(); 13789 } else if (start == 0 && step == 1 && 13790 slicelength == PyUnicode_GET_LENGTH(self)) { 13791 return unicode_result_unchanged(self); 13792 } else if (step == 1) { 13793 return PyUnicode_Substring(self, 13794 start, start + slicelength); 13795 } 13796 /* General case */ 13797 src_kind = PyUnicode_KIND(self); 13798 src_data = PyUnicode_DATA(self); 13799 if (!PyUnicode_IS_ASCII(self)) { 13800 kind_limit = kind_maxchar_limit(src_kind); 13801 max_char = 0; 13802 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13803 ch = PyUnicode_READ(src_kind, src_data, cur); 13804 if (ch > max_char) { 13805 max_char = ch; 13806 if (max_char >= kind_limit) 13807 break; 13808 } 13809 } 13810 } 13811 else 13812 max_char = 127; 13813 result = PyUnicode_New(slicelength, max_char); 13814 if (result == NULL) 13815 return NULL; 13816 dest_kind = PyUnicode_KIND(result); 13817 dest_data = PyUnicode_DATA(result); 13818 13819 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13820 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13821 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13822 } 13823 assert(_PyUnicode_CheckConsistency(result, 1)); 13824 return result; 13825 } else { 13826 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13827 return NULL; 13828 } 13829} 13830 13831static PyMappingMethods unicode_as_mapping = { 13832 (lenfunc)unicode_length, /* mp_length */ 13833 (binaryfunc)unicode_subscript, /* mp_subscript */ 13834 (objobjargproc)0, /* mp_ass_subscript */ 13835}; 13836 13837 13838/* Helpers for PyUnicode_Format() */ 13839 13840struct unicode_formatter_t { 13841 PyObject *args; 13842 int args_owned; 13843 Py_ssize_t arglen, argidx; 13844 PyObject *dict; 13845 13846 enum PyUnicode_Kind fmtkind; 13847 Py_ssize_t fmtcnt, fmtpos; 13848 void *fmtdata; 13849 PyObject *fmtstr; 13850 13851 _PyUnicodeWriter writer; 13852}; 13853 13854struct unicode_format_arg_t { 13855 Py_UCS4 ch; 13856 int flags; 13857 Py_ssize_t width; 13858 int prec; 13859 int sign; 13860}; 13861 13862static PyObject * 13863unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13864{ 13865 Py_ssize_t argidx = ctx->argidx; 13866 13867 if (argidx < ctx->arglen) { 13868 ctx->argidx++; 13869 if (ctx->arglen < 0) 13870 return ctx->args; 13871 else 13872 return PyTuple_GetItem(ctx->args, argidx); 13873 } 13874 PyErr_SetString(PyExc_TypeError, 13875 "not enough arguments for format string"); 13876 return NULL; 13877} 13878 13879/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13880 13881/* Format a float into the writer if the writer is not NULL, or into *p_output 13882 otherwise. 13883 13884 Return 0 on success, raise an exception and return -1 on error. */ 13885static int 13886formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13887 PyObject **p_output, 13888 _PyUnicodeWriter *writer) 13889{ 13890 char *p; 13891 double x; 13892 Py_ssize_t len; 13893 int prec; 13894 int dtoa_flags; 13895 13896 x = PyFloat_AsDouble(v); 13897 if (x == -1.0 && PyErr_Occurred()) 13898 return -1; 13899 13900 prec = arg->prec; 13901 if (prec < 0) 13902 prec = 6; 13903 13904 if (arg->flags & F_ALT) 13905 dtoa_flags = Py_DTSF_ALT; 13906 else 13907 dtoa_flags = 0; 13908 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13909 if (p == NULL) 13910 return -1; 13911 len = strlen(p); 13912 if (writer) { 13913 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13914 PyMem_Free(p); 13915 return -1; 13916 } 13917 } 13918 else 13919 *p_output = _PyUnicode_FromASCII(p, len); 13920 PyMem_Free(p); 13921 return 0; 13922} 13923 13924/* formatlong() emulates the format codes d, u, o, x and X, and 13925 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13926 * Python's regular ints. 13927 * Return value: a new PyUnicodeObject*, or NULL if error. 13928 * The output string is of the form 13929 * "-"? ("0x" | "0X")? digit+ 13930 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13931 * set in flags. The case of hex digits will be correct, 13932 * There will be at least prec digits, zero-filled on the left if 13933 * necessary to get that many. 13934 * val object to be converted 13935 * flags bitmask of format flags; only F_ALT is looked at 13936 * prec minimum number of digits; 0-fill on left if needed 13937 * type a character in [duoxX]; u acts the same as d 13938 * 13939 * CAUTION: o, x and X conversions on regular ints can never 13940 * produce a '-' sign, but can for Python's unbounded ints. 13941 */ 13942PyObject * 13943_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 13944{ 13945 PyObject *result = NULL; 13946 char *buf; 13947 Py_ssize_t i; 13948 int sign; /* 1 if '-', else 0 */ 13949 int len; /* number of characters */ 13950 Py_ssize_t llen; 13951 int numdigits; /* len == numnondigits + numdigits */ 13952 int numnondigits = 0; 13953 13954 /* Avoid exceeding SSIZE_T_MAX */ 13955 if (prec > INT_MAX-3) { 13956 PyErr_SetString(PyExc_OverflowError, 13957 "precision too large"); 13958 return NULL; 13959 } 13960 13961 assert(PyLong_Check(val)); 13962 13963 switch (type) { 13964 default: 13965 assert(!"'type' not in [diuoxX]"); 13966 case 'd': 13967 case 'i': 13968 case 'u': 13969 /* int and int subclasses should print numerically when a numeric */ 13970 /* format code is used (see issue18780) */ 13971 result = PyNumber_ToBase(val, 10); 13972 break; 13973 case 'o': 13974 numnondigits = 2; 13975 result = PyNumber_ToBase(val, 8); 13976 break; 13977 case 'x': 13978 case 'X': 13979 numnondigits = 2; 13980 result = PyNumber_ToBase(val, 16); 13981 break; 13982 } 13983 if (!result) 13984 return NULL; 13985 13986 assert(unicode_modifiable(result)); 13987 assert(PyUnicode_IS_READY(result)); 13988 assert(PyUnicode_IS_ASCII(result)); 13989 13990 /* To modify the string in-place, there can only be one reference. */ 13991 if (Py_REFCNT(result) != 1) { 13992 Py_DECREF(result); 13993 PyErr_BadInternalCall(); 13994 return NULL; 13995 } 13996 buf = PyUnicode_DATA(result); 13997 llen = PyUnicode_GET_LENGTH(result); 13998 if (llen > INT_MAX) { 13999 Py_DECREF(result); 14000 PyErr_SetString(PyExc_ValueError, 14001 "string too large in _PyUnicode_FormatLong"); 14002 return NULL; 14003 } 14004 len = (int)llen; 14005 sign = buf[0] == '-'; 14006 numnondigits += sign; 14007 numdigits = len - numnondigits; 14008 assert(numdigits > 0); 14009 14010 /* Get rid of base marker unless F_ALT */ 14011 if (((alt) == 0 && 14012 (type == 'o' || type == 'x' || type == 'X'))) { 14013 assert(buf[sign] == '0'); 14014 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14015 buf[sign+1] == 'o'); 14016 numnondigits -= 2; 14017 buf += 2; 14018 len -= 2; 14019 if (sign) 14020 buf[0] = '-'; 14021 assert(len == numnondigits + numdigits); 14022 assert(numdigits > 0); 14023 } 14024 14025 /* Fill with leading zeroes to meet minimum width. */ 14026 if (prec > numdigits) { 14027 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14028 numnondigits + prec); 14029 char *b1; 14030 if (!r1) { 14031 Py_DECREF(result); 14032 return NULL; 14033 } 14034 b1 = PyBytes_AS_STRING(r1); 14035 for (i = 0; i < numnondigits; ++i) 14036 *b1++ = *buf++; 14037 for (i = 0; i < prec - numdigits; i++) 14038 *b1++ = '0'; 14039 for (i = 0; i < numdigits; i++) 14040 *b1++ = *buf++; 14041 *b1 = '\0'; 14042 Py_DECREF(result); 14043 result = r1; 14044 buf = PyBytes_AS_STRING(result); 14045 len = numnondigits + prec; 14046 } 14047 14048 /* Fix up case for hex conversions. */ 14049 if (type == 'X') { 14050 /* Need to convert all lower case letters to upper case. 14051 and need to convert 0x to 0X (and -0x to -0X). */ 14052 for (i = 0; i < len; i++) 14053 if (buf[i] >= 'a' && buf[i] <= 'x') 14054 buf[i] -= 'a'-'A'; 14055 } 14056 if (!PyUnicode_Check(result) 14057 || buf != PyUnicode_DATA(result)) { 14058 PyObject *unicode; 14059 unicode = _PyUnicode_FromASCII(buf, len); 14060 Py_DECREF(result); 14061 result = unicode; 14062 } 14063 else if (len != PyUnicode_GET_LENGTH(result)) { 14064 if (PyUnicode_Resize(&result, len) < 0) 14065 Py_CLEAR(result); 14066 } 14067 return result; 14068} 14069 14070/* Format an integer or a float as an integer. 14071 * Return 1 if the number has been formatted into the writer, 14072 * 0 if the number has been formatted into *p_output 14073 * -1 and raise an exception on error */ 14074static int 14075mainformatlong(PyObject *v, 14076 struct unicode_format_arg_t *arg, 14077 PyObject **p_output, 14078 _PyUnicodeWriter *writer) 14079{ 14080 PyObject *iobj, *res; 14081 char type = (char)arg->ch; 14082 14083 if (!PyNumber_Check(v)) 14084 goto wrongtype; 14085 14086 /* make sure number is a type of integer for o, x, and X */ 14087 if (!PyLong_Check(v)) { 14088 if (type == 'o' || type == 'x' || type == 'X') { 14089 iobj = PyNumber_Index(v); 14090 if (iobj == NULL) { 14091 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14092 goto wrongtype; 14093 return -1; 14094 } 14095 } 14096 else { 14097 iobj = PyNumber_Long(v); 14098 if (iobj == NULL ) { 14099 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14100 goto wrongtype; 14101 return -1; 14102 } 14103 } 14104 assert(PyLong_Check(iobj)); 14105 } 14106 else { 14107 iobj = v; 14108 Py_INCREF(iobj); 14109 } 14110 14111 if (PyLong_CheckExact(v) 14112 && arg->width == -1 && arg->prec == -1 14113 && !(arg->flags & (F_SIGN | F_BLANK)) 14114 && type != 'X') 14115 { 14116 /* Fast path */ 14117 int alternate = arg->flags & F_ALT; 14118 int base; 14119 14120 switch(type) 14121 { 14122 default: 14123 assert(0 && "'type' not in [diuoxX]"); 14124 case 'd': 14125 case 'i': 14126 case 'u': 14127 base = 10; 14128 break; 14129 case 'o': 14130 base = 8; 14131 break; 14132 case 'x': 14133 case 'X': 14134 base = 16; 14135 break; 14136 } 14137 14138 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14139 Py_DECREF(iobj); 14140 return -1; 14141 } 14142 Py_DECREF(iobj); 14143 return 1; 14144 } 14145 14146 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14147 Py_DECREF(iobj); 14148 if (res == NULL) 14149 return -1; 14150 *p_output = res; 14151 return 0; 14152 14153wrongtype: 14154 switch(type) 14155 { 14156 case 'o': 14157 case 'x': 14158 case 'X': 14159 PyErr_Format(PyExc_TypeError, 14160 "%%%c format: an integer is required, " 14161 "not %.200s", 14162 type, Py_TYPE(v)->tp_name); 14163 break; 14164 default: 14165 PyErr_Format(PyExc_TypeError, 14166 "%%%c format: a number is required, " 14167 "not %.200s", 14168 type, Py_TYPE(v)->tp_name); 14169 break; 14170 } 14171 return -1; 14172} 14173 14174static Py_UCS4 14175formatchar(PyObject *v) 14176{ 14177 /* presume that the buffer is at least 3 characters long */ 14178 if (PyUnicode_Check(v)) { 14179 if (PyUnicode_GET_LENGTH(v) == 1) { 14180 return PyUnicode_READ_CHAR(v, 0); 14181 } 14182 goto onError; 14183 } 14184 else { 14185 PyObject *iobj; 14186 long x; 14187 /* make sure number is a type of integer */ 14188 if (!PyLong_Check(v)) { 14189 iobj = PyNumber_Index(v); 14190 if (iobj == NULL) { 14191 goto onError; 14192 } 14193 v = iobj; 14194 Py_DECREF(iobj); 14195 } 14196 /* Integer input truncated to a character */ 14197 x = PyLong_AsLong(v); 14198 if (x == -1 && PyErr_Occurred()) 14199 goto onError; 14200 14201 if (x < 0 || x > MAX_UNICODE) { 14202 PyErr_SetString(PyExc_OverflowError, 14203 "%c arg not in range(0x110000)"); 14204 return (Py_UCS4) -1; 14205 } 14206 14207 return (Py_UCS4) x; 14208 } 14209 14210 onError: 14211 PyErr_SetString(PyExc_TypeError, 14212 "%c requires int or char"); 14213 return (Py_UCS4) -1; 14214} 14215 14216/* Parse options of an argument: flags, width, precision. 14217 Handle also "%(name)" syntax. 14218 14219 Return 0 if the argument has been formatted into arg->str. 14220 Return 1 if the argument has been written into ctx->writer, 14221 Raise an exception and return -1 on error. */ 14222static int 14223unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14224 struct unicode_format_arg_t *arg) 14225{ 14226#define FORMAT_READ(ctx) \ 14227 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14228 14229 PyObject *v; 14230 14231 if (arg->ch == '(') { 14232 /* Get argument value from a dictionary. Example: "%(name)s". */ 14233 Py_ssize_t keystart; 14234 Py_ssize_t keylen; 14235 PyObject *key; 14236 int pcount = 1; 14237 14238 if (ctx->dict == NULL) { 14239 PyErr_SetString(PyExc_TypeError, 14240 "format requires a mapping"); 14241 return -1; 14242 } 14243 ++ctx->fmtpos; 14244 --ctx->fmtcnt; 14245 keystart = ctx->fmtpos; 14246 /* Skip over balanced parentheses */ 14247 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14248 arg->ch = FORMAT_READ(ctx); 14249 if (arg->ch == ')') 14250 --pcount; 14251 else if (arg->ch == '(') 14252 ++pcount; 14253 ctx->fmtpos++; 14254 } 14255 keylen = ctx->fmtpos - keystart - 1; 14256 if (ctx->fmtcnt < 0 || pcount > 0) { 14257 PyErr_SetString(PyExc_ValueError, 14258 "incomplete format key"); 14259 return -1; 14260 } 14261 key = PyUnicode_Substring(ctx->fmtstr, 14262 keystart, keystart + keylen); 14263 if (key == NULL) 14264 return -1; 14265 if (ctx->args_owned) { 14266 Py_DECREF(ctx->args); 14267 ctx->args_owned = 0; 14268 } 14269 ctx->args = PyObject_GetItem(ctx->dict, key); 14270 Py_DECREF(key); 14271 if (ctx->args == NULL) 14272 return -1; 14273 ctx->args_owned = 1; 14274 ctx->arglen = -1; 14275 ctx->argidx = -2; 14276 } 14277 14278 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14279 while (--ctx->fmtcnt >= 0) { 14280 arg->ch = FORMAT_READ(ctx); 14281 ctx->fmtpos++; 14282 switch (arg->ch) { 14283 case '-': arg->flags |= F_LJUST; continue; 14284 case '+': arg->flags |= F_SIGN; continue; 14285 case ' ': arg->flags |= F_BLANK; continue; 14286 case '#': arg->flags |= F_ALT; continue; 14287 case '0': arg->flags |= F_ZERO; continue; 14288 } 14289 break; 14290 } 14291 14292 /* Parse width. Example: "%10s" => width=10 */ 14293 if (arg->ch == '*') { 14294 v = unicode_format_getnextarg(ctx); 14295 if (v == NULL) 14296 return -1; 14297 if (!PyLong_Check(v)) { 14298 PyErr_SetString(PyExc_TypeError, 14299 "* wants int"); 14300 return -1; 14301 } 14302 arg->width = PyLong_AsSsize_t(v); 14303 if (arg->width == -1 && PyErr_Occurred()) 14304 return -1; 14305 if (arg->width < 0) { 14306 arg->flags |= F_LJUST; 14307 arg->width = -arg->width; 14308 } 14309 if (--ctx->fmtcnt >= 0) { 14310 arg->ch = FORMAT_READ(ctx); 14311 ctx->fmtpos++; 14312 } 14313 } 14314 else if (arg->ch >= '0' && arg->ch <= '9') { 14315 arg->width = arg->ch - '0'; 14316 while (--ctx->fmtcnt >= 0) { 14317 arg->ch = FORMAT_READ(ctx); 14318 ctx->fmtpos++; 14319 if (arg->ch < '0' || arg->ch > '9') 14320 break; 14321 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14322 mixing signed and unsigned comparison. Since arg->ch is between 14323 '0' and '9', casting to int is safe. */ 14324 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14325 PyErr_SetString(PyExc_ValueError, 14326 "width too big"); 14327 return -1; 14328 } 14329 arg->width = arg->width*10 + (arg->ch - '0'); 14330 } 14331 } 14332 14333 /* Parse precision. Example: "%.3f" => prec=3 */ 14334 if (arg->ch == '.') { 14335 arg->prec = 0; 14336 if (--ctx->fmtcnt >= 0) { 14337 arg->ch = FORMAT_READ(ctx); 14338 ctx->fmtpos++; 14339 } 14340 if (arg->ch == '*') { 14341 v = unicode_format_getnextarg(ctx); 14342 if (v == NULL) 14343 return -1; 14344 if (!PyLong_Check(v)) { 14345 PyErr_SetString(PyExc_TypeError, 14346 "* wants int"); 14347 return -1; 14348 } 14349 arg->prec = _PyLong_AsInt(v); 14350 if (arg->prec == -1 && PyErr_Occurred()) 14351 return -1; 14352 if (arg->prec < 0) 14353 arg->prec = 0; 14354 if (--ctx->fmtcnt >= 0) { 14355 arg->ch = FORMAT_READ(ctx); 14356 ctx->fmtpos++; 14357 } 14358 } 14359 else if (arg->ch >= '0' && arg->ch <= '9') { 14360 arg->prec = arg->ch - '0'; 14361 while (--ctx->fmtcnt >= 0) { 14362 arg->ch = FORMAT_READ(ctx); 14363 ctx->fmtpos++; 14364 if (arg->ch < '0' || arg->ch > '9') 14365 break; 14366 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14367 PyErr_SetString(PyExc_ValueError, 14368 "precision too big"); 14369 return -1; 14370 } 14371 arg->prec = arg->prec*10 + (arg->ch - '0'); 14372 } 14373 } 14374 } 14375 14376 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14377 if (ctx->fmtcnt >= 0) { 14378 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14379 if (--ctx->fmtcnt >= 0) { 14380 arg->ch = FORMAT_READ(ctx); 14381 ctx->fmtpos++; 14382 } 14383 } 14384 } 14385 if (ctx->fmtcnt < 0) { 14386 PyErr_SetString(PyExc_ValueError, 14387 "incomplete format"); 14388 return -1; 14389 } 14390 return 0; 14391 14392#undef FORMAT_READ 14393} 14394 14395/* Format one argument. Supported conversion specifiers: 14396 14397 - "s", "r", "a": any type 14398 - "i", "d", "u": int or float 14399 - "o", "x", "X": int 14400 - "e", "E", "f", "F", "g", "G": float 14401 - "c": int or str (1 character) 14402 14403 When possible, the output is written directly into the Unicode writer 14404 (ctx->writer). A string is created when padding is required. 14405 14406 Return 0 if the argument has been formatted into *p_str, 14407 1 if the argument has been written into ctx->writer, 14408 -1 on error. */ 14409static int 14410unicode_format_arg_format(struct unicode_formatter_t *ctx, 14411 struct unicode_format_arg_t *arg, 14412 PyObject **p_str) 14413{ 14414 PyObject *v; 14415 _PyUnicodeWriter *writer = &ctx->writer; 14416 14417 if (ctx->fmtcnt == 0) 14418 ctx->writer.overallocate = 0; 14419 14420 if (arg->ch == '%') { 14421 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14422 return -1; 14423 return 1; 14424 } 14425 14426 v = unicode_format_getnextarg(ctx); 14427 if (v == NULL) 14428 return -1; 14429 14430 14431 switch (arg->ch) { 14432 case 's': 14433 case 'r': 14434 case 'a': 14435 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14436 /* Fast path */ 14437 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14438 return -1; 14439 return 1; 14440 } 14441 14442 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14443 *p_str = v; 14444 Py_INCREF(*p_str); 14445 } 14446 else { 14447 if (arg->ch == 's') 14448 *p_str = PyObject_Str(v); 14449 else if (arg->ch == 'r') 14450 *p_str = PyObject_Repr(v); 14451 else 14452 *p_str = PyObject_ASCII(v); 14453 } 14454 break; 14455 14456 case 'i': 14457 case 'd': 14458 case 'u': 14459 case 'o': 14460 case 'x': 14461 case 'X': 14462 { 14463 int ret = mainformatlong(v, arg, p_str, writer); 14464 if (ret != 0) 14465 return ret; 14466 arg->sign = 1; 14467 break; 14468 } 14469 14470 case 'e': 14471 case 'E': 14472 case 'f': 14473 case 'F': 14474 case 'g': 14475 case 'G': 14476 if (arg->width == -1 && arg->prec == -1 14477 && !(arg->flags & (F_SIGN | F_BLANK))) 14478 { 14479 /* Fast path */ 14480 if (formatfloat(v, arg, NULL, writer) == -1) 14481 return -1; 14482 return 1; 14483 } 14484 14485 arg->sign = 1; 14486 if (formatfloat(v, arg, p_str, NULL) == -1) 14487 return -1; 14488 break; 14489 14490 case 'c': 14491 { 14492 Py_UCS4 ch = formatchar(v); 14493 if (ch == (Py_UCS4) -1) 14494 return -1; 14495 if (arg->width == -1 && arg->prec == -1) { 14496 /* Fast path */ 14497 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14498 return -1; 14499 return 1; 14500 } 14501 *p_str = PyUnicode_FromOrdinal(ch); 14502 break; 14503 } 14504 14505 default: 14506 PyErr_Format(PyExc_ValueError, 14507 "unsupported format character '%c' (0x%x) " 14508 "at index %zd", 14509 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14510 (int)arg->ch, 14511 ctx->fmtpos - 1); 14512 return -1; 14513 } 14514 if (*p_str == NULL) 14515 return -1; 14516 assert (PyUnicode_Check(*p_str)); 14517 return 0; 14518} 14519 14520static int 14521unicode_format_arg_output(struct unicode_formatter_t *ctx, 14522 struct unicode_format_arg_t *arg, 14523 PyObject *str) 14524{ 14525 Py_ssize_t len; 14526 enum PyUnicode_Kind kind; 14527 void *pbuf; 14528 Py_ssize_t pindex; 14529 Py_UCS4 signchar; 14530 Py_ssize_t buflen; 14531 Py_UCS4 maxchar; 14532 Py_ssize_t sublen; 14533 _PyUnicodeWriter *writer = &ctx->writer; 14534 Py_UCS4 fill; 14535 14536 fill = ' '; 14537 if (arg->sign && arg->flags & F_ZERO) 14538 fill = '0'; 14539 14540 if (PyUnicode_READY(str) == -1) 14541 return -1; 14542 14543 len = PyUnicode_GET_LENGTH(str); 14544 if ((arg->width == -1 || arg->width <= len) 14545 && (arg->prec == -1 || arg->prec >= len) 14546 && !(arg->flags & (F_SIGN | F_BLANK))) 14547 { 14548 /* Fast path */ 14549 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14550 return -1; 14551 return 0; 14552 } 14553 14554 /* Truncate the string for "s", "r" and "a" formats 14555 if the precision is set */ 14556 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14557 if (arg->prec >= 0 && len > arg->prec) 14558 len = arg->prec; 14559 } 14560 14561 /* Adjust sign and width */ 14562 kind = PyUnicode_KIND(str); 14563 pbuf = PyUnicode_DATA(str); 14564 pindex = 0; 14565 signchar = '\0'; 14566 if (arg->sign) { 14567 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14568 if (ch == '-' || ch == '+') { 14569 signchar = ch; 14570 len--; 14571 pindex++; 14572 } 14573 else if (arg->flags & F_SIGN) 14574 signchar = '+'; 14575 else if (arg->flags & F_BLANK) 14576 signchar = ' '; 14577 else 14578 arg->sign = 0; 14579 } 14580 if (arg->width < len) 14581 arg->width = len; 14582 14583 /* Prepare the writer */ 14584 maxchar = writer->maxchar; 14585 if (!(arg->flags & F_LJUST)) { 14586 if (arg->sign) { 14587 if ((arg->width-1) > len) 14588 maxchar = Py_MAX(maxchar, fill); 14589 } 14590 else { 14591 if (arg->width > len) 14592 maxchar = Py_MAX(maxchar, fill); 14593 } 14594 } 14595 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14596 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14597 maxchar = Py_MAX(maxchar, strmaxchar); 14598 } 14599 14600 buflen = arg->width; 14601 if (arg->sign && len == arg->width) 14602 buflen++; 14603 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14604 return -1; 14605 14606 /* Write the sign if needed */ 14607 if (arg->sign) { 14608 if (fill != ' ') { 14609 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14610 writer->pos += 1; 14611 } 14612 if (arg->width > len) 14613 arg->width--; 14614 } 14615 14616 /* Write the numeric prefix for "x", "X" and "o" formats 14617 if the alternate form is used. 14618 For example, write "0x" for the "%#x" format. */ 14619 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14620 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14621 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14622 if (fill != ' ') { 14623 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14624 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14625 writer->pos += 2; 14626 pindex += 2; 14627 } 14628 arg->width -= 2; 14629 if (arg->width < 0) 14630 arg->width = 0; 14631 len -= 2; 14632 } 14633 14634 /* Pad left with the fill character if needed */ 14635 if (arg->width > len && !(arg->flags & F_LJUST)) { 14636 sublen = arg->width - len; 14637 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14638 writer->pos += sublen; 14639 arg->width = len; 14640 } 14641 14642 /* If padding with spaces: write sign if needed and/or numeric prefix if 14643 the alternate form is used */ 14644 if (fill == ' ') { 14645 if (arg->sign) { 14646 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14647 writer->pos += 1; 14648 } 14649 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14650 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14651 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14652 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14653 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14654 writer->pos += 2; 14655 pindex += 2; 14656 } 14657 } 14658 14659 /* Write characters */ 14660 if (len) { 14661 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14662 str, pindex, len); 14663 writer->pos += len; 14664 } 14665 14666 /* Pad right with the fill character if needed */ 14667 if (arg->width > len) { 14668 sublen = arg->width - len; 14669 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14670 writer->pos += sublen; 14671 } 14672 return 0; 14673} 14674 14675/* Helper of PyUnicode_Format(): format one arg. 14676 Return 0 on success, raise an exception and return -1 on error. */ 14677static int 14678unicode_format_arg(struct unicode_formatter_t *ctx) 14679{ 14680 struct unicode_format_arg_t arg; 14681 PyObject *str; 14682 int ret; 14683 14684 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14685 arg.flags = 0; 14686 arg.width = -1; 14687 arg.prec = -1; 14688 arg.sign = 0; 14689 str = NULL; 14690 14691 ret = unicode_format_arg_parse(ctx, &arg); 14692 if (ret == -1) 14693 return -1; 14694 14695 ret = unicode_format_arg_format(ctx, &arg, &str); 14696 if (ret == -1) 14697 return -1; 14698 14699 if (ret != 1) { 14700 ret = unicode_format_arg_output(ctx, &arg, str); 14701 Py_DECREF(str); 14702 if (ret == -1) 14703 return -1; 14704 } 14705 14706 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14707 PyErr_SetString(PyExc_TypeError, 14708 "not all arguments converted during string formatting"); 14709 return -1; 14710 } 14711 return 0; 14712} 14713 14714PyObject * 14715PyUnicode_Format(PyObject *format, PyObject *args) 14716{ 14717 struct unicode_formatter_t ctx; 14718 14719 if (format == NULL || args == NULL) { 14720 PyErr_BadInternalCall(); 14721 return NULL; 14722 } 14723 14724 ctx.fmtstr = PyUnicode_FromObject(format); 14725 if (ctx.fmtstr == NULL) 14726 return NULL; 14727 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14728 Py_DECREF(ctx.fmtstr); 14729 return NULL; 14730 } 14731 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14732 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14733 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14734 ctx.fmtpos = 0; 14735 14736 _PyUnicodeWriter_Init(&ctx.writer); 14737 ctx.writer.min_length = ctx.fmtcnt + 100; 14738 ctx.writer.overallocate = 1; 14739 14740 if (PyTuple_Check(args)) { 14741 ctx.arglen = PyTuple_Size(args); 14742 ctx.argidx = 0; 14743 } 14744 else { 14745 ctx.arglen = -1; 14746 ctx.argidx = -2; 14747 } 14748 ctx.args_owned = 0; 14749 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14750 ctx.dict = args; 14751 else 14752 ctx.dict = NULL; 14753 ctx.args = args; 14754 14755 while (--ctx.fmtcnt >= 0) { 14756 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14757 Py_ssize_t nonfmtpos; 14758 14759 nonfmtpos = ctx.fmtpos++; 14760 while (ctx.fmtcnt >= 0 && 14761 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14762 ctx.fmtpos++; 14763 ctx.fmtcnt--; 14764 } 14765 if (ctx.fmtcnt < 0) { 14766 ctx.fmtpos--; 14767 ctx.writer.overallocate = 0; 14768 } 14769 14770 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14771 nonfmtpos, ctx.fmtpos) < 0) 14772 goto onError; 14773 } 14774 else { 14775 ctx.fmtpos++; 14776 if (unicode_format_arg(&ctx) == -1) 14777 goto onError; 14778 } 14779 } 14780 14781 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14782 PyErr_SetString(PyExc_TypeError, 14783 "not all arguments converted during string formatting"); 14784 goto onError; 14785 } 14786 14787 if (ctx.args_owned) { 14788 Py_DECREF(ctx.args); 14789 } 14790 Py_DECREF(ctx.fmtstr); 14791 return _PyUnicodeWriter_Finish(&ctx.writer); 14792 14793 onError: 14794 Py_DECREF(ctx.fmtstr); 14795 _PyUnicodeWriter_Dealloc(&ctx.writer); 14796 if (ctx.args_owned) { 14797 Py_DECREF(ctx.args); 14798 } 14799 return NULL; 14800} 14801 14802static PyObject * 14803unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14804 14805static PyObject * 14806unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14807{ 14808 PyObject *x = NULL; 14809 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14810 char *encoding = NULL; 14811 char *errors = NULL; 14812 14813 if (type != &PyUnicode_Type) 14814 return unicode_subtype_new(type, args, kwds); 14815 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14816 kwlist, &x, &encoding, &errors)) 14817 return NULL; 14818 if (x == NULL) 14819 _Py_RETURN_UNICODE_EMPTY(); 14820 if (encoding == NULL && errors == NULL) 14821 return PyObject_Str(x); 14822 else 14823 return PyUnicode_FromEncodedObject(x, encoding, errors); 14824} 14825 14826static PyObject * 14827unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14828{ 14829 PyObject *unicode, *self; 14830 Py_ssize_t length, char_size; 14831 int share_wstr, share_utf8; 14832 unsigned int kind; 14833 void *data; 14834 14835 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14836 14837 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14838 if (unicode == NULL) 14839 return NULL; 14840 assert(_PyUnicode_CHECK(unicode)); 14841 if (PyUnicode_READY(unicode) == -1) { 14842 Py_DECREF(unicode); 14843 return NULL; 14844 } 14845 14846 self = type->tp_alloc(type, 0); 14847 if (self == NULL) { 14848 Py_DECREF(unicode); 14849 return NULL; 14850 } 14851 kind = PyUnicode_KIND(unicode); 14852 length = PyUnicode_GET_LENGTH(unicode); 14853 14854 _PyUnicode_LENGTH(self) = length; 14855#ifdef Py_DEBUG 14856 _PyUnicode_HASH(self) = -1; 14857#else 14858 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14859#endif 14860 _PyUnicode_STATE(self).interned = 0; 14861 _PyUnicode_STATE(self).kind = kind; 14862 _PyUnicode_STATE(self).compact = 0; 14863 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14864 _PyUnicode_STATE(self).ready = 1; 14865 _PyUnicode_WSTR(self) = NULL; 14866 _PyUnicode_UTF8_LENGTH(self) = 0; 14867 _PyUnicode_UTF8(self) = NULL; 14868 _PyUnicode_WSTR_LENGTH(self) = 0; 14869 _PyUnicode_DATA_ANY(self) = NULL; 14870 14871 share_utf8 = 0; 14872 share_wstr = 0; 14873 if (kind == PyUnicode_1BYTE_KIND) { 14874 char_size = 1; 14875 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14876 share_utf8 = 1; 14877 } 14878 else if (kind == PyUnicode_2BYTE_KIND) { 14879 char_size = 2; 14880 if (sizeof(wchar_t) == 2) 14881 share_wstr = 1; 14882 } 14883 else { 14884 assert(kind == PyUnicode_4BYTE_KIND); 14885 char_size = 4; 14886 if (sizeof(wchar_t) == 4) 14887 share_wstr = 1; 14888 } 14889 14890 /* Ensure we won't overflow the length. */ 14891 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14892 PyErr_NoMemory(); 14893 goto onError; 14894 } 14895 data = PyObject_MALLOC((length + 1) * char_size); 14896 if (data == NULL) { 14897 PyErr_NoMemory(); 14898 goto onError; 14899 } 14900 14901 _PyUnicode_DATA_ANY(self) = data; 14902 if (share_utf8) { 14903 _PyUnicode_UTF8_LENGTH(self) = length; 14904 _PyUnicode_UTF8(self) = data; 14905 } 14906 if (share_wstr) { 14907 _PyUnicode_WSTR_LENGTH(self) = length; 14908 _PyUnicode_WSTR(self) = (wchar_t *)data; 14909 } 14910 14911 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14912 kind * (length + 1)); 14913 assert(_PyUnicode_CheckConsistency(self, 1)); 14914#ifdef Py_DEBUG 14915 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14916#endif 14917 Py_DECREF(unicode); 14918 return self; 14919 14920onError: 14921 Py_DECREF(unicode); 14922 Py_DECREF(self); 14923 return NULL; 14924} 14925 14926PyDoc_STRVAR(unicode_doc, 14927"str(object='') -> str\n\ 14928str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14929\n\ 14930Create a new string object from the given object. If encoding or\n\ 14931errors is specified, then the object must expose a data buffer\n\ 14932that will be decoded using the given encoding and error handler.\n\ 14933Otherwise, returns the result of object.__str__() (if defined)\n\ 14934or repr(object).\n\ 14935encoding defaults to sys.getdefaultencoding().\n\ 14936errors defaults to 'strict'."); 14937 14938static PyObject *unicode_iter(PyObject *seq); 14939 14940PyTypeObject PyUnicode_Type = { 14941 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14942 "str", /* tp_name */ 14943 sizeof(PyUnicodeObject), /* tp_size */ 14944 0, /* tp_itemsize */ 14945 /* Slots */ 14946 (destructor)unicode_dealloc, /* tp_dealloc */ 14947 0, /* tp_print */ 14948 0, /* tp_getattr */ 14949 0, /* tp_setattr */ 14950 0, /* tp_reserved */ 14951 unicode_repr, /* tp_repr */ 14952 &unicode_as_number, /* tp_as_number */ 14953 &unicode_as_sequence, /* tp_as_sequence */ 14954 &unicode_as_mapping, /* tp_as_mapping */ 14955 (hashfunc) unicode_hash, /* tp_hash*/ 14956 0, /* tp_call*/ 14957 (reprfunc) unicode_str, /* tp_str */ 14958 PyObject_GenericGetAttr, /* tp_getattro */ 14959 0, /* tp_setattro */ 14960 0, /* tp_as_buffer */ 14961 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14962 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14963 unicode_doc, /* tp_doc */ 14964 0, /* tp_traverse */ 14965 0, /* tp_clear */ 14966 PyUnicode_RichCompare, /* tp_richcompare */ 14967 0, /* tp_weaklistoffset */ 14968 unicode_iter, /* tp_iter */ 14969 0, /* tp_iternext */ 14970 unicode_methods, /* tp_methods */ 14971 0, /* tp_members */ 14972 0, /* tp_getset */ 14973 &PyBaseObject_Type, /* tp_base */ 14974 0, /* tp_dict */ 14975 0, /* tp_descr_get */ 14976 0, /* tp_descr_set */ 14977 0, /* tp_dictoffset */ 14978 0, /* tp_init */ 14979 0, /* tp_alloc */ 14980 unicode_new, /* tp_new */ 14981 PyObject_Del, /* tp_free */ 14982}; 14983 14984/* Initialize the Unicode implementation */ 14985 14986int _PyUnicode_Init(void) 14987{ 14988 /* XXX - move this array to unicodectype.c ? */ 14989 Py_UCS2 linebreak[] = { 14990 0x000A, /* LINE FEED */ 14991 0x000D, /* CARRIAGE RETURN */ 14992 0x001C, /* FILE SEPARATOR */ 14993 0x001D, /* GROUP SEPARATOR */ 14994 0x001E, /* RECORD SEPARATOR */ 14995 0x0085, /* NEXT LINE */ 14996 0x2028, /* LINE SEPARATOR */ 14997 0x2029, /* PARAGRAPH SEPARATOR */ 14998 }; 14999 15000 /* Init the implementation */ 15001 _Py_INCREF_UNICODE_EMPTY(); 15002 if (!unicode_empty) 15003 Py_FatalError("Can't create empty string"); 15004 Py_DECREF(unicode_empty); 15005 15006 if (PyType_Ready(&PyUnicode_Type) < 0) 15007 Py_FatalError("Can't initialize 'unicode'"); 15008 15009 /* initialize the linebreak bloom filter */ 15010 bloom_linebreak = make_bloom_mask( 15011 PyUnicode_2BYTE_KIND, linebreak, 15012 Py_ARRAY_LENGTH(linebreak)); 15013 15014 if (PyType_Ready(&EncodingMapType) < 0) 15015 Py_FatalError("Can't initialize encoding map type"); 15016 15017 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15018 Py_FatalError("Can't initialize field name iterator type"); 15019 15020 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15021 Py_FatalError("Can't initialize formatter iter type"); 15022 15023 return 0; 15024} 15025 15026/* Finalize the Unicode implementation */ 15027 15028int 15029PyUnicode_ClearFreeList(void) 15030{ 15031 return 0; 15032} 15033 15034void 15035_PyUnicode_Fini(void) 15036{ 15037 int i; 15038 15039 Py_CLEAR(unicode_empty); 15040 15041 for (i = 0; i < 256; i++) 15042 Py_CLEAR(unicode_latin1[i]); 15043 _PyUnicode_ClearStaticStrings(); 15044 (void)PyUnicode_ClearFreeList(); 15045} 15046 15047void 15048PyUnicode_InternInPlace(PyObject **p) 15049{ 15050 PyObject *s = *p; 15051 PyObject *t; 15052#ifdef Py_DEBUG 15053 assert(s != NULL); 15054 assert(_PyUnicode_CHECK(s)); 15055#else 15056 if (s == NULL || !PyUnicode_Check(s)) 15057 return; 15058#endif 15059 /* If it's a subclass, we don't really know what putting 15060 it in the interned dict might do. */ 15061 if (!PyUnicode_CheckExact(s)) 15062 return; 15063 if (PyUnicode_CHECK_INTERNED(s)) 15064 return; 15065 if (interned == NULL) { 15066 interned = PyDict_New(); 15067 if (interned == NULL) { 15068 PyErr_Clear(); /* Don't leave an exception */ 15069 return; 15070 } 15071 } 15072 /* It might be that the GetItem call fails even 15073 though the key is present in the dictionary, 15074 namely when this happens during a stack overflow. */ 15075 Py_ALLOW_RECURSION 15076 t = PyDict_GetItem(interned, s); 15077 Py_END_ALLOW_RECURSION 15078 15079 if (t) { 15080 Py_INCREF(t); 15081 Py_DECREF(*p); 15082 *p = t; 15083 return; 15084 } 15085 15086 PyThreadState_GET()->recursion_critical = 1; 15087 if (PyDict_SetItem(interned, s, s) < 0) { 15088 PyErr_Clear(); 15089 PyThreadState_GET()->recursion_critical = 0; 15090 return; 15091 } 15092 PyThreadState_GET()->recursion_critical = 0; 15093 /* The two references in interned are not counted by refcnt. 15094 The deallocator will take care of this */ 15095 Py_REFCNT(s) -= 2; 15096 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15097} 15098 15099void 15100PyUnicode_InternImmortal(PyObject **p) 15101{ 15102 PyUnicode_InternInPlace(p); 15103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15104 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15105 Py_INCREF(*p); 15106 } 15107} 15108 15109PyObject * 15110PyUnicode_InternFromString(const char *cp) 15111{ 15112 PyObject *s = PyUnicode_FromString(cp); 15113 if (s == NULL) 15114 return NULL; 15115 PyUnicode_InternInPlace(&s); 15116 return s; 15117} 15118 15119void 15120_Py_ReleaseInternedUnicodeStrings(void) 15121{ 15122 PyObject *keys; 15123 PyObject *s; 15124 Py_ssize_t i, n; 15125 Py_ssize_t immortal_size = 0, mortal_size = 0; 15126 15127 if (interned == NULL || !PyDict_Check(interned)) 15128 return; 15129 keys = PyDict_Keys(interned); 15130 if (keys == NULL || !PyList_Check(keys)) { 15131 PyErr_Clear(); 15132 return; 15133 } 15134 15135 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15136 detector, interned unicode strings are not forcibly deallocated; 15137 rather, we give them their stolen references back, and then clear 15138 and DECREF the interned dict. */ 15139 15140 n = PyList_GET_SIZE(keys); 15141 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15142 n); 15143 for (i = 0; i < n; i++) { 15144 s = PyList_GET_ITEM(keys, i); 15145 if (PyUnicode_READY(s) == -1) { 15146 assert(0 && "could not ready string"); 15147 fprintf(stderr, "could not ready string\n"); 15148 } 15149 switch (PyUnicode_CHECK_INTERNED(s)) { 15150 case SSTATE_NOT_INTERNED: 15151 /* XXX Shouldn't happen */ 15152 break; 15153 case SSTATE_INTERNED_IMMORTAL: 15154 Py_REFCNT(s) += 1; 15155 immortal_size += PyUnicode_GET_LENGTH(s); 15156 break; 15157 case SSTATE_INTERNED_MORTAL: 15158 Py_REFCNT(s) += 2; 15159 mortal_size += PyUnicode_GET_LENGTH(s); 15160 break; 15161 default: 15162 Py_FatalError("Inconsistent interned string state."); 15163 } 15164 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15165 } 15166 fprintf(stderr, "total size of all interned strings: " 15167 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15168 "mortal/immortal\n", mortal_size, immortal_size); 15169 Py_DECREF(keys); 15170 PyDict_Clear(interned); 15171 Py_CLEAR(interned); 15172} 15173 15174 15175/********************* Unicode Iterator **************************/ 15176 15177typedef struct { 15178 PyObject_HEAD 15179 Py_ssize_t it_index; 15180 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15181} unicodeiterobject; 15182 15183static void 15184unicodeiter_dealloc(unicodeiterobject *it) 15185{ 15186 _PyObject_GC_UNTRACK(it); 15187 Py_XDECREF(it->it_seq); 15188 PyObject_GC_Del(it); 15189} 15190 15191static int 15192unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15193{ 15194 Py_VISIT(it->it_seq); 15195 return 0; 15196} 15197 15198static PyObject * 15199unicodeiter_next(unicodeiterobject *it) 15200{ 15201 PyObject *seq, *item; 15202 15203 assert(it != NULL); 15204 seq = it->it_seq; 15205 if (seq == NULL) 15206 return NULL; 15207 assert(_PyUnicode_CHECK(seq)); 15208 15209 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15210 int kind = PyUnicode_KIND(seq); 15211 void *data = PyUnicode_DATA(seq); 15212 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15213 item = PyUnicode_FromOrdinal(chr); 15214 if (item != NULL) 15215 ++it->it_index; 15216 return item; 15217 } 15218 15219 Py_DECREF(seq); 15220 it->it_seq = NULL; 15221 return NULL; 15222} 15223 15224static PyObject * 15225unicodeiter_len(unicodeiterobject *it) 15226{ 15227 Py_ssize_t len = 0; 15228 if (it->it_seq) 15229 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15230 return PyLong_FromSsize_t(len); 15231} 15232 15233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15234 15235static PyObject * 15236unicodeiter_reduce(unicodeiterobject *it) 15237{ 15238 if (it->it_seq != NULL) { 15239 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15240 it->it_seq, it->it_index); 15241 } else { 15242 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15243 if (u == NULL) 15244 return NULL; 15245 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15246 } 15247} 15248 15249PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15250 15251static PyObject * 15252unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15253{ 15254 Py_ssize_t index = PyLong_AsSsize_t(state); 15255 if (index == -1 && PyErr_Occurred()) 15256 return NULL; 15257 if (it->it_seq != NULL) { 15258 if (index < 0) 15259 index = 0; 15260 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15261 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15262 it->it_index = index; 15263 } 15264 Py_RETURN_NONE; 15265} 15266 15267PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15268 15269static PyMethodDef unicodeiter_methods[] = { 15270 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15271 length_hint_doc}, 15272 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15273 reduce_doc}, 15274 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15275 setstate_doc}, 15276 {NULL, NULL} /* sentinel */ 15277}; 15278 15279PyTypeObject PyUnicodeIter_Type = { 15280 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15281 "str_iterator", /* tp_name */ 15282 sizeof(unicodeiterobject), /* tp_basicsize */ 15283 0, /* tp_itemsize */ 15284 /* methods */ 15285 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15286 0, /* tp_print */ 15287 0, /* tp_getattr */ 15288 0, /* tp_setattr */ 15289 0, /* tp_reserved */ 15290 0, /* tp_repr */ 15291 0, /* tp_as_number */ 15292 0, /* tp_as_sequence */ 15293 0, /* tp_as_mapping */ 15294 0, /* tp_hash */ 15295 0, /* tp_call */ 15296 0, /* tp_str */ 15297 PyObject_GenericGetAttr, /* tp_getattro */ 15298 0, /* tp_setattro */ 15299 0, /* tp_as_buffer */ 15300 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15301 0, /* tp_doc */ 15302 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15303 0, /* tp_clear */ 15304 0, /* tp_richcompare */ 15305 0, /* tp_weaklistoffset */ 15306 PyObject_SelfIter, /* tp_iter */ 15307 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15308 unicodeiter_methods, /* tp_methods */ 15309 0, 15310}; 15311 15312static PyObject * 15313unicode_iter(PyObject *seq) 15314{ 15315 unicodeiterobject *it; 15316 15317 if (!PyUnicode_Check(seq)) { 15318 PyErr_BadInternalCall(); 15319 return NULL; 15320 } 15321 if (PyUnicode_READY(seq) == -1) 15322 return NULL; 15323 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15324 if (it == NULL) 15325 return NULL; 15326 it->it_index = 0; 15327 Py_INCREF(seq); 15328 it->it_seq = seq; 15329 _PyObject_GC_TRACK(it); 15330 return (PyObject *)it; 15331} 15332 15333 15334size_t 15335Py_UNICODE_strlen(const Py_UNICODE *u) 15336{ 15337 int res = 0; 15338 while(*u++) 15339 res++; 15340 return res; 15341} 15342 15343Py_UNICODE* 15344Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15345{ 15346 Py_UNICODE *u = s1; 15347 while ((*u++ = *s2++)); 15348 return s1; 15349} 15350 15351Py_UNICODE* 15352Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15353{ 15354 Py_UNICODE *u = s1; 15355 while ((*u++ = *s2++)) 15356 if (n-- == 0) 15357 break; 15358 return s1; 15359} 15360 15361Py_UNICODE* 15362Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15363{ 15364 Py_UNICODE *u1 = s1; 15365 u1 += Py_UNICODE_strlen(u1); 15366 Py_UNICODE_strcpy(u1, s2); 15367 return s1; 15368} 15369 15370int 15371Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15372{ 15373 while (*s1 && *s2 && *s1 == *s2) 15374 s1++, s2++; 15375 if (*s1 && *s2) 15376 return (*s1 < *s2) ? -1 : +1; 15377 if (*s1) 15378 return 1; 15379 if (*s2) 15380 return -1; 15381 return 0; 15382} 15383 15384int 15385Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15386{ 15387 Py_UNICODE u1, u2; 15388 for (; n != 0; n--) { 15389 u1 = *s1; 15390 u2 = *s2; 15391 if (u1 != u2) 15392 return (u1 < u2) ? -1 : +1; 15393 if (u1 == '\0') 15394 return 0; 15395 s1++; 15396 s2++; 15397 } 15398 return 0; 15399} 15400 15401Py_UNICODE* 15402Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15403{ 15404 const Py_UNICODE *p; 15405 for (p = s; *p; p++) 15406 if (*p == c) 15407 return (Py_UNICODE*)p; 15408 return NULL; 15409} 15410 15411Py_UNICODE* 15412Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15413{ 15414 const Py_UNICODE *p; 15415 p = s + Py_UNICODE_strlen(s); 15416 while (p != s) { 15417 p--; 15418 if (*p == c) 15419 return (Py_UNICODE*)p; 15420 } 15421 return NULL; 15422} 15423 15424Py_UNICODE* 15425PyUnicode_AsUnicodeCopy(PyObject *unicode) 15426{ 15427 Py_UNICODE *u, *copy; 15428 Py_ssize_t len, size; 15429 15430 if (!PyUnicode_Check(unicode)) { 15431 PyErr_BadArgument(); 15432 return NULL; 15433 } 15434 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15435 if (u == NULL) 15436 return NULL; 15437 /* Ensure we won't overflow the size. */ 15438 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15439 PyErr_NoMemory(); 15440 return NULL; 15441 } 15442 size = len + 1; /* copy the null character */ 15443 size *= sizeof(Py_UNICODE); 15444 copy = PyMem_Malloc(size); 15445 if (copy == NULL) { 15446 PyErr_NoMemory(); 15447 return NULL; 15448 } 15449 memcpy(copy, u, size); 15450 return copy; 15451} 15452 15453/* A _string module, to export formatter_parser and formatter_field_name_split 15454 to the string.Formatter class implemented in Python. */ 15455 15456static PyMethodDef _string_methods[] = { 15457 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15458 METH_O, PyDoc_STR("split the argument as a field name")}, 15459 {"formatter_parser", (PyCFunction) formatter_parser, 15460 METH_O, PyDoc_STR("parse the argument as a format string")}, 15461 {NULL, NULL} 15462}; 15463 15464static struct PyModuleDef _string_module = { 15465 PyModuleDef_HEAD_INIT, 15466 "_string", 15467 PyDoc_STR("string helper module"), 15468 0, 15469 _string_methods, 15470 NULL, 15471 NULL, 15472 NULL, 15473 NULL 15474}; 15475 15476PyMODINIT_FUNC 15477PyInit__string(void) 15478{ 15479 return PyModule_Create(&_string_module); 15480} 15481 15482 15483#ifdef __cplusplus 15484} 15485#endif 15486