unicodeobject.c revision c3713e9706e51bbd30958c27d35e7fda764b0c4a
1/* 2 3Unicode implementation based on original code by Fredrik Lundh, 4modified by Marc-Andre Lemburg <mal@lemburg.com>. 5 6Major speed upgrades to the method implementations at the Reykjavik 7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. 8 9Copyright (c) Corporation for National Research Initiatives. 10 11-------------------------------------------------------------------- 12The original string type implementation is: 13 14 Copyright (c) 1999 by Secret Labs AB 15 Copyright (c) 1999 by Fredrik Lundh 16 17By obtaining, using, and/or copying this software and/or its 18associated documentation, you agree that you have read, understood, 19and will comply with the following terms and conditions: 20 21Permission to use, copy, modify, and distribute this software and its 22associated documentation for any purpose and without fee is hereby 23granted, provided that the above copyright notice appears in all 24copies, and that both that copyright notice and this permission notice 25appear in supporting documentation, and that the name of Secret Labs 26AB or the author not be used in advertising or publicity pertaining to 27distribution of the software without specific, written prior 28permission. 29 30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 37-------------------------------------------------------------------- 38 39*/ 40 41#define PY_SSIZE_T_CLEAN 42#include "Python.h" 43#include "ucnhash.h" 44#include "bytes_methods.h" 45#include "stringlib/eq.h" 46 47#ifdef MS_WINDOWS 48#include <windows.h> 49#endif 50 51/*[clinic input] 52class str "PyUnicodeObject *" "&PyUnicode_Type" 53[clinic start generated code]*/ 54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/ 55 56/* --- Globals ------------------------------------------------------------ 57 58NOTE: In the interpreter's initialization phase, some globals are currently 59 initialized dynamically as needed. In the process Unicode objects may 60 be created before the Unicode type is ready. 61 62*/ 63 64 65#ifdef __cplusplus 66extern "C" { 67#endif 68 69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */ 70#define MAX_UNICODE 0x10ffff 71 72#ifdef Py_DEBUG 73# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) 74#else 75# define _PyUnicode_CHECK(op) PyUnicode_Check(op) 76#endif 77 78#define _PyUnicode_UTF8(op) \ 79 (((PyCompactUnicodeObject*)(op))->utf8) 80#define PyUnicode_UTF8(op) \ 81 (assert(_PyUnicode_CHECK(op)), \ 82 assert(PyUnicode_IS_READY(op)), \ 83 PyUnicode_IS_COMPACT_ASCII(op) ? \ 84 ((char*)((PyASCIIObject*)(op) + 1)) : \ 85 _PyUnicode_UTF8(op)) 86#define _PyUnicode_UTF8_LENGTH(op) \ 87 (((PyCompactUnicodeObject*)(op))->utf8_length) 88#define PyUnicode_UTF8_LENGTH(op) \ 89 (assert(_PyUnicode_CHECK(op)), \ 90 assert(PyUnicode_IS_READY(op)), \ 91 PyUnicode_IS_COMPACT_ASCII(op) ? \ 92 ((PyASCIIObject*)(op))->length : \ 93 _PyUnicode_UTF8_LENGTH(op)) 94#define _PyUnicode_WSTR(op) \ 95 (((PyASCIIObject*)(op))->wstr) 96#define _PyUnicode_WSTR_LENGTH(op) \ 97 (((PyCompactUnicodeObject*)(op))->wstr_length) 98#define _PyUnicode_LENGTH(op) \ 99 (((PyASCIIObject *)(op))->length) 100#define _PyUnicode_STATE(op) \ 101 (((PyASCIIObject *)(op))->state) 102#define _PyUnicode_HASH(op) \ 103 (((PyASCIIObject *)(op))->hash) 104#define _PyUnicode_KIND(op) \ 105 (assert(_PyUnicode_CHECK(op)), \ 106 ((PyASCIIObject *)(op))->state.kind) 107#define _PyUnicode_GET_LENGTH(op) \ 108 (assert(_PyUnicode_CHECK(op)), \ 109 ((PyASCIIObject *)(op))->length) 110#define _PyUnicode_DATA_ANY(op) \ 111 (((PyUnicodeObject*)(op))->data.any) 112 113#undef PyUnicode_READY 114#define PyUnicode_READY(op) \ 115 (assert(_PyUnicode_CHECK(op)), \ 116 (PyUnicode_IS_READY(op) ? \ 117 0 : \ 118 _PyUnicode_Ready(op))) 119 120#define _PyUnicode_SHARE_UTF8(op) \ 121 (assert(_PyUnicode_CHECK(op)), \ 122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ 123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) 124#define _PyUnicode_SHARE_WSTR(op) \ 125 (assert(_PyUnicode_CHECK(op)), \ 126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op))) 127 128/* true if the Unicode object has an allocated UTF-8 memory block 129 (not shared with other data) */ 130#define _PyUnicode_HAS_UTF8_MEMORY(op) \ 131 ((!PyUnicode_IS_COMPACT_ASCII(op) \ 132 && _PyUnicode_UTF8(op) \ 133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) 134 135/* true if the Unicode object has an allocated wstr memory block 136 (not shared with other data) */ 137#define _PyUnicode_HAS_WSTR_MEMORY(op) \ 138 ((_PyUnicode_WSTR(op) && \ 139 (!PyUnicode_IS_READY(op) || \ 140 _PyUnicode_WSTR(op) != PyUnicode_DATA(op)))) 141 142/* Generic helper macro to convert characters of different types. 143 from_type and to_type have to be valid type names, begin and end 144 are pointers to the source characters which should be of type 145 "from_type *". to is a pointer of type "to_type *" and points to the 146 buffer where the result characters are written to. */ 147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \ 148 do { \ 149 to_type *_to = (to_type *)(to); \ 150 const from_type *_iter = (from_type *)(begin); \ 151 const from_type *_end = (from_type *)(end); \ 152 Py_ssize_t n = (_end) - (_iter); \ 153 const from_type *_unrolled_end = \ 154 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \ 155 while (_iter < (_unrolled_end)) { \ 156 _to[0] = (to_type) _iter[0]; \ 157 _to[1] = (to_type) _iter[1]; \ 158 _to[2] = (to_type) _iter[2]; \ 159 _to[3] = (to_type) _iter[3]; \ 160 _iter += 4; _to += 4; \ 161 } \ 162 while (_iter < (_end)) \ 163 *_to++ = (to_type) *_iter++; \ 164 } while (0) 165 166/* This dictionary holds all interned unicode strings. Note that references 167 to strings in this dictionary are *not* counted in the string's ob_refcnt. 168 When the interned string reaches a refcnt of 0 the string deallocation 169 function will delete the reference from this dictionary. 170 171 Another way to look at this is that to say that the actual reference 172 count of a string is: s->ob_refcnt + (s->state ? 2 : 0) 173*/ 174static PyObject *interned = NULL; 175 176/* The empty Unicode object is shared to improve performance. */ 177static PyObject *unicode_empty = NULL; 178 179#define _Py_INCREF_UNICODE_EMPTY() \ 180 do { \ 181 if (unicode_empty != NULL) \ 182 Py_INCREF(unicode_empty); \ 183 else { \ 184 unicode_empty = PyUnicode_New(0, 0); \ 185 if (unicode_empty != NULL) { \ 186 Py_INCREF(unicode_empty); \ 187 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \ 188 } \ 189 } \ 190 } while (0) 191 192#define _Py_RETURN_UNICODE_EMPTY() \ 193 do { \ 194 _Py_INCREF_UNICODE_EMPTY(); \ 195 return unicode_empty; \ 196 } while (0) 197 198/* Forward declaration */ 199Py_LOCAL_INLINE(int) 200_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); 201 202/* List of static strings. */ 203static _Py_Identifier *static_strings = NULL; 204 205/* Single character Unicode strings in the Latin-1 range are being 206 shared as well. */ 207static PyObject *unicode_latin1[256] = {NULL}; 208 209/* Fast detection of the most frequent whitespace characters */ 210const unsigned char _Py_ascii_whitespace[] = { 211 0, 0, 0, 0, 0, 0, 0, 0, 212/* case 0x0009: * CHARACTER TABULATION */ 213/* case 0x000A: * LINE FEED */ 214/* case 0x000B: * LINE TABULATION */ 215/* case 0x000C: * FORM FEED */ 216/* case 0x000D: * CARRIAGE RETURN */ 217 0, 1, 1, 1, 1, 1, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219/* case 0x001C: * FILE SEPARATOR */ 220/* case 0x001D: * GROUP SEPARATOR */ 221/* case 0x001E: * RECORD SEPARATOR */ 222/* case 0x001F: * UNIT SEPARATOR */ 223 0, 0, 0, 0, 1, 1, 1, 1, 224/* case 0x0020: * SPACE */ 225 1, 0, 0, 0, 0, 0, 0, 0, 226 0, 0, 0, 0, 0, 0, 0, 0, 227 0, 0, 0, 0, 0, 0, 0, 0, 228 0, 0, 0, 0, 0, 0, 0, 0, 229 230 0, 0, 0, 0, 0, 0, 0, 0, 231 0, 0, 0, 0, 0, 0, 0, 0, 232 0, 0, 0, 0, 0, 0, 0, 0, 233 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 0, 0, 0, 0, 0, 0, 0, 0, 237 0, 0, 0, 0, 0, 0, 0, 0 238}; 239 240/* forward */ 241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); 242static PyObject* get_latin1_char(unsigned char ch); 243static int unicode_modifiable(PyObject *unicode); 244 245 246static PyObject * 247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size); 248static PyObject * 249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size); 250static PyObject * 251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size); 252 253static PyObject * 254unicode_encode_call_errorhandler(const char *errors, 255 PyObject **errorHandler,const char *encoding, const char *reason, 256 PyObject *unicode, PyObject **exceptionObject, 257 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos); 258 259static void 260raise_encode_exception(PyObject **exceptionObject, 261 const char *encoding, 262 PyObject *unicode, 263 Py_ssize_t startpos, Py_ssize_t endpos, 264 const char *reason); 265 266/* Same for linebreaks */ 267static unsigned char ascii_linebreak[] = { 268 0, 0, 0, 0, 0, 0, 0, 0, 269/* 0x000A, * LINE FEED */ 270/* 0x000B, * LINE TABULATION */ 271/* 0x000C, * FORM FEED */ 272/* 0x000D, * CARRIAGE RETURN */ 273 0, 0, 1, 1, 1, 1, 0, 0, 274 0, 0, 0, 0, 0, 0, 0, 0, 275/* 0x001C, * FILE SEPARATOR */ 276/* 0x001D, * GROUP SEPARATOR */ 277/* 0x001E, * RECORD SEPARATOR */ 278 0, 0, 0, 0, 1, 1, 1, 0, 279 0, 0, 0, 0, 0, 0, 0, 0, 280 0, 0, 0, 0, 0, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 282 0, 0, 0, 0, 0, 0, 0, 0, 283 284 0, 0, 0, 0, 0, 0, 0, 0, 285 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 287 0, 0, 0, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0 292}; 293 294#include "clinic/unicodeobject.c.h" 295 296typedef enum { 297 _Py_ERROR_UNKNOWN=0, 298 _Py_ERROR_STRICT, 299 _Py_ERROR_SURROGATEESCAPE, 300 _Py_ERROR_REPLACE, 301 _Py_ERROR_IGNORE, 302 _Py_ERROR_XMLCHARREFREPLACE, 303 _Py_ERROR_OTHER 304} _Py_error_handler; 305 306static _Py_error_handler 307get_error_handler(const char *errors) 308{ 309 if (errors == NULL) 310 return _Py_ERROR_STRICT; 311 if (strcmp(errors, "strict") == 0) 312 return _Py_ERROR_STRICT; 313 if (strcmp(errors, "surrogateescape") == 0) 314 return _Py_ERROR_SURROGATEESCAPE; 315 if (strcmp(errors, "ignore") == 0) 316 return _Py_ERROR_IGNORE; 317 if (strcmp(errors, "replace") == 0) 318 return _Py_ERROR_REPLACE; 319 if (strcmp(errors, "xmlcharrefreplace") == 0) 320 return _Py_ERROR_XMLCHARREFREPLACE; 321 return _Py_ERROR_OTHER; 322} 323 324/* The max unicode value is always 0x10FFFF while using the PEP-393 API. 325 This function is kept for backward compatibility with the old API. */ 326Py_UNICODE 327PyUnicode_GetMax(void) 328{ 329#ifdef Py_UNICODE_WIDE 330 return 0x10FFFF; 331#else 332 /* This is actually an illegal character, so it should 333 not be passed to unichr. */ 334 return 0xFFFF; 335#endif 336} 337 338#ifdef Py_DEBUG 339int 340_PyUnicode_CheckConsistency(PyObject *op, int check_content) 341{ 342 PyASCIIObject *ascii; 343 unsigned int kind; 344 345 assert(PyUnicode_Check(op)); 346 347 ascii = (PyASCIIObject *)op; 348 kind = ascii->state.kind; 349 350 if (ascii->state.ascii == 1 && ascii->state.compact == 1) { 351 assert(kind == PyUnicode_1BYTE_KIND); 352 assert(ascii->state.ready == 1); 353 } 354 else { 355 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 356 void *data; 357 358 if (ascii->state.compact == 1) { 359 data = compact + 1; 360 assert(kind == PyUnicode_1BYTE_KIND 361 || kind == PyUnicode_2BYTE_KIND 362 || kind == PyUnicode_4BYTE_KIND); 363 assert(ascii->state.ascii == 0); 364 assert(ascii->state.ready == 1); 365 assert (compact->utf8 != data); 366 } 367 else { 368 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 369 370 data = unicode->data.any; 371 if (kind == PyUnicode_WCHAR_KIND) { 372 assert(ascii->length == 0); 373 assert(ascii->hash == -1); 374 assert(ascii->state.compact == 0); 375 assert(ascii->state.ascii == 0); 376 assert(ascii->state.ready == 0); 377 assert(ascii->state.interned == SSTATE_NOT_INTERNED); 378 assert(ascii->wstr != NULL); 379 assert(data == NULL); 380 assert(compact->utf8 == NULL); 381 } 382 else { 383 assert(kind == PyUnicode_1BYTE_KIND 384 || kind == PyUnicode_2BYTE_KIND 385 || kind == PyUnicode_4BYTE_KIND); 386 assert(ascii->state.compact == 0); 387 assert(ascii->state.ready == 1); 388 assert(data != NULL); 389 if (ascii->state.ascii) { 390 assert (compact->utf8 == data); 391 assert (compact->utf8_length == ascii->length); 392 } 393 else 394 assert (compact->utf8 != data); 395 } 396 } 397 if (kind != PyUnicode_WCHAR_KIND) { 398 if ( 399#if SIZEOF_WCHAR_T == 2 400 kind == PyUnicode_2BYTE_KIND 401#else 402 kind == PyUnicode_4BYTE_KIND 403#endif 404 ) 405 { 406 assert(ascii->wstr == data); 407 assert(compact->wstr_length == ascii->length); 408 } else 409 assert(ascii->wstr != data); 410 } 411 412 if (compact->utf8 == NULL) 413 assert(compact->utf8_length == 0); 414 if (ascii->wstr == NULL) 415 assert(compact->wstr_length == 0); 416 } 417 /* check that the best kind is used */ 418 if (check_content && kind != PyUnicode_WCHAR_KIND) 419 { 420 Py_ssize_t i; 421 Py_UCS4 maxchar = 0; 422 void *data; 423 Py_UCS4 ch; 424 425 data = PyUnicode_DATA(ascii); 426 for (i=0; i < ascii->length; i++) 427 { 428 ch = PyUnicode_READ(kind, data, i); 429 if (ch > maxchar) 430 maxchar = ch; 431 } 432 if (kind == PyUnicode_1BYTE_KIND) { 433 if (ascii->state.ascii == 0) { 434 assert(maxchar >= 128); 435 assert(maxchar <= 255); 436 } 437 else 438 assert(maxchar < 128); 439 } 440 else if (kind == PyUnicode_2BYTE_KIND) { 441 assert(maxchar >= 0x100); 442 assert(maxchar <= 0xFFFF); 443 } 444 else { 445 assert(maxchar >= 0x10000); 446 assert(maxchar <= MAX_UNICODE); 447 } 448 assert(PyUnicode_READ(kind, data, ascii->length) == 0); 449 } 450 return 1; 451} 452#endif 453 454static PyObject* 455unicode_result_wchar(PyObject *unicode) 456{ 457#ifndef Py_DEBUG 458 Py_ssize_t len; 459 460 len = _PyUnicode_WSTR_LENGTH(unicode); 461 if (len == 0) { 462 Py_DECREF(unicode); 463 _Py_RETURN_UNICODE_EMPTY(); 464 } 465 466 if (len == 1) { 467 wchar_t ch = _PyUnicode_WSTR(unicode)[0]; 468 if ((Py_UCS4)ch < 256) { 469 PyObject *latin1_char = get_latin1_char((unsigned char)ch); 470 Py_DECREF(unicode); 471 return latin1_char; 472 } 473 } 474 475 if (_PyUnicode_Ready(unicode) < 0) { 476 Py_DECREF(unicode); 477 return NULL; 478 } 479#else 480 assert(Py_REFCNT(unicode) == 1); 481 482 /* don't make the result ready in debug mode to ensure that the caller 483 makes the string ready before using it */ 484 assert(_PyUnicode_CheckConsistency(unicode, 1)); 485#endif 486 return unicode; 487} 488 489static PyObject* 490unicode_result_ready(PyObject *unicode) 491{ 492 Py_ssize_t length; 493 494 length = PyUnicode_GET_LENGTH(unicode); 495 if (length == 0) { 496 if (unicode != unicode_empty) { 497 Py_DECREF(unicode); 498 _Py_RETURN_UNICODE_EMPTY(); 499 } 500 return unicode_empty; 501 } 502 503 if (length == 1) { 504 void *data = PyUnicode_DATA(unicode); 505 int kind = PyUnicode_KIND(unicode); 506 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 507 if (ch < 256) { 508 PyObject *latin1_char = unicode_latin1[ch]; 509 if (latin1_char != NULL) { 510 if (unicode != latin1_char) { 511 Py_INCREF(latin1_char); 512 Py_DECREF(unicode); 513 } 514 return latin1_char; 515 } 516 else { 517 assert(_PyUnicode_CheckConsistency(unicode, 1)); 518 Py_INCREF(unicode); 519 unicode_latin1[ch] = unicode; 520 return unicode; 521 } 522 } 523 } 524 525 assert(_PyUnicode_CheckConsistency(unicode, 1)); 526 return unicode; 527} 528 529static PyObject* 530unicode_result(PyObject *unicode) 531{ 532 assert(_PyUnicode_CHECK(unicode)); 533 if (PyUnicode_IS_READY(unicode)) 534 return unicode_result_ready(unicode); 535 else 536 return unicode_result_wchar(unicode); 537} 538 539static PyObject* 540unicode_result_unchanged(PyObject *unicode) 541{ 542 if (PyUnicode_CheckExact(unicode)) { 543 if (PyUnicode_READY(unicode) == -1) 544 return NULL; 545 Py_INCREF(unicode); 546 return unicode; 547 } 548 else 549 /* Subtype -- return genuine unicode string with the same value. */ 550 return _PyUnicode_Copy(unicode); 551} 552 553/* --- Bloom Filters ----------------------------------------------------- */ 554 555/* stuff to implement simple "bloom filters" for Unicode characters. 556 to keep things simple, we use a single bitmask, using the least 5 557 bits from each unicode characters as the bit index. */ 558 559/* the linebreak mask is set up by Unicode_Init below */ 560 561#if LONG_BIT >= 128 562#define BLOOM_WIDTH 128 563#elif LONG_BIT >= 64 564#define BLOOM_WIDTH 64 565#elif LONG_BIT >= 32 566#define BLOOM_WIDTH 32 567#else 568#error "LONG_BIT is smaller than 32" 569#endif 570 571#define BLOOM_MASK unsigned long 572 573static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; 574 575#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) 576 577#define BLOOM_LINEBREAK(ch) \ 578 ((ch) < 128U ? ascii_linebreak[(ch)] : \ 579 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) 580 581Py_LOCAL_INLINE(BLOOM_MASK) 582make_bloom_mask(int kind, void* ptr, Py_ssize_t len) 583{ 584#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ 585 do { \ 586 TYPE *data = (TYPE *)PTR; \ 587 TYPE *end = data + LEN; \ 588 Py_UCS4 ch; \ 589 for (; data != end; data++) { \ 590 ch = *data; \ 591 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ 592 } \ 593 break; \ 594 } while (0) 595 596 /* calculate simple bloom-style bitmask for a given unicode string */ 597 598 BLOOM_MASK mask; 599 600 mask = 0; 601 switch (kind) { 602 case PyUnicode_1BYTE_KIND: 603 BLOOM_UPDATE(Py_UCS1, mask, ptr, len); 604 break; 605 case PyUnicode_2BYTE_KIND: 606 BLOOM_UPDATE(Py_UCS2, mask, ptr, len); 607 break; 608 case PyUnicode_4BYTE_KIND: 609 BLOOM_UPDATE(Py_UCS4, mask, ptr, len); 610 break; 611 default: 612 assert(0); 613 } 614 return mask; 615 616#undef BLOOM_UPDATE 617} 618 619/* Compilation of templated routines */ 620 621#include "stringlib/asciilib.h" 622#include "stringlib/fastsearch.h" 623#include "stringlib/partition.h" 624#include "stringlib/split.h" 625#include "stringlib/count.h" 626#include "stringlib/find.h" 627#include "stringlib/find_max_char.h" 628#include "stringlib/localeutil.h" 629#include "stringlib/undef.h" 630 631#include "stringlib/ucs1lib.h" 632#include "stringlib/fastsearch.h" 633#include "stringlib/partition.h" 634#include "stringlib/split.h" 635#include "stringlib/count.h" 636#include "stringlib/find.h" 637#include "stringlib/replace.h" 638#include "stringlib/find_max_char.h" 639#include "stringlib/localeutil.h" 640#include "stringlib/undef.h" 641 642#include "stringlib/ucs2lib.h" 643#include "stringlib/fastsearch.h" 644#include "stringlib/partition.h" 645#include "stringlib/split.h" 646#include "stringlib/count.h" 647#include "stringlib/find.h" 648#include "stringlib/replace.h" 649#include "stringlib/find_max_char.h" 650#include "stringlib/localeutil.h" 651#include "stringlib/undef.h" 652 653#include "stringlib/ucs4lib.h" 654#include "stringlib/fastsearch.h" 655#include "stringlib/partition.h" 656#include "stringlib/split.h" 657#include "stringlib/count.h" 658#include "stringlib/find.h" 659#include "stringlib/replace.h" 660#include "stringlib/find_max_char.h" 661#include "stringlib/localeutil.h" 662#include "stringlib/undef.h" 663 664#include "stringlib/unicodedefs.h" 665#include "stringlib/fastsearch.h" 666#include "stringlib/count.h" 667#include "stringlib/find.h" 668#include "stringlib/undef.h" 669 670/* --- Unicode Object ----------------------------------------------------- */ 671 672static PyObject * 673fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); 674 675Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, 676 Py_ssize_t size, Py_UCS4 ch, 677 int direction) 678{ 679 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; 680 681 switch (kind) { 682 case PyUnicode_1BYTE_KIND: 683 { 684 Py_UCS1 ch1 = (Py_UCS1) ch; 685 if (ch1 == ch) 686 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); 687 else 688 return -1; 689 } 690 case PyUnicode_2BYTE_KIND: 691 { 692 Py_UCS2 ch2 = (Py_UCS2) ch; 693 if (ch2 == ch) 694 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); 695 else 696 return -1; 697 } 698 case PyUnicode_4BYTE_KIND: 699 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); 700 default: 701 assert(0); 702 return -1; 703 } 704} 705 706#ifdef Py_DEBUG 707/* Fill the data of an Unicode string with invalid characters to detect bugs 708 earlier. 709 710 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for 711 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an 712 invalid character in Unicode 6.0. */ 713static void 714unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length) 715{ 716 int kind = PyUnicode_KIND(unicode); 717 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); 718 Py_ssize_t length = _PyUnicode_LENGTH(unicode); 719 if (length <= old_length) 720 return; 721 memset(data + old_length * kind, 0xff, (length - old_length) * kind); 722} 723#endif 724 725static PyObject* 726resize_compact(PyObject *unicode, Py_ssize_t length) 727{ 728 Py_ssize_t char_size; 729 Py_ssize_t struct_size; 730 Py_ssize_t new_size; 731 int share_wstr; 732 PyObject *new_unicode; 733#ifdef Py_DEBUG 734 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 735#endif 736 737 assert(unicode_modifiable(unicode)); 738 assert(PyUnicode_IS_READY(unicode)); 739 assert(PyUnicode_IS_COMPACT(unicode)); 740 741 char_size = PyUnicode_KIND(unicode); 742 if (PyUnicode_IS_ASCII(unicode)) 743 struct_size = sizeof(PyASCIIObject); 744 else 745 struct_size = sizeof(PyCompactUnicodeObject); 746 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 747 748 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { 749 PyErr_NoMemory(); 750 return NULL; 751 } 752 new_size = (struct_size + (length + 1) * char_size); 753 754 _Py_DEC_REFTOTAL; 755 _Py_ForgetReference(unicode); 756 757 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size); 758 if (new_unicode == NULL) { 759 _Py_NewReference(unicode); 760 PyErr_NoMemory(); 761 return NULL; 762 } 763 unicode = new_unicode; 764 _Py_NewReference(unicode); 765 766 _PyUnicode_LENGTH(unicode) = length; 767 if (share_wstr) { 768 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); 769 if (!PyUnicode_IS_ASCII(unicode)) 770 _PyUnicode_WSTR_LENGTH(unicode) = length; 771 } 772 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) { 773 PyObject_DEL(_PyUnicode_WSTR(unicode)); 774 _PyUnicode_WSTR(unicode) = NULL; 775 } 776#ifdef Py_DEBUG 777 unicode_fill_invalid(unicode, old_length); 778#endif 779 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 780 length, 0); 781 assert(_PyUnicode_CheckConsistency(unicode, 0)); 782 return unicode; 783} 784 785static int 786resize_inplace(PyObject *unicode, Py_ssize_t length) 787{ 788 wchar_t *wstr; 789 Py_ssize_t new_size; 790 assert(!PyUnicode_IS_COMPACT(unicode)); 791 assert(Py_REFCNT(unicode) == 1); 792 793 if (PyUnicode_IS_READY(unicode)) { 794 Py_ssize_t char_size; 795 int share_wstr, share_utf8; 796 void *data; 797#ifdef Py_DEBUG 798 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode); 799#endif 800 801 data = _PyUnicode_DATA_ANY(unicode); 802 char_size = PyUnicode_KIND(unicode); 803 share_wstr = _PyUnicode_SHARE_WSTR(unicode); 804 share_utf8 = _PyUnicode_SHARE_UTF8(unicode); 805 806 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 807 PyErr_NoMemory(); 808 return -1; 809 } 810 new_size = (length + 1) * char_size; 811 812 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) 813 { 814 PyObject_DEL(_PyUnicode_UTF8(unicode)); 815 _PyUnicode_UTF8(unicode) = NULL; 816 _PyUnicode_UTF8_LENGTH(unicode) = 0; 817 } 818 819 data = (PyObject *)PyObject_REALLOC(data, new_size); 820 if (data == NULL) { 821 PyErr_NoMemory(); 822 return -1; 823 } 824 _PyUnicode_DATA_ANY(unicode) = data; 825 if (share_wstr) { 826 _PyUnicode_WSTR(unicode) = data; 827 _PyUnicode_WSTR_LENGTH(unicode) = length; 828 } 829 if (share_utf8) { 830 _PyUnicode_UTF8(unicode) = data; 831 _PyUnicode_UTF8_LENGTH(unicode) = length; 832 } 833 _PyUnicode_LENGTH(unicode) = length; 834 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); 835#ifdef Py_DEBUG 836 unicode_fill_invalid(unicode, old_length); 837#endif 838 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { 839 assert(_PyUnicode_CheckConsistency(unicode, 0)); 840 return 0; 841 } 842 } 843 assert(_PyUnicode_WSTR(unicode) != NULL); 844 845 /* check for integer overflow */ 846 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) { 847 PyErr_NoMemory(); 848 return -1; 849 } 850 new_size = sizeof(wchar_t) * (length + 1); 851 wstr = _PyUnicode_WSTR(unicode); 852 wstr = PyObject_REALLOC(wstr, new_size); 853 if (!wstr) { 854 PyErr_NoMemory(); 855 return -1; 856 } 857 _PyUnicode_WSTR(unicode) = wstr; 858 _PyUnicode_WSTR(unicode)[length] = 0; 859 _PyUnicode_WSTR_LENGTH(unicode) = length; 860 assert(_PyUnicode_CheckConsistency(unicode, 0)); 861 return 0; 862} 863 864static PyObject* 865resize_copy(PyObject *unicode, Py_ssize_t length) 866{ 867 Py_ssize_t copy_length; 868 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) { 869 PyObject *copy; 870 871 if (PyUnicode_READY(unicode) == -1) 872 return NULL; 873 874 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 875 if (copy == NULL) 876 return NULL; 877 878 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); 879 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length); 880 return copy; 881 } 882 else { 883 PyObject *w; 884 885 w = (PyObject*)_PyUnicode_New(length); 886 if (w == NULL) 887 return NULL; 888 copy_length = _PyUnicode_WSTR_LENGTH(unicode); 889 copy_length = Py_MIN(copy_length, length); 890 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), 891 copy_length * sizeof(wchar_t)); 892 return w; 893 } 894} 895 896/* We allocate one more byte to make sure the string is 897 Ux0000 terminated; some code (e.g. new_identifier) 898 relies on that. 899 900 XXX This allocator could further be enhanced by assuring that the 901 free list never reduces its size below 1. 902 903*/ 904 905static PyUnicodeObject * 906_PyUnicode_New(Py_ssize_t length) 907{ 908 PyUnicodeObject *unicode; 909 size_t new_size; 910 911 /* Optimization for empty strings */ 912 if (length == 0 && unicode_empty != NULL) { 913 Py_INCREF(unicode_empty); 914 return (PyUnicodeObject*)unicode_empty; 915 } 916 917 /* Ensure we won't overflow the size. */ 918 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 919 return (PyUnicodeObject *)PyErr_NoMemory(); 920 } 921 if (length < 0) { 922 PyErr_SetString(PyExc_SystemError, 923 "Negative size passed to _PyUnicode_New"); 924 return NULL; 925 } 926 927 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); 928 if (unicode == NULL) 929 return NULL; 930 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); 931 932 _PyUnicode_WSTR_LENGTH(unicode) = length; 933 _PyUnicode_HASH(unicode) = -1; 934 _PyUnicode_STATE(unicode).interned = 0; 935 _PyUnicode_STATE(unicode).kind = 0; 936 _PyUnicode_STATE(unicode).compact = 0; 937 _PyUnicode_STATE(unicode).ready = 0; 938 _PyUnicode_STATE(unicode).ascii = 0; 939 _PyUnicode_DATA_ANY(unicode) = NULL; 940 _PyUnicode_LENGTH(unicode) = 0; 941 _PyUnicode_UTF8(unicode) = NULL; 942 _PyUnicode_UTF8_LENGTH(unicode) = 0; 943 944 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size); 945 if (!_PyUnicode_WSTR(unicode)) { 946 Py_DECREF(unicode); 947 PyErr_NoMemory(); 948 return NULL; 949 } 950 951 /* Initialize the first element to guard against cases where 952 * the caller fails before initializing str -- unicode_resize() 953 * reads str[0], and the Keep-Alive optimization can keep memory 954 * allocated for str alive across a call to unicode_dealloc(unicode). 955 * We don't want unicode_resize to read uninitialized memory in 956 * that case. 957 */ 958 _PyUnicode_WSTR(unicode)[0] = 0; 959 _PyUnicode_WSTR(unicode)[length] = 0; 960 961 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0)); 962 return unicode; 963} 964 965static const char* 966unicode_kind_name(PyObject *unicode) 967{ 968 /* don't check consistency: unicode_kind_name() is called from 969 _PyUnicode_Dump() */ 970 if (!PyUnicode_IS_COMPACT(unicode)) 971 { 972 if (!PyUnicode_IS_READY(unicode)) 973 return "wstr"; 974 switch (PyUnicode_KIND(unicode)) 975 { 976 case PyUnicode_1BYTE_KIND: 977 if (PyUnicode_IS_ASCII(unicode)) 978 return "legacy ascii"; 979 else 980 return "legacy latin1"; 981 case PyUnicode_2BYTE_KIND: 982 return "legacy UCS2"; 983 case PyUnicode_4BYTE_KIND: 984 return "legacy UCS4"; 985 default: 986 return "<legacy invalid kind>"; 987 } 988 } 989 assert(PyUnicode_IS_READY(unicode)); 990 switch (PyUnicode_KIND(unicode)) { 991 case PyUnicode_1BYTE_KIND: 992 if (PyUnicode_IS_ASCII(unicode)) 993 return "ascii"; 994 else 995 return "latin1"; 996 case PyUnicode_2BYTE_KIND: 997 return "UCS2"; 998 case PyUnicode_4BYTE_KIND: 999 return "UCS4"; 1000 default: 1001 return "<invalid compact kind>"; 1002 } 1003} 1004 1005#ifdef Py_DEBUG 1006/* Functions wrapping macros for use in debugger */ 1007char *_PyUnicode_utf8(void *unicode){ 1008 return PyUnicode_UTF8(unicode); 1009} 1010 1011void *_PyUnicode_compact_data(void *unicode) { 1012 return _PyUnicode_COMPACT_DATA(unicode); 1013} 1014void *_PyUnicode_data(void *unicode){ 1015 printf("obj %p\n", unicode); 1016 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode)); 1017 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode)); 1018 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1))); 1019 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1))); 1020 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode)); 1021 return PyUnicode_DATA(unicode); 1022} 1023 1024void 1025_PyUnicode_Dump(PyObject *op) 1026{ 1027 PyASCIIObject *ascii = (PyASCIIObject *)op; 1028 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op; 1029 PyUnicodeObject *unicode = (PyUnicodeObject *)op; 1030 void *data; 1031 1032 if (ascii->state.compact) 1033 { 1034 if (ascii->state.ascii) 1035 data = (ascii + 1); 1036 else 1037 data = (compact + 1); 1038 } 1039 else 1040 data = unicode->data.any; 1041 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ", 1042 unicode_kind_name(op), ascii->length); 1043 1044 if (ascii->wstr == data) 1045 printf("shared "); 1046 printf("wstr=%p", ascii->wstr); 1047 1048 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) { 1049 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length); 1050 if (!ascii->state.compact && compact->utf8 == unicode->data.any) 1051 printf("shared "); 1052 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)", 1053 compact->utf8, compact->utf8_length); 1054 } 1055 printf(", data=%p\n", data); 1056} 1057#endif 1058 1059PyObject * 1060PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) 1061{ 1062 PyObject *obj; 1063 PyCompactUnicodeObject *unicode; 1064 void *data; 1065 enum PyUnicode_Kind kind; 1066 int is_sharing, is_ascii; 1067 Py_ssize_t char_size; 1068 Py_ssize_t struct_size; 1069 1070 /* Optimization for empty strings */ 1071 if (size == 0 && unicode_empty != NULL) { 1072 Py_INCREF(unicode_empty); 1073 return unicode_empty; 1074 } 1075 1076 is_ascii = 0; 1077 is_sharing = 0; 1078 struct_size = sizeof(PyCompactUnicodeObject); 1079 if (maxchar < 128) { 1080 kind = PyUnicode_1BYTE_KIND; 1081 char_size = 1; 1082 is_ascii = 1; 1083 struct_size = sizeof(PyASCIIObject); 1084 } 1085 else if (maxchar < 256) { 1086 kind = PyUnicode_1BYTE_KIND; 1087 char_size = 1; 1088 } 1089 else if (maxchar < 65536) { 1090 kind = PyUnicode_2BYTE_KIND; 1091 char_size = 2; 1092 if (sizeof(wchar_t) == 2) 1093 is_sharing = 1; 1094 } 1095 else { 1096 if (maxchar > MAX_UNICODE) { 1097 PyErr_SetString(PyExc_SystemError, 1098 "invalid maximum character passed to PyUnicode_New"); 1099 return NULL; 1100 } 1101 kind = PyUnicode_4BYTE_KIND; 1102 char_size = 4; 1103 if (sizeof(wchar_t) == 4) 1104 is_sharing = 1; 1105 } 1106 1107 /* Ensure we won't overflow the size. */ 1108 if (size < 0) { 1109 PyErr_SetString(PyExc_SystemError, 1110 "Negative size passed to PyUnicode_New"); 1111 return NULL; 1112 } 1113 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) 1114 return PyErr_NoMemory(); 1115 1116 /* Duplicated allocation code from _PyObject_New() instead of a call to 1117 * PyObject_New() so we are able to allocate space for the object and 1118 * it's data buffer. 1119 */ 1120 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size); 1121 if (obj == NULL) 1122 return PyErr_NoMemory(); 1123 obj = PyObject_INIT(obj, &PyUnicode_Type); 1124 if (obj == NULL) 1125 return NULL; 1126 1127 unicode = (PyCompactUnicodeObject *)obj; 1128 if (is_ascii) 1129 data = ((PyASCIIObject*)obj) + 1; 1130 else 1131 data = unicode + 1; 1132 _PyUnicode_LENGTH(unicode) = size; 1133 _PyUnicode_HASH(unicode) = -1; 1134 _PyUnicode_STATE(unicode).interned = 0; 1135 _PyUnicode_STATE(unicode).kind = kind; 1136 _PyUnicode_STATE(unicode).compact = 1; 1137 _PyUnicode_STATE(unicode).ready = 1; 1138 _PyUnicode_STATE(unicode).ascii = is_ascii; 1139 if (is_ascii) { 1140 ((char*)data)[size] = 0; 1141 _PyUnicode_WSTR(unicode) = NULL; 1142 } 1143 else if (kind == PyUnicode_1BYTE_KIND) { 1144 ((char*)data)[size] = 0; 1145 _PyUnicode_WSTR(unicode) = NULL; 1146 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1147 unicode->utf8 = NULL; 1148 unicode->utf8_length = 0; 1149 } 1150 else { 1151 unicode->utf8 = NULL; 1152 unicode->utf8_length = 0; 1153 if (kind == PyUnicode_2BYTE_KIND) 1154 ((Py_UCS2*)data)[size] = 0; 1155 else /* kind == PyUnicode_4BYTE_KIND */ 1156 ((Py_UCS4*)data)[size] = 0; 1157 if (is_sharing) { 1158 _PyUnicode_WSTR_LENGTH(unicode) = size; 1159 _PyUnicode_WSTR(unicode) = (wchar_t *)data; 1160 } 1161 else { 1162 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1163 _PyUnicode_WSTR(unicode) = NULL; 1164 } 1165 } 1166#ifdef Py_DEBUG 1167 unicode_fill_invalid((PyObject*)unicode, 0); 1168#endif 1169 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); 1170 return obj; 1171} 1172 1173#if SIZEOF_WCHAR_T == 2 1174/* Helper function to convert a 16-bits wchar_t representation to UCS4, this 1175 will decode surrogate pairs, the other conversions are implemented as macros 1176 for efficiency. 1177 1178 This function assumes that unicode can hold one more code point than wstr 1179 characters for a terminating null character. */ 1180static void 1181unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, 1182 PyObject *unicode) 1183{ 1184 const wchar_t *iter; 1185 Py_UCS4 *ucs4_out; 1186 1187 assert(unicode != NULL); 1188 assert(_PyUnicode_CHECK(unicode)); 1189 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1190 ucs4_out = PyUnicode_4BYTE_DATA(unicode); 1191 1192 for (iter = begin; iter < end; ) { 1193 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + 1194 _PyUnicode_GET_LENGTH(unicode))); 1195 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1196 && (iter+1) < end 1197 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1198 { 1199 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1200 iter += 2; 1201 } 1202 else { 1203 *ucs4_out++ = *iter; 1204 iter++; 1205 } 1206 } 1207 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + 1208 _PyUnicode_GET_LENGTH(unicode))); 1209 1210} 1211#endif 1212 1213static int 1214unicode_check_modifiable(PyObject *unicode) 1215{ 1216 if (!unicode_modifiable(unicode)) { 1217 PyErr_SetString(PyExc_SystemError, 1218 "Cannot modify a string currently used"); 1219 return -1; 1220 } 1221 return 0; 1222} 1223 1224static int 1225_copy_characters(PyObject *to, Py_ssize_t to_start, 1226 PyObject *from, Py_ssize_t from_start, 1227 Py_ssize_t how_many, int check_maxchar) 1228{ 1229 unsigned int from_kind, to_kind; 1230 void *from_data, *to_data; 1231 1232 assert(0 <= how_many); 1233 assert(0 <= from_start); 1234 assert(0 <= to_start); 1235 assert(PyUnicode_Check(from)); 1236 assert(PyUnicode_IS_READY(from)); 1237 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from)); 1238 1239 assert(PyUnicode_Check(to)); 1240 assert(PyUnicode_IS_READY(to)); 1241 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); 1242 1243 if (how_many == 0) 1244 return 0; 1245 1246 from_kind = PyUnicode_KIND(from); 1247 from_data = PyUnicode_DATA(from); 1248 to_kind = PyUnicode_KIND(to); 1249 to_data = PyUnicode_DATA(to); 1250 1251#ifdef Py_DEBUG 1252 if (!check_maxchar 1253 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)) 1254 { 1255 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1256 Py_UCS4 ch; 1257 Py_ssize_t i; 1258 for (i=0; i < how_many; i++) { 1259 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1260 assert(ch <= to_maxchar); 1261 } 1262 } 1263#endif 1264 1265 if (from_kind == to_kind) { 1266 if (check_maxchar 1267 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)) 1268 { 1269 /* Writing Latin-1 characters into an ASCII string requires to 1270 check that all written characters are pure ASCII */ 1271 Py_UCS4 max_char; 1272 max_char = ucs1lib_find_max_char(from_data, 1273 (Py_UCS1*)from_data + how_many); 1274 if (max_char >= 128) 1275 return -1; 1276 } 1277 Py_MEMCPY((char*)to_data + to_kind * to_start, 1278 (char*)from_data + from_kind * from_start, 1279 to_kind * how_many); 1280 } 1281 else if (from_kind == PyUnicode_1BYTE_KIND 1282 && to_kind == PyUnicode_2BYTE_KIND) 1283 { 1284 _PyUnicode_CONVERT_BYTES( 1285 Py_UCS1, Py_UCS2, 1286 PyUnicode_1BYTE_DATA(from) + from_start, 1287 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1288 PyUnicode_2BYTE_DATA(to) + to_start 1289 ); 1290 } 1291 else if (from_kind == PyUnicode_1BYTE_KIND 1292 && to_kind == PyUnicode_4BYTE_KIND) 1293 { 1294 _PyUnicode_CONVERT_BYTES( 1295 Py_UCS1, Py_UCS4, 1296 PyUnicode_1BYTE_DATA(from) + from_start, 1297 PyUnicode_1BYTE_DATA(from) + from_start + how_many, 1298 PyUnicode_4BYTE_DATA(to) + to_start 1299 ); 1300 } 1301 else if (from_kind == PyUnicode_2BYTE_KIND 1302 && to_kind == PyUnicode_4BYTE_KIND) 1303 { 1304 _PyUnicode_CONVERT_BYTES( 1305 Py_UCS2, Py_UCS4, 1306 PyUnicode_2BYTE_DATA(from) + from_start, 1307 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1308 PyUnicode_4BYTE_DATA(to) + to_start 1309 ); 1310 } 1311 else { 1312 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to)); 1313 1314 if (!check_maxchar) { 1315 if (from_kind == PyUnicode_2BYTE_KIND 1316 && to_kind == PyUnicode_1BYTE_KIND) 1317 { 1318 _PyUnicode_CONVERT_BYTES( 1319 Py_UCS2, Py_UCS1, 1320 PyUnicode_2BYTE_DATA(from) + from_start, 1321 PyUnicode_2BYTE_DATA(from) + from_start + how_many, 1322 PyUnicode_1BYTE_DATA(to) + to_start 1323 ); 1324 } 1325 else if (from_kind == PyUnicode_4BYTE_KIND 1326 && to_kind == PyUnicode_1BYTE_KIND) 1327 { 1328 _PyUnicode_CONVERT_BYTES( 1329 Py_UCS4, Py_UCS1, 1330 PyUnicode_4BYTE_DATA(from) + from_start, 1331 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1332 PyUnicode_1BYTE_DATA(to) + to_start 1333 ); 1334 } 1335 else if (from_kind == PyUnicode_4BYTE_KIND 1336 && to_kind == PyUnicode_2BYTE_KIND) 1337 { 1338 _PyUnicode_CONVERT_BYTES( 1339 Py_UCS4, Py_UCS2, 1340 PyUnicode_4BYTE_DATA(from) + from_start, 1341 PyUnicode_4BYTE_DATA(from) + from_start + how_many, 1342 PyUnicode_2BYTE_DATA(to) + to_start 1343 ); 1344 } 1345 else { 1346 assert(0); 1347 return -1; 1348 } 1349 } 1350 else { 1351 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); 1352 Py_UCS4 ch; 1353 Py_ssize_t i; 1354 1355 for (i=0; i < how_many; i++) { 1356 ch = PyUnicode_READ(from_kind, from_data, from_start + i); 1357 if (ch > to_maxchar) 1358 return -1; 1359 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); 1360 } 1361 } 1362 } 1363 return 0; 1364} 1365 1366void 1367_PyUnicode_FastCopyCharacters( 1368 PyObject *to, Py_ssize_t to_start, 1369 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many) 1370{ 1371 (void)_copy_characters(to, to_start, from, from_start, how_many, 0); 1372} 1373 1374Py_ssize_t 1375PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, 1376 PyObject *from, Py_ssize_t from_start, 1377 Py_ssize_t how_many) 1378{ 1379 int err; 1380 1381 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { 1382 PyErr_BadInternalCall(); 1383 return -1; 1384 } 1385 1386 if (PyUnicode_READY(from) == -1) 1387 return -1; 1388 if (PyUnicode_READY(to) == -1) 1389 return -1; 1390 1391 if (from_start < 0) { 1392 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1393 return -1; 1394 } 1395 if (to_start < 0) { 1396 PyErr_SetString(PyExc_IndexError, "string index out of range"); 1397 return -1; 1398 } 1399 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); 1400 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { 1401 PyErr_Format(PyExc_SystemError, 1402 "Cannot write %zi characters at %zi " 1403 "in a string of %zi characters", 1404 how_many, to_start, PyUnicode_GET_LENGTH(to)); 1405 return -1; 1406 } 1407 1408 if (how_many == 0) 1409 return 0; 1410 1411 if (unicode_check_modifiable(to)) 1412 return -1; 1413 1414 err = _copy_characters(to, to_start, from, from_start, how_many, 1); 1415 if (err) { 1416 PyErr_Format(PyExc_SystemError, 1417 "Cannot copy %s characters " 1418 "into a string of %s characters", 1419 unicode_kind_name(from), 1420 unicode_kind_name(to)); 1421 return -1; 1422 } 1423 return how_many; 1424} 1425 1426/* Find the maximum code point and count the number of surrogate pairs so a 1427 correct string length can be computed before converting a string to UCS4. 1428 This function counts single surrogates as a character and not as a pair. 1429 1430 Return 0 on success, or -1 on error. */ 1431static int 1432find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, 1433 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) 1434{ 1435 const wchar_t *iter; 1436 Py_UCS4 ch; 1437 1438 assert(num_surrogates != NULL && maxchar != NULL); 1439 *num_surrogates = 0; 1440 *maxchar = 0; 1441 1442 for (iter = begin; iter < end; ) { 1443#if SIZEOF_WCHAR_T == 2 1444 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) 1445 && (iter+1) < end 1446 && Py_UNICODE_IS_LOW_SURROGATE(iter[1])) 1447 { 1448 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]); 1449 ++(*num_surrogates); 1450 iter += 2; 1451 } 1452 else 1453#endif 1454 { 1455 ch = *iter; 1456 iter++; 1457 } 1458 if (ch > *maxchar) { 1459 *maxchar = ch; 1460 if (*maxchar > MAX_UNICODE) { 1461 PyErr_Format(PyExc_ValueError, 1462 "character U+%x is not in range [U+0000; U+10ffff]", 1463 ch); 1464 return -1; 1465 } 1466 } 1467 } 1468 return 0; 1469} 1470 1471int 1472_PyUnicode_Ready(PyObject *unicode) 1473{ 1474 wchar_t *end; 1475 Py_UCS4 maxchar = 0; 1476 Py_ssize_t num_surrogates; 1477#if SIZEOF_WCHAR_T == 2 1478 Py_ssize_t length_wo_surrogates; 1479#endif 1480 1481 /* _PyUnicode_Ready() is only intended for old-style API usage where 1482 strings were created using _PyObject_New() and where no canonical 1483 representation (the str field) has been set yet aka strings 1484 which are not yet ready. */ 1485 assert(_PyUnicode_CHECK(unicode)); 1486 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND); 1487 assert(_PyUnicode_WSTR(unicode) != NULL); 1488 assert(_PyUnicode_DATA_ANY(unicode) == NULL); 1489 assert(_PyUnicode_UTF8(unicode) == NULL); 1490 /* Actually, it should neither be interned nor be anything else: */ 1491 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED); 1492 1493 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); 1494 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, 1495 &maxchar, &num_surrogates) == -1) 1496 return -1; 1497 1498 if (maxchar < 256) { 1499 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1); 1500 if (!_PyUnicode_DATA_ANY(unicode)) { 1501 PyErr_NoMemory(); 1502 return -1; 1503 } 1504 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, 1505 _PyUnicode_WSTR(unicode), end, 1506 PyUnicode_1BYTE_DATA(unicode)); 1507 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1508 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1509 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND; 1510 if (maxchar < 128) { 1511 _PyUnicode_STATE(unicode).ascii = 1; 1512 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode); 1513 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1514 } 1515 else { 1516 _PyUnicode_STATE(unicode).ascii = 0; 1517 _PyUnicode_UTF8(unicode) = NULL; 1518 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1519 } 1520 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1521 _PyUnicode_WSTR(unicode) = NULL; 1522 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1523 } 1524 /* In this case we might have to convert down from 4-byte native 1525 wchar_t to 2-byte unicode. */ 1526 else if (maxchar < 65536) { 1527 assert(num_surrogates == 0 && 1528 "FindMaxCharAndNumSurrogatePairs() messed up"); 1529 1530#if SIZEOF_WCHAR_T == 2 1531 /* We can share representations and are done. */ 1532 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1533 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1534 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1535 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1536 _PyUnicode_UTF8(unicode) = NULL; 1537 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1538#else 1539 /* sizeof(wchar_t) == 4 */ 1540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC( 1541 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1)); 1542 if (!_PyUnicode_DATA_ANY(unicode)) { 1543 PyErr_NoMemory(); 1544 return -1; 1545 } 1546 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, 1547 _PyUnicode_WSTR(unicode), end, 1548 PyUnicode_2BYTE_DATA(unicode)); 1549 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0'; 1550 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1551 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND; 1552 _PyUnicode_UTF8(unicode) = NULL; 1553 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1554 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1555 _PyUnicode_WSTR(unicode) = NULL; 1556 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1557#endif 1558 } 1559 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */ 1560 else { 1561#if SIZEOF_WCHAR_T == 2 1562 /* in case the native representation is 2-bytes, we need to allocate a 1563 new normalized 4-byte version. */ 1564 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates; 1565 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) { 1566 PyErr_NoMemory(); 1567 return -1; 1568 } 1569 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1)); 1570 if (!_PyUnicode_DATA_ANY(unicode)) { 1571 PyErr_NoMemory(); 1572 return -1; 1573 } 1574 _PyUnicode_LENGTH(unicode) = length_wo_surrogates; 1575 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1576 _PyUnicode_UTF8(unicode) = NULL; 1577 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1578 /* unicode_convert_wchar_to_ucs4() requires a ready string */ 1579 _PyUnicode_STATE(unicode).ready = 1; 1580 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode); 1581 PyObject_FREE(_PyUnicode_WSTR(unicode)); 1582 _PyUnicode_WSTR(unicode) = NULL; 1583 _PyUnicode_WSTR_LENGTH(unicode) = 0; 1584#else 1585 assert(num_surrogates == 0); 1586 1587 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode); 1588 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode); 1589 _PyUnicode_UTF8(unicode) = NULL; 1590 _PyUnicode_UTF8_LENGTH(unicode) = 0; 1591 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND; 1592#endif 1593 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; 1594 } 1595 _PyUnicode_STATE(unicode).ready = 1; 1596 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1597 return 0; 1598} 1599 1600static void 1601unicode_dealloc(PyObject *unicode) 1602{ 1603 switch (PyUnicode_CHECK_INTERNED(unicode)) { 1604 case SSTATE_NOT_INTERNED: 1605 break; 1606 1607 case SSTATE_INTERNED_MORTAL: 1608 /* revive dead object temporarily for DelItem */ 1609 Py_REFCNT(unicode) = 3; 1610 if (PyDict_DelItem(interned, unicode) != 0) 1611 Py_FatalError( 1612 "deletion of interned string failed"); 1613 break; 1614 1615 case SSTATE_INTERNED_IMMORTAL: 1616 Py_FatalError("Immortal interned string died."); 1617 1618 default: 1619 Py_FatalError("Inconsistent interned string state."); 1620 } 1621 1622 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) 1623 PyObject_DEL(_PyUnicode_WSTR(unicode)); 1624 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) 1625 PyObject_DEL(_PyUnicode_UTF8(unicode)); 1626 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) 1627 PyObject_DEL(_PyUnicode_DATA_ANY(unicode)); 1628 1629 Py_TYPE(unicode)->tp_free(unicode); 1630} 1631 1632#ifdef Py_DEBUG 1633static int 1634unicode_is_singleton(PyObject *unicode) 1635{ 1636 PyASCIIObject *ascii = (PyASCIIObject *)unicode; 1637 if (unicode == unicode_empty) 1638 return 1; 1639 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) 1640 { 1641 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); 1642 if (ch < 256 && unicode_latin1[ch] == unicode) 1643 return 1; 1644 } 1645 return 0; 1646} 1647#endif 1648 1649static int 1650unicode_modifiable(PyObject *unicode) 1651{ 1652 assert(_PyUnicode_CHECK(unicode)); 1653 if (Py_REFCNT(unicode) != 1) 1654 return 0; 1655 if (_PyUnicode_HASH(unicode) != -1) 1656 return 0; 1657 if (PyUnicode_CHECK_INTERNED(unicode)) 1658 return 0; 1659 if (!PyUnicode_CheckExact(unicode)) 1660 return 0; 1661#ifdef Py_DEBUG 1662 /* singleton refcount is greater than 1 */ 1663 assert(!unicode_is_singleton(unicode)); 1664#endif 1665 return 1; 1666} 1667 1668static int 1669unicode_resize(PyObject **p_unicode, Py_ssize_t length) 1670{ 1671 PyObject *unicode; 1672 Py_ssize_t old_length; 1673 1674 assert(p_unicode != NULL); 1675 unicode = *p_unicode; 1676 1677 assert(unicode != NULL); 1678 assert(PyUnicode_Check(unicode)); 1679 assert(0 <= length); 1680 1681 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND) 1682 old_length = PyUnicode_WSTR_LENGTH(unicode); 1683 else 1684 old_length = PyUnicode_GET_LENGTH(unicode); 1685 if (old_length == length) 1686 return 0; 1687 1688 if (length == 0) { 1689 _Py_INCREF_UNICODE_EMPTY(); 1690 if (!unicode_empty) 1691 return -1; 1692 Py_DECREF(*p_unicode); 1693 *p_unicode = unicode_empty; 1694 return 0; 1695 } 1696 1697 if (!unicode_modifiable(unicode)) { 1698 PyObject *copy = resize_copy(unicode, length); 1699 if (copy == NULL) 1700 return -1; 1701 Py_DECREF(*p_unicode); 1702 *p_unicode = copy; 1703 return 0; 1704 } 1705 1706 if (PyUnicode_IS_COMPACT(unicode)) { 1707 PyObject *new_unicode = resize_compact(unicode, length); 1708 if (new_unicode == NULL) 1709 return -1; 1710 *p_unicode = new_unicode; 1711 return 0; 1712 } 1713 return resize_inplace(unicode, length); 1714} 1715 1716int 1717PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) 1718{ 1719 PyObject *unicode; 1720 if (p_unicode == NULL) { 1721 PyErr_BadInternalCall(); 1722 return -1; 1723 } 1724 unicode = *p_unicode; 1725 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0) 1726 { 1727 PyErr_BadInternalCall(); 1728 return -1; 1729 } 1730 return unicode_resize(p_unicode, length); 1731} 1732 1733/* Copy a ASCII or latin1 char* string into a Python Unicode string. 1734 1735 WARNING: The function doesn't copy the terminating null character and 1736 doesn't check the maximum character (may write a latin1 character in an 1737 ASCII string). */ 1738static void 1739unicode_write_cstr(PyObject *unicode, Py_ssize_t index, 1740 const char *str, Py_ssize_t len) 1741{ 1742 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 1743 void *data = PyUnicode_DATA(unicode); 1744 const char *end = str + len; 1745 1746 switch (kind) { 1747 case PyUnicode_1BYTE_KIND: { 1748 assert(index + len <= PyUnicode_GET_LENGTH(unicode)); 1749#ifdef Py_DEBUG 1750 if (PyUnicode_IS_ASCII(unicode)) { 1751 Py_UCS4 maxchar = ucs1lib_find_max_char( 1752 (const Py_UCS1*)str, 1753 (const Py_UCS1*)str + len); 1754 assert(maxchar < 128); 1755 } 1756#endif 1757 memcpy((char *) data + index, str, len); 1758 break; 1759 } 1760 case PyUnicode_2BYTE_KIND: { 1761 Py_UCS2 *start = (Py_UCS2 *)data + index; 1762 Py_UCS2 *ucs2 = start; 1763 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1764 1765 for (; str < end; ++ucs2, ++str) 1766 *ucs2 = (Py_UCS2)*str; 1767 1768 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode)); 1769 break; 1770 } 1771 default: { 1772 Py_UCS4 *start = (Py_UCS4 *)data + index; 1773 Py_UCS4 *ucs4 = start; 1774 assert(kind == PyUnicode_4BYTE_KIND); 1775 assert(index <= PyUnicode_GET_LENGTH(unicode)); 1776 1777 for (; str < end; ++ucs4, ++str) 1778 *ucs4 = (Py_UCS4)*str; 1779 1780 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode)); 1781 } 1782 } 1783} 1784 1785static PyObject* 1786get_latin1_char(unsigned char ch) 1787{ 1788 PyObject *unicode = unicode_latin1[ch]; 1789 if (!unicode) { 1790 unicode = PyUnicode_New(1, ch); 1791 if (!unicode) 1792 return NULL; 1793 PyUnicode_1BYTE_DATA(unicode)[0] = ch; 1794 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1795 unicode_latin1[ch] = unicode; 1796 } 1797 Py_INCREF(unicode); 1798 return unicode; 1799} 1800 1801static PyObject* 1802unicode_char(Py_UCS4 ch) 1803{ 1804 PyObject *unicode; 1805 1806 assert(ch <= MAX_UNICODE); 1807 1808 if (ch < 256) 1809 return get_latin1_char(ch); 1810 1811 unicode = PyUnicode_New(1, ch); 1812 if (unicode == NULL) 1813 return NULL; 1814 switch (PyUnicode_KIND(unicode)) { 1815 case PyUnicode_1BYTE_KIND: 1816 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; 1817 break; 1818 case PyUnicode_2BYTE_KIND: 1819 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; 1820 break; 1821 default: 1822 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); 1823 PyUnicode_4BYTE_DATA(unicode)[0] = ch; 1824 } 1825 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1826 return unicode; 1827} 1828 1829PyObject * 1830PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) 1831{ 1832 PyObject *unicode; 1833 Py_UCS4 maxchar = 0; 1834 Py_ssize_t num_surrogates; 1835 1836 if (u == NULL) 1837 return (PyObject*)_PyUnicode_New(size); 1838 1839 /* If the Unicode data is known at construction time, we can apply 1840 some optimizations which share commonly used objects. */ 1841 1842 /* Optimization for empty strings */ 1843 if (size == 0) 1844 _Py_RETURN_UNICODE_EMPTY(); 1845 1846 /* Single character Unicode objects in the Latin-1 range are 1847 shared when using this constructor */ 1848 if (size == 1 && (Py_UCS4)*u < 256) 1849 return get_latin1_char((unsigned char)*u); 1850 1851 /* If not empty and not single character, copy the Unicode data 1852 into the new object */ 1853 if (find_maxchar_surrogates(u, u + size, 1854 &maxchar, &num_surrogates) == -1) 1855 return NULL; 1856 1857 unicode = PyUnicode_New(size - num_surrogates, maxchar); 1858 if (!unicode) 1859 return NULL; 1860 1861 switch (PyUnicode_KIND(unicode)) { 1862 case PyUnicode_1BYTE_KIND: 1863 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, 1864 u, u + size, PyUnicode_1BYTE_DATA(unicode)); 1865 break; 1866 case PyUnicode_2BYTE_KIND: 1867#if Py_UNICODE_SIZE == 2 1868 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); 1869#else 1870 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, 1871 u, u + size, PyUnicode_2BYTE_DATA(unicode)); 1872#endif 1873 break; 1874 case PyUnicode_4BYTE_KIND: 1875#if SIZEOF_WCHAR_T == 2 1876 /* This is the only case which has to process surrogates, thus 1877 a simple copy loop is not enough and we need a function. */ 1878 unicode_convert_wchar_to_ucs4(u, u + size, unicode); 1879#else 1880 assert(num_surrogates == 0); 1881 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); 1882#endif 1883 break; 1884 default: 1885 assert(0 && "Impossible state"); 1886 } 1887 1888 return unicode_result(unicode); 1889} 1890 1891PyObject * 1892PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) 1893{ 1894 if (size < 0) { 1895 PyErr_SetString(PyExc_SystemError, 1896 "Negative size passed to PyUnicode_FromStringAndSize"); 1897 return NULL; 1898 } 1899 if (u != NULL) 1900 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL); 1901 else 1902 return (PyObject *)_PyUnicode_New(size); 1903} 1904 1905PyObject * 1906PyUnicode_FromString(const char *u) 1907{ 1908 size_t size = strlen(u); 1909 if (size > PY_SSIZE_T_MAX) { 1910 PyErr_SetString(PyExc_OverflowError, "input too long"); 1911 return NULL; 1912 } 1913 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); 1914} 1915 1916PyObject * 1917_PyUnicode_FromId(_Py_Identifier *id) 1918{ 1919 if (!id->object) { 1920 id->object = PyUnicode_DecodeUTF8Stateful(id->string, 1921 strlen(id->string), 1922 NULL, NULL); 1923 if (!id->object) 1924 return NULL; 1925 PyUnicode_InternInPlace(&id->object); 1926 assert(!id->next); 1927 id->next = static_strings; 1928 static_strings = id; 1929 } 1930 return id->object; 1931} 1932 1933void 1934_PyUnicode_ClearStaticStrings() 1935{ 1936 _Py_Identifier *tmp, *s = static_strings; 1937 while (s) { 1938 Py_CLEAR(s->object); 1939 tmp = s->next; 1940 s->next = NULL; 1941 s = tmp; 1942 } 1943 static_strings = NULL; 1944} 1945 1946/* Internal function, doesn't check maximum character */ 1947 1948PyObject* 1949_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size) 1950{ 1951 const unsigned char *s = (const unsigned char *)buffer; 1952 PyObject *unicode; 1953 if (size == 1) { 1954#ifdef Py_DEBUG 1955 assert((unsigned char)s[0] < 128); 1956#endif 1957 return get_latin1_char(s[0]); 1958 } 1959 unicode = PyUnicode_New(size, 127); 1960 if (!unicode) 1961 return NULL; 1962 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size); 1963 assert(_PyUnicode_CheckConsistency(unicode, 1)); 1964 return unicode; 1965} 1966 1967static Py_UCS4 1968kind_maxchar_limit(unsigned int kind) 1969{ 1970 switch (kind) { 1971 case PyUnicode_1BYTE_KIND: 1972 return 0x80; 1973 case PyUnicode_2BYTE_KIND: 1974 return 0x100; 1975 case PyUnicode_4BYTE_KIND: 1976 return 0x10000; 1977 default: 1978 assert(0 && "invalid kind"); 1979 return MAX_UNICODE; 1980 } 1981} 1982 1983Py_LOCAL_INLINE(Py_UCS4) 1984align_maxchar(Py_UCS4 maxchar) 1985{ 1986 if (maxchar <= 127) 1987 return 127; 1988 else if (maxchar <= 255) 1989 return 255; 1990 else if (maxchar <= 65535) 1991 return 65535; 1992 else 1993 return MAX_UNICODE; 1994} 1995 1996static PyObject* 1997_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size) 1998{ 1999 PyObject *res; 2000 unsigned char max_char; 2001 2002 if (size == 0) 2003 _Py_RETURN_UNICODE_EMPTY(); 2004 assert(size > 0); 2005 if (size == 1) 2006 return get_latin1_char(u[0]); 2007 2008 max_char = ucs1lib_find_max_char(u, u + size); 2009 res = PyUnicode_New(size, max_char); 2010 if (!res) 2011 return NULL; 2012 memcpy(PyUnicode_1BYTE_DATA(res), u, size); 2013 assert(_PyUnicode_CheckConsistency(res, 1)); 2014 return res; 2015} 2016 2017static PyObject* 2018_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) 2019{ 2020 PyObject *res; 2021 Py_UCS2 max_char; 2022 2023 if (size == 0) 2024 _Py_RETURN_UNICODE_EMPTY(); 2025 assert(size > 0); 2026 if (size == 1) 2027 return unicode_char(u[0]); 2028 2029 max_char = ucs2lib_find_max_char(u, u + size); 2030 res = PyUnicode_New(size, max_char); 2031 if (!res) 2032 return NULL; 2033 if (max_char >= 256) 2034 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); 2035 else { 2036 _PyUnicode_CONVERT_BYTES( 2037 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res)); 2038 } 2039 assert(_PyUnicode_CheckConsistency(res, 1)); 2040 return res; 2041} 2042 2043static PyObject* 2044_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) 2045{ 2046 PyObject *res; 2047 Py_UCS4 max_char; 2048 2049 if (size == 0) 2050 _Py_RETURN_UNICODE_EMPTY(); 2051 assert(size > 0); 2052 if (size == 1) 2053 return unicode_char(u[0]); 2054 2055 max_char = ucs4lib_find_max_char(u, u + size); 2056 res = PyUnicode_New(size, max_char); 2057 if (!res) 2058 return NULL; 2059 if (max_char < 256) 2060 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size, 2061 PyUnicode_1BYTE_DATA(res)); 2062 else if (max_char < 0x10000) 2063 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size, 2064 PyUnicode_2BYTE_DATA(res)); 2065 else 2066 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); 2067 assert(_PyUnicode_CheckConsistency(res, 1)); 2068 return res; 2069} 2070 2071PyObject* 2072PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) 2073{ 2074 if (size < 0) { 2075 PyErr_SetString(PyExc_ValueError, "size must be positive"); 2076 return NULL; 2077 } 2078 switch (kind) { 2079 case PyUnicode_1BYTE_KIND: 2080 return _PyUnicode_FromUCS1(buffer, size); 2081 case PyUnicode_2BYTE_KIND: 2082 return _PyUnicode_FromUCS2(buffer, size); 2083 case PyUnicode_4BYTE_KIND: 2084 return _PyUnicode_FromUCS4(buffer, size); 2085 default: 2086 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2087 return NULL; 2088 } 2089} 2090 2091Py_UCS4 2092_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end) 2093{ 2094 enum PyUnicode_Kind kind; 2095 void *startptr, *endptr; 2096 2097 assert(PyUnicode_IS_READY(unicode)); 2098 assert(0 <= start); 2099 assert(end <= PyUnicode_GET_LENGTH(unicode)); 2100 assert(start <= end); 2101 2102 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode)) 2103 return PyUnicode_MAX_CHAR_VALUE(unicode); 2104 2105 if (start == end) 2106 return 127; 2107 2108 if (PyUnicode_IS_ASCII(unicode)) 2109 return 127; 2110 2111 kind = PyUnicode_KIND(unicode); 2112 startptr = PyUnicode_DATA(unicode); 2113 endptr = (char *)startptr + end * kind; 2114 startptr = (char *)startptr + start * kind; 2115 switch(kind) { 2116 case PyUnicode_1BYTE_KIND: 2117 return ucs1lib_find_max_char(startptr, endptr); 2118 case PyUnicode_2BYTE_KIND: 2119 return ucs2lib_find_max_char(startptr, endptr); 2120 case PyUnicode_4BYTE_KIND: 2121 return ucs4lib_find_max_char(startptr, endptr); 2122 default: 2123 assert(0); 2124 return 0; 2125 } 2126} 2127 2128/* Ensure that a string uses the most efficient storage, if it is not the 2129 case: create a new string with of the right kind. Write NULL into *p_unicode 2130 on error. */ 2131static void 2132unicode_adjust_maxchar(PyObject **p_unicode) 2133{ 2134 PyObject *unicode, *copy; 2135 Py_UCS4 max_char; 2136 Py_ssize_t len; 2137 unsigned int kind; 2138 2139 assert(p_unicode != NULL); 2140 unicode = *p_unicode; 2141 assert(PyUnicode_IS_READY(unicode)); 2142 if (PyUnicode_IS_ASCII(unicode)) 2143 return; 2144 2145 len = PyUnicode_GET_LENGTH(unicode); 2146 kind = PyUnicode_KIND(unicode); 2147 if (kind == PyUnicode_1BYTE_KIND) { 2148 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); 2149 max_char = ucs1lib_find_max_char(u, u + len); 2150 if (max_char >= 128) 2151 return; 2152 } 2153 else if (kind == PyUnicode_2BYTE_KIND) { 2154 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); 2155 max_char = ucs2lib_find_max_char(u, u + len); 2156 if (max_char >= 256) 2157 return; 2158 } 2159 else { 2160 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); 2161 assert(kind == PyUnicode_4BYTE_KIND); 2162 max_char = ucs4lib_find_max_char(u, u + len); 2163 if (max_char >= 0x10000) 2164 return; 2165 } 2166 copy = PyUnicode_New(len, max_char); 2167 if (copy != NULL) 2168 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len); 2169 Py_DECREF(unicode); 2170 *p_unicode = copy; 2171} 2172 2173PyObject* 2174_PyUnicode_Copy(PyObject *unicode) 2175{ 2176 Py_ssize_t length; 2177 PyObject *copy; 2178 2179 if (!PyUnicode_Check(unicode)) { 2180 PyErr_BadInternalCall(); 2181 return NULL; 2182 } 2183 if (PyUnicode_READY(unicode) == -1) 2184 return NULL; 2185 2186 length = PyUnicode_GET_LENGTH(unicode); 2187 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); 2188 if (!copy) 2189 return NULL; 2190 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode)); 2191 2192 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), 2193 length * PyUnicode_KIND(unicode)); 2194 assert(_PyUnicode_CheckConsistency(copy, 1)); 2195 return copy; 2196} 2197 2198 2199/* Widen Unicode objects to larger buffers. Don't write terminating null 2200 character. Return NULL on error. */ 2201 2202void* 2203_PyUnicode_AsKind(PyObject *s, unsigned int kind) 2204{ 2205 Py_ssize_t len; 2206 void *result; 2207 unsigned int skind; 2208 2209 if (PyUnicode_READY(s) == -1) 2210 return NULL; 2211 2212 len = PyUnicode_GET_LENGTH(s); 2213 skind = PyUnicode_KIND(s); 2214 if (skind >= kind) { 2215 PyErr_SetString(PyExc_SystemError, "invalid widening attempt"); 2216 return NULL; 2217 } 2218 switch (kind) { 2219 case PyUnicode_2BYTE_KIND: 2220 result = PyMem_New(Py_UCS2, len); 2221 if (!result) 2222 return PyErr_NoMemory(); 2223 assert(skind == PyUnicode_1BYTE_KIND); 2224 _PyUnicode_CONVERT_BYTES( 2225 Py_UCS1, Py_UCS2, 2226 PyUnicode_1BYTE_DATA(s), 2227 PyUnicode_1BYTE_DATA(s) + len, 2228 result); 2229 return result; 2230 case PyUnicode_4BYTE_KIND: 2231 result = PyMem_New(Py_UCS4, len); 2232 if (!result) 2233 return PyErr_NoMemory(); 2234 if (skind == PyUnicode_2BYTE_KIND) { 2235 _PyUnicode_CONVERT_BYTES( 2236 Py_UCS2, Py_UCS4, 2237 PyUnicode_2BYTE_DATA(s), 2238 PyUnicode_2BYTE_DATA(s) + len, 2239 result); 2240 } 2241 else { 2242 assert(skind == PyUnicode_1BYTE_KIND); 2243 _PyUnicode_CONVERT_BYTES( 2244 Py_UCS1, Py_UCS4, 2245 PyUnicode_1BYTE_DATA(s), 2246 PyUnicode_1BYTE_DATA(s) + len, 2247 result); 2248 } 2249 return result; 2250 default: 2251 break; 2252 } 2253 PyErr_SetString(PyExc_SystemError, "invalid kind"); 2254 return NULL; 2255} 2256 2257static Py_UCS4* 2258as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2259 int copy_null) 2260{ 2261 int kind; 2262 void *data; 2263 Py_ssize_t len, targetlen; 2264 if (PyUnicode_READY(string) == -1) 2265 return NULL; 2266 kind = PyUnicode_KIND(string); 2267 data = PyUnicode_DATA(string); 2268 len = PyUnicode_GET_LENGTH(string); 2269 targetlen = len; 2270 if (copy_null) 2271 targetlen++; 2272 if (!target) { 2273 target = PyMem_New(Py_UCS4, targetlen); 2274 if (!target) { 2275 PyErr_NoMemory(); 2276 return NULL; 2277 } 2278 } 2279 else { 2280 if (targetsize < targetlen) { 2281 PyErr_Format(PyExc_SystemError, 2282 "string is longer than the buffer"); 2283 if (copy_null && 0 < targetsize) 2284 target[0] = 0; 2285 return NULL; 2286 } 2287 } 2288 if (kind == PyUnicode_1BYTE_KIND) { 2289 Py_UCS1 *start = (Py_UCS1 *) data; 2290 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target); 2291 } 2292 else if (kind == PyUnicode_2BYTE_KIND) { 2293 Py_UCS2 *start = (Py_UCS2 *) data; 2294 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target); 2295 } 2296 else { 2297 assert(kind == PyUnicode_4BYTE_KIND); 2298 Py_MEMCPY(target, data, len * sizeof(Py_UCS4)); 2299 } 2300 if (copy_null) 2301 target[len] = 0; 2302 return target; 2303} 2304 2305Py_UCS4* 2306PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize, 2307 int copy_null) 2308{ 2309 if (target == NULL || targetsize < 0) { 2310 PyErr_BadInternalCall(); 2311 return NULL; 2312 } 2313 return as_ucs4(string, target, targetsize, copy_null); 2314} 2315 2316Py_UCS4* 2317PyUnicode_AsUCS4Copy(PyObject *string) 2318{ 2319 return as_ucs4(string, NULL, 0, 1); 2320} 2321 2322#ifdef HAVE_WCHAR_H 2323 2324PyObject * 2325PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) 2326{ 2327 if (w == NULL) { 2328 if (size == 0) 2329 _Py_RETURN_UNICODE_EMPTY(); 2330 PyErr_BadInternalCall(); 2331 return NULL; 2332 } 2333 2334 if (size == -1) { 2335 size = wcslen(w); 2336 } 2337 2338 return PyUnicode_FromUnicode(w, size); 2339} 2340 2341#endif /* HAVE_WCHAR_H */ 2342 2343/* maximum number of characters required for output of %lld or %p. 2344 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, 2345 plus 1 for the sign. 53/22 is an upper bound for log10(256). */ 2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) 2347 2348static int 2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, 2350 Py_ssize_t width, Py_ssize_t precision) 2351{ 2352 Py_ssize_t length, fill, arglen; 2353 Py_UCS4 maxchar; 2354 2355 if (PyUnicode_READY(str) == -1) 2356 return -1; 2357 2358 length = PyUnicode_GET_LENGTH(str); 2359 if ((precision == -1 || precision >= length) 2360 && width <= length) 2361 return _PyUnicodeWriter_WriteStr(writer, str); 2362 2363 if (precision != -1) 2364 length = Py_MIN(precision, length); 2365 2366 arglen = Py_MAX(length, width); 2367 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 2368 maxchar = _PyUnicode_FindMaxChar(str, 0, length); 2369 else 2370 maxchar = writer->maxchar; 2371 2372 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1) 2373 return -1; 2374 2375 if (width > length) { 2376 fill = width - length; 2377 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) 2378 return -1; 2379 writer->pos += fill; 2380 } 2381 2382 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 2383 str, 0, length); 2384 writer->pos += length; 2385 return 0; 2386} 2387 2388static int 2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, 2390 Py_ssize_t width, Py_ssize_t precision) 2391{ 2392 /* UTF-8 */ 2393 Py_ssize_t length; 2394 PyObject *unicode; 2395 int res; 2396 2397 length = strlen(str); 2398 if (precision != -1) 2399 length = Py_MIN(length, precision); 2400 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); 2401 if (unicode == NULL) 2402 return -1; 2403 2404 res = unicode_fromformat_write_str(writer, unicode, width, -1); 2405 Py_DECREF(unicode); 2406 return res; 2407} 2408 2409static const char* 2410unicode_fromformat_arg(_PyUnicodeWriter *writer, 2411 const char *f, va_list *vargs) 2412{ 2413 const char *p; 2414 Py_ssize_t len; 2415 int zeropad; 2416 Py_ssize_t width; 2417 Py_ssize_t precision; 2418 int longflag; 2419 int longlongflag; 2420 int size_tflag; 2421 Py_ssize_t fill; 2422 2423 p = f; 2424 f++; 2425 zeropad = 0; 2426 if (*f == '0') { 2427 zeropad = 1; 2428 f++; 2429 } 2430 2431 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ 2432 width = -1; 2433 if (Py_ISDIGIT((unsigned)*f)) { 2434 width = *f - '0'; 2435 f++; 2436 while (Py_ISDIGIT((unsigned)*f)) { 2437 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2438 PyErr_SetString(PyExc_ValueError, 2439 "width too big"); 2440 return NULL; 2441 } 2442 width = (width * 10) + (*f - '0'); 2443 f++; 2444 } 2445 } 2446 precision = -1; 2447 if (*f == '.') { 2448 f++; 2449 if (Py_ISDIGIT((unsigned)*f)) { 2450 precision = (*f - '0'); 2451 f++; 2452 while (Py_ISDIGIT((unsigned)*f)) { 2453 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { 2454 PyErr_SetString(PyExc_ValueError, 2455 "precision too big"); 2456 return NULL; 2457 } 2458 precision = (precision * 10) + (*f - '0'); 2459 f++; 2460 } 2461 } 2462 if (*f == '%') { 2463 /* "%.3%s" => f points to "3" */ 2464 f--; 2465 } 2466 } 2467 if (*f == '\0') { 2468 /* bogus format "%.123" => go backward, f points to "3" */ 2469 f--; 2470 } 2471 2472 /* Handle %ld, %lu, %lld and %llu. */ 2473 longflag = 0; 2474 longlongflag = 0; 2475 size_tflag = 0; 2476 if (*f == 'l') { 2477 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { 2478 longflag = 1; 2479 ++f; 2480 } 2481#ifdef HAVE_LONG_LONG 2482 else if (f[1] == 'l' && 2483 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) { 2484 longlongflag = 1; 2485 f += 2; 2486 } 2487#endif 2488 } 2489 /* handle the size_t flag. */ 2490 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) { 2491 size_tflag = 1; 2492 ++f; 2493 } 2494 2495 if (f[1] == '\0') 2496 writer->overallocate = 0; 2497 2498 switch (*f) { 2499 case 'c': 2500 { 2501 int ordinal = va_arg(*vargs, int); 2502 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2503 PyErr_SetString(PyExc_OverflowError, 2504 "character argument not in range(0x110000)"); 2505 return NULL; 2506 } 2507 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) 2508 return NULL; 2509 break; 2510 } 2511 2512 case 'i': 2513 case 'd': 2514 case 'u': 2515 case 'x': 2516 { 2517 /* used by sprintf */ 2518 char buffer[MAX_LONG_LONG_CHARS]; 2519 Py_ssize_t arglen; 2520 2521 if (*f == 'u') { 2522 if (longflag) 2523 len = sprintf(buffer, "%lu", 2524 va_arg(*vargs, unsigned long)); 2525#ifdef HAVE_LONG_LONG 2526 else if (longlongflag) 2527 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u", 2528 va_arg(*vargs, unsigned PY_LONG_LONG)); 2529#endif 2530 else if (size_tflag) 2531 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", 2532 va_arg(*vargs, size_t)); 2533 else 2534 len = sprintf(buffer, "%u", 2535 va_arg(*vargs, unsigned int)); 2536 } 2537 else if (*f == 'x') { 2538 len = sprintf(buffer, "%x", va_arg(*vargs, int)); 2539 } 2540 else { 2541 if (longflag) 2542 len = sprintf(buffer, "%li", 2543 va_arg(*vargs, long)); 2544#ifdef HAVE_LONG_LONG 2545 else if (longlongflag) 2546 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i", 2547 va_arg(*vargs, PY_LONG_LONG)); 2548#endif 2549 else if (size_tflag) 2550 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i", 2551 va_arg(*vargs, Py_ssize_t)); 2552 else 2553 len = sprintf(buffer, "%i", 2554 va_arg(*vargs, int)); 2555 } 2556 assert(len >= 0); 2557 2558 if (precision < len) 2559 precision = len; 2560 2561 arglen = Py_MAX(precision, width); 2562 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) 2563 return NULL; 2564 2565 if (width > precision) { 2566 Py_UCS4 fillchar; 2567 fill = width - precision; 2568 fillchar = zeropad?'0':' '; 2569 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) 2570 return NULL; 2571 writer->pos += fill; 2572 } 2573 if (precision > len) { 2574 fill = precision - len; 2575 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) 2576 return NULL; 2577 writer->pos += fill; 2578 } 2579 2580 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) 2581 return NULL; 2582 break; 2583 } 2584 2585 case 'p': 2586 { 2587 char number[MAX_LONG_LONG_CHARS]; 2588 2589 len = sprintf(number, "%p", va_arg(*vargs, void*)); 2590 assert(len >= 0); 2591 2592 /* %p is ill-defined: ensure leading 0x. */ 2593 if (number[1] == 'X') 2594 number[1] = 'x'; 2595 else if (number[1] != 'x') { 2596 memmove(number + 2, number, 2597 strlen(number) + 1); 2598 number[0] = '0'; 2599 number[1] = 'x'; 2600 len += 2; 2601 } 2602 2603 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) 2604 return NULL; 2605 break; 2606 } 2607 2608 case 's': 2609 { 2610 /* UTF-8 */ 2611 const char *s = va_arg(*vargs, const char*); 2612 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) 2613 return NULL; 2614 break; 2615 } 2616 2617 case 'U': 2618 { 2619 PyObject *obj = va_arg(*vargs, PyObject *); 2620 assert(obj && _PyUnicode_CHECK(obj)); 2621 2622 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2623 return NULL; 2624 break; 2625 } 2626 2627 case 'V': 2628 { 2629 PyObject *obj = va_arg(*vargs, PyObject *); 2630 const char *str = va_arg(*vargs, const char *); 2631 if (obj) { 2632 assert(_PyUnicode_CHECK(obj)); 2633 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) 2634 return NULL; 2635 } 2636 else { 2637 assert(str != NULL); 2638 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) 2639 return NULL; 2640 } 2641 break; 2642 } 2643 2644 case 'S': 2645 { 2646 PyObject *obj = va_arg(*vargs, PyObject *); 2647 PyObject *str; 2648 assert(obj); 2649 str = PyObject_Str(obj); 2650 if (!str) 2651 return NULL; 2652 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { 2653 Py_DECREF(str); 2654 return NULL; 2655 } 2656 Py_DECREF(str); 2657 break; 2658 } 2659 2660 case 'R': 2661 { 2662 PyObject *obj = va_arg(*vargs, PyObject *); 2663 PyObject *repr; 2664 assert(obj); 2665 repr = PyObject_Repr(obj); 2666 if (!repr) 2667 return NULL; 2668 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { 2669 Py_DECREF(repr); 2670 return NULL; 2671 } 2672 Py_DECREF(repr); 2673 break; 2674 } 2675 2676 case 'A': 2677 { 2678 PyObject *obj = va_arg(*vargs, PyObject *); 2679 PyObject *ascii; 2680 assert(obj); 2681 ascii = PyObject_ASCII(obj); 2682 if (!ascii) 2683 return NULL; 2684 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { 2685 Py_DECREF(ascii); 2686 return NULL; 2687 } 2688 Py_DECREF(ascii); 2689 break; 2690 } 2691 2692 case '%': 2693 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 2694 return NULL; 2695 break; 2696 2697 default: 2698 /* if we stumble upon an unknown formatting code, copy the rest 2699 of the format string to the output string. (we cannot just 2700 skip the code, since there's no way to know what's in the 2701 argument list) */ 2702 len = strlen(p); 2703 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) 2704 return NULL; 2705 f = p+len; 2706 return f; 2707 } 2708 2709 f++; 2710 return f; 2711} 2712 2713PyObject * 2714PyUnicode_FromFormatV(const char *format, va_list vargs) 2715{ 2716 va_list vargs2; 2717 const char *f; 2718 _PyUnicodeWriter writer; 2719 2720 _PyUnicodeWriter_Init(&writer); 2721 writer.min_length = strlen(format) + 100; 2722 writer.overallocate = 1; 2723 2724 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). 2725 Copy it to be able to pass a reference to a subfunction. */ 2726 Py_VA_COPY(vargs2, vargs); 2727 2728 for (f = format; *f; ) { 2729 if (*f == '%') { 2730 f = unicode_fromformat_arg(&writer, f, &vargs2); 2731 if (f == NULL) 2732 goto fail; 2733 } 2734 else { 2735 const char *p; 2736 Py_ssize_t len; 2737 2738 p = f; 2739 do 2740 { 2741 if ((unsigned char)*p > 127) { 2742 PyErr_Format(PyExc_ValueError, 2743 "PyUnicode_FromFormatV() expects an ASCII-encoded format " 2744 "string, got a non-ASCII byte: 0x%02x", 2745 (unsigned char)*p); 2746 return NULL; 2747 } 2748 p++; 2749 } 2750 while (*p != '\0' && *p != '%'); 2751 len = p - f; 2752 2753 if (*p == '\0') 2754 writer.overallocate = 0; 2755 2756 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) 2757 goto fail; 2758 2759 f = p; 2760 } 2761 } 2762 return _PyUnicodeWriter_Finish(&writer); 2763 2764 fail: 2765 _PyUnicodeWriter_Dealloc(&writer); 2766 return NULL; 2767} 2768 2769PyObject * 2770PyUnicode_FromFormat(const char *format, ...) 2771{ 2772 PyObject* ret; 2773 va_list vargs; 2774 2775#ifdef HAVE_STDARG_PROTOTYPES 2776 va_start(vargs, format); 2777#else 2778 va_start(vargs); 2779#endif 2780 ret = PyUnicode_FromFormatV(format, vargs); 2781 va_end(vargs); 2782 return ret; 2783} 2784 2785#ifdef HAVE_WCHAR_H 2786 2787/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): 2788 convert a Unicode object to a wide character string. 2789 2790 - If w is NULL: return the number of wide characters (including the null 2791 character) required to convert the unicode object. Ignore size argument. 2792 2793 - Otherwise: return the number of wide characters (excluding the null 2794 character) written into w. Write at most size wide characters (including 2795 the null character). */ 2796static Py_ssize_t 2797unicode_aswidechar(PyObject *unicode, 2798 wchar_t *w, 2799 Py_ssize_t size) 2800{ 2801 Py_ssize_t res; 2802 const wchar_t *wstr; 2803 2804 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); 2805 if (wstr == NULL) 2806 return -1; 2807 2808 if (w != NULL) { 2809 if (size > res) 2810 size = res + 1; 2811 else 2812 res = size; 2813 Py_MEMCPY(w, wstr, size * sizeof(wchar_t)); 2814 return res; 2815 } 2816 else 2817 return res + 1; 2818} 2819 2820Py_ssize_t 2821PyUnicode_AsWideChar(PyObject *unicode, 2822 wchar_t *w, 2823 Py_ssize_t size) 2824{ 2825 if (unicode == NULL) { 2826 PyErr_BadInternalCall(); 2827 return -1; 2828 } 2829 return unicode_aswidechar(unicode, w, size); 2830} 2831 2832wchar_t* 2833PyUnicode_AsWideCharString(PyObject *unicode, 2834 Py_ssize_t *size) 2835{ 2836 wchar_t* buffer; 2837 Py_ssize_t buflen; 2838 2839 if (unicode == NULL) { 2840 PyErr_BadInternalCall(); 2841 return NULL; 2842 } 2843 2844 buflen = unicode_aswidechar(unicode, NULL, 0); 2845 if (buflen == -1) 2846 return NULL; 2847 buffer = PyMem_NEW(wchar_t, buflen); 2848 if (buffer == NULL) { 2849 PyErr_NoMemory(); 2850 return NULL; 2851 } 2852 buflen = unicode_aswidechar(unicode, buffer, buflen); 2853 if (buflen == -1) { 2854 PyMem_FREE(buffer); 2855 return NULL; 2856 } 2857 if (size != NULL) 2858 *size = buflen; 2859 return buffer; 2860} 2861 2862#endif /* HAVE_WCHAR_H */ 2863 2864PyObject * 2865PyUnicode_FromOrdinal(int ordinal) 2866{ 2867 if (ordinal < 0 || ordinal > MAX_UNICODE) { 2868 PyErr_SetString(PyExc_ValueError, 2869 "chr() arg not in range(0x110000)"); 2870 return NULL; 2871 } 2872 2873 return unicode_char((Py_UCS4)ordinal); 2874} 2875 2876PyObject * 2877PyUnicode_FromObject(PyObject *obj) 2878{ 2879 /* XXX Perhaps we should make this API an alias of 2880 PyObject_Str() instead ?! */ 2881 if (PyUnicode_CheckExact(obj)) { 2882 if (PyUnicode_READY(obj) == -1) 2883 return NULL; 2884 Py_INCREF(obj); 2885 return obj; 2886 } 2887 if (PyUnicode_Check(obj)) { 2888 /* For a Unicode subtype that's not a Unicode object, 2889 return a true Unicode object with the same data. */ 2890 return _PyUnicode_Copy(obj); 2891 } 2892 PyErr_Format(PyExc_TypeError, 2893 "Can't convert '%.100s' object to str implicitly", 2894 Py_TYPE(obj)->tp_name); 2895 return NULL; 2896} 2897 2898PyObject * 2899PyUnicode_FromEncodedObject(PyObject *obj, 2900 const char *encoding, 2901 const char *errors) 2902{ 2903 Py_buffer buffer; 2904 PyObject *v; 2905 2906 if (obj == NULL) { 2907 PyErr_BadInternalCall(); 2908 return NULL; 2909 } 2910 2911 /* Decoding bytes objects is the most common case and should be fast */ 2912 if (PyBytes_Check(obj)) { 2913 if (PyBytes_GET_SIZE(obj) == 0) 2914 _Py_RETURN_UNICODE_EMPTY(); 2915 v = PyUnicode_Decode( 2916 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj), 2917 encoding, errors); 2918 return v; 2919 } 2920 2921 if (PyUnicode_Check(obj)) { 2922 PyErr_SetString(PyExc_TypeError, 2923 "decoding str is not supported"); 2924 return NULL; 2925 } 2926 2927 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */ 2928 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) { 2929 PyErr_Format(PyExc_TypeError, 2930 "coercing to str: need a bytes-like object, %.80s found", 2931 Py_TYPE(obj)->tp_name); 2932 return NULL; 2933 } 2934 2935 if (buffer.len == 0) { 2936 PyBuffer_Release(&buffer); 2937 _Py_RETURN_UNICODE_EMPTY(); 2938 } 2939 2940 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); 2941 PyBuffer_Release(&buffer); 2942 return v; 2943} 2944 2945/* Convert encoding to lower case and replace '_' with '-' in order to 2946 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), 2947 1 on success. */ 2948int 2949_Py_normalize_encoding(const char *encoding, 2950 char *lower, 2951 size_t lower_len) 2952{ 2953 const char *e; 2954 char *l; 2955 char *l_end; 2956 2957 if (encoding == NULL) { 2958 /* 6 == strlen("utf-8") + 1 */ 2959 if (lower_len < 6) 2960 return 0; 2961 strcpy(lower, "utf-8"); 2962 return 1; 2963 } 2964 e = encoding; 2965 l = lower; 2966 l_end = &lower[lower_len - 1]; 2967 while (*e) { 2968 if (l == l_end) 2969 return 0; 2970 if (Py_ISUPPER(*e)) { 2971 *l++ = Py_TOLOWER(*e++); 2972 } 2973 else if (*e == '_') { 2974 *l++ = '-'; 2975 e++; 2976 } 2977 else { 2978 *l++ = *e++; 2979 } 2980 } 2981 *l = '\0'; 2982 return 1; 2983} 2984 2985PyObject * 2986PyUnicode_Decode(const char *s, 2987 Py_ssize_t size, 2988 const char *encoding, 2989 const char *errors) 2990{ 2991 PyObject *buffer = NULL, *unicode; 2992 Py_buffer info; 2993 char lower[11]; /* Enough for any encoding shortcut */ 2994 2995 /* Shortcuts for common default encodings */ 2996 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 2997 if ((strcmp(lower, "utf-8") == 0) || 2998 (strcmp(lower, "utf8") == 0)) 2999 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 3000 else if ((strcmp(lower, "latin-1") == 0) || 3001 (strcmp(lower, "latin1") == 0) || 3002 (strcmp(lower, "iso-8859-1") == 0) || 3003 (strcmp(lower, "iso8859-1") == 0)) 3004 return PyUnicode_DecodeLatin1(s, size, errors); 3005#ifdef HAVE_MBCS 3006 else if (strcmp(lower, "mbcs") == 0) 3007 return PyUnicode_DecodeMBCS(s, size, errors); 3008#endif 3009 else if (strcmp(lower, "ascii") == 0) 3010 return PyUnicode_DecodeASCII(s, size, errors); 3011 else if (strcmp(lower, "utf-16") == 0) 3012 return PyUnicode_DecodeUTF16(s, size, errors, 0); 3013 else if (strcmp(lower, "utf-32") == 0) 3014 return PyUnicode_DecodeUTF32(s, size, errors, 0); 3015 } 3016 3017 /* Decode via the codec registry */ 3018 buffer = NULL; 3019 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0) 3020 goto onError; 3021 buffer = PyMemoryView_FromBuffer(&info); 3022 if (buffer == NULL) 3023 goto onError; 3024 unicode = _PyCodec_DecodeText(buffer, encoding, errors); 3025 if (unicode == NULL) 3026 goto onError; 3027 if (!PyUnicode_Check(unicode)) { 3028 PyErr_Format(PyExc_TypeError, 3029 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3030 "use codecs.decode() to decode to arbitrary types", 3031 encoding, 3032 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3033 Py_DECREF(unicode); 3034 goto onError; 3035 } 3036 Py_DECREF(buffer); 3037 return unicode_result(unicode); 3038 3039 onError: 3040 Py_XDECREF(buffer); 3041 return NULL; 3042} 3043 3044PyObject * 3045PyUnicode_AsDecodedObject(PyObject *unicode, 3046 const char *encoding, 3047 const char *errors) 3048{ 3049 PyObject *v; 3050 3051 if (!PyUnicode_Check(unicode)) { 3052 PyErr_BadArgument(); 3053 goto onError; 3054 } 3055 3056 if (encoding == NULL) 3057 encoding = PyUnicode_GetDefaultEncoding(); 3058 3059 /* Decode via the codec registry */ 3060 v = PyCodec_Decode(unicode, encoding, errors); 3061 if (v == NULL) 3062 goto onError; 3063 return unicode_result(v); 3064 3065 onError: 3066 return NULL; 3067} 3068 3069PyObject * 3070PyUnicode_AsDecodedUnicode(PyObject *unicode, 3071 const char *encoding, 3072 const char *errors) 3073{ 3074 PyObject *v; 3075 3076 if (!PyUnicode_Check(unicode)) { 3077 PyErr_BadArgument(); 3078 goto onError; 3079 } 3080 3081 if (encoding == NULL) 3082 encoding = PyUnicode_GetDefaultEncoding(); 3083 3084 /* Decode via the codec registry */ 3085 v = PyCodec_Decode(unicode, encoding, errors); 3086 if (v == NULL) 3087 goto onError; 3088 if (!PyUnicode_Check(v)) { 3089 PyErr_Format(PyExc_TypeError, 3090 "'%.400s' decoder returned '%.400s' instead of 'str'; " 3091 "use codecs.decode() to decode to arbitrary types", 3092 encoding, 3093 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name); 3094 Py_DECREF(v); 3095 goto onError; 3096 } 3097 return unicode_result(v); 3098 3099 onError: 3100 return NULL; 3101} 3102 3103PyObject * 3104PyUnicode_Encode(const Py_UNICODE *s, 3105 Py_ssize_t size, 3106 const char *encoding, 3107 const char *errors) 3108{ 3109 PyObject *v, *unicode; 3110 3111 unicode = PyUnicode_FromUnicode(s, size); 3112 if (unicode == NULL) 3113 return NULL; 3114 v = PyUnicode_AsEncodedString(unicode, encoding, errors); 3115 Py_DECREF(unicode); 3116 return v; 3117} 3118 3119PyObject * 3120PyUnicode_AsEncodedObject(PyObject *unicode, 3121 const char *encoding, 3122 const char *errors) 3123{ 3124 PyObject *v; 3125 3126 if (!PyUnicode_Check(unicode)) { 3127 PyErr_BadArgument(); 3128 goto onError; 3129 } 3130 3131 if (encoding == NULL) 3132 encoding = PyUnicode_GetDefaultEncoding(); 3133 3134 /* Encode via the codec registry */ 3135 v = PyCodec_Encode(unicode, encoding, errors); 3136 if (v == NULL) 3137 goto onError; 3138 return v; 3139 3140 onError: 3141 return NULL; 3142} 3143 3144static size_t 3145wcstombs_errorpos(const wchar_t *wstr) 3146{ 3147 size_t len; 3148#if SIZEOF_WCHAR_T == 2 3149 wchar_t buf[3]; 3150#else 3151 wchar_t buf[2]; 3152#endif 3153 char outbuf[MB_LEN_MAX]; 3154 const wchar_t *start, *previous; 3155 3156#if SIZEOF_WCHAR_T == 2 3157 buf[2] = 0; 3158#else 3159 buf[1] = 0; 3160#endif 3161 start = wstr; 3162 while (*wstr != L'\0') 3163 { 3164 previous = wstr; 3165#if SIZEOF_WCHAR_T == 2 3166 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) 3167 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) 3168 { 3169 buf[0] = wstr[0]; 3170 buf[1] = wstr[1]; 3171 wstr += 2; 3172 } 3173 else { 3174 buf[0] = *wstr; 3175 buf[1] = 0; 3176 wstr++; 3177 } 3178#else 3179 buf[0] = *wstr; 3180 wstr++; 3181#endif 3182 len = wcstombs(outbuf, buf, sizeof(outbuf)); 3183 if (len == (size_t)-1) 3184 return previous - start; 3185 } 3186 3187 /* failed to find the unencodable character */ 3188 return 0; 3189} 3190 3191static int 3192locale_error_handler(const char *errors, int *surrogateescape) 3193{ 3194 _Py_error_handler error_handler = get_error_handler(errors); 3195 switch (error_handler) 3196 { 3197 case _Py_ERROR_STRICT: 3198 *surrogateescape = 0; 3199 return 0; 3200 case _Py_ERROR_SURROGATEESCAPE: 3201 *surrogateescape = 1; 3202 return 0; 3203 default: 3204 PyErr_Format(PyExc_ValueError, 3205 "only 'strict' and 'surrogateescape' error handlers " 3206 "are supported, not '%s'", 3207 errors); 3208 return -1; 3209 } 3210} 3211 3212PyObject * 3213PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) 3214{ 3215 Py_ssize_t wlen, wlen2; 3216 wchar_t *wstr; 3217 PyObject *bytes = NULL; 3218 char *errmsg; 3219 PyObject *reason = NULL; 3220 PyObject *exc; 3221 size_t error_pos; 3222 int surrogateescape; 3223 3224 if (locale_error_handler(errors, &surrogateescape) < 0) 3225 return NULL; 3226 3227 wstr = PyUnicode_AsWideCharString(unicode, &wlen); 3228 if (wstr == NULL) 3229 return NULL; 3230 3231 wlen2 = wcslen(wstr); 3232 if (wlen2 != wlen) { 3233 PyMem_Free(wstr); 3234 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3235 return NULL; 3236 } 3237 3238 if (surrogateescape) { 3239 /* "surrogateescape" error handler */ 3240 char *str; 3241 3242 str = Py_EncodeLocale(wstr, &error_pos); 3243 if (str == NULL) { 3244 if (error_pos == (size_t)-1) { 3245 PyErr_NoMemory(); 3246 PyMem_Free(wstr); 3247 return NULL; 3248 } 3249 else { 3250 goto encode_error; 3251 } 3252 } 3253 PyMem_Free(wstr); 3254 3255 bytes = PyBytes_FromString(str); 3256 PyMem_Free(str); 3257 } 3258 else { 3259 /* strict mode */ 3260 size_t len, len2; 3261 3262 len = wcstombs(NULL, wstr, 0); 3263 if (len == (size_t)-1) { 3264 error_pos = (size_t)-1; 3265 goto encode_error; 3266 } 3267 3268 bytes = PyBytes_FromStringAndSize(NULL, len); 3269 if (bytes == NULL) { 3270 PyMem_Free(wstr); 3271 return NULL; 3272 } 3273 3274 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); 3275 if (len2 == (size_t)-1 || len2 > len) { 3276 error_pos = (size_t)-1; 3277 goto encode_error; 3278 } 3279 PyMem_Free(wstr); 3280 } 3281 return bytes; 3282 3283encode_error: 3284 errmsg = strerror(errno); 3285 assert(errmsg != NULL); 3286 3287 if (error_pos == (size_t)-1) 3288 error_pos = wcstombs_errorpos(wstr); 3289 3290 PyMem_Free(wstr); 3291 Py_XDECREF(bytes); 3292 3293 if (errmsg != NULL) { 3294 size_t errlen; 3295 wstr = Py_DecodeLocale(errmsg, &errlen); 3296 if (wstr != NULL) { 3297 reason = PyUnicode_FromWideChar(wstr, errlen); 3298 PyMem_RawFree(wstr); 3299 } else 3300 errmsg = NULL; 3301 } 3302 if (errmsg == NULL) 3303 reason = PyUnicode_FromString( 3304 "wcstombs() encountered an unencodable " 3305 "wide character"); 3306 if (reason == NULL) 3307 return NULL; 3308 3309 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO", 3310 "locale", unicode, 3311 (Py_ssize_t)error_pos, 3312 (Py_ssize_t)(error_pos+1), 3313 reason); 3314 Py_DECREF(reason); 3315 if (exc != NULL) { 3316 PyCodec_StrictErrors(exc); 3317 Py_XDECREF(exc); 3318 } 3319 return NULL; 3320} 3321 3322PyObject * 3323PyUnicode_EncodeFSDefault(PyObject *unicode) 3324{ 3325#ifdef HAVE_MBCS 3326 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 3327#elif defined(__APPLE__) 3328 return _PyUnicode_AsUTF8String(unicode, "surrogateescape"); 3329#else 3330 PyInterpreterState *interp = PyThreadState_GET()->interp; 3331 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3332 cannot use it to encode and decode filenames before it is loaded. Load 3333 the Python codec requires to encode at least its own filename. Use the C 3334 version of the locale codec until the codec registry is initialized and 3335 the Python codec is loaded. 3336 3337 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3338 cannot only rely on it: check also interp->fscodec_initialized for 3339 subinterpreters. */ 3340 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3341 return PyUnicode_AsEncodedString(unicode, 3342 Py_FileSystemDefaultEncoding, 3343 "surrogateescape"); 3344 } 3345 else { 3346 return PyUnicode_EncodeLocale(unicode, "surrogateescape"); 3347 } 3348#endif 3349} 3350 3351PyObject * 3352PyUnicode_AsEncodedString(PyObject *unicode, 3353 const char *encoding, 3354 const char *errors) 3355{ 3356 PyObject *v; 3357 char lower[11]; /* Enough for any encoding shortcut */ 3358 3359 if (!PyUnicode_Check(unicode)) { 3360 PyErr_BadArgument(); 3361 return NULL; 3362 } 3363 3364 /* Shortcuts for common default encodings */ 3365 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) { 3366 if ((strcmp(lower, "utf-8") == 0) || 3367 (strcmp(lower, "utf8") == 0)) 3368 { 3369 if (errors == NULL || strcmp(errors, "strict") == 0) 3370 return _PyUnicode_AsUTF8String(unicode, NULL); 3371 else 3372 return _PyUnicode_AsUTF8String(unicode, errors); 3373 } 3374 else if ((strcmp(lower, "latin-1") == 0) || 3375 (strcmp(lower, "latin1") == 0) || 3376 (strcmp(lower, "iso-8859-1") == 0) || 3377 (strcmp(lower, "iso8859-1") == 0)) 3378 return _PyUnicode_AsLatin1String(unicode, errors); 3379#ifdef HAVE_MBCS 3380 else if (strcmp(lower, "mbcs") == 0) 3381 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors); 3382#endif 3383 else if (strcmp(lower, "ascii") == 0) 3384 return _PyUnicode_AsASCIIString(unicode, errors); 3385 } 3386 3387 /* Encode via the codec registry */ 3388 v = _PyCodec_EncodeText(unicode, encoding, errors); 3389 if (v == NULL) 3390 return NULL; 3391 3392 /* The normal path */ 3393 if (PyBytes_Check(v)) 3394 return v; 3395 3396 /* If the codec returns a buffer, raise a warning and convert to bytes */ 3397 if (PyByteArray_Check(v)) { 3398 int error; 3399 PyObject *b; 3400 3401 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, 3402 "encoder %s returned bytearray instead of bytes; " 3403 "use codecs.encode() to encode to arbitrary types", 3404 encoding); 3405 if (error) { 3406 Py_DECREF(v); 3407 return NULL; 3408 } 3409 3410 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); 3411 Py_DECREF(v); 3412 return b; 3413 } 3414 3415 PyErr_Format(PyExc_TypeError, 3416 "'%.400s' encoder returned '%.400s' instead of 'bytes'; " 3417 "use codecs.encode() to encode to arbitrary types", 3418 encoding, 3419 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3420 Py_DECREF(v); 3421 return NULL; 3422} 3423 3424PyObject * 3425PyUnicode_AsEncodedUnicode(PyObject *unicode, 3426 const char *encoding, 3427 const char *errors) 3428{ 3429 PyObject *v; 3430 3431 if (!PyUnicode_Check(unicode)) { 3432 PyErr_BadArgument(); 3433 goto onError; 3434 } 3435 3436 if (encoding == NULL) 3437 encoding = PyUnicode_GetDefaultEncoding(); 3438 3439 /* Encode via the codec registry */ 3440 v = PyCodec_Encode(unicode, encoding, errors); 3441 if (v == NULL) 3442 goto onError; 3443 if (!PyUnicode_Check(v)) { 3444 PyErr_Format(PyExc_TypeError, 3445 "'%.400s' encoder returned '%.400s' instead of 'str'; " 3446 "use codecs.encode() to encode to arbitrary types", 3447 encoding, 3448 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name); 3449 Py_DECREF(v); 3450 goto onError; 3451 } 3452 return v; 3453 3454 onError: 3455 return NULL; 3456} 3457 3458static size_t 3459mbstowcs_errorpos(const char *str, size_t len) 3460{ 3461#ifdef HAVE_MBRTOWC 3462 const char *start = str; 3463 mbstate_t mbs; 3464 size_t converted; 3465 wchar_t ch; 3466 3467 memset(&mbs, 0, sizeof mbs); 3468 while (len) 3469 { 3470 converted = mbrtowc(&ch, str, len, &mbs); 3471 if (converted == 0) 3472 /* Reached end of string */ 3473 break; 3474 if (converted == (size_t)-1 || converted == (size_t)-2) { 3475 /* Conversion error or incomplete character */ 3476 return str - start; 3477 } 3478 else { 3479 str += converted; 3480 len -= converted; 3481 } 3482 } 3483 /* failed to find the undecodable byte sequence */ 3484 return 0; 3485#endif 3486 return 0; 3487} 3488 3489PyObject* 3490PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, 3491 const char *errors) 3492{ 3493 wchar_t smallbuf[256]; 3494 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); 3495 wchar_t *wstr; 3496 size_t wlen, wlen2; 3497 PyObject *unicode; 3498 int surrogateescape; 3499 size_t error_pos; 3500 char *errmsg; 3501 PyObject *reason = NULL; /* initialize to prevent gcc warning */ 3502 PyObject *exc; 3503 3504 if (locale_error_handler(errors, &surrogateescape) < 0) 3505 return NULL; 3506 3507 if (str[len] != '\0' || (size_t)len != strlen(str)) { 3508 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3509 return NULL; 3510 } 3511 3512 if (surrogateescape) { 3513 /* "surrogateescape" error handler */ 3514 wstr = Py_DecodeLocale(str, &wlen); 3515 if (wstr == NULL) { 3516 if (wlen == (size_t)-1) 3517 PyErr_NoMemory(); 3518 else 3519 PyErr_SetFromErrno(PyExc_OSError); 3520 return NULL; 3521 } 3522 3523 unicode = PyUnicode_FromWideChar(wstr, wlen); 3524 PyMem_RawFree(wstr); 3525 } 3526 else { 3527 /* strict mode */ 3528#ifndef HAVE_BROKEN_MBSTOWCS 3529 wlen = mbstowcs(NULL, str, 0); 3530#else 3531 wlen = len; 3532#endif 3533 if (wlen == (size_t)-1) 3534 goto decode_error; 3535 if (wlen+1 <= smallbuf_len) { 3536 wstr = smallbuf; 3537 } 3538 else { 3539 wstr = PyMem_New(wchar_t, wlen+1); 3540 if (!wstr) 3541 return PyErr_NoMemory(); 3542 } 3543 3544 wlen2 = mbstowcs(wstr, str, wlen+1); 3545 if (wlen2 == (size_t)-1) { 3546 if (wstr != smallbuf) 3547 PyMem_Free(wstr); 3548 goto decode_error; 3549 } 3550#ifdef HAVE_BROKEN_MBSTOWCS 3551 assert(wlen2 == wlen); 3552#endif 3553 unicode = PyUnicode_FromWideChar(wstr, wlen2); 3554 if (wstr != smallbuf) 3555 PyMem_Free(wstr); 3556 } 3557 return unicode; 3558 3559decode_error: 3560 reason = NULL; 3561 errmsg = strerror(errno); 3562 assert(errmsg != NULL); 3563 3564 error_pos = mbstowcs_errorpos(str, len); 3565 if (errmsg != NULL) { 3566 size_t errlen; 3567 wstr = Py_DecodeLocale(errmsg, &errlen); 3568 if (wstr != NULL) { 3569 reason = PyUnicode_FromWideChar(wstr, errlen); 3570 PyMem_RawFree(wstr); 3571 } 3572 } 3573 if (reason == NULL) 3574 reason = PyUnicode_FromString( 3575 "mbstowcs() encountered an invalid multibyte sequence"); 3576 if (reason == NULL) 3577 return NULL; 3578 3579 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO", 3580 "locale", str, len, 3581 (Py_ssize_t)error_pos, 3582 (Py_ssize_t)(error_pos+1), 3583 reason); 3584 Py_DECREF(reason); 3585 if (exc != NULL) { 3586 PyCodec_StrictErrors(exc); 3587 Py_XDECREF(exc); 3588 } 3589 return NULL; 3590} 3591 3592PyObject* 3593PyUnicode_DecodeLocale(const char *str, const char *errors) 3594{ 3595 Py_ssize_t size = (Py_ssize_t)strlen(str); 3596 return PyUnicode_DecodeLocaleAndSize(str, size, errors); 3597} 3598 3599 3600PyObject* 3601PyUnicode_DecodeFSDefault(const char *s) { 3602 Py_ssize_t size = (Py_ssize_t)strlen(s); 3603 return PyUnicode_DecodeFSDefaultAndSize(s, size); 3604} 3605 3606PyObject* 3607PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) 3608{ 3609#ifdef HAVE_MBCS 3610 return PyUnicode_DecodeMBCS(s, size, NULL); 3611#elif defined(__APPLE__) 3612 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL); 3613#else 3614 PyInterpreterState *interp = PyThreadState_GET()->interp; 3615 /* Bootstrap check: if the filesystem codec is implemented in Python, we 3616 cannot use it to encode and decode filenames before it is loaded. Load 3617 the Python codec requires to encode at least its own filename. Use the C 3618 version of the locale codec until the codec registry is initialized and 3619 the Python codec is loaded. 3620 3621 Py_FileSystemDefaultEncoding is shared between all interpreters, we 3622 cannot only rely on it: check also interp->fscodec_initialized for 3623 subinterpreters. */ 3624 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { 3625 return PyUnicode_Decode(s, size, 3626 Py_FileSystemDefaultEncoding, 3627 "surrogateescape"); 3628 } 3629 else { 3630 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape"); 3631 } 3632#endif 3633} 3634 3635 3636int 3637PyUnicode_FSConverter(PyObject* arg, void* addr) 3638{ 3639 PyObject *output = NULL; 3640 Py_ssize_t size; 3641 void *data; 3642 if (arg == NULL) { 3643 Py_DECREF(*(PyObject**)addr); 3644 return 1; 3645 } 3646 if (PyBytes_Check(arg)) { 3647 output = arg; 3648 Py_INCREF(output); 3649 } 3650 else { 3651 arg = PyUnicode_FromObject(arg); 3652 if (!arg) 3653 return 0; 3654 output = PyUnicode_EncodeFSDefault(arg); 3655 Py_DECREF(arg); 3656 if (!output) 3657 return 0; 3658 if (!PyBytes_Check(output)) { 3659 Py_DECREF(output); 3660 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); 3661 return 0; 3662 } 3663 } 3664 size = PyBytes_GET_SIZE(output); 3665 data = PyBytes_AS_STRING(output); 3666 if ((size_t)size != strlen(data)) { 3667 PyErr_SetString(PyExc_ValueError, "embedded null byte"); 3668 Py_DECREF(output); 3669 return 0; 3670 } 3671 *(PyObject**)addr = output; 3672 return Py_CLEANUP_SUPPORTED; 3673} 3674 3675 3676int 3677PyUnicode_FSDecoder(PyObject* arg, void* addr) 3678{ 3679 PyObject *output = NULL; 3680 if (arg == NULL) { 3681 Py_DECREF(*(PyObject**)addr); 3682 return 1; 3683 } 3684 if (PyUnicode_Check(arg)) { 3685 if (PyUnicode_READY(arg) == -1) 3686 return 0; 3687 output = arg; 3688 Py_INCREF(output); 3689 } 3690 else { 3691 arg = PyBytes_FromObject(arg); 3692 if (!arg) 3693 return 0; 3694 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), 3695 PyBytes_GET_SIZE(arg)); 3696 Py_DECREF(arg); 3697 if (!output) 3698 return 0; 3699 if (!PyUnicode_Check(output)) { 3700 Py_DECREF(output); 3701 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); 3702 return 0; 3703 } 3704 } 3705 if (PyUnicode_READY(output) == -1) { 3706 Py_DECREF(output); 3707 return 0; 3708 } 3709 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output), 3710 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) { 3711 PyErr_SetString(PyExc_ValueError, "embedded null character"); 3712 Py_DECREF(output); 3713 return 0; 3714 } 3715 *(PyObject**)addr = output; 3716 return Py_CLEANUP_SUPPORTED; 3717} 3718 3719 3720char* 3721PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) 3722{ 3723 PyObject *bytes; 3724 3725 if (!PyUnicode_Check(unicode)) { 3726 PyErr_BadArgument(); 3727 return NULL; 3728 } 3729 if (PyUnicode_READY(unicode) == -1) 3730 return NULL; 3731 3732 if (PyUnicode_UTF8(unicode) == NULL) { 3733 assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); 3734 bytes = _PyUnicode_AsUTF8String(unicode, "strict"); 3735 if (bytes == NULL) 3736 return NULL; 3737 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); 3738 if (_PyUnicode_UTF8(unicode) == NULL) { 3739 PyErr_NoMemory(); 3740 Py_DECREF(bytes); 3741 return NULL; 3742 } 3743 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes); 3744 Py_MEMCPY(_PyUnicode_UTF8(unicode), 3745 PyBytes_AS_STRING(bytes), 3746 _PyUnicode_UTF8_LENGTH(unicode) + 1); 3747 Py_DECREF(bytes); 3748 } 3749 3750 if (psize) 3751 *psize = PyUnicode_UTF8_LENGTH(unicode); 3752 return PyUnicode_UTF8(unicode); 3753} 3754 3755char* 3756PyUnicode_AsUTF8(PyObject *unicode) 3757{ 3758 return PyUnicode_AsUTF8AndSize(unicode, NULL); 3759} 3760 3761Py_UNICODE * 3762PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) 3763{ 3764 const unsigned char *one_byte; 3765#if SIZEOF_WCHAR_T == 4 3766 const Py_UCS2 *two_bytes; 3767#else 3768 const Py_UCS4 *four_bytes; 3769 const Py_UCS4 *ucs4_end; 3770 Py_ssize_t num_surrogates; 3771#endif 3772 wchar_t *w; 3773 wchar_t *wchar_end; 3774 3775 if (!PyUnicode_Check(unicode)) { 3776 PyErr_BadArgument(); 3777 return NULL; 3778 } 3779 if (_PyUnicode_WSTR(unicode) == NULL) { 3780 /* Non-ASCII compact unicode object */ 3781 assert(_PyUnicode_KIND(unicode) != 0); 3782 assert(PyUnicode_IS_READY(unicode)); 3783 3784 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { 3785#if SIZEOF_WCHAR_T == 2 3786 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3787 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); 3788 num_surrogates = 0; 3789 3790 for (; four_bytes < ucs4_end; ++four_bytes) { 3791 if (*four_bytes > 0xFFFF) 3792 ++num_surrogates; 3793 } 3794 3795 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( 3796 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); 3797 if (!_PyUnicode_WSTR(unicode)) { 3798 PyErr_NoMemory(); 3799 return NULL; 3800 } 3801 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; 3802 3803 w = _PyUnicode_WSTR(unicode); 3804 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); 3805 four_bytes = PyUnicode_4BYTE_DATA(unicode); 3806 for (; four_bytes < ucs4_end; ++four_bytes, ++w) { 3807 if (*four_bytes > 0xFFFF) { 3808 assert(*four_bytes <= MAX_UNICODE); 3809 /* encode surrogate pair in this case */ 3810 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); 3811 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); 3812 } 3813 else 3814 *w = *four_bytes; 3815 3816 if (w > wchar_end) { 3817 assert(0 && "Miscalculated string end"); 3818 } 3819 } 3820 *w = 0; 3821#else 3822 /* sizeof(wchar_t) == 4 */ 3823 Py_FatalError("Impossible unicode object state, wstr and str " 3824 "should share memory already."); 3825 return NULL; 3826#endif 3827 } 3828 else { 3829 if ((size_t)_PyUnicode_LENGTH(unicode) > 3830 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { 3831 PyErr_NoMemory(); 3832 return NULL; 3833 } 3834 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * 3835 (_PyUnicode_LENGTH(unicode) + 1)); 3836 if (!_PyUnicode_WSTR(unicode)) { 3837 PyErr_NoMemory(); 3838 return NULL; 3839 } 3840 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) 3841 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); 3842 w = _PyUnicode_WSTR(unicode); 3843 wchar_end = w + _PyUnicode_LENGTH(unicode); 3844 3845 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { 3846 one_byte = PyUnicode_1BYTE_DATA(unicode); 3847 for (; w < wchar_end; ++one_byte, ++w) 3848 *w = *one_byte; 3849 /* null-terminate the wstr */ 3850 *w = 0; 3851 } 3852 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { 3853#if SIZEOF_WCHAR_T == 4 3854 two_bytes = PyUnicode_2BYTE_DATA(unicode); 3855 for (; w < wchar_end; ++two_bytes, ++w) 3856 *w = *two_bytes; 3857 /* null-terminate the wstr */ 3858 *w = 0; 3859#else 3860 /* sizeof(wchar_t) == 2 */ 3861 PyObject_FREE(_PyUnicode_WSTR(unicode)); 3862 _PyUnicode_WSTR(unicode) = NULL; 3863 Py_FatalError("Impossible unicode object state, wstr " 3864 "and str should share memory already."); 3865 return NULL; 3866#endif 3867 } 3868 else { 3869 assert(0 && "This should never happen."); 3870 } 3871 } 3872 } 3873 if (size != NULL) 3874 *size = PyUnicode_WSTR_LENGTH(unicode); 3875 return _PyUnicode_WSTR(unicode); 3876} 3877 3878Py_UNICODE * 3879PyUnicode_AsUnicode(PyObject *unicode) 3880{ 3881 return PyUnicode_AsUnicodeAndSize(unicode, NULL); 3882} 3883 3884 3885Py_ssize_t 3886PyUnicode_GetSize(PyObject *unicode) 3887{ 3888 if (!PyUnicode_Check(unicode)) { 3889 PyErr_BadArgument(); 3890 goto onError; 3891 } 3892 return PyUnicode_GET_SIZE(unicode); 3893 3894 onError: 3895 return -1; 3896} 3897 3898Py_ssize_t 3899PyUnicode_GetLength(PyObject *unicode) 3900{ 3901 if (!PyUnicode_Check(unicode)) { 3902 PyErr_BadArgument(); 3903 return -1; 3904 } 3905 if (PyUnicode_READY(unicode) == -1) 3906 return -1; 3907 return PyUnicode_GET_LENGTH(unicode); 3908} 3909 3910Py_UCS4 3911PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index) 3912{ 3913 void *data; 3914 int kind; 3915 3916 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) { 3917 PyErr_BadArgument(); 3918 return (Py_UCS4)-1; 3919 } 3920 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3921 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3922 return (Py_UCS4)-1; 3923 } 3924 data = PyUnicode_DATA(unicode); 3925 kind = PyUnicode_KIND(unicode); 3926 return PyUnicode_READ(kind, data, index); 3927} 3928 3929int 3930PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch) 3931{ 3932 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) { 3933 PyErr_BadArgument(); 3934 return -1; 3935 } 3936 assert(PyUnicode_IS_READY(unicode)); 3937 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) { 3938 PyErr_SetString(PyExc_IndexError, "string index out of range"); 3939 return -1; 3940 } 3941 if (unicode_check_modifiable(unicode)) 3942 return -1; 3943 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) { 3944 PyErr_SetString(PyExc_ValueError, "character out of range"); 3945 return -1; 3946 } 3947 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), 3948 index, ch); 3949 return 0; 3950} 3951 3952const char * 3953PyUnicode_GetDefaultEncoding(void) 3954{ 3955 return "utf-8"; 3956} 3957 3958/* create or adjust a UnicodeDecodeError */ 3959static void 3960make_decode_exception(PyObject **exceptionObject, 3961 const char *encoding, 3962 const char *input, Py_ssize_t length, 3963 Py_ssize_t startpos, Py_ssize_t endpos, 3964 const char *reason) 3965{ 3966 if (*exceptionObject == NULL) { 3967 *exceptionObject = PyUnicodeDecodeError_Create( 3968 encoding, input, length, startpos, endpos, reason); 3969 } 3970 else { 3971 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) 3972 goto onError; 3973 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) 3974 goto onError; 3975 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) 3976 goto onError; 3977 } 3978 return; 3979 3980onError: 3981 Py_CLEAR(*exceptionObject); 3982} 3983 3984#ifdef HAVE_MBCS 3985/* error handling callback helper: 3986 build arguments, call the callback and check the arguments, 3987 if no exception occurred, copy the replacement to the output 3988 and adjust various state variables. 3989 return 0 on success, -1 on error 3990*/ 3991 3992static int 3993unicode_decode_call_errorhandler_wchar( 3994 const char *errors, PyObject **errorHandler, 3995 const char *encoding, const char *reason, 3996 const char **input, const char **inend, Py_ssize_t *startinpos, 3997 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 3998 PyObject **output, Py_ssize_t *outpos) 3999{ 4000 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4001 4002 PyObject *restuple = NULL; 4003 PyObject *repunicode = NULL; 4004 Py_ssize_t outsize; 4005 Py_ssize_t insize; 4006 Py_ssize_t requiredsize; 4007 Py_ssize_t newpos; 4008 PyObject *inputobj = NULL; 4009 wchar_t *repwstr; 4010 Py_ssize_t repwlen; 4011 4012 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND); 4013 outsize = _PyUnicode_WSTR_LENGTH(*output); 4014 4015 if (*errorHandler == NULL) { 4016 *errorHandler = PyCodec_LookupError(errors); 4017 if (*errorHandler == NULL) 4018 goto onError; 4019 } 4020 4021 make_decode_exception(exceptionObject, 4022 encoding, 4023 *input, *inend - *input, 4024 *startinpos, *endinpos, 4025 reason); 4026 if (*exceptionObject == NULL) 4027 goto onError; 4028 4029 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4030 if (restuple == NULL) 4031 goto onError; 4032 if (!PyTuple_Check(restuple)) { 4033 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4034 goto onError; 4035 } 4036 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4037 goto onError; 4038 4039 /* Copy back the bytes variables, which might have been modified by the 4040 callback */ 4041 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4042 if (!inputobj) 4043 goto onError; 4044 if (!PyBytes_Check(inputobj)) { 4045 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4046 } 4047 *input = PyBytes_AS_STRING(inputobj); 4048 insize = PyBytes_GET_SIZE(inputobj); 4049 *inend = *input + insize; 4050 /* we can DECREF safely, as the exception has another reference, 4051 so the object won't go away. */ 4052 Py_DECREF(inputobj); 4053 4054 if (newpos<0) 4055 newpos = insize+newpos; 4056 if (newpos<0 || newpos>insize) { 4057 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4058 goto onError; 4059 } 4060 4061 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen); 4062 if (repwstr == NULL) 4063 goto onError; 4064 /* need more space? (at least enough for what we 4065 have+the replacement+the rest of the string (starting 4066 at the new input position), so we won't have to check space 4067 when there are no errors in the rest of the string) */ 4068 requiredsize = *outpos; 4069 if (requiredsize > PY_SSIZE_T_MAX - repwlen) 4070 goto overflow; 4071 requiredsize += repwlen; 4072 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos)) 4073 goto overflow; 4074 requiredsize += insize - newpos; 4075 if (requiredsize > outsize) { 4076 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize) 4077 requiredsize = 2*outsize; 4078 if (unicode_resize(output, requiredsize) < 0) 4079 goto onError; 4080 } 4081 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen); 4082 *outpos += repwlen; 4083 *endinpos = newpos; 4084 *inptr = *input + newpos; 4085 4086 /* we made it! */ 4087 Py_XDECREF(restuple); 4088 return 0; 4089 4090 overflow: 4091 PyErr_SetString(PyExc_OverflowError, 4092 "decoded result is too long for a Python string"); 4093 4094 onError: 4095 Py_XDECREF(restuple); 4096 return -1; 4097} 4098#endif /* HAVE_MBCS */ 4099 4100static int 4101unicode_decode_call_errorhandler_writer( 4102 const char *errors, PyObject **errorHandler, 4103 const char *encoding, const char *reason, 4104 const char **input, const char **inend, Py_ssize_t *startinpos, 4105 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, 4106 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) 4107{ 4108 static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; 4109 4110 PyObject *restuple = NULL; 4111 PyObject *repunicode = NULL; 4112 Py_ssize_t insize; 4113 Py_ssize_t newpos; 4114 Py_ssize_t replen; 4115 PyObject *inputobj = NULL; 4116 4117 if (*errorHandler == NULL) { 4118 *errorHandler = PyCodec_LookupError(errors); 4119 if (*errorHandler == NULL) 4120 goto onError; 4121 } 4122 4123 make_decode_exception(exceptionObject, 4124 encoding, 4125 *input, *inend - *input, 4126 *startinpos, *endinpos, 4127 reason); 4128 if (*exceptionObject == NULL) 4129 goto onError; 4130 4131 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); 4132 if (restuple == NULL) 4133 goto onError; 4134 if (!PyTuple_Check(restuple)) { 4135 PyErr_SetString(PyExc_TypeError, &argparse[4]); 4136 goto onError; 4137 } 4138 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) 4139 goto onError; 4140 4141 /* Copy back the bytes variables, which might have been modified by the 4142 callback */ 4143 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); 4144 if (!inputobj) 4145 goto onError; 4146 if (!PyBytes_Check(inputobj)) { 4147 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); 4148 } 4149 *input = PyBytes_AS_STRING(inputobj); 4150 insize = PyBytes_GET_SIZE(inputobj); 4151 *inend = *input + insize; 4152 /* we can DECREF safely, as the exception has another reference, 4153 so the object won't go away. */ 4154 Py_DECREF(inputobj); 4155 4156 if (newpos<0) 4157 newpos = insize+newpos; 4158 if (newpos<0 || newpos>insize) { 4159 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); 4160 goto onError; 4161 } 4162 4163 if (PyUnicode_READY(repunicode) < 0) 4164 goto onError; 4165 replen = PyUnicode_GET_LENGTH(repunicode); 4166 if (replen > 1) { 4167 writer->min_length += replen - 1; 4168 writer->overallocate = 1; 4169 if (_PyUnicodeWriter_Prepare(writer, writer->min_length, 4170 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1) 4171 goto onError; 4172 } 4173 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) 4174 goto onError; 4175 4176 *endinpos = newpos; 4177 *inptr = *input + newpos; 4178 4179 /* we made it! */ 4180 Py_XDECREF(restuple); 4181 return 0; 4182 4183 onError: 4184 Py_XDECREF(restuple); 4185 return -1; 4186} 4187 4188/* --- UTF-7 Codec -------------------------------------------------------- */ 4189 4190/* See RFC2152 for details. We encode conservatively and decode liberally. */ 4191 4192/* Three simple macros defining base-64. */ 4193 4194/* Is c a base-64 character? */ 4195 4196#define IS_BASE64(c) \ 4197 (((c) >= 'A' && (c) <= 'Z') || \ 4198 ((c) >= 'a' && (c) <= 'z') || \ 4199 ((c) >= '0' && (c) <= '9') || \ 4200 (c) == '+' || (c) == '/') 4201 4202/* given that c is a base-64 character, what is its base-64 value? */ 4203 4204#define FROM_BASE64(c) \ 4205 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \ 4206 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \ 4207 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \ 4208 (c) == '+' ? 62 : 63) 4209 4210/* What is the base-64 character of the bottom 6 bits of n? */ 4211 4212#define TO_BASE64(n) \ 4213 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) 4214 4215/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be 4216 * decoded as itself. We are permissive on decoding; the only ASCII 4217 * byte not decoding to itself is the + which begins a base64 4218 * string. */ 4219 4220#define DECODE_DIRECT(c) \ 4221 ((c) <= 127 && (c) != '+') 4222 4223/* The UTF-7 encoder treats ASCII characters differently according to 4224 * whether they are Set D, Set O, Whitespace, or special (i.e. none of 4225 * the above). See RFC2152. This array identifies these different 4226 * sets: 4227 * 0 : "Set D" 4228 * alphanumeric and '(),-./:? 4229 * 1 : "Set O" 4230 * !"#$%&*;<=>@[]^_`{|} 4231 * 2 : "whitespace" 4232 * ht nl cr sp 4233 * 3 : special (must be base64 encoded) 4234 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) 4235 */ 4236 4237static 4238char utf7_category[128] = { 4239/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */ 4240 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 4241/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */ 4242 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4243/* sp ! " # $ % & ' ( ) * + , - . / */ 4244 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, 4245/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 4246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 4247/* @ A B C D E F G H I J K L M N O */ 4248 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4249/* P Q R S T U V W X Y Z [ \ ] ^ _ */ 4250 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, 4251/* ` a b c d e f g h i j k l m n o */ 4252 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4253/* p q r s t u v w x y z { | } ~ del */ 4254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 4255}; 4256 4257/* ENCODE_DIRECT: this character should be encoded as itself. The 4258 * answer depends on whether we are encoding set O as itself, and also 4259 * on whether we are encoding whitespace as itself. RFC2152 makes it 4260 * clear that the answers to these questions vary between 4261 * applications, so this code needs to be flexible. */ 4262 4263#define ENCODE_DIRECT(c, directO, directWS) \ 4264 ((c) < 128 && (c) > 0 && \ 4265 ((utf7_category[(c)] == 0) || \ 4266 (directWS && (utf7_category[(c)] == 2)) || \ 4267 (directO && (utf7_category[(c)] == 1)))) 4268 4269PyObject * 4270PyUnicode_DecodeUTF7(const char *s, 4271 Py_ssize_t size, 4272 const char *errors) 4273{ 4274 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); 4275} 4276 4277/* The decoder. The only state we preserve is our read position, 4278 * i.e. how many characters we have consumed. So if we end in the 4279 * middle of a shift sequence we have to back off the read position 4280 * and the output to the beginning of the sequence, otherwise we lose 4281 * all the shift state (seen bits, number of bits seen, high 4282 * surrogate). */ 4283 4284PyObject * 4285PyUnicode_DecodeUTF7Stateful(const char *s, 4286 Py_ssize_t size, 4287 const char *errors, 4288 Py_ssize_t *consumed) 4289{ 4290 const char *starts = s; 4291 Py_ssize_t startinpos; 4292 Py_ssize_t endinpos; 4293 const char *e; 4294 _PyUnicodeWriter writer; 4295 const char *errmsg = ""; 4296 int inShift = 0; 4297 Py_ssize_t shiftOutStart; 4298 unsigned int base64bits = 0; 4299 unsigned long base64buffer = 0; 4300 Py_UCS4 surrogate = 0; 4301 PyObject *errorHandler = NULL; 4302 PyObject *exc = NULL; 4303 4304 if (size == 0) { 4305 if (consumed) 4306 *consumed = 0; 4307 _Py_RETURN_UNICODE_EMPTY(); 4308 } 4309 4310 /* Start off assuming it's all ASCII. Widen later as necessary. */ 4311 _PyUnicodeWriter_Init(&writer); 4312 writer.min_length = size; 4313 4314 shiftOutStart = 0; 4315 e = s + size; 4316 4317 while (s < e) { 4318 Py_UCS4 ch; 4319 restart: 4320 ch = (unsigned char) *s; 4321 4322 if (inShift) { /* in a base-64 section */ 4323 if (IS_BASE64(ch)) { /* consume a base-64 character */ 4324 base64buffer = (base64buffer << 6) | FROM_BASE64(ch); 4325 base64bits += 6; 4326 s++; 4327 if (base64bits >= 16) { 4328 /* we have enough bits for a UTF-16 value */ 4329 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16)); 4330 base64bits -= 16; 4331 base64buffer &= (1 << base64bits) - 1; /* clear high bits */ 4332 assert(outCh <= 0xffff); 4333 if (surrogate) { 4334 /* expecting a second surrogate */ 4335 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) { 4336 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh); 4337 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0) 4338 goto onError; 4339 surrogate = 0; 4340 continue; 4341 } 4342 else { 4343 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4344 goto onError; 4345 surrogate = 0; 4346 } 4347 } 4348 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) { 4349 /* first surrogate */ 4350 surrogate = outCh; 4351 } 4352 else { 4353 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0) 4354 goto onError; 4355 } 4356 } 4357 } 4358 else { /* now leaving a base-64 section */ 4359 inShift = 0; 4360 s++; 4361 if (surrogate) { 4362 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) 4363 goto onError; 4364 surrogate = 0; 4365 } 4366 if (base64bits > 0) { /* left-over bits */ 4367 if (base64bits >= 6) { 4368 /* We've seen at least one base-64 character */ 4369 errmsg = "partial character in shift sequence"; 4370 goto utf7Error; 4371 } 4372 else { 4373 /* Some bits remain; they should be zero */ 4374 if (base64buffer != 0) { 4375 errmsg = "non-zero padding bits in shift sequence"; 4376 goto utf7Error; 4377 } 4378 } 4379 } 4380 if (ch != '-') { 4381 /* '-' is absorbed; other terminating 4382 characters are preserved */ 4383 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4384 goto onError; 4385 } 4386 } 4387 } 4388 else if ( ch == '+' ) { 4389 startinpos = s-starts; 4390 s++; /* consume '+' */ 4391 if (s < e && *s == '-') { /* '+-' encodes '+' */ 4392 s++; 4393 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0) 4394 goto onError; 4395 } 4396 else { /* begin base64-encoded section */ 4397 inShift = 1; 4398 shiftOutStart = writer.pos; 4399 base64bits = 0; 4400 base64buffer = 0; 4401 } 4402 } 4403 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ 4404 s++; 4405 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4406 goto onError; 4407 } 4408 else { 4409 startinpos = s-starts; 4410 s++; 4411 errmsg = "unexpected special character"; 4412 goto utf7Error; 4413 } 4414 continue; 4415utf7Error: 4416 endinpos = s-starts; 4417 if (unicode_decode_call_errorhandler_writer( 4418 errors, &errorHandler, 4419 "utf7", errmsg, 4420 &starts, &e, &startinpos, &endinpos, &exc, &s, 4421 &writer)) 4422 goto onError; 4423 } 4424 4425 /* end of string */ 4426 4427 if (inShift && !consumed) { /* in shift sequence, no more to follow */ 4428 /* if we're in an inconsistent state, that's an error */ 4429 if (surrogate || 4430 (base64bits >= 6) || 4431 (base64bits > 0 && base64buffer != 0)) { 4432 endinpos = size; 4433 if (unicode_decode_call_errorhandler_writer( 4434 errors, &errorHandler, 4435 "utf7", "unterminated shift sequence", 4436 &starts, &e, &startinpos, &endinpos, &exc, &s, 4437 &writer)) 4438 goto onError; 4439 if (s < e) 4440 goto restart; 4441 } 4442 } 4443 4444 /* return state */ 4445 if (consumed) { 4446 if (inShift) { 4447 *consumed = startinpos; 4448 if (writer.pos != shiftOutStart && writer.maxchar > 127) { 4449 PyObject *result = PyUnicode_FromKindAndData( 4450 writer.kind, writer.data, shiftOutStart); 4451 Py_XDECREF(errorHandler); 4452 Py_XDECREF(exc); 4453 _PyUnicodeWriter_Dealloc(&writer); 4454 return result; 4455 } 4456 writer.pos = shiftOutStart; /* back off output */ 4457 } 4458 else { 4459 *consumed = s-starts; 4460 } 4461 } 4462 4463 Py_XDECREF(errorHandler); 4464 Py_XDECREF(exc); 4465 return _PyUnicodeWriter_Finish(&writer); 4466 4467 onError: 4468 Py_XDECREF(errorHandler); 4469 Py_XDECREF(exc); 4470 _PyUnicodeWriter_Dealloc(&writer); 4471 return NULL; 4472} 4473 4474 4475PyObject * 4476_PyUnicode_EncodeUTF7(PyObject *str, 4477 int base64SetO, 4478 int base64WhiteSpace, 4479 const char *errors) 4480{ 4481 int kind; 4482 void *data; 4483 Py_ssize_t len; 4484 PyObject *v; 4485 int inShift = 0; 4486 Py_ssize_t i; 4487 unsigned int base64bits = 0; 4488 unsigned long base64buffer = 0; 4489 char * out; 4490 char * start; 4491 4492 if (PyUnicode_READY(str) == -1) 4493 return NULL; 4494 kind = PyUnicode_KIND(str); 4495 data = PyUnicode_DATA(str); 4496 len = PyUnicode_GET_LENGTH(str); 4497 4498 if (len == 0) 4499 return PyBytes_FromStringAndSize(NULL, 0); 4500 4501 /* It might be possible to tighten this worst case */ 4502 if (len > PY_SSIZE_T_MAX / 8) 4503 return PyErr_NoMemory(); 4504 v = PyBytes_FromStringAndSize(NULL, len * 8); 4505 if (v == NULL) 4506 return NULL; 4507 4508 start = out = PyBytes_AS_STRING(v); 4509 for (i = 0; i < len; ++i) { 4510 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 4511 4512 if (inShift) { 4513 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4514 /* shifting out */ 4515 if (base64bits) { /* output remaining bits */ 4516 *out++ = TO_BASE64(base64buffer << (6-base64bits)); 4517 base64buffer = 0; 4518 base64bits = 0; 4519 } 4520 inShift = 0; 4521 /* Characters not in the BASE64 set implicitly unshift the sequence 4522 so no '-' is required, except if the character is itself a '-' */ 4523 if (IS_BASE64(ch) || ch == '-') { 4524 *out++ = '-'; 4525 } 4526 *out++ = (char) ch; 4527 } 4528 else { 4529 goto encode_char; 4530 } 4531 } 4532 else { /* not in a shift sequence */ 4533 if (ch == '+') { 4534 *out++ = '+'; 4535 *out++ = '-'; 4536 } 4537 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) { 4538 *out++ = (char) ch; 4539 } 4540 else { 4541 *out++ = '+'; 4542 inShift = 1; 4543 goto encode_char; 4544 } 4545 } 4546 continue; 4547encode_char: 4548 if (ch >= 0x10000) { 4549 assert(ch <= MAX_UNICODE); 4550 4551 /* code first surrogate */ 4552 base64bits += 16; 4553 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); 4554 while (base64bits >= 6) { 4555 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4556 base64bits -= 6; 4557 } 4558 /* prepare second surrogate */ 4559 ch = Py_UNICODE_LOW_SURROGATE(ch); 4560 } 4561 base64bits += 16; 4562 base64buffer = (base64buffer << 16) | ch; 4563 while (base64bits >= 6) { 4564 *out++ = TO_BASE64(base64buffer >> (base64bits-6)); 4565 base64bits -= 6; 4566 } 4567 } 4568 if (base64bits) 4569 *out++= TO_BASE64(base64buffer << (6-base64bits) ); 4570 if (inShift) 4571 *out++ = '-'; 4572 if (_PyBytes_Resize(&v, out - start) < 0) 4573 return NULL; 4574 return v; 4575} 4576PyObject * 4577PyUnicode_EncodeUTF7(const Py_UNICODE *s, 4578 Py_ssize_t size, 4579 int base64SetO, 4580 int base64WhiteSpace, 4581 const char *errors) 4582{ 4583 PyObject *result; 4584 PyObject *tmp = PyUnicode_FromUnicode(s, size); 4585 if (tmp == NULL) 4586 return NULL; 4587 result = _PyUnicode_EncodeUTF7(tmp, base64SetO, 4588 base64WhiteSpace, errors); 4589 Py_DECREF(tmp); 4590 return result; 4591} 4592 4593#undef IS_BASE64 4594#undef FROM_BASE64 4595#undef TO_BASE64 4596#undef DECODE_DIRECT 4597#undef ENCODE_DIRECT 4598 4599/* --- UTF-8 Codec -------------------------------------------------------- */ 4600 4601PyObject * 4602PyUnicode_DecodeUTF8(const char *s, 4603 Py_ssize_t size, 4604 const char *errors) 4605{ 4606 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); 4607} 4608 4609#include "stringlib/asciilib.h" 4610#include "stringlib/codecs.h" 4611#include "stringlib/undef.h" 4612 4613#include "stringlib/ucs1lib.h" 4614#include "stringlib/codecs.h" 4615#include "stringlib/undef.h" 4616 4617#include "stringlib/ucs2lib.h" 4618#include "stringlib/codecs.h" 4619#include "stringlib/undef.h" 4620 4621#include "stringlib/ucs4lib.h" 4622#include "stringlib/codecs.h" 4623#include "stringlib/undef.h" 4624 4625/* Mask to quickly check whether a C 'long' contains a 4626 non-ASCII, UTF8-encoded char. */ 4627#if (SIZEOF_LONG == 8) 4628# define ASCII_CHAR_MASK 0x8080808080808080UL 4629#elif (SIZEOF_LONG == 4) 4630# define ASCII_CHAR_MASK 0x80808080UL 4631#else 4632# error C 'long' size should be either 4 or 8! 4633#endif 4634 4635static Py_ssize_t 4636ascii_decode(const char *start, const char *end, Py_UCS1 *dest) 4637{ 4638 const char *p = start; 4639 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 4640 4641 /* 4642 * Issue #17237: m68k is a bit different from most architectures in 4643 * that objects do not use "natural alignment" - for example, int and 4644 * long are only aligned at 2-byte boundaries. Therefore the assert() 4645 * won't work; also, tests have shown that skipping the "optimised 4646 * version" will even speed up m68k. 4647 */ 4648#if !defined(__m68k__) 4649#if SIZEOF_LONG <= SIZEOF_VOID_P 4650 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG)); 4651 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4652 /* Fast path, see in STRINGLIB(utf8_decode) for 4653 an explanation. */ 4654 /* Help allocation */ 4655 const char *_p = p; 4656 Py_UCS1 * q = dest; 4657 while (_p < aligned_end) { 4658 unsigned long value = *(const unsigned long *) _p; 4659 if (value & ASCII_CHAR_MASK) 4660 break; 4661 *((unsigned long *)q) = value; 4662 _p += SIZEOF_LONG; 4663 q += SIZEOF_LONG; 4664 } 4665 p = _p; 4666 while (p < end) { 4667 if ((unsigned char)*p & 0x80) 4668 break; 4669 *q++ = *p++; 4670 } 4671 return p - start; 4672 } 4673#endif 4674#endif 4675 while (p < end) { 4676 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h 4677 for an explanation. */ 4678 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 4679 /* Help allocation */ 4680 const char *_p = p; 4681 while (_p < aligned_end) { 4682 unsigned long value = *(unsigned long *) _p; 4683 if (value & ASCII_CHAR_MASK) 4684 break; 4685 _p += SIZEOF_LONG; 4686 } 4687 p = _p; 4688 if (_p == end) 4689 break; 4690 } 4691 if ((unsigned char)*p & 0x80) 4692 break; 4693 ++p; 4694 } 4695 memcpy(dest, start, p - start); 4696 return p - start; 4697} 4698 4699PyObject * 4700PyUnicode_DecodeUTF8Stateful(const char *s, 4701 Py_ssize_t size, 4702 const char *errors, 4703 Py_ssize_t *consumed) 4704{ 4705 _PyUnicodeWriter writer; 4706 const char *starts = s; 4707 const char *end = s + size; 4708 4709 Py_ssize_t startinpos; 4710 Py_ssize_t endinpos; 4711 const char *errmsg = ""; 4712 PyObject *errorHandler = NULL; 4713 PyObject *exc = NULL; 4714 4715 if (size == 0) { 4716 if (consumed) 4717 *consumed = 0; 4718 _Py_RETURN_UNICODE_EMPTY(); 4719 } 4720 4721 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 4722 if (size == 1 && (unsigned char)s[0] < 128) { 4723 if (consumed) 4724 *consumed = 1; 4725 return get_latin1_char((unsigned char)s[0]); 4726 } 4727 4728 _PyUnicodeWriter_Init(&writer); 4729 writer.min_length = size; 4730 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4731 goto onError; 4732 4733 writer.pos = ascii_decode(s, end, writer.data); 4734 s += writer.pos; 4735 while (s < end) { 4736 Py_UCS4 ch; 4737 int kind = writer.kind; 4738 if (kind == PyUnicode_1BYTE_KIND) { 4739 if (PyUnicode_IS_ASCII(writer.buffer)) 4740 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); 4741 else 4742 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); 4743 } else if (kind == PyUnicode_2BYTE_KIND) { 4744 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); 4745 } else { 4746 assert(kind == PyUnicode_4BYTE_KIND); 4747 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); 4748 } 4749 4750 switch (ch) { 4751 case 0: 4752 if (s == end || consumed) 4753 goto End; 4754 errmsg = "unexpected end of data"; 4755 startinpos = s - starts; 4756 endinpos = end - starts; 4757 break; 4758 case 1: 4759 errmsg = "invalid start byte"; 4760 startinpos = s - starts; 4761 endinpos = startinpos + 1; 4762 break; 4763 case 2: 4764 case 3: 4765 case 4: 4766 errmsg = "invalid continuation byte"; 4767 startinpos = s - starts; 4768 endinpos = startinpos + ch - 1; 4769 break; 4770 default: 4771 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 4772 goto onError; 4773 continue; 4774 } 4775 4776 if (unicode_decode_call_errorhandler_writer( 4777 errors, &errorHandler, 4778 "utf-8", errmsg, 4779 &starts, &end, &startinpos, &endinpos, &exc, &s, 4780 &writer)) 4781 goto onError; 4782 } 4783 4784End: 4785 if (consumed) 4786 *consumed = s - starts; 4787 4788 Py_XDECREF(errorHandler); 4789 Py_XDECREF(exc); 4790 return _PyUnicodeWriter_Finish(&writer); 4791 4792onError: 4793 Py_XDECREF(errorHandler); 4794 Py_XDECREF(exc); 4795 _PyUnicodeWriter_Dealloc(&writer); 4796 return NULL; 4797} 4798 4799#ifdef __APPLE__ 4800 4801/* Simplified UTF-8 decoder using surrogateescape error handler, 4802 used to decode the command line arguments on Mac OS X. 4803 4804 Return a pointer to a newly allocated wide character string (use 4805 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ 4806 4807wchar_t* 4808_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) 4809{ 4810 const char *e; 4811 wchar_t *unicode; 4812 Py_ssize_t outpos; 4813 4814 /* Note: size will always be longer than the resulting Unicode 4815 character count */ 4816 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) 4817 return NULL; 4818 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); 4819 if (!unicode) 4820 return NULL; 4821 4822 /* Unpack UTF-8 encoded data */ 4823 e = s + size; 4824 outpos = 0; 4825 while (s < e) { 4826 Py_UCS4 ch; 4827#if SIZEOF_WCHAR_T == 4 4828 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos); 4829#else 4830 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos); 4831#endif 4832 if (ch > 0xFF) { 4833#if SIZEOF_WCHAR_T == 4 4834 assert(0); 4835#else 4836 assert(Py_UNICODE_IS_SURROGATE(ch)); 4837 /* compute and append the two surrogates: */ 4838 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch); 4839 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch); 4840#endif 4841 } 4842 else { 4843 if (!ch && s == e) 4844 break; 4845 /* surrogateescape */ 4846 unicode[outpos++] = 0xDC00 + (unsigned char)*s++; 4847 } 4848 } 4849 unicode[outpos] = L'\0'; 4850 return unicode; 4851} 4852 4853#endif /* __APPLE__ */ 4854 4855/* Primary internal function which creates utf8 encoded bytes objects. 4856 4857 Allocation strategy: if the string is short, convert into a stack buffer 4858 and allocate exactly as much space needed at the end. Else allocate the 4859 maximum possible needed (4 result bytes per Unicode character), and return 4860 the excess memory at the end. 4861*/ 4862PyObject * 4863_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) 4864{ 4865 enum PyUnicode_Kind kind; 4866 void *data; 4867 Py_ssize_t size; 4868 4869 if (!PyUnicode_Check(unicode)) { 4870 PyErr_BadArgument(); 4871 return NULL; 4872 } 4873 4874 if (PyUnicode_READY(unicode) == -1) 4875 return NULL; 4876 4877 if (PyUnicode_UTF8(unicode)) 4878 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode), 4879 PyUnicode_UTF8_LENGTH(unicode)); 4880 4881 kind = PyUnicode_KIND(unicode); 4882 data = PyUnicode_DATA(unicode); 4883 size = PyUnicode_GET_LENGTH(unicode); 4884 4885 switch (kind) { 4886 default: 4887 assert(0); 4888 case PyUnicode_1BYTE_KIND: 4889 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ 4890 assert(!PyUnicode_IS_ASCII(unicode)); 4891 return ucs1lib_utf8_encoder(unicode, data, size, errors); 4892 case PyUnicode_2BYTE_KIND: 4893 return ucs2lib_utf8_encoder(unicode, data, size, errors); 4894 case PyUnicode_4BYTE_KIND: 4895 return ucs4lib_utf8_encoder(unicode, data, size, errors); 4896 } 4897} 4898 4899PyObject * 4900PyUnicode_EncodeUTF8(const Py_UNICODE *s, 4901 Py_ssize_t size, 4902 const char *errors) 4903{ 4904 PyObject *v, *unicode; 4905 4906 unicode = PyUnicode_FromUnicode(s, size); 4907 if (unicode == NULL) 4908 return NULL; 4909 v = _PyUnicode_AsUTF8String(unicode, errors); 4910 Py_DECREF(unicode); 4911 return v; 4912} 4913 4914PyObject * 4915PyUnicode_AsUTF8String(PyObject *unicode) 4916{ 4917 return _PyUnicode_AsUTF8String(unicode, NULL); 4918} 4919 4920/* --- UTF-32 Codec ------------------------------------------------------- */ 4921 4922PyObject * 4923PyUnicode_DecodeUTF32(const char *s, 4924 Py_ssize_t size, 4925 const char *errors, 4926 int *byteorder) 4927{ 4928 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); 4929} 4930 4931PyObject * 4932PyUnicode_DecodeUTF32Stateful(const char *s, 4933 Py_ssize_t size, 4934 const char *errors, 4935 int *byteorder, 4936 Py_ssize_t *consumed) 4937{ 4938 const char *starts = s; 4939 Py_ssize_t startinpos; 4940 Py_ssize_t endinpos; 4941 _PyUnicodeWriter writer; 4942 const unsigned char *q, *e; 4943 int le, bo = 0; /* assume native ordering by default */ 4944 const char *encoding; 4945 const char *errmsg = ""; 4946 PyObject *errorHandler = NULL; 4947 PyObject *exc = NULL; 4948 4949 q = (unsigned char *)s; 4950 e = q + size; 4951 4952 if (byteorder) 4953 bo = *byteorder; 4954 4955 /* Check for BOM marks (U+FEFF) in the input and adjust current 4956 byte order setting accordingly. In native mode, the leading BOM 4957 mark is skipped, in all other modes, it is copied to the output 4958 stream as-is (giving a ZWNBSP character). */ 4959 if (bo == 0 && size >= 4) { 4960 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 4961 if (bom == 0x0000FEFF) { 4962 bo = -1; 4963 q += 4; 4964 } 4965 else if (bom == 0xFFFE0000) { 4966 bo = 1; 4967 q += 4; 4968 } 4969 if (byteorder) 4970 *byteorder = bo; 4971 } 4972 4973 if (q == e) { 4974 if (consumed) 4975 *consumed = size; 4976 _Py_RETURN_UNICODE_EMPTY(); 4977 } 4978 4979#ifdef WORDS_BIGENDIAN 4980 le = bo < 0; 4981#else 4982 le = bo <= 0; 4983#endif 4984 encoding = le ? "utf-32-le" : "utf-32-be"; 4985 4986 _PyUnicodeWriter_Init(&writer); 4987 writer.min_length = (e - q + 3) / 4; 4988 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 4989 goto onError; 4990 4991 while (1) { 4992 Py_UCS4 ch = 0; 4993 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); 4994 4995 if (e - q >= 4) { 4996 enum PyUnicode_Kind kind = writer.kind; 4997 void *data = writer.data; 4998 const unsigned char *last = e - 4; 4999 Py_ssize_t pos = writer.pos; 5000 if (le) { 5001 do { 5002 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; 5003 if (ch > maxch) 5004 break; 5005 if (kind != PyUnicode_1BYTE_KIND && 5006 Py_UNICODE_IS_SURROGATE(ch)) 5007 break; 5008 PyUnicode_WRITE(kind, data, pos++, ch); 5009 q += 4; 5010 } while (q <= last); 5011 } 5012 else { 5013 do { 5014 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; 5015 if (ch > maxch) 5016 break; 5017 if (kind != PyUnicode_1BYTE_KIND && 5018 Py_UNICODE_IS_SURROGATE(ch)) 5019 break; 5020 PyUnicode_WRITE(kind, data, pos++, ch); 5021 q += 4; 5022 } while (q <= last); 5023 } 5024 writer.pos = pos; 5025 } 5026 5027 if (Py_UNICODE_IS_SURROGATE(ch)) { 5028 errmsg = "code point in surrogate code point range(0xd800, 0xe000)"; 5029 startinpos = ((const char *)q) - starts; 5030 endinpos = startinpos + 4; 5031 } 5032 else if (ch <= maxch) { 5033 if (q == e || consumed) 5034 break; 5035 /* remaining bytes at the end? (size should be divisible by 4) */ 5036 errmsg = "truncated data"; 5037 startinpos = ((const char *)q) - starts; 5038 endinpos = ((const char *)e) - starts; 5039 } 5040 else { 5041 if (ch < 0x110000) { 5042 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5043 goto onError; 5044 q += 4; 5045 continue; 5046 } 5047 errmsg = "code point not in range(0x110000)"; 5048 startinpos = ((const char *)q) - starts; 5049 endinpos = startinpos + 4; 5050 } 5051 5052 /* The remaining input chars are ignored if the callback 5053 chooses to skip the input */ 5054 if (unicode_decode_call_errorhandler_writer( 5055 errors, &errorHandler, 5056 encoding, errmsg, 5057 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, 5058 &writer)) 5059 goto onError; 5060 } 5061 5062 if (consumed) 5063 *consumed = (const char *)q-starts; 5064 5065 Py_XDECREF(errorHandler); 5066 Py_XDECREF(exc); 5067 return _PyUnicodeWriter_Finish(&writer); 5068 5069 onError: 5070 _PyUnicodeWriter_Dealloc(&writer); 5071 Py_XDECREF(errorHandler); 5072 Py_XDECREF(exc); 5073 return NULL; 5074} 5075 5076PyObject * 5077_PyUnicode_EncodeUTF32(PyObject *str, 5078 const char *errors, 5079 int byteorder) 5080{ 5081 enum PyUnicode_Kind kind; 5082 const void *data; 5083 Py_ssize_t len; 5084 PyObject *v; 5085 PY_UINT32_T *out; 5086#if PY_LITTLE_ENDIAN 5087 int native_ordering = byteorder <= 0; 5088#else 5089 int native_ordering = byteorder >= 0; 5090#endif 5091 const char *encoding; 5092 Py_ssize_t nsize, pos; 5093 PyObject *errorHandler = NULL; 5094 PyObject *exc = NULL; 5095 PyObject *rep = NULL; 5096 5097 if (!PyUnicode_Check(str)) { 5098 PyErr_BadArgument(); 5099 return NULL; 5100 } 5101 if (PyUnicode_READY(str) == -1) 5102 return NULL; 5103 kind = PyUnicode_KIND(str); 5104 data = PyUnicode_DATA(str); 5105 len = PyUnicode_GET_LENGTH(str); 5106 5107 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0)) 5108 return PyErr_NoMemory(); 5109 nsize = len + (byteorder == 0); 5110 v = PyBytes_FromStringAndSize(NULL, nsize * 4); 5111 if (v == NULL) 5112 return NULL; 5113 5114 /* output buffer is 4-bytes aligned */ 5115 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4)); 5116 out = (PY_UINT32_T *)PyBytes_AS_STRING(v); 5117 if (byteorder == 0) 5118 *out++ = 0xFEFF; 5119 if (len == 0) 5120 goto done; 5121 5122 if (byteorder == -1) 5123 encoding = "utf-32-le"; 5124 else if (byteorder == 1) 5125 encoding = "utf-32-be"; 5126 else 5127 encoding = "utf-32"; 5128 5129 if (kind == PyUnicode_1BYTE_KIND) { 5130 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5131 goto done; 5132 } 5133 5134 pos = 0; 5135 while (pos < len) { 5136 Py_ssize_t repsize, moreunits; 5137 5138 if (kind == PyUnicode_2BYTE_KIND) { 5139 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos, 5140 &out, native_ordering); 5141 } 5142 else { 5143 assert(kind == PyUnicode_4BYTE_KIND); 5144 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos, 5145 &out, native_ordering); 5146 } 5147 if (pos == len) 5148 break; 5149 5150 rep = unicode_encode_call_errorhandler( 5151 errors, &errorHandler, 5152 encoding, "surrogates not allowed", 5153 str, &exc, pos, pos + 1, &pos); 5154 if (!rep) 5155 goto error; 5156 5157 if (PyBytes_Check(rep)) { 5158 repsize = PyBytes_GET_SIZE(rep); 5159 if (repsize & 3) { 5160 raise_encode_exception(&exc, encoding, 5161 str, pos - 1, pos, 5162 "surrogates not allowed"); 5163 goto error; 5164 } 5165 moreunits = repsize / 4; 5166 } 5167 else { 5168 assert(PyUnicode_Check(rep)); 5169 if (PyUnicode_READY(rep) < 0) 5170 goto error; 5171 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5172 if (!PyUnicode_IS_ASCII(rep)) { 5173 raise_encode_exception(&exc, encoding, 5174 str, pos - 1, pos, 5175 "surrogates not allowed"); 5176 goto error; 5177 } 5178 } 5179 5180 /* four bytes are reserved for each surrogate */ 5181 if (moreunits > 1) { 5182 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v); 5183 Py_ssize_t morebytes = 4 * (moreunits - 1); 5184 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5185 /* integer overflow */ 5186 PyErr_NoMemory(); 5187 goto error; 5188 } 5189 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5190 goto error; 5191 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos; 5192 } 5193 5194 if (PyBytes_Check(rep)) { 5195 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5196 out += moreunits; 5197 } else /* rep is unicode */ { 5198 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5199 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5200 &out, native_ordering); 5201 } 5202 5203 Py_CLEAR(rep); 5204 } 5205 5206 /* Cut back to size actually needed. This is necessary for, for example, 5207 encoding of a string containing isolated surrogates and the 'ignore' 5208 handler is used. */ 5209 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5210 if (nsize != PyBytes_GET_SIZE(v)) 5211 _PyBytes_Resize(&v, nsize); 5212 Py_XDECREF(errorHandler); 5213 Py_XDECREF(exc); 5214 done: 5215 return v; 5216 error: 5217 Py_XDECREF(rep); 5218 Py_XDECREF(errorHandler); 5219 Py_XDECREF(exc); 5220 Py_XDECREF(v); 5221 return NULL; 5222} 5223 5224PyObject * 5225PyUnicode_EncodeUTF32(const Py_UNICODE *s, 5226 Py_ssize_t size, 5227 const char *errors, 5228 int byteorder) 5229{ 5230 PyObject *result; 5231 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5232 if (tmp == NULL) 5233 return NULL; 5234 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); 5235 Py_DECREF(tmp); 5236 return result; 5237} 5238 5239PyObject * 5240PyUnicode_AsUTF32String(PyObject *unicode) 5241{ 5242 return _PyUnicode_EncodeUTF32(unicode, NULL, 0); 5243} 5244 5245/* --- UTF-16 Codec ------------------------------------------------------- */ 5246 5247PyObject * 5248PyUnicode_DecodeUTF16(const char *s, 5249 Py_ssize_t size, 5250 const char *errors, 5251 int *byteorder) 5252{ 5253 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); 5254} 5255 5256PyObject * 5257PyUnicode_DecodeUTF16Stateful(const char *s, 5258 Py_ssize_t size, 5259 const char *errors, 5260 int *byteorder, 5261 Py_ssize_t *consumed) 5262{ 5263 const char *starts = s; 5264 Py_ssize_t startinpos; 5265 Py_ssize_t endinpos; 5266 _PyUnicodeWriter writer; 5267 const unsigned char *q, *e; 5268 int bo = 0; /* assume native ordering by default */ 5269 int native_ordering; 5270 const char *errmsg = ""; 5271 PyObject *errorHandler = NULL; 5272 PyObject *exc = NULL; 5273 const char *encoding; 5274 5275 q = (unsigned char *)s; 5276 e = q + size; 5277 5278 if (byteorder) 5279 bo = *byteorder; 5280 5281 /* Check for BOM marks (U+FEFF) in the input and adjust current 5282 byte order setting accordingly. In native mode, the leading BOM 5283 mark is skipped, in all other modes, it is copied to the output 5284 stream as-is (giving a ZWNBSP character). */ 5285 if (bo == 0 && size >= 2) { 5286 const Py_UCS4 bom = (q[1] << 8) | q[0]; 5287 if (bom == 0xFEFF) { 5288 q += 2; 5289 bo = -1; 5290 } 5291 else if (bom == 0xFFFE) { 5292 q += 2; 5293 bo = 1; 5294 } 5295 if (byteorder) 5296 *byteorder = bo; 5297 } 5298 5299 if (q == e) { 5300 if (consumed) 5301 *consumed = size; 5302 _Py_RETURN_UNICODE_EMPTY(); 5303 } 5304 5305#if PY_LITTLE_ENDIAN 5306 native_ordering = bo <= 0; 5307 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be"; 5308#else 5309 native_ordering = bo >= 0; 5310 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le"; 5311#endif 5312 5313 /* Note: size will always be longer than the resulting Unicode 5314 character count */ 5315 _PyUnicodeWriter_Init(&writer); 5316 writer.min_length = (e - q + 1) / 2; 5317 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 5318 goto onError; 5319 5320 while (1) { 5321 Py_UCS4 ch = 0; 5322 if (e - q >= 2) { 5323 int kind = writer.kind; 5324 if (kind == PyUnicode_1BYTE_KIND) { 5325 if (PyUnicode_IS_ASCII(writer.buffer)) 5326 ch = asciilib_utf16_decode(&q, e, 5327 (Py_UCS1*)writer.data, &writer.pos, 5328 native_ordering); 5329 else 5330 ch = ucs1lib_utf16_decode(&q, e, 5331 (Py_UCS1*)writer.data, &writer.pos, 5332 native_ordering); 5333 } else if (kind == PyUnicode_2BYTE_KIND) { 5334 ch = ucs2lib_utf16_decode(&q, e, 5335 (Py_UCS2*)writer.data, &writer.pos, 5336 native_ordering); 5337 } else { 5338 assert(kind == PyUnicode_4BYTE_KIND); 5339 ch = ucs4lib_utf16_decode(&q, e, 5340 (Py_UCS4*)writer.data, &writer.pos, 5341 native_ordering); 5342 } 5343 } 5344 5345 switch (ch) 5346 { 5347 case 0: 5348 /* remaining byte at the end? (size should be even) */ 5349 if (q == e || consumed) 5350 goto End; 5351 errmsg = "truncated data"; 5352 startinpos = ((const char *)q) - starts; 5353 endinpos = ((const char *)e) - starts; 5354 break; 5355 /* The remaining input chars are ignored if the callback 5356 chooses to skip the input */ 5357 case 1: 5358 q -= 2; 5359 if (consumed) 5360 goto End; 5361 errmsg = "unexpected end of data"; 5362 startinpos = ((const char *)q) - starts; 5363 endinpos = ((const char *)e) - starts; 5364 break; 5365 case 2: 5366 errmsg = "illegal encoding"; 5367 startinpos = ((const char *)q) - 2 - starts; 5368 endinpos = startinpos + 2; 5369 break; 5370 case 3: 5371 errmsg = "illegal UTF-16 surrogate"; 5372 startinpos = ((const char *)q) - 4 - starts; 5373 endinpos = startinpos + 2; 5374 break; 5375 default: 5376 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 5377 goto onError; 5378 continue; 5379 } 5380 5381 if (unicode_decode_call_errorhandler_writer( 5382 errors, 5383 &errorHandler, 5384 encoding, errmsg, 5385 &starts, 5386 (const char **)&e, 5387 &startinpos, 5388 &endinpos, 5389 &exc, 5390 (const char **)&q, 5391 &writer)) 5392 goto onError; 5393 } 5394 5395End: 5396 if (consumed) 5397 *consumed = (const char *)q-starts; 5398 5399 Py_XDECREF(errorHandler); 5400 Py_XDECREF(exc); 5401 return _PyUnicodeWriter_Finish(&writer); 5402 5403 onError: 5404 _PyUnicodeWriter_Dealloc(&writer); 5405 Py_XDECREF(errorHandler); 5406 Py_XDECREF(exc); 5407 return NULL; 5408} 5409 5410PyObject * 5411_PyUnicode_EncodeUTF16(PyObject *str, 5412 const char *errors, 5413 int byteorder) 5414{ 5415 enum PyUnicode_Kind kind; 5416 const void *data; 5417 Py_ssize_t len; 5418 PyObject *v; 5419 unsigned short *out; 5420 Py_ssize_t pairs; 5421#if PY_BIG_ENDIAN 5422 int native_ordering = byteorder >= 0; 5423#else 5424 int native_ordering = byteorder <= 0; 5425#endif 5426 const char *encoding; 5427 Py_ssize_t nsize, pos; 5428 PyObject *errorHandler = NULL; 5429 PyObject *exc = NULL; 5430 PyObject *rep = NULL; 5431 5432 if (!PyUnicode_Check(str)) { 5433 PyErr_BadArgument(); 5434 return NULL; 5435 } 5436 if (PyUnicode_READY(str) == -1) 5437 return NULL; 5438 kind = PyUnicode_KIND(str); 5439 data = PyUnicode_DATA(str); 5440 len = PyUnicode_GET_LENGTH(str); 5441 5442 pairs = 0; 5443 if (kind == PyUnicode_4BYTE_KIND) { 5444 const Py_UCS4 *in = (const Py_UCS4 *)data; 5445 const Py_UCS4 *end = in + len; 5446 while (in < end) 5447 if (*in++ >= 0x10000) 5448 pairs++; 5449 } 5450 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) 5451 return PyErr_NoMemory(); 5452 nsize = len + pairs + (byteorder == 0); 5453 v = PyBytes_FromStringAndSize(NULL, nsize * 2); 5454 if (v == NULL) 5455 return NULL; 5456 5457 /* output buffer is 2-bytes aligned */ 5458 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2)); 5459 out = (unsigned short *)PyBytes_AS_STRING(v); 5460 if (byteorder == 0) 5461 *out++ = 0xFEFF; 5462 if (len == 0) 5463 goto done; 5464 5465 if (kind == PyUnicode_1BYTE_KIND) { 5466 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering); 5467 goto done; 5468 } 5469 5470 if (byteorder < 0) 5471 encoding = "utf-16-le"; 5472 else if (byteorder > 0) 5473 encoding = "utf-16-be"; 5474 else 5475 encoding = "utf-16"; 5476 5477 pos = 0; 5478 while (pos < len) { 5479 Py_ssize_t repsize, moreunits; 5480 5481 if (kind == PyUnicode_2BYTE_KIND) { 5482 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos, 5483 &out, native_ordering); 5484 } 5485 else { 5486 assert(kind == PyUnicode_4BYTE_KIND); 5487 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos, 5488 &out, native_ordering); 5489 } 5490 if (pos == len) 5491 break; 5492 5493 rep = unicode_encode_call_errorhandler( 5494 errors, &errorHandler, 5495 encoding, "surrogates not allowed", 5496 str, &exc, pos, pos + 1, &pos); 5497 if (!rep) 5498 goto error; 5499 5500 if (PyBytes_Check(rep)) { 5501 repsize = PyBytes_GET_SIZE(rep); 5502 if (repsize & 1) { 5503 raise_encode_exception(&exc, encoding, 5504 str, pos - 1, pos, 5505 "surrogates not allowed"); 5506 goto error; 5507 } 5508 moreunits = repsize / 2; 5509 } 5510 else { 5511 assert(PyUnicode_Check(rep)); 5512 if (PyUnicode_READY(rep) < 0) 5513 goto error; 5514 moreunits = repsize = PyUnicode_GET_LENGTH(rep); 5515 if (!PyUnicode_IS_ASCII(rep)) { 5516 raise_encode_exception(&exc, encoding, 5517 str, pos - 1, pos, 5518 "surrogates not allowed"); 5519 goto error; 5520 } 5521 } 5522 5523 /* two bytes are reserved for each surrogate */ 5524 if (moreunits > 1) { 5525 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v); 5526 Py_ssize_t morebytes = 2 * (moreunits - 1); 5527 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) { 5528 /* integer overflow */ 5529 PyErr_NoMemory(); 5530 goto error; 5531 } 5532 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0) 5533 goto error; 5534 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos; 5535 } 5536 5537 if (PyBytes_Check(rep)) { 5538 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize); 5539 out += moreunits; 5540 } else /* rep is unicode */ { 5541 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); 5542 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize, 5543 &out, native_ordering); 5544 } 5545 5546 Py_CLEAR(rep); 5547 } 5548 5549 /* Cut back to size actually needed. This is necessary for, for example, 5550 encoding of a string containing isolated surrogates and the 'ignore' handler 5551 is used. */ 5552 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v); 5553 if (nsize != PyBytes_GET_SIZE(v)) 5554 _PyBytes_Resize(&v, nsize); 5555 Py_XDECREF(errorHandler); 5556 Py_XDECREF(exc); 5557 done: 5558 return v; 5559 error: 5560 Py_XDECREF(rep); 5561 Py_XDECREF(errorHandler); 5562 Py_XDECREF(exc); 5563 Py_XDECREF(v); 5564 return NULL; 5565#undef STORECHAR 5566} 5567 5568PyObject * 5569PyUnicode_EncodeUTF16(const Py_UNICODE *s, 5570 Py_ssize_t size, 5571 const char *errors, 5572 int byteorder) 5573{ 5574 PyObject *result; 5575 PyObject *tmp = PyUnicode_FromUnicode(s, size); 5576 if (tmp == NULL) 5577 return NULL; 5578 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); 5579 Py_DECREF(tmp); 5580 return result; 5581} 5582 5583PyObject * 5584PyUnicode_AsUTF16String(PyObject *unicode) 5585{ 5586 return _PyUnicode_EncodeUTF16(unicode, NULL, 0); 5587} 5588 5589/* --- Unicode Escape Codec ----------------------------------------------- */ 5590 5591/* Helper function for PyUnicode_DecodeUnicodeEscape, determines 5592 if all the escapes in the string make it still a valid ASCII string. 5593 Returns -1 if any escapes were found which cause the string to 5594 pop out of ASCII range. Otherwise returns the length of the 5595 required buffer to hold the string. 5596 */ 5597static Py_ssize_t 5598length_of_escaped_ascii_string(const char *s, Py_ssize_t size) 5599{ 5600 const unsigned char *p = (const unsigned char *)s; 5601 const unsigned char *end = p + size; 5602 Py_ssize_t length = 0; 5603 5604 if (size < 0) 5605 return -1; 5606 5607 for (; p < end; ++p) { 5608 if (*p > 127) { 5609 /* Non-ASCII */ 5610 return -1; 5611 } 5612 else if (*p != '\\') { 5613 /* Normal character */ 5614 ++length; 5615 } 5616 else { 5617 /* Backslash-escape, check next char */ 5618 ++p; 5619 /* Escape sequence reaches till end of string or 5620 non-ASCII follow-up. */ 5621 if (p >= end || *p > 127) 5622 return -1; 5623 switch (*p) { 5624 case '\n': 5625 /* backslash + \n result in zero characters */ 5626 break; 5627 case '\\': case '\'': case '\"': 5628 case 'b': case 'f': case 't': 5629 case 'n': case 'r': case 'v': case 'a': 5630 ++length; 5631 break; 5632 case '0': case '1': case '2': case '3': 5633 case '4': case '5': case '6': case '7': 5634 case 'x': case 'u': case 'U': case 'N': 5635 /* these do not guarantee ASCII characters */ 5636 return -1; 5637 default: 5638 /* count the backslash + the other character */ 5639 length += 2; 5640 } 5641 } 5642 } 5643 return length; 5644} 5645 5646static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; 5647 5648PyObject * 5649PyUnicode_DecodeUnicodeEscape(const char *s, 5650 Py_ssize_t size, 5651 const char *errors) 5652{ 5653 const char *starts = s; 5654 Py_ssize_t startinpos; 5655 Py_ssize_t endinpos; 5656 _PyUnicodeWriter writer; 5657 const char *end; 5658 char* message; 5659 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ 5660 PyObject *errorHandler = NULL; 5661 PyObject *exc = NULL; 5662 Py_ssize_t len; 5663 5664 len = length_of_escaped_ascii_string(s, size); 5665 if (len == 0) 5666 _Py_RETURN_UNICODE_EMPTY(); 5667 5668 /* After length_of_escaped_ascii_string() there are two alternatives, 5669 either the string is pure ASCII with named escapes like \n, etc. 5670 and we determined it's exact size (common case) 5671 or it contains \x, \u, ... escape sequences. then we create a 5672 legacy wchar string and resize it at the end of this function. */ 5673 _PyUnicodeWriter_Init(&writer); 5674 if (len > 0) { 5675 writer.min_length = len; 5676 } 5677 else { 5678 /* Escaped strings will always be longer than the resulting 5679 Unicode string, so we start with size here and then reduce the 5680 length after conversion to the true value. 5681 (but if the error callback returns a long replacement string 5682 we'll have to allocate more space) */ 5683 writer.min_length = size; 5684 } 5685 5686 if (size == 0) 5687 return _PyUnicodeWriter_Finish(&writer); 5688 end = s + size; 5689 5690 while (s < end) { 5691 unsigned char c; 5692 Py_UCS4 x; 5693 int digits; 5694 5695 /* Non-escape characters are interpreted as Unicode ordinals */ 5696 if (*s != '\\') { 5697 x = (unsigned char)*s; 5698 s++; 5699 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 5700 goto onError; 5701 continue; 5702 } 5703 5704 startinpos = s-starts; 5705 /* \ - Escapes */ 5706 s++; 5707 c = *s++; 5708 if (s > end) 5709 c = '\0'; /* Invalid after \ */ 5710 5711 switch (c) { 5712 5713 /* \x escapes */ 5714#define WRITECHAR(ch) \ 5715 do { \ 5716 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ 5717 goto onError; \ 5718 } while(0) 5719 5720 case '\n': break; 5721 case '\\': WRITECHAR('\\'); break; 5722 case '\'': WRITECHAR('\''); break; 5723 case '\"': WRITECHAR('\"'); break; 5724 case 'b': WRITECHAR('\b'); break; 5725 /* FF */ 5726 case 'f': WRITECHAR('\014'); break; 5727 case 't': WRITECHAR('\t'); break; 5728 case 'n': WRITECHAR('\n'); break; 5729 case 'r': WRITECHAR('\r'); break; 5730 /* VT */ 5731 case 'v': WRITECHAR('\013'); break; 5732 /* BEL, not classic C */ 5733 case 'a': WRITECHAR('\007'); break; 5734 5735 /* \OOO (octal) escapes */ 5736 case '0': case '1': case '2': case '3': 5737 case '4': case '5': case '6': case '7': 5738 x = s[-1] - '0'; 5739 if (s < end && '0' <= *s && *s <= '7') { 5740 x = (x<<3) + *s++ - '0'; 5741 if (s < end && '0' <= *s && *s <= '7') 5742 x = (x<<3) + *s++ - '0'; 5743 } 5744 WRITECHAR(x); 5745 break; 5746 5747 /* hex escapes */ 5748 /* \xXX */ 5749 case 'x': 5750 digits = 2; 5751 message = "truncated \\xXX escape"; 5752 goto hexescape; 5753 5754 /* \uXXXX */ 5755 case 'u': 5756 digits = 4; 5757 message = "truncated \\uXXXX escape"; 5758 goto hexescape; 5759 5760 /* \UXXXXXXXX */ 5761 case 'U': 5762 digits = 8; 5763 message = "truncated \\UXXXXXXXX escape"; 5764 hexescape: 5765 chr = 0; 5766 if (end - s < digits) { 5767 /* count only hex digits */ 5768 for (; s < end; ++s) { 5769 c = (unsigned char)*s; 5770 if (!Py_ISXDIGIT(c)) 5771 goto error; 5772 } 5773 goto error; 5774 } 5775 for (; digits--; ++s) { 5776 c = (unsigned char)*s; 5777 if (!Py_ISXDIGIT(c)) 5778 goto error; 5779 chr = (chr<<4) & ~0xF; 5780 if (c >= '0' && c <= '9') 5781 chr += c - '0'; 5782 else if (c >= 'a' && c <= 'f') 5783 chr += 10 + c - 'a'; 5784 else 5785 chr += 10 + c - 'A'; 5786 } 5787 if (chr == 0xffffffff && PyErr_Occurred()) 5788 /* _decoding_error will have already written into the 5789 target buffer. */ 5790 break; 5791 store: 5792 /* when we get here, chr is a 32-bit unicode character */ 5793 message = "illegal Unicode character"; 5794 if (chr > MAX_UNICODE) 5795 goto error; 5796 WRITECHAR(chr); 5797 break; 5798 5799 /* \N{name} */ 5800 case 'N': 5801 message = "malformed \\N character escape"; 5802 if (ucnhash_CAPI == NULL) { 5803 /* load the unicode data module */ 5804 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( 5805 PyUnicodeData_CAPSULE_NAME, 1); 5806 if (ucnhash_CAPI == NULL) 5807 goto ucnhashError; 5808 } 5809 if (*s == '{') { 5810 const char *start = s+1; 5811 /* look for the closing brace */ 5812 while (*s != '}' && s < end) 5813 s++; 5814 if (s > start && s < end && *s == '}') { 5815 /* found a name. look it up in the unicode database */ 5816 message = "unknown Unicode character name"; 5817 s++; 5818 if (s - start - 1 <= INT_MAX && 5819 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), 5820 &chr, 0)) 5821 goto store; 5822 } 5823 } 5824 goto error; 5825 5826 default: 5827 if (s > end) { 5828 message = "\\ at end of string"; 5829 s--; 5830 goto error; 5831 } 5832 else { 5833 WRITECHAR('\\'); 5834 WRITECHAR((unsigned char)s[-1]); 5835 } 5836 break; 5837 } 5838 continue; 5839 5840 error: 5841 endinpos = s-starts; 5842 if (unicode_decode_call_errorhandler_writer( 5843 errors, &errorHandler, 5844 "unicodeescape", message, 5845 &starts, &end, &startinpos, &endinpos, &exc, &s, 5846 &writer)) 5847 goto onError; 5848 continue; 5849 } 5850#undef WRITECHAR 5851 5852 Py_XDECREF(errorHandler); 5853 Py_XDECREF(exc); 5854 return _PyUnicodeWriter_Finish(&writer); 5855 5856 ucnhashError: 5857 PyErr_SetString( 5858 PyExc_UnicodeError, 5859 "\\N escapes not supported (can't load unicodedata module)" 5860 ); 5861 _PyUnicodeWriter_Dealloc(&writer); 5862 Py_XDECREF(errorHandler); 5863 Py_XDECREF(exc); 5864 return NULL; 5865 5866 onError: 5867 _PyUnicodeWriter_Dealloc(&writer); 5868 Py_XDECREF(errorHandler); 5869 Py_XDECREF(exc); 5870 return NULL; 5871} 5872 5873/* Return a Unicode-Escape string version of the Unicode object. 5874 5875 If quotes is true, the string is enclosed in u"" or u'' quotes as 5876 appropriate. 5877 5878*/ 5879 5880PyObject * 5881PyUnicode_AsUnicodeEscapeString(PyObject *unicode) 5882{ 5883 Py_ssize_t i, len; 5884 PyObject *repr; 5885 char *p; 5886 int kind; 5887 void *data; 5888 Py_ssize_t expandsize = 0; 5889 5890 /* Initial allocation is based on the longest-possible character 5891 escape. 5892 5893 For UCS1 strings it's '\xxx', 4 bytes per source character. 5894 For UCS2 strings it's '\uxxxx', 6 bytes per source character. 5895 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. 5896 */ 5897 5898 if (!PyUnicode_Check(unicode)) { 5899 PyErr_BadArgument(); 5900 return NULL; 5901 } 5902 if (PyUnicode_READY(unicode) == -1) 5903 return NULL; 5904 len = PyUnicode_GET_LENGTH(unicode); 5905 kind = PyUnicode_KIND(unicode); 5906 data = PyUnicode_DATA(unicode); 5907 switch (kind) { 5908 case PyUnicode_1BYTE_KIND: expandsize = 4; break; 5909 case PyUnicode_2BYTE_KIND: expandsize = 6; break; 5910 case PyUnicode_4BYTE_KIND: expandsize = 10; break; 5911 } 5912 5913 if (len == 0) 5914 return PyBytes_FromStringAndSize(NULL, 0); 5915 5916 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) 5917 return PyErr_NoMemory(); 5918 5919 repr = PyBytes_FromStringAndSize(NULL, 5920 2 5921 + expandsize*len 5922 + 1); 5923 if (repr == NULL) 5924 return NULL; 5925 5926 p = PyBytes_AS_STRING(repr); 5927 5928 for (i = 0; i < len; i++) { 5929 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 5930 5931 /* Escape backslashes */ 5932 if (ch == '\\') { 5933 *p++ = '\\'; 5934 *p++ = (char) ch; 5935 continue; 5936 } 5937 5938 /* Map 21-bit characters to '\U00xxxxxx' */ 5939 else if (ch >= 0x10000) { 5940 assert(ch <= MAX_UNICODE); 5941 *p++ = '\\'; 5942 *p++ = 'U'; 5943 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; 5944 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; 5945 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; 5946 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; 5947 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; 5948 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; 5949 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; 5950 *p++ = Py_hexdigits[ch & 0x0000000F]; 5951 continue; 5952 } 5953 5954 /* Map 16-bit characters to '\uxxxx' */ 5955 if (ch >= 256) { 5956 *p++ = '\\'; 5957 *p++ = 'u'; 5958 *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; 5959 *p++ = Py_hexdigits[(ch >> 8) & 0x000F]; 5960 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5961 *p++ = Py_hexdigits[ch & 0x000F]; 5962 } 5963 5964 /* Map special whitespace to '\t', \n', '\r' */ 5965 else if (ch == '\t') { 5966 *p++ = '\\'; 5967 *p++ = 't'; 5968 } 5969 else if (ch == '\n') { 5970 *p++ = '\\'; 5971 *p++ = 'n'; 5972 } 5973 else if (ch == '\r') { 5974 *p++ = '\\'; 5975 *p++ = 'r'; 5976 } 5977 5978 /* Map non-printable US ASCII to '\xhh' */ 5979 else if (ch < ' ' || ch >= 0x7F) { 5980 *p++ = '\\'; 5981 *p++ = 'x'; 5982 *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; 5983 *p++ = Py_hexdigits[ch & 0x000F]; 5984 } 5985 5986 /* Copy everything else as-is */ 5987 else 5988 *p++ = (char) ch; 5989 } 5990 5991 assert(p - PyBytes_AS_STRING(repr) > 0); 5992 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) 5993 return NULL; 5994 return repr; 5995} 5996 5997PyObject * 5998PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, 5999 Py_ssize_t size) 6000{ 6001 PyObject *result; 6002 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6003 if (tmp == NULL) 6004 return NULL; 6005 result = PyUnicode_AsUnicodeEscapeString(tmp); 6006 Py_DECREF(tmp); 6007 return result; 6008} 6009 6010/* --- Raw Unicode Escape Codec ------------------------------------------- */ 6011 6012PyObject * 6013PyUnicode_DecodeRawUnicodeEscape(const char *s, 6014 Py_ssize_t size, 6015 const char *errors) 6016{ 6017 const char *starts = s; 6018 Py_ssize_t startinpos; 6019 Py_ssize_t endinpos; 6020 _PyUnicodeWriter writer; 6021 const char *end; 6022 const char *bs; 6023 PyObject *errorHandler = NULL; 6024 PyObject *exc = NULL; 6025 6026 if (size == 0) 6027 _Py_RETURN_UNICODE_EMPTY(); 6028 6029 /* Escaped strings will always be longer than the resulting 6030 Unicode string, so we start with size here and then reduce the 6031 length after conversion to the true value. (But decoding error 6032 handler might have to resize the string) */ 6033 _PyUnicodeWriter_Init(&writer); 6034 writer.min_length = size; 6035 6036 end = s + size; 6037 while (s < end) { 6038 unsigned char c; 6039 Py_UCS4 x; 6040 int i; 6041 int count; 6042 6043 /* Non-escape characters are interpreted as Unicode ordinals */ 6044 if (*s != '\\') { 6045 x = (unsigned char)*s++; 6046 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6047 goto onError; 6048 continue; 6049 } 6050 startinpos = s-starts; 6051 6052 /* \u-escapes are only interpreted iff the number of leading 6053 backslashes if odd */ 6054 bs = s; 6055 for (;s < end;) { 6056 if (*s != '\\') 6057 break; 6058 x = (unsigned char)*s++; 6059 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6060 goto onError; 6061 } 6062 if (((s - bs) & 1) == 0 || 6063 s >= end || 6064 (*s != 'u' && *s != 'U')) { 6065 continue; 6066 } 6067 writer.pos--; 6068 count = *s=='u' ? 4 : 8; 6069 s++; 6070 6071 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ 6072 for (x = 0, i = 0; i < count; ++i, ++s) { 6073 c = (unsigned char)*s; 6074 if (!Py_ISXDIGIT(c)) { 6075 endinpos = s-starts; 6076 if (unicode_decode_call_errorhandler_writer( 6077 errors, &errorHandler, 6078 "rawunicodeescape", "truncated \\uXXXX", 6079 &starts, &end, &startinpos, &endinpos, &exc, &s, 6080 &writer)) 6081 goto onError; 6082 goto nextByte; 6083 } 6084 x = (x<<4) & ~0xF; 6085 if (c >= '0' && c <= '9') 6086 x += c - '0'; 6087 else if (c >= 'a' && c <= 'f') 6088 x += 10 + c - 'a'; 6089 else 6090 x += 10 + c - 'A'; 6091 } 6092 if (x <= MAX_UNICODE) { 6093 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) 6094 goto onError; 6095 } 6096 else { 6097 endinpos = s-starts; 6098 if (unicode_decode_call_errorhandler_writer( 6099 errors, &errorHandler, 6100 "rawunicodeescape", "\\Uxxxxxxxx out of range", 6101 &starts, &end, &startinpos, &endinpos, &exc, &s, 6102 &writer)) 6103 goto onError; 6104 } 6105 nextByte: 6106 ; 6107 } 6108 Py_XDECREF(errorHandler); 6109 Py_XDECREF(exc); 6110 return _PyUnicodeWriter_Finish(&writer); 6111 6112 onError: 6113 _PyUnicodeWriter_Dealloc(&writer); 6114 Py_XDECREF(errorHandler); 6115 Py_XDECREF(exc); 6116 return NULL; 6117} 6118 6119 6120PyObject * 6121PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) 6122{ 6123 PyObject *repr; 6124 char *p; 6125 char *q; 6126 Py_ssize_t expandsize, pos; 6127 int kind; 6128 void *data; 6129 Py_ssize_t len; 6130 6131 if (!PyUnicode_Check(unicode)) { 6132 PyErr_BadArgument(); 6133 return NULL; 6134 } 6135 if (PyUnicode_READY(unicode) == -1) 6136 return NULL; 6137 kind = PyUnicode_KIND(unicode); 6138 data = PyUnicode_DATA(unicode); 6139 len = PyUnicode_GET_LENGTH(unicode); 6140 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 6141 bytes, and 1 byte characters 4. */ 6142 expandsize = kind * 2 + 2; 6143 6144 if (len > PY_SSIZE_T_MAX / expandsize) 6145 return PyErr_NoMemory(); 6146 6147 repr = PyBytes_FromStringAndSize(NULL, expandsize * len); 6148 if (repr == NULL) 6149 return NULL; 6150 if (len == 0) 6151 return repr; 6152 6153 p = q = PyBytes_AS_STRING(repr); 6154 for (pos = 0; pos < len; pos++) { 6155 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6156 /* Map 32-bit characters to '\Uxxxxxxxx' */ 6157 if (ch >= 0x10000) { 6158 assert(ch <= MAX_UNICODE); 6159 *p++ = '\\'; 6160 *p++ = 'U'; 6161 *p++ = Py_hexdigits[(ch >> 28) & 0xf]; 6162 *p++ = Py_hexdigits[(ch >> 24) & 0xf]; 6163 *p++ = Py_hexdigits[(ch >> 20) & 0xf]; 6164 *p++ = Py_hexdigits[(ch >> 16) & 0xf]; 6165 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6166 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6167 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6168 *p++ = Py_hexdigits[ch & 15]; 6169 } 6170 /* Map 16-bit characters to '\uxxxx' */ 6171 else if (ch >= 256) { 6172 *p++ = '\\'; 6173 *p++ = 'u'; 6174 *p++ = Py_hexdigits[(ch >> 12) & 0xf]; 6175 *p++ = Py_hexdigits[(ch >> 8) & 0xf]; 6176 *p++ = Py_hexdigits[(ch >> 4) & 0xf]; 6177 *p++ = Py_hexdigits[ch & 15]; 6178 } 6179 /* Copy everything else as-is */ 6180 else 6181 *p++ = (char) ch; 6182 } 6183 6184 assert(p > q); 6185 if (_PyBytes_Resize(&repr, p - q) < 0) 6186 return NULL; 6187 return repr; 6188} 6189 6190PyObject * 6191PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, 6192 Py_ssize_t size) 6193{ 6194 PyObject *result; 6195 PyObject *tmp = PyUnicode_FromUnicode(s, size); 6196 if (tmp == NULL) 6197 return NULL; 6198 result = PyUnicode_AsRawUnicodeEscapeString(tmp); 6199 Py_DECREF(tmp); 6200 return result; 6201} 6202 6203/* --- Unicode Internal Codec ------------------------------------------- */ 6204 6205PyObject * 6206_PyUnicode_DecodeUnicodeInternal(const char *s, 6207 Py_ssize_t size, 6208 const char *errors) 6209{ 6210 const char *starts = s; 6211 Py_ssize_t startinpos; 6212 Py_ssize_t endinpos; 6213 _PyUnicodeWriter writer; 6214 const char *end; 6215 const char *reason; 6216 PyObject *errorHandler = NULL; 6217 PyObject *exc = NULL; 6218 6219 if (PyErr_WarnEx(PyExc_DeprecationWarning, 6220 "unicode_internal codec has been deprecated", 6221 1)) 6222 return NULL; 6223 6224 if (size == 0) 6225 _Py_RETURN_UNICODE_EMPTY(); 6226 6227 _PyUnicodeWriter_Init(&writer); 6228 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) { 6229 PyErr_NoMemory(); 6230 goto onError; 6231 } 6232 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE; 6233 6234 end = s + size; 6235 while (s < end) { 6236 Py_UNICODE uch; 6237 Py_UCS4 ch; 6238 if (end - s < Py_UNICODE_SIZE) { 6239 endinpos = end-starts; 6240 reason = "truncated input"; 6241 goto error; 6242 } 6243 /* We copy the raw representation one byte at a time because the 6244 pointer may be unaligned (see test_codeccallbacks). */ 6245 ((char *) &uch)[0] = s[0]; 6246 ((char *) &uch)[1] = s[1]; 6247#ifdef Py_UNICODE_WIDE 6248 ((char *) &uch)[2] = s[2]; 6249 ((char *) &uch)[3] = s[3]; 6250#endif 6251 ch = uch; 6252#ifdef Py_UNICODE_WIDE 6253 /* We have to sanity check the raw data, otherwise doom looms for 6254 some malformed UCS-4 data. */ 6255 if (ch > 0x10ffff) { 6256 endinpos = s - starts + Py_UNICODE_SIZE; 6257 reason = "illegal code point (> 0x10FFFF)"; 6258 goto error; 6259 } 6260#endif 6261 s += Py_UNICODE_SIZE; 6262#ifndef Py_UNICODE_WIDE 6263 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE) 6264 { 6265 Py_UNICODE uch2; 6266 ((char *) &uch2)[0] = s[0]; 6267 ((char *) &uch2)[1] = s[1]; 6268 if (Py_UNICODE_IS_LOW_SURROGATE(uch2)) 6269 { 6270 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2); 6271 s += Py_UNICODE_SIZE; 6272 } 6273 } 6274#endif 6275 6276 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) 6277 goto onError; 6278 continue; 6279 6280 error: 6281 startinpos = s - starts; 6282 if (unicode_decode_call_errorhandler_writer( 6283 errors, &errorHandler, 6284 "unicode_internal", reason, 6285 &starts, &end, &startinpos, &endinpos, &exc, &s, 6286 &writer)) 6287 goto onError; 6288 } 6289 6290 Py_XDECREF(errorHandler); 6291 Py_XDECREF(exc); 6292 return _PyUnicodeWriter_Finish(&writer); 6293 6294 onError: 6295 _PyUnicodeWriter_Dealloc(&writer); 6296 Py_XDECREF(errorHandler); 6297 Py_XDECREF(exc); 6298 return NULL; 6299} 6300 6301/* --- Latin-1 Codec ------------------------------------------------------ */ 6302 6303PyObject * 6304PyUnicode_DecodeLatin1(const char *s, 6305 Py_ssize_t size, 6306 const char *errors) 6307{ 6308 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ 6309 return _PyUnicode_FromUCS1((unsigned char*)s, size); 6310} 6311 6312/* create or adjust a UnicodeEncodeError */ 6313static void 6314make_encode_exception(PyObject **exceptionObject, 6315 const char *encoding, 6316 PyObject *unicode, 6317 Py_ssize_t startpos, Py_ssize_t endpos, 6318 const char *reason) 6319{ 6320 if (*exceptionObject == NULL) { 6321 *exceptionObject = PyObject_CallFunction( 6322 PyExc_UnicodeEncodeError, "sOnns", 6323 encoding, unicode, startpos, endpos, reason); 6324 } 6325 else { 6326 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) 6327 goto onError; 6328 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) 6329 goto onError; 6330 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) 6331 goto onError; 6332 return; 6333 onError: 6334 Py_CLEAR(*exceptionObject); 6335 } 6336} 6337 6338/* raises a UnicodeEncodeError */ 6339static void 6340raise_encode_exception(PyObject **exceptionObject, 6341 const char *encoding, 6342 PyObject *unicode, 6343 Py_ssize_t startpos, Py_ssize_t endpos, 6344 const char *reason) 6345{ 6346 make_encode_exception(exceptionObject, 6347 encoding, unicode, startpos, endpos, reason); 6348 if (*exceptionObject != NULL) 6349 PyCodec_StrictErrors(*exceptionObject); 6350} 6351 6352/* error handling callback helper: 6353 build arguments, call the callback and check the arguments, 6354 put the result into newpos and return the replacement string, which 6355 has to be freed by the caller */ 6356static PyObject * 6357unicode_encode_call_errorhandler(const char *errors, 6358 PyObject **errorHandler, 6359 const char *encoding, const char *reason, 6360 PyObject *unicode, PyObject **exceptionObject, 6361 Py_ssize_t startpos, Py_ssize_t endpos, 6362 Py_ssize_t *newpos) 6363{ 6364 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; 6365 Py_ssize_t len; 6366 PyObject *restuple; 6367 PyObject *resunicode; 6368 6369 if (*errorHandler == NULL) { 6370 *errorHandler = PyCodec_LookupError(errors); 6371 if (*errorHandler == NULL) 6372 return NULL; 6373 } 6374 6375 if (PyUnicode_READY(unicode) == -1) 6376 return NULL; 6377 len = PyUnicode_GET_LENGTH(unicode); 6378 6379 make_encode_exception(exceptionObject, 6380 encoding, unicode, startpos, endpos, reason); 6381 if (*exceptionObject == NULL) 6382 return NULL; 6383 6384 restuple = PyObject_CallFunctionObjArgs( 6385 *errorHandler, *exceptionObject, NULL); 6386 if (restuple == NULL) 6387 return NULL; 6388 if (!PyTuple_Check(restuple)) { 6389 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6390 Py_DECREF(restuple); 6391 return NULL; 6392 } 6393 if (!PyArg_ParseTuple(restuple, argparse, 6394 &resunicode, newpos)) { 6395 Py_DECREF(restuple); 6396 return NULL; 6397 } 6398 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) { 6399 PyErr_SetString(PyExc_TypeError, &argparse[3]); 6400 Py_DECREF(restuple); 6401 return NULL; 6402 } 6403 if (*newpos<0) 6404 *newpos = len + *newpos; 6405 if (*newpos<0 || *newpos>len) { 6406 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 6407 Py_DECREF(restuple); 6408 return NULL; 6409 } 6410 Py_INCREF(resunicode); 6411 Py_DECREF(restuple); 6412 return resunicode; 6413} 6414 6415static PyObject * 6416unicode_encode_ucs1(PyObject *unicode, 6417 const char *errors, 6418 const Py_UCS4 limit) 6419{ 6420 /* input state */ 6421 Py_ssize_t pos=0, size; 6422 int kind; 6423 void *data; 6424 /* output object */ 6425 PyObject *res; 6426 /* pointer into the output */ 6427 char *str; 6428 /* current output position */ 6429 Py_ssize_t ressize; 6430 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; 6431 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; 6432 PyObject *error_handler_obj = NULL; 6433 PyObject *exc = NULL; 6434 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6435 6436 if (PyUnicode_READY(unicode) == -1) 6437 return NULL; 6438 size = PyUnicode_GET_LENGTH(unicode); 6439 kind = PyUnicode_KIND(unicode); 6440 data = PyUnicode_DATA(unicode); 6441 /* allocate enough for a simple encoding without 6442 replacements, if we need more, we'll resize */ 6443 if (size == 0) 6444 return PyBytes_FromStringAndSize(NULL, 0); 6445 res = PyBytes_FromStringAndSize(NULL, size); 6446 if (res == NULL) 6447 return NULL; 6448 str = PyBytes_AS_STRING(res); 6449 ressize = size; 6450 6451 while (pos < size) { 6452 Py_UCS4 ch = PyUnicode_READ(kind, data, pos); 6453 6454 /* can we encode this? */ 6455 if (ch < limit) { 6456 /* no overflow check, because we know that the space is enough */ 6457 *str++ = (char)ch; 6458 ++pos; 6459 } 6460 else { 6461 Py_ssize_t requiredsize; 6462 PyObject *repunicode; 6463 Py_ssize_t repsize, newpos, respos, i; 6464 /* startpos for collecting unencodable chars */ 6465 Py_ssize_t collstart = pos; 6466 Py_ssize_t collend = pos; 6467 /* find all unecodable characters */ 6468 6469 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) 6470 ++collend; 6471 6472 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ 6473 if (error_handler == _Py_ERROR_UNKNOWN) 6474 error_handler = get_error_handler(errors); 6475 6476 switch (error_handler) { 6477 case _Py_ERROR_STRICT: 6478 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); 6479 goto onError; 6480 6481 case _Py_ERROR_REPLACE: 6482 while (collstart++ < collend) 6483 *str++ = '?'; 6484 /* fall through ignore error handler */ 6485 case _Py_ERROR_IGNORE: 6486 pos = collend; 6487 break; 6488 6489 case _Py_ERROR_XMLCHARREFREPLACE: 6490 respos = str - PyBytes_AS_STRING(res); 6491 requiredsize = respos; 6492 /* determine replacement size */ 6493 for (i = collstart; i < collend; ++i) { 6494 Py_ssize_t incr; 6495 6496 ch = PyUnicode_READ(kind, data, i); 6497 if (ch < 10) 6498 incr = 2+1+1; 6499 else if (ch < 100) 6500 incr = 2+2+1; 6501 else if (ch < 1000) 6502 incr = 2+3+1; 6503 else if (ch < 10000) 6504 incr = 2+4+1; 6505 else if (ch < 100000) 6506 incr = 2+5+1; 6507 else if (ch < 1000000) 6508 incr = 2+6+1; 6509 else { 6510 assert(ch <= MAX_UNICODE); 6511 incr = 2+7+1; 6512 } 6513 if (requiredsize > PY_SSIZE_T_MAX - incr) 6514 goto overflow; 6515 requiredsize += incr; 6516 } 6517 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6518 goto overflow; 6519 requiredsize += size - collend; 6520 if (requiredsize > ressize) { 6521 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6522 requiredsize = 2*ressize; 6523 if (_PyBytes_Resize(&res, requiredsize)) 6524 goto onError; 6525 str = PyBytes_AS_STRING(res) + respos; 6526 ressize = requiredsize; 6527 } 6528 /* generate replacement */ 6529 for (i = collstart; i < collend; ++i) { 6530 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); 6531 } 6532 pos = collend; 6533 break; 6534 6535 case _Py_ERROR_SURROGATEESCAPE: 6536 for (i = collstart; i < collend; ++i) { 6537 ch = PyUnicode_READ(kind, data, i); 6538 if (ch < 0xdc80 || 0xdcff < ch) { 6539 /* Not a UTF-8b surrogate */ 6540 break; 6541 } 6542 *str++ = (char)(ch - 0xdc00); 6543 ++pos; 6544 } 6545 if (i >= collend) 6546 break; 6547 collstart = pos; 6548 assert(collstart != collend); 6549 /* fallback to general error handling */ 6550 6551 default: 6552 repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj, 6553 encoding, reason, unicode, &exc, 6554 collstart, collend, &newpos); 6555 if (repunicode == NULL || (PyUnicode_Check(repunicode) && 6556 PyUnicode_READY(repunicode) == -1)) 6557 goto onError; 6558 6559 if (PyBytes_Check(repunicode)) { 6560 /* Directly copy bytes result to output. */ 6561 repsize = PyBytes_Size(repunicode); 6562 if (repsize > 1) { 6563 /* Make room for all additional bytes. */ 6564 respos = str - PyBytes_AS_STRING(res); 6565 if (ressize > PY_SSIZE_T_MAX - repsize - 1) { 6566 Py_DECREF(repunicode); 6567 goto overflow; 6568 } 6569 if (_PyBytes_Resize(&res, ressize+repsize-1)) { 6570 Py_DECREF(repunicode); 6571 goto onError; 6572 } 6573 str = PyBytes_AS_STRING(res) + respos; 6574 ressize += repsize-1; 6575 } 6576 memcpy(str, PyBytes_AsString(repunicode), repsize); 6577 str += repsize; 6578 pos = newpos; 6579 Py_DECREF(repunicode); 6580 break; 6581 } 6582 6583 /* need more space? (at least enough for what we 6584 have+the replacement+the rest of the string, so 6585 we won't have to check space for encodable characters) */ 6586 respos = str - PyBytes_AS_STRING(res); 6587 repsize = PyUnicode_GET_LENGTH(repunicode); 6588 requiredsize = respos; 6589 if (requiredsize > PY_SSIZE_T_MAX - repsize) 6590 goto overflow; 6591 requiredsize += repsize; 6592 if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) 6593 goto overflow; 6594 requiredsize += size - collend; 6595 if (requiredsize > ressize) { 6596 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) 6597 requiredsize = 2*ressize; 6598 if (_PyBytes_Resize(&res, requiredsize)) { 6599 Py_DECREF(repunicode); 6600 goto onError; 6601 } 6602 str = PyBytes_AS_STRING(res) + respos; 6603 ressize = requiredsize; 6604 } 6605 6606 /* check if there is anything unencodable in the replacement 6607 and copy it to the output */ 6608 for (i = 0; repsize-->0; ++i, ++str) { 6609 ch = PyUnicode_READ_CHAR(repunicode, i); 6610 if (ch >= limit) { 6611 raise_encode_exception(&exc, encoding, unicode, 6612 pos, pos+1, reason); 6613 Py_DECREF(repunicode); 6614 goto onError; 6615 } 6616 *str = (char)ch; 6617 } 6618 pos = newpos; 6619 Py_DECREF(repunicode); 6620 } 6621 } 6622 } 6623 /* Resize if we allocated to much */ 6624 size = str - PyBytes_AS_STRING(res); 6625 if (size < ressize) { /* If this falls res will be NULL */ 6626 assert(size >= 0); 6627 if (_PyBytes_Resize(&res, size) < 0) 6628 goto onError; 6629 } 6630 6631 Py_XDECREF(error_handler_obj); 6632 Py_XDECREF(exc); 6633 return res; 6634 6635 overflow: 6636 PyErr_SetString(PyExc_OverflowError, 6637 "encoded result is too long for a Python string"); 6638 6639 onError: 6640 Py_XDECREF(res); 6641 Py_XDECREF(error_handler_obj); 6642 Py_XDECREF(exc); 6643 return NULL; 6644} 6645 6646/* Deprecated */ 6647PyObject * 6648PyUnicode_EncodeLatin1(const Py_UNICODE *p, 6649 Py_ssize_t size, 6650 const char *errors) 6651{ 6652 PyObject *result; 6653 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6654 if (unicode == NULL) 6655 return NULL; 6656 result = unicode_encode_ucs1(unicode, errors, 256); 6657 Py_DECREF(unicode); 6658 return result; 6659} 6660 6661PyObject * 6662_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors) 6663{ 6664 if (!PyUnicode_Check(unicode)) { 6665 PyErr_BadArgument(); 6666 return NULL; 6667 } 6668 if (PyUnicode_READY(unicode) == -1) 6669 return NULL; 6670 /* Fast path: if it is a one-byte string, construct 6671 bytes object directly. */ 6672 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) 6673 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6674 PyUnicode_GET_LENGTH(unicode)); 6675 /* Non-Latin-1 characters present. Defer to above function to 6676 raise the exception. */ 6677 return unicode_encode_ucs1(unicode, errors, 256); 6678} 6679 6680PyObject* 6681PyUnicode_AsLatin1String(PyObject *unicode) 6682{ 6683 return _PyUnicode_AsLatin1String(unicode, NULL); 6684} 6685 6686/* --- 7-bit ASCII Codec -------------------------------------------------- */ 6687 6688PyObject * 6689PyUnicode_DecodeASCII(const char *s, 6690 Py_ssize_t size, 6691 const char *errors) 6692{ 6693 const char *starts = s; 6694 _PyUnicodeWriter writer; 6695 int kind; 6696 void *data; 6697 Py_ssize_t startinpos; 6698 Py_ssize_t endinpos; 6699 Py_ssize_t outpos; 6700 const char *e; 6701 PyObject *error_handler_obj = NULL; 6702 PyObject *exc = NULL; 6703 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 6704 6705 if (size == 0) 6706 _Py_RETURN_UNICODE_EMPTY(); 6707 6708 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ 6709 if (size == 1 && (unsigned char)s[0] < 128) 6710 return get_latin1_char((unsigned char)s[0]); 6711 6712 _PyUnicodeWriter_Init(&writer); 6713 writer.min_length = size; 6714 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) 6715 return NULL; 6716 6717 e = s + size; 6718 data = writer.data; 6719 outpos = ascii_decode(s, e, (Py_UCS1 *)data); 6720 writer.pos = outpos; 6721 if (writer.pos == size) 6722 return _PyUnicodeWriter_Finish(&writer); 6723 6724 s += writer.pos; 6725 kind = writer.kind; 6726 while (s < e) { 6727 unsigned char c = (unsigned char)*s; 6728 if (c < 128) { 6729 PyUnicode_WRITE(kind, data, writer.pos, c); 6730 writer.pos++; 6731 ++s; 6732 continue; 6733 } 6734 6735 /* byte outsize range 0x00..0x7f: call the error handler */ 6736 6737 if (error_handler == _Py_ERROR_UNKNOWN) 6738 error_handler = get_error_handler(errors); 6739 6740 switch (error_handler) 6741 { 6742 case _Py_ERROR_REPLACE: 6743 case _Py_ERROR_SURROGATEESCAPE: 6744 /* Fast-path: the error handler only writes one character, 6745 but we may switch to UCS2 at the first write */ 6746 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) 6747 goto onError; 6748 kind = writer.kind; 6749 data = writer.data; 6750 6751 if (error_handler == _Py_ERROR_REPLACE) 6752 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); 6753 else 6754 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); 6755 writer.pos++; 6756 ++s; 6757 break; 6758 6759 case _Py_ERROR_IGNORE: 6760 ++s; 6761 break; 6762 6763 default: 6764 startinpos = s-starts; 6765 endinpos = startinpos + 1; 6766 if (unicode_decode_call_errorhandler_writer( 6767 errors, &error_handler_obj, 6768 "ascii", "ordinal not in range(128)", 6769 &starts, &e, &startinpos, &endinpos, &exc, &s, 6770 &writer)) 6771 goto onError; 6772 kind = writer.kind; 6773 data = writer.data; 6774 } 6775 } 6776 Py_XDECREF(error_handler_obj); 6777 Py_XDECREF(exc); 6778 return _PyUnicodeWriter_Finish(&writer); 6779 6780 onError: 6781 _PyUnicodeWriter_Dealloc(&writer); 6782 Py_XDECREF(error_handler_obj); 6783 Py_XDECREF(exc); 6784 return NULL; 6785} 6786 6787/* Deprecated */ 6788PyObject * 6789PyUnicode_EncodeASCII(const Py_UNICODE *p, 6790 Py_ssize_t size, 6791 const char *errors) 6792{ 6793 PyObject *result; 6794 PyObject *unicode = PyUnicode_FromUnicode(p, size); 6795 if (unicode == NULL) 6796 return NULL; 6797 result = unicode_encode_ucs1(unicode, errors, 128); 6798 Py_DECREF(unicode); 6799 return result; 6800} 6801 6802PyObject * 6803_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors) 6804{ 6805 if (!PyUnicode_Check(unicode)) { 6806 PyErr_BadArgument(); 6807 return NULL; 6808 } 6809 if (PyUnicode_READY(unicode) == -1) 6810 return NULL; 6811 /* Fast path: if it is an ASCII-only string, construct bytes object 6812 directly. Else defer to above function to raise the exception. */ 6813 if (PyUnicode_IS_ASCII(unicode)) 6814 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode), 6815 PyUnicode_GET_LENGTH(unicode)); 6816 return unicode_encode_ucs1(unicode, errors, 128); 6817} 6818 6819PyObject * 6820PyUnicode_AsASCIIString(PyObject *unicode) 6821{ 6822 return _PyUnicode_AsASCIIString(unicode, NULL); 6823} 6824 6825#ifdef HAVE_MBCS 6826 6827/* --- MBCS codecs for Windows -------------------------------------------- */ 6828 6829#if SIZEOF_INT < SIZEOF_SIZE_T 6830#define NEED_RETRY 6831#endif 6832 6833#ifndef WC_ERR_INVALID_CHARS 6834# define WC_ERR_INVALID_CHARS 0x0080 6835#endif 6836 6837static char* 6838code_page_name(UINT code_page, PyObject **obj) 6839{ 6840 *obj = NULL; 6841 if (code_page == CP_ACP) 6842 return "mbcs"; 6843 if (code_page == CP_UTF7) 6844 return "CP_UTF7"; 6845 if (code_page == CP_UTF8) 6846 return "CP_UTF8"; 6847 6848 *obj = PyBytes_FromFormat("cp%u", code_page); 6849 if (*obj == NULL) 6850 return NULL; 6851 return PyBytes_AS_STRING(*obj); 6852} 6853 6854static DWORD 6855decode_code_page_flags(UINT code_page) 6856{ 6857 if (code_page == CP_UTF7) { 6858 /* The CP_UTF7 decoder only supports flags=0 */ 6859 return 0; 6860 } 6861 else 6862 return MB_ERR_INVALID_CHARS; 6863} 6864 6865/* 6866 * Decode a byte string from a Windows code page into unicode object in strict 6867 * mode. 6868 * 6869 * Returns consumed size if succeed, returns -2 on decode error, or raise an 6870 * OSError and returns -1 on other error. 6871 */ 6872static int 6873decode_code_page_strict(UINT code_page, 6874 PyObject **v, 6875 const char *in, 6876 int insize) 6877{ 6878 const DWORD flags = decode_code_page_flags(code_page); 6879 wchar_t *out; 6880 DWORD outsize; 6881 6882 /* First get the size of the result */ 6883 assert(insize > 0); 6884 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0); 6885 if (outsize <= 0) 6886 goto error; 6887 6888 if (*v == NULL) { 6889 /* Create unicode object */ 6890 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6891 *v = (PyObject*)_PyUnicode_New(outsize); 6892 if (*v == NULL) 6893 return -1; 6894 out = PyUnicode_AS_UNICODE(*v); 6895 } 6896 else { 6897 /* Extend unicode object */ 6898 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6899 if (unicode_resize(v, n + outsize) < 0) 6900 return -1; 6901 out = PyUnicode_AS_UNICODE(*v) + n; 6902 } 6903 6904 /* Do the conversion */ 6905 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize); 6906 if (outsize <= 0) 6907 goto error; 6908 return insize; 6909 6910error: 6911 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 6912 return -2; 6913 PyErr_SetFromWindowsErr(0); 6914 return -1; 6915} 6916 6917/* 6918 * Decode a byte string from a code page into unicode object with an error 6919 * handler. 6920 * 6921 * Returns consumed size if succeed, or raise an OSError or 6922 * UnicodeDecodeError exception and returns -1 on error. 6923 */ 6924static int 6925decode_code_page_errors(UINT code_page, 6926 PyObject **v, 6927 const char *in, const int size, 6928 const char *errors, int final) 6929{ 6930 const char *startin = in; 6931 const char *endin = in + size; 6932 const DWORD flags = decode_code_page_flags(code_page); 6933 /* Ideally, we should get reason from FormatMessage. This is the Windows 6934 2000 English version of the message. */ 6935 const char *reason = "No mapping for the Unicode character exists " 6936 "in the target code page."; 6937 /* each step cannot decode more than 1 character, but a character can be 6938 represented as a surrogate pair */ 6939 wchar_t buffer[2], *startout, *out; 6940 int insize; 6941 Py_ssize_t outsize; 6942 PyObject *errorHandler = NULL; 6943 PyObject *exc = NULL; 6944 PyObject *encoding_obj = NULL; 6945 char *encoding; 6946 DWORD err; 6947 int ret = -1; 6948 6949 assert(size > 0); 6950 6951 encoding = code_page_name(code_page, &encoding_obj); 6952 if (encoding == NULL) 6953 return -1; 6954 6955 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { 6956 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a 6957 UnicodeDecodeError. */ 6958 make_decode_exception(&exc, encoding, in, size, 0, 0, reason); 6959 if (exc != NULL) { 6960 PyCodec_StrictErrors(exc); 6961 Py_CLEAR(exc); 6962 } 6963 goto error; 6964 } 6965 6966 if (*v == NULL) { 6967 /* Create unicode object */ 6968 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6969 PyErr_NoMemory(); 6970 goto error; 6971 } 6972 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */ 6973 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer)); 6974 if (*v == NULL) 6975 goto error; 6976 startout = PyUnicode_AS_UNICODE(*v); 6977 } 6978 else { 6979 /* Extend unicode object */ 6980 Py_ssize_t n = PyUnicode_GET_SIZE(*v); 6981 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) { 6982 PyErr_NoMemory(); 6983 goto error; 6984 } 6985 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0) 6986 goto error; 6987 startout = PyUnicode_AS_UNICODE(*v) + n; 6988 } 6989 6990 /* Decode the byte string character per character */ 6991 out = startout; 6992 while (in < endin) 6993 { 6994 /* Decode a character */ 6995 insize = 1; 6996 do 6997 { 6998 outsize = MultiByteToWideChar(code_page, flags, 6999 in, insize, 7000 buffer, Py_ARRAY_LENGTH(buffer)); 7001 if (outsize > 0) 7002 break; 7003 err = GetLastError(); 7004 if (err != ERROR_NO_UNICODE_TRANSLATION 7005 && err != ERROR_INSUFFICIENT_BUFFER) 7006 { 7007 PyErr_SetFromWindowsErr(0); 7008 goto error; 7009 } 7010 insize++; 7011 } 7012 /* 4=maximum length of a UTF-8 sequence */ 7013 while (insize <= 4 && (in + insize) <= endin); 7014 7015 if (outsize <= 0) { 7016 Py_ssize_t startinpos, endinpos, outpos; 7017 7018 /* last character in partial decode? */ 7019 if (in + insize >= endin && !final) 7020 break; 7021 7022 startinpos = in - startin; 7023 endinpos = startinpos + 1; 7024 outpos = out - PyUnicode_AS_UNICODE(*v); 7025 if (unicode_decode_call_errorhandler_wchar( 7026 errors, &errorHandler, 7027 encoding, reason, 7028 &startin, &endin, &startinpos, &endinpos, &exc, &in, 7029 v, &outpos)) 7030 { 7031 goto error; 7032 } 7033 out = PyUnicode_AS_UNICODE(*v) + outpos; 7034 } 7035 else { 7036 in += insize; 7037 memcpy(out, buffer, outsize * sizeof(wchar_t)); 7038 out += outsize; 7039 } 7040 } 7041 7042 /* write a NUL character at the end */ 7043 *out = 0; 7044 7045 /* Extend unicode object */ 7046 outsize = out - startout; 7047 assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); 7048 if (unicode_resize(v, outsize) < 0) 7049 goto error; 7050 /* (in - startin) <= size and size is an int */ 7051 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int); 7052 7053error: 7054 Py_XDECREF(encoding_obj); 7055 Py_XDECREF(errorHandler); 7056 Py_XDECREF(exc); 7057 return ret; 7058} 7059 7060static PyObject * 7061decode_code_page_stateful(int code_page, 7062 const char *s, Py_ssize_t size, 7063 const char *errors, Py_ssize_t *consumed) 7064{ 7065 PyObject *v = NULL; 7066 int chunk_size, final, converted, done; 7067 7068 if (code_page < 0) { 7069 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7070 return NULL; 7071 } 7072 7073 if (consumed) 7074 *consumed = 0; 7075 7076 do 7077 { 7078#ifdef NEED_RETRY 7079 if (size > INT_MAX) { 7080 chunk_size = INT_MAX; 7081 final = 0; 7082 done = 0; 7083 } 7084 else 7085#endif 7086 { 7087 chunk_size = (int)size; 7088 final = (consumed == NULL); 7089 done = 1; 7090 } 7091 7092 if (chunk_size == 0 && done) { 7093 if (v != NULL) 7094 break; 7095 _Py_RETURN_UNICODE_EMPTY(); 7096 } 7097 7098 converted = decode_code_page_strict(code_page, &v, 7099 s, chunk_size); 7100 if (converted == -2) 7101 converted = decode_code_page_errors(code_page, &v, 7102 s, chunk_size, 7103 errors, final); 7104 assert(converted != 0 || done); 7105 7106 if (converted < 0) { 7107 Py_XDECREF(v); 7108 return NULL; 7109 } 7110 7111 if (consumed) 7112 *consumed += converted; 7113 7114 s += converted; 7115 size -= converted; 7116 } while (!done); 7117 7118 return unicode_result(v); 7119} 7120 7121PyObject * 7122PyUnicode_DecodeCodePageStateful(int code_page, 7123 const char *s, 7124 Py_ssize_t size, 7125 const char *errors, 7126 Py_ssize_t *consumed) 7127{ 7128 return decode_code_page_stateful(code_page, s, size, errors, consumed); 7129} 7130 7131PyObject * 7132PyUnicode_DecodeMBCSStateful(const char *s, 7133 Py_ssize_t size, 7134 const char *errors, 7135 Py_ssize_t *consumed) 7136{ 7137 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed); 7138} 7139 7140PyObject * 7141PyUnicode_DecodeMBCS(const char *s, 7142 Py_ssize_t size, 7143 const char *errors) 7144{ 7145 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); 7146} 7147 7148static DWORD 7149encode_code_page_flags(UINT code_page, const char *errors) 7150{ 7151 if (code_page == CP_UTF8) { 7152 return WC_ERR_INVALID_CHARS; 7153 } 7154 else if (code_page == CP_UTF7) { 7155 /* CP_UTF7 only supports flags=0 */ 7156 return 0; 7157 } 7158 else { 7159 if (errors != NULL && strcmp(errors, "replace") == 0) 7160 return 0; 7161 else 7162 return WC_NO_BEST_FIT_CHARS; 7163 } 7164} 7165 7166/* 7167 * Encode a Unicode string to a Windows code page into a byte string in strict 7168 * mode. 7169 * 7170 * Returns consumed characters if succeed, returns -2 on encode error, or raise 7171 * an OSError and returns -1 on other error. 7172 */ 7173static int 7174encode_code_page_strict(UINT code_page, PyObject **outbytes, 7175 PyObject *unicode, Py_ssize_t offset, int len, 7176 const char* errors) 7177{ 7178 BOOL usedDefaultChar = FALSE; 7179 BOOL *pusedDefaultChar = &usedDefaultChar; 7180 int outsize; 7181 PyObject *exc = NULL; 7182 wchar_t *p; 7183 Py_ssize_t size; 7184 const DWORD flags = encode_code_page_flags(code_page, NULL); 7185 char *out; 7186 /* Create a substring so that we can get the UTF-16 representation 7187 of just the slice under consideration. */ 7188 PyObject *substring; 7189 7190 assert(len > 0); 7191 7192 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7193 pusedDefaultChar = &usedDefaultChar; 7194 else 7195 pusedDefaultChar = NULL; 7196 7197 substring = PyUnicode_Substring(unicode, offset, offset+len); 7198 if (substring == NULL) 7199 return -1; 7200 p = PyUnicode_AsUnicodeAndSize(substring, &size); 7201 if (p == NULL) { 7202 Py_DECREF(substring); 7203 return -1; 7204 } 7205 assert(size <= INT_MAX); 7206 7207 /* First get the size of the result */ 7208 outsize = WideCharToMultiByte(code_page, flags, 7209 p, (int)size, 7210 NULL, 0, 7211 NULL, pusedDefaultChar); 7212 if (outsize <= 0) 7213 goto error; 7214 /* If we used a default char, then we failed! */ 7215 if (pusedDefaultChar && *pusedDefaultChar) { 7216 Py_DECREF(substring); 7217 return -2; 7218 } 7219 7220 if (*outbytes == NULL) { 7221 /* Create string object */ 7222 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7223 if (*outbytes == NULL) { 7224 Py_DECREF(substring); 7225 return -1; 7226 } 7227 out = PyBytes_AS_STRING(*outbytes); 7228 } 7229 else { 7230 /* Extend string object */ 7231 const Py_ssize_t n = PyBytes_Size(*outbytes); 7232 if (outsize > PY_SSIZE_T_MAX - n) { 7233 PyErr_NoMemory(); 7234 Py_DECREF(substring); 7235 return -1; 7236 } 7237 if (_PyBytes_Resize(outbytes, n + outsize) < 0) { 7238 Py_DECREF(substring); 7239 return -1; 7240 } 7241 out = PyBytes_AS_STRING(*outbytes) + n; 7242 } 7243 7244 /* Do the conversion */ 7245 outsize = WideCharToMultiByte(code_page, flags, 7246 p, (int)size, 7247 out, outsize, 7248 NULL, pusedDefaultChar); 7249 Py_CLEAR(substring); 7250 if (outsize <= 0) 7251 goto error; 7252 if (pusedDefaultChar && *pusedDefaultChar) 7253 return -2; 7254 return 0; 7255 7256error: 7257 Py_XDECREF(substring); 7258 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) 7259 return -2; 7260 PyErr_SetFromWindowsErr(0); 7261 return -1; 7262} 7263 7264/* 7265 * Encode a Unicode string to a Windows code page into a byte string using a 7266 * error handler. 7267 * 7268 * Returns consumed characters if succeed, or raise an OSError and returns 7269 * -1 on other error. 7270 */ 7271static int 7272encode_code_page_errors(UINT code_page, PyObject **outbytes, 7273 PyObject *unicode, Py_ssize_t unicode_offset, 7274 Py_ssize_t insize, const char* errors) 7275{ 7276 const DWORD flags = encode_code_page_flags(code_page, errors); 7277 Py_ssize_t pos = unicode_offset; 7278 Py_ssize_t endin = unicode_offset + insize; 7279 /* Ideally, we should get reason from FormatMessage. This is the Windows 7280 2000 English version of the message. */ 7281 const char *reason = "invalid character"; 7282 /* 4=maximum length of a UTF-8 sequence */ 7283 char buffer[4]; 7284 BOOL usedDefaultChar = FALSE, *pusedDefaultChar; 7285 Py_ssize_t outsize; 7286 char *out; 7287 PyObject *errorHandler = NULL; 7288 PyObject *exc = NULL; 7289 PyObject *encoding_obj = NULL; 7290 char *encoding; 7291 Py_ssize_t newpos, newoutsize; 7292 PyObject *rep; 7293 int ret = -1; 7294 7295 assert(insize > 0); 7296 7297 encoding = code_page_name(code_page, &encoding_obj); 7298 if (encoding == NULL) 7299 return -1; 7300 7301 if (errors == NULL || strcmp(errors, "strict") == 0) { 7302 /* The last error was ERROR_NO_UNICODE_TRANSLATION, 7303 then we raise a UnicodeEncodeError. */ 7304 make_encode_exception(&exc, encoding, unicode, 0, 0, reason); 7305 if (exc != NULL) { 7306 PyCodec_StrictErrors(exc); 7307 Py_DECREF(exc); 7308 } 7309 Py_XDECREF(encoding_obj); 7310 return -1; 7311 } 7312 7313 if (code_page != CP_UTF8 && code_page != CP_UTF7) 7314 pusedDefaultChar = &usedDefaultChar; 7315 else 7316 pusedDefaultChar = NULL; 7317 7318 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) { 7319 PyErr_NoMemory(); 7320 goto error; 7321 } 7322 outsize = insize * Py_ARRAY_LENGTH(buffer); 7323 7324 if (*outbytes == NULL) { 7325 /* Create string object */ 7326 *outbytes = PyBytes_FromStringAndSize(NULL, outsize); 7327 if (*outbytes == NULL) 7328 goto error; 7329 out = PyBytes_AS_STRING(*outbytes); 7330 } 7331 else { 7332 /* Extend string object */ 7333 Py_ssize_t n = PyBytes_Size(*outbytes); 7334 if (n > PY_SSIZE_T_MAX - outsize) { 7335 PyErr_NoMemory(); 7336 goto error; 7337 } 7338 if (_PyBytes_Resize(outbytes, n + outsize) < 0) 7339 goto error; 7340 out = PyBytes_AS_STRING(*outbytes) + n; 7341 } 7342 7343 /* Encode the string character per character */ 7344 while (pos < endin) 7345 { 7346 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos); 7347 wchar_t chars[2]; 7348 int charsize; 7349 if (ch < 0x10000) { 7350 chars[0] = (wchar_t)ch; 7351 charsize = 1; 7352 } 7353 else { 7354 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); 7355 chars[1] = Py_UNICODE_LOW_SURROGATE(ch); 7356 charsize = 2; 7357 } 7358 7359 outsize = WideCharToMultiByte(code_page, flags, 7360 chars, charsize, 7361 buffer, Py_ARRAY_LENGTH(buffer), 7362 NULL, pusedDefaultChar); 7363 if (outsize > 0) { 7364 if (pusedDefaultChar == NULL || !(*pusedDefaultChar)) 7365 { 7366 pos++; 7367 memcpy(out, buffer, outsize); 7368 out += outsize; 7369 continue; 7370 } 7371 } 7372 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) { 7373 PyErr_SetFromWindowsErr(0); 7374 goto error; 7375 } 7376 7377 rep = unicode_encode_call_errorhandler( 7378 errors, &errorHandler, encoding, reason, 7379 unicode, &exc, 7380 pos, pos + 1, &newpos); 7381 if (rep == NULL) 7382 goto error; 7383 pos = newpos; 7384 7385 if (PyBytes_Check(rep)) { 7386 outsize = PyBytes_GET_SIZE(rep); 7387 if (outsize != 1) { 7388 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7389 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7390 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7391 Py_DECREF(rep); 7392 goto error; 7393 } 7394 out = PyBytes_AS_STRING(*outbytes) + offset; 7395 } 7396 memcpy(out, PyBytes_AS_STRING(rep), outsize); 7397 out += outsize; 7398 } 7399 else { 7400 Py_ssize_t i; 7401 enum PyUnicode_Kind kind; 7402 void *data; 7403 7404 if (PyUnicode_READY(rep) == -1) { 7405 Py_DECREF(rep); 7406 goto error; 7407 } 7408 7409 outsize = PyUnicode_GET_LENGTH(rep); 7410 if (outsize != 1) { 7411 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes); 7412 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1); 7413 if (_PyBytes_Resize(outbytes, newoutsize) < 0) { 7414 Py_DECREF(rep); 7415 goto error; 7416 } 7417 out = PyBytes_AS_STRING(*outbytes) + offset; 7418 } 7419 kind = PyUnicode_KIND(rep); 7420 data = PyUnicode_DATA(rep); 7421 for (i=0; i < outsize; i++) { 7422 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7423 if (ch > 127) { 7424 raise_encode_exception(&exc, 7425 encoding, unicode, 7426 pos, pos + 1, 7427 "unable to encode error handler result to ASCII"); 7428 Py_DECREF(rep); 7429 goto error; 7430 } 7431 *out = (unsigned char)ch; 7432 out++; 7433 } 7434 } 7435 Py_DECREF(rep); 7436 } 7437 /* write a NUL byte */ 7438 *out = 0; 7439 outsize = out - PyBytes_AS_STRING(*outbytes); 7440 assert(outsize <= PyBytes_GET_SIZE(*outbytes)); 7441 if (_PyBytes_Resize(outbytes, outsize) < 0) 7442 goto error; 7443 ret = 0; 7444 7445error: 7446 Py_XDECREF(encoding_obj); 7447 Py_XDECREF(errorHandler); 7448 Py_XDECREF(exc); 7449 return ret; 7450} 7451 7452static PyObject * 7453encode_code_page(int code_page, 7454 PyObject *unicode, 7455 const char *errors) 7456{ 7457 Py_ssize_t len; 7458 PyObject *outbytes = NULL; 7459 Py_ssize_t offset; 7460 int chunk_len, ret, done; 7461 7462 if (!PyUnicode_Check(unicode)) { 7463 PyErr_BadArgument(); 7464 return NULL; 7465 } 7466 7467 if (PyUnicode_READY(unicode) == -1) 7468 return NULL; 7469 len = PyUnicode_GET_LENGTH(unicode); 7470 7471 if (code_page < 0) { 7472 PyErr_SetString(PyExc_ValueError, "invalid code page number"); 7473 return NULL; 7474 } 7475 7476 if (len == 0) 7477 return PyBytes_FromStringAndSize(NULL, 0); 7478 7479 offset = 0; 7480 do 7481 { 7482#ifdef NEED_RETRY 7483 /* UTF-16 encoding may double the size, so use only INT_MAX/2 7484 chunks. */ 7485 if (len > INT_MAX/2) { 7486 chunk_len = INT_MAX/2; 7487 done = 0; 7488 } 7489 else 7490#endif 7491 { 7492 chunk_len = (int)len; 7493 done = 1; 7494 } 7495 7496 ret = encode_code_page_strict(code_page, &outbytes, 7497 unicode, offset, chunk_len, 7498 errors); 7499 if (ret == -2) 7500 ret = encode_code_page_errors(code_page, &outbytes, 7501 unicode, offset, 7502 chunk_len, errors); 7503 if (ret < 0) { 7504 Py_XDECREF(outbytes); 7505 return NULL; 7506 } 7507 7508 offset += chunk_len; 7509 len -= chunk_len; 7510 } while (!done); 7511 7512 return outbytes; 7513} 7514 7515PyObject * 7516PyUnicode_EncodeMBCS(const Py_UNICODE *p, 7517 Py_ssize_t size, 7518 const char *errors) 7519{ 7520 PyObject *unicode, *res; 7521 unicode = PyUnicode_FromUnicode(p, size); 7522 if (unicode == NULL) 7523 return NULL; 7524 res = encode_code_page(CP_ACP, unicode, errors); 7525 Py_DECREF(unicode); 7526 return res; 7527} 7528 7529PyObject * 7530PyUnicode_EncodeCodePage(int code_page, 7531 PyObject *unicode, 7532 const char *errors) 7533{ 7534 return encode_code_page(code_page, unicode, errors); 7535} 7536 7537PyObject * 7538PyUnicode_AsMBCSString(PyObject *unicode) 7539{ 7540 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL); 7541} 7542 7543#undef NEED_RETRY 7544 7545#endif /* HAVE_MBCS */ 7546 7547/* --- Character Mapping Codec -------------------------------------------- */ 7548 7549static int 7550charmap_decode_string(const char *s, 7551 Py_ssize_t size, 7552 PyObject *mapping, 7553 const char *errors, 7554 _PyUnicodeWriter *writer) 7555{ 7556 const char *starts = s; 7557 const char *e; 7558 Py_ssize_t startinpos, endinpos; 7559 PyObject *errorHandler = NULL, *exc = NULL; 7560 Py_ssize_t maplen; 7561 enum PyUnicode_Kind mapkind; 7562 void *mapdata; 7563 Py_UCS4 x; 7564 unsigned char ch; 7565 7566 if (PyUnicode_READY(mapping) == -1) 7567 return -1; 7568 7569 maplen = PyUnicode_GET_LENGTH(mapping); 7570 mapdata = PyUnicode_DATA(mapping); 7571 mapkind = PyUnicode_KIND(mapping); 7572 7573 e = s + size; 7574 7575 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) { 7576 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1 7577 * is disabled in encoding aliases, latin1 is preferred because 7578 * its implementation is faster. */ 7579 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata; 7580 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7581 Py_UCS4 maxchar = writer->maxchar; 7582 7583 assert (writer->kind == PyUnicode_1BYTE_KIND); 7584 while (s < e) { 7585 ch = *s; 7586 x = mapdata_ucs1[ch]; 7587 if (x > maxchar) { 7588 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1) 7589 goto onError; 7590 maxchar = writer->maxchar; 7591 outdata = (Py_UCS1 *)writer->data; 7592 } 7593 outdata[writer->pos] = x; 7594 writer->pos++; 7595 ++s; 7596 } 7597 return 0; 7598 } 7599 7600 while (s < e) { 7601 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) { 7602 enum PyUnicode_Kind outkind = writer->kind; 7603 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata; 7604 if (outkind == PyUnicode_1BYTE_KIND) { 7605 Py_UCS1 *outdata = (Py_UCS1 *)writer->data; 7606 Py_UCS4 maxchar = writer->maxchar; 7607 while (s < e) { 7608 ch = *s; 7609 x = mapdata_ucs2[ch]; 7610 if (x > maxchar) 7611 goto Error; 7612 outdata[writer->pos] = x; 7613 writer->pos++; 7614 ++s; 7615 } 7616 break; 7617 } 7618 else if (outkind == PyUnicode_2BYTE_KIND) { 7619 Py_UCS2 *outdata = (Py_UCS2 *)writer->data; 7620 while (s < e) { 7621 ch = *s; 7622 x = mapdata_ucs2[ch]; 7623 if (x == 0xFFFE) 7624 goto Error; 7625 outdata[writer->pos] = x; 7626 writer->pos++; 7627 ++s; 7628 } 7629 break; 7630 } 7631 } 7632 ch = *s; 7633 7634 if (ch < maplen) 7635 x = PyUnicode_READ(mapkind, mapdata, ch); 7636 else 7637 x = 0xfffe; /* invalid value */ 7638Error: 7639 if (x == 0xfffe) 7640 { 7641 /* undefined mapping */ 7642 startinpos = s-starts; 7643 endinpos = startinpos+1; 7644 if (unicode_decode_call_errorhandler_writer( 7645 errors, &errorHandler, 7646 "charmap", "character maps to <undefined>", 7647 &starts, &e, &startinpos, &endinpos, &exc, &s, 7648 writer)) { 7649 goto onError; 7650 } 7651 continue; 7652 } 7653 7654 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0) 7655 goto onError; 7656 ++s; 7657 } 7658 Py_XDECREF(errorHandler); 7659 Py_XDECREF(exc); 7660 return 0; 7661 7662onError: 7663 Py_XDECREF(errorHandler); 7664 Py_XDECREF(exc); 7665 return -1; 7666} 7667 7668static int 7669charmap_decode_mapping(const char *s, 7670 Py_ssize_t size, 7671 PyObject *mapping, 7672 const char *errors, 7673 _PyUnicodeWriter *writer) 7674{ 7675 const char *starts = s; 7676 const char *e; 7677 Py_ssize_t startinpos, endinpos; 7678 PyObject *errorHandler = NULL, *exc = NULL; 7679 unsigned char ch; 7680 PyObject *key, *item = NULL; 7681 7682 e = s + size; 7683 7684 while (s < e) { 7685 ch = *s; 7686 7687 /* Get mapping (char ordinal -> integer, Unicode char or None) */ 7688 key = PyLong_FromLong((long)ch); 7689 if (key == NULL) 7690 goto onError; 7691 7692 item = PyObject_GetItem(mapping, key); 7693 Py_DECREF(key); 7694 if (item == NULL) { 7695 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 7696 /* No mapping found means: mapping is undefined. */ 7697 PyErr_Clear(); 7698 goto Undefined; 7699 } else 7700 goto onError; 7701 } 7702 7703 /* Apply mapping */ 7704 if (item == Py_None) 7705 goto Undefined; 7706 if (PyLong_Check(item)) { 7707 long value = PyLong_AS_LONG(item); 7708 if (value == 0xFFFE) 7709 goto Undefined; 7710 if (value < 0 || value > MAX_UNICODE) { 7711 PyErr_Format(PyExc_TypeError, 7712 "character mapping must be in range(0x%lx)", 7713 (unsigned long)MAX_UNICODE + 1); 7714 goto onError; 7715 } 7716 7717 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7718 goto onError; 7719 } 7720 else if (PyUnicode_Check(item)) { 7721 if (PyUnicode_READY(item) == -1) 7722 goto onError; 7723 if (PyUnicode_GET_LENGTH(item) == 1) { 7724 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0); 7725 if (value == 0xFFFE) 7726 goto Undefined; 7727 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0) 7728 goto onError; 7729 } 7730 else { 7731 writer->overallocate = 1; 7732 if (_PyUnicodeWriter_WriteStr(writer, item) == -1) 7733 goto onError; 7734 } 7735 } 7736 else { 7737 /* wrong return value */ 7738 PyErr_SetString(PyExc_TypeError, 7739 "character mapping must return integer, None or str"); 7740 goto onError; 7741 } 7742 Py_CLEAR(item); 7743 ++s; 7744 continue; 7745 7746Undefined: 7747 /* undefined mapping */ 7748 Py_CLEAR(item); 7749 startinpos = s-starts; 7750 endinpos = startinpos+1; 7751 if (unicode_decode_call_errorhandler_writer( 7752 errors, &errorHandler, 7753 "charmap", "character maps to <undefined>", 7754 &starts, &e, &startinpos, &endinpos, &exc, &s, 7755 writer)) { 7756 goto onError; 7757 } 7758 } 7759 Py_XDECREF(errorHandler); 7760 Py_XDECREF(exc); 7761 return 0; 7762 7763onError: 7764 Py_XDECREF(item); 7765 Py_XDECREF(errorHandler); 7766 Py_XDECREF(exc); 7767 return -1; 7768} 7769 7770PyObject * 7771PyUnicode_DecodeCharmap(const char *s, 7772 Py_ssize_t size, 7773 PyObject *mapping, 7774 const char *errors) 7775{ 7776 _PyUnicodeWriter writer; 7777 7778 /* Default to Latin-1 */ 7779 if (mapping == NULL) 7780 return PyUnicode_DecodeLatin1(s, size, errors); 7781 7782 if (size == 0) 7783 _Py_RETURN_UNICODE_EMPTY(); 7784 _PyUnicodeWriter_Init(&writer); 7785 writer.min_length = size; 7786 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) 7787 goto onError; 7788 7789 if (PyUnicode_CheckExact(mapping)) { 7790 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0) 7791 goto onError; 7792 } 7793 else { 7794 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0) 7795 goto onError; 7796 } 7797 return _PyUnicodeWriter_Finish(&writer); 7798 7799 onError: 7800 _PyUnicodeWriter_Dealloc(&writer); 7801 return NULL; 7802} 7803 7804/* Charmap encoding: the lookup table */ 7805 7806struct encoding_map { 7807 PyObject_HEAD 7808 unsigned char level1[32]; 7809 int count2, count3; 7810 unsigned char level23[1]; 7811}; 7812 7813static PyObject* 7814encoding_map_size(PyObject *obj, PyObject* args) 7815{ 7816 struct encoding_map *map = (struct encoding_map*)obj; 7817 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 + 7818 128*map->count3); 7819} 7820 7821static PyMethodDef encoding_map_methods[] = { 7822 {"size", encoding_map_size, METH_NOARGS, 7823 PyDoc_STR("Return the size (in bytes) of this object") }, 7824 { 0 } 7825}; 7826 7827static void 7828encoding_map_dealloc(PyObject* o) 7829{ 7830 PyObject_FREE(o); 7831} 7832 7833static PyTypeObject EncodingMapType = { 7834 PyVarObject_HEAD_INIT(NULL, 0) 7835 "EncodingMap", /*tp_name*/ 7836 sizeof(struct encoding_map), /*tp_basicsize*/ 7837 0, /*tp_itemsize*/ 7838 /* methods */ 7839 encoding_map_dealloc, /*tp_dealloc*/ 7840 0, /*tp_print*/ 7841 0, /*tp_getattr*/ 7842 0, /*tp_setattr*/ 7843 0, /*tp_reserved*/ 7844 0, /*tp_repr*/ 7845 0, /*tp_as_number*/ 7846 0, /*tp_as_sequence*/ 7847 0, /*tp_as_mapping*/ 7848 0, /*tp_hash*/ 7849 0, /*tp_call*/ 7850 0, /*tp_str*/ 7851 0, /*tp_getattro*/ 7852 0, /*tp_setattro*/ 7853 0, /*tp_as_buffer*/ 7854 Py_TPFLAGS_DEFAULT, /*tp_flags*/ 7855 0, /*tp_doc*/ 7856 0, /*tp_traverse*/ 7857 0, /*tp_clear*/ 7858 0, /*tp_richcompare*/ 7859 0, /*tp_weaklistoffset*/ 7860 0, /*tp_iter*/ 7861 0, /*tp_iternext*/ 7862 encoding_map_methods, /*tp_methods*/ 7863 0, /*tp_members*/ 7864 0, /*tp_getset*/ 7865 0, /*tp_base*/ 7866 0, /*tp_dict*/ 7867 0, /*tp_descr_get*/ 7868 0, /*tp_descr_set*/ 7869 0, /*tp_dictoffset*/ 7870 0, /*tp_init*/ 7871 0, /*tp_alloc*/ 7872 0, /*tp_new*/ 7873 0, /*tp_free*/ 7874 0, /*tp_is_gc*/ 7875}; 7876 7877PyObject* 7878PyUnicode_BuildEncodingMap(PyObject* string) 7879{ 7880 PyObject *result; 7881 struct encoding_map *mresult; 7882 int i; 7883 int need_dict = 0; 7884 unsigned char level1[32]; 7885 unsigned char level2[512]; 7886 unsigned char *mlevel1, *mlevel2, *mlevel3; 7887 int count2 = 0, count3 = 0; 7888 int kind; 7889 void *data; 7890 Py_ssize_t length; 7891 Py_UCS4 ch; 7892 7893 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) { 7894 PyErr_BadArgument(); 7895 return NULL; 7896 } 7897 kind = PyUnicode_KIND(string); 7898 data = PyUnicode_DATA(string); 7899 length = PyUnicode_GET_LENGTH(string); 7900 length = Py_MIN(length, 256); 7901 memset(level1, 0xFF, sizeof level1); 7902 memset(level2, 0xFF, sizeof level2); 7903 7904 /* If there isn't a one-to-one mapping of NULL to \0, 7905 or if there are non-BMP characters, we need to use 7906 a mapping dictionary. */ 7907 if (PyUnicode_READ(kind, data, 0) != 0) 7908 need_dict = 1; 7909 for (i = 1; i < length; i++) { 7910 int l1, l2; 7911 ch = PyUnicode_READ(kind, data, i); 7912 if (ch == 0 || ch > 0xFFFF) { 7913 need_dict = 1; 7914 break; 7915 } 7916 if (ch == 0xFFFE) 7917 /* unmapped character */ 7918 continue; 7919 l1 = ch >> 11; 7920 l2 = ch >> 7; 7921 if (level1[l1] == 0xFF) 7922 level1[l1] = count2++; 7923 if (level2[l2] == 0xFF) 7924 level2[l2] = count3++; 7925 } 7926 7927 if (count2 >= 0xFF || count3 >= 0xFF) 7928 need_dict = 1; 7929 7930 if (need_dict) { 7931 PyObject *result = PyDict_New(); 7932 PyObject *key, *value; 7933 if (!result) 7934 return NULL; 7935 for (i = 0; i < length; i++) { 7936 key = PyLong_FromLong(PyUnicode_READ(kind, data, i)); 7937 value = PyLong_FromLong(i); 7938 if (!key || !value) 7939 goto failed1; 7940 if (PyDict_SetItem(result, key, value) == -1) 7941 goto failed1; 7942 Py_DECREF(key); 7943 Py_DECREF(value); 7944 } 7945 return result; 7946 failed1: 7947 Py_XDECREF(key); 7948 Py_XDECREF(value); 7949 Py_DECREF(result); 7950 return NULL; 7951 } 7952 7953 /* Create a three-level trie */ 7954 result = PyObject_MALLOC(sizeof(struct encoding_map) + 7955 16*count2 + 128*count3 - 1); 7956 if (!result) 7957 return PyErr_NoMemory(); 7958 PyObject_Init(result, &EncodingMapType); 7959 mresult = (struct encoding_map*)result; 7960 mresult->count2 = count2; 7961 mresult->count3 = count3; 7962 mlevel1 = mresult->level1; 7963 mlevel2 = mresult->level23; 7964 mlevel3 = mresult->level23 + 16*count2; 7965 memcpy(mlevel1, level1, 32); 7966 memset(mlevel2, 0xFF, 16*count2); 7967 memset(mlevel3, 0, 128*count3); 7968 count3 = 0; 7969 for (i = 1; i < length; i++) { 7970 int o1, o2, o3, i2, i3; 7971 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 7972 if (ch == 0xFFFE) 7973 /* unmapped character */ 7974 continue; 7975 o1 = ch>>11; 7976 o2 = (ch>>7) & 0xF; 7977 i2 = 16*mlevel1[o1] + o2; 7978 if (mlevel2[i2] == 0xFF) 7979 mlevel2[i2] = count3++; 7980 o3 = ch & 0x7F; 7981 i3 = 128*mlevel2[i2] + o3; 7982 mlevel3[i3] = i; 7983 } 7984 return result; 7985} 7986 7987static int 7988encoding_map_lookup(Py_UCS4 c, PyObject *mapping) 7989{ 7990 struct encoding_map *map = (struct encoding_map*)mapping; 7991 int l1 = c>>11; 7992 int l2 = (c>>7) & 0xF; 7993 int l3 = c & 0x7F; 7994 int i; 7995 7996 if (c > 0xFFFF) 7997 return -1; 7998 if (c == 0) 7999 return 0; 8000 /* level 1*/ 8001 i = map->level1[l1]; 8002 if (i == 0xFF) { 8003 return -1; 8004 } 8005 /* level 2*/ 8006 i = map->level23[16*i+l2]; 8007 if (i == 0xFF) { 8008 return -1; 8009 } 8010 /* level 3 */ 8011 i = map->level23[16*map->count2 + 128*i + l3]; 8012 if (i == 0) { 8013 return -1; 8014 } 8015 return i; 8016} 8017 8018/* Lookup the character ch in the mapping. If the character 8019 can't be found, Py_None is returned (or NULL, if another 8020 error occurred). */ 8021static PyObject * 8022charmapencode_lookup(Py_UCS4 c, PyObject *mapping) 8023{ 8024 PyObject *w = PyLong_FromLong((long)c); 8025 PyObject *x; 8026 8027 if (w == NULL) 8028 return NULL; 8029 x = PyObject_GetItem(mapping, w); 8030 Py_DECREF(w); 8031 if (x == NULL) { 8032 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8033 /* No mapping found means: mapping is undefined. */ 8034 PyErr_Clear(); 8035 x = Py_None; 8036 Py_INCREF(x); 8037 return x; 8038 } else 8039 return NULL; 8040 } 8041 else if (x == Py_None) 8042 return x; 8043 else if (PyLong_Check(x)) { 8044 long value = PyLong_AS_LONG(x); 8045 if (value < 0 || value > 255) { 8046 PyErr_SetString(PyExc_TypeError, 8047 "character mapping must be in range(256)"); 8048 Py_DECREF(x); 8049 return NULL; 8050 } 8051 return x; 8052 } 8053 else if (PyBytes_Check(x)) 8054 return x; 8055 else { 8056 /* wrong return value */ 8057 PyErr_Format(PyExc_TypeError, 8058 "character mapping must return integer, bytes or None, not %.400s", 8059 x->ob_type->tp_name); 8060 Py_DECREF(x); 8061 return NULL; 8062 } 8063} 8064 8065static int 8066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) 8067{ 8068 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8069 /* exponentially overallocate to minimize reallocations */ 8070 if (requiredsize < 2*outsize) 8071 requiredsize = 2*outsize; 8072 if (_PyBytes_Resize(outobj, requiredsize)) 8073 return -1; 8074 return 0; 8075} 8076 8077typedef enum charmapencode_result { 8078 enc_SUCCESS, enc_FAILED, enc_EXCEPTION 8079} charmapencode_result; 8080/* lookup the character, put the result in the output string and adjust 8081 various state variables. Resize the output bytes object if not enough 8082 space is available. Return a new reference to the object that 8083 was put in the output buffer, or Py_None, if the mapping was undefined 8084 (in which case no character was written) or NULL, if a 8085 reallocation error occurred. The caller must decref the result */ 8086static charmapencode_result 8087charmapencode_output(Py_UCS4 c, PyObject *mapping, 8088 PyObject **outobj, Py_ssize_t *outpos) 8089{ 8090 PyObject *rep; 8091 char *outstart; 8092 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj); 8093 8094 if (Py_TYPE(mapping) == &EncodingMapType) { 8095 int res = encoding_map_lookup(c, mapping); 8096 Py_ssize_t requiredsize = *outpos+1; 8097 if (res == -1) 8098 return enc_FAILED; 8099 if (outsize<requiredsize) 8100 if (charmapencode_resize(outobj, outpos, requiredsize)) 8101 return enc_EXCEPTION; 8102 outstart = PyBytes_AS_STRING(*outobj); 8103 outstart[(*outpos)++] = (char)res; 8104 return enc_SUCCESS; 8105 } 8106 8107 rep = charmapencode_lookup(c, mapping); 8108 if (rep==NULL) 8109 return enc_EXCEPTION; 8110 else if (rep==Py_None) { 8111 Py_DECREF(rep); 8112 return enc_FAILED; 8113 } else { 8114 if (PyLong_Check(rep)) { 8115 Py_ssize_t requiredsize = *outpos+1; 8116 if (outsize<requiredsize) 8117 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8118 Py_DECREF(rep); 8119 return enc_EXCEPTION; 8120 } 8121 outstart = PyBytes_AS_STRING(*outobj); 8122 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep); 8123 } 8124 else { 8125 const char *repchars = PyBytes_AS_STRING(rep); 8126 Py_ssize_t repsize = PyBytes_GET_SIZE(rep); 8127 Py_ssize_t requiredsize = *outpos+repsize; 8128 if (outsize<requiredsize) 8129 if (charmapencode_resize(outobj, outpos, requiredsize)) { 8130 Py_DECREF(rep); 8131 return enc_EXCEPTION; 8132 } 8133 outstart = PyBytes_AS_STRING(*outobj); 8134 memcpy(outstart + *outpos, repchars, repsize); 8135 *outpos += repsize; 8136 } 8137 } 8138 Py_DECREF(rep); 8139 return enc_SUCCESS; 8140} 8141 8142/* handle an error in PyUnicode_EncodeCharmap 8143 Return 0 on success, -1 on error */ 8144static int 8145charmap_encoding_error( 8146 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, 8147 PyObject **exceptionObject, 8148 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, 8149 PyObject **res, Py_ssize_t *respos) 8150{ 8151 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8152 Py_ssize_t size, repsize; 8153 Py_ssize_t newpos; 8154 enum PyUnicode_Kind kind; 8155 void *data; 8156 Py_ssize_t index; 8157 /* startpos for collecting unencodable chars */ 8158 Py_ssize_t collstartpos = *inpos; 8159 Py_ssize_t collendpos = *inpos+1; 8160 Py_ssize_t collpos; 8161 char *encoding = "charmap"; 8162 char *reason = "character maps to <undefined>"; 8163 charmapencode_result x; 8164 Py_UCS4 ch; 8165 int val; 8166 8167 if (PyUnicode_READY(unicode) == -1) 8168 return -1; 8169 size = PyUnicode_GET_LENGTH(unicode); 8170 /* find all unencodable characters */ 8171 while (collendpos < size) { 8172 PyObject *rep; 8173 if (Py_TYPE(mapping) == &EncodingMapType) { 8174 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8175 val = encoding_map_lookup(ch, mapping); 8176 if (val != -1) 8177 break; 8178 ++collendpos; 8179 continue; 8180 } 8181 8182 ch = PyUnicode_READ_CHAR(unicode, collendpos); 8183 rep = charmapencode_lookup(ch, mapping); 8184 if (rep==NULL) 8185 return -1; 8186 else if (rep!=Py_None) { 8187 Py_DECREF(rep); 8188 break; 8189 } 8190 Py_DECREF(rep); 8191 ++collendpos; 8192 } 8193 /* cache callback name lookup 8194 * (if not done yet, i.e. it's the first error) */ 8195 if (*error_handler == _Py_ERROR_UNKNOWN) 8196 *error_handler = get_error_handler(errors); 8197 8198 switch (*error_handler) { 8199 case _Py_ERROR_STRICT: 8200 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8201 return -1; 8202 8203 case _Py_ERROR_REPLACE: 8204 for (collpos = collstartpos; collpos<collendpos; ++collpos) { 8205 x = charmapencode_output('?', mapping, res, respos); 8206 if (x==enc_EXCEPTION) { 8207 return -1; 8208 } 8209 else if (x==enc_FAILED) { 8210 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8211 return -1; 8212 } 8213 } 8214 /* fall through */ 8215 case _Py_ERROR_IGNORE: 8216 *inpos = collendpos; 8217 break; 8218 8219 case _Py_ERROR_XMLCHARREFREPLACE: 8220 /* generate replacement (temporarily (mis)uses p) */ 8221 for (collpos = collstartpos; collpos < collendpos; ++collpos) { 8222 char buffer[2+29+1+1]; 8223 char *cp; 8224 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos)); 8225 for (cp = buffer; *cp; ++cp) { 8226 x = charmapencode_output(*cp, mapping, res, respos); 8227 if (x==enc_EXCEPTION) 8228 return -1; 8229 else if (x==enc_FAILED) { 8230 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8231 return -1; 8232 } 8233 } 8234 } 8235 *inpos = collendpos; 8236 break; 8237 8238 default: 8239 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, 8240 encoding, reason, unicode, exceptionObject, 8241 collstartpos, collendpos, &newpos); 8242 if (repunicode == NULL) 8243 return -1; 8244 if (PyBytes_Check(repunicode)) { 8245 /* Directly copy bytes result to output. */ 8246 Py_ssize_t outsize = PyBytes_Size(*res); 8247 Py_ssize_t requiredsize; 8248 repsize = PyBytes_Size(repunicode); 8249 requiredsize = *respos + repsize; 8250 if (requiredsize > outsize) 8251 /* Make room for all additional bytes. */ 8252 if (charmapencode_resize(res, respos, requiredsize)) { 8253 Py_DECREF(repunicode); 8254 return -1; 8255 } 8256 memcpy(PyBytes_AsString(*res) + *respos, 8257 PyBytes_AsString(repunicode), repsize); 8258 *respos += repsize; 8259 *inpos = newpos; 8260 Py_DECREF(repunicode); 8261 break; 8262 } 8263 /* generate replacement */ 8264 if (PyUnicode_READY(repunicode) == -1) { 8265 Py_DECREF(repunicode); 8266 return -1; 8267 } 8268 repsize = PyUnicode_GET_LENGTH(repunicode); 8269 data = PyUnicode_DATA(repunicode); 8270 kind = PyUnicode_KIND(repunicode); 8271 for (index = 0; index < repsize; index++) { 8272 Py_UCS4 repch = PyUnicode_READ(kind, data, index); 8273 x = charmapencode_output(repch, mapping, res, respos); 8274 if (x==enc_EXCEPTION) { 8275 Py_DECREF(repunicode); 8276 return -1; 8277 } 8278 else if (x==enc_FAILED) { 8279 Py_DECREF(repunicode); 8280 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); 8281 return -1; 8282 } 8283 } 8284 *inpos = newpos; 8285 Py_DECREF(repunicode); 8286 } 8287 return 0; 8288} 8289 8290PyObject * 8291_PyUnicode_EncodeCharmap(PyObject *unicode, 8292 PyObject *mapping, 8293 const char *errors) 8294{ 8295 /* output object */ 8296 PyObject *res = NULL; 8297 /* current input position */ 8298 Py_ssize_t inpos = 0; 8299 Py_ssize_t size; 8300 /* current output position */ 8301 Py_ssize_t respos = 0; 8302 PyObject *error_handler_obj = NULL; 8303 PyObject *exc = NULL; 8304 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; 8305 void *data; 8306 int kind; 8307 8308 if (PyUnicode_READY(unicode) == -1) 8309 return NULL; 8310 size = PyUnicode_GET_LENGTH(unicode); 8311 data = PyUnicode_DATA(unicode); 8312 kind = PyUnicode_KIND(unicode); 8313 8314 /* Default to Latin-1 */ 8315 if (mapping == NULL) 8316 return unicode_encode_ucs1(unicode, errors, 256); 8317 8318 /* allocate enough for a simple encoding without 8319 replacements, if we need more, we'll resize */ 8320 res = PyBytes_FromStringAndSize(NULL, size); 8321 if (res == NULL) 8322 goto onError; 8323 if (size == 0) 8324 return res; 8325 8326 while (inpos<size) { 8327 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos); 8328 /* try to encode it */ 8329 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos); 8330 if (x==enc_EXCEPTION) /* error */ 8331 goto onError; 8332 if (x==enc_FAILED) { /* unencodable character */ 8333 if (charmap_encoding_error(unicode, &inpos, mapping, 8334 &exc, 8335 &error_handler, &error_handler_obj, errors, 8336 &res, &respos)) { 8337 goto onError; 8338 } 8339 } 8340 else 8341 /* done with this character => adjust input position */ 8342 ++inpos; 8343 } 8344 8345 /* Resize if we allocated to much */ 8346 if (respos<PyBytes_GET_SIZE(res)) 8347 if (_PyBytes_Resize(&res, respos) < 0) 8348 goto onError; 8349 8350 Py_XDECREF(exc); 8351 Py_XDECREF(error_handler_obj); 8352 return res; 8353 8354 onError: 8355 Py_XDECREF(res); 8356 Py_XDECREF(exc); 8357 Py_XDECREF(error_handler_obj); 8358 return NULL; 8359} 8360 8361/* Deprecated */ 8362PyObject * 8363PyUnicode_EncodeCharmap(const Py_UNICODE *p, 8364 Py_ssize_t size, 8365 PyObject *mapping, 8366 const char *errors) 8367{ 8368 PyObject *result; 8369 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8370 if (unicode == NULL) 8371 return NULL; 8372 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); 8373 Py_DECREF(unicode); 8374 return result; 8375} 8376 8377PyObject * 8378PyUnicode_AsCharmapString(PyObject *unicode, 8379 PyObject *mapping) 8380{ 8381 if (!PyUnicode_Check(unicode) || mapping == NULL) { 8382 PyErr_BadArgument(); 8383 return NULL; 8384 } 8385 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL); 8386} 8387 8388/* create or adjust a UnicodeTranslateError */ 8389static void 8390make_translate_exception(PyObject **exceptionObject, 8391 PyObject *unicode, 8392 Py_ssize_t startpos, Py_ssize_t endpos, 8393 const char *reason) 8394{ 8395 if (*exceptionObject == NULL) { 8396 *exceptionObject = _PyUnicodeTranslateError_Create( 8397 unicode, startpos, endpos, reason); 8398 } 8399 else { 8400 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) 8401 goto onError; 8402 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) 8403 goto onError; 8404 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) 8405 goto onError; 8406 return; 8407 onError: 8408 Py_CLEAR(*exceptionObject); 8409 } 8410} 8411 8412/* error handling callback helper: 8413 build arguments, call the callback and check the arguments, 8414 put the result into newpos and return the replacement string, which 8415 has to be freed by the caller */ 8416static PyObject * 8417unicode_translate_call_errorhandler(const char *errors, 8418 PyObject **errorHandler, 8419 const char *reason, 8420 PyObject *unicode, PyObject **exceptionObject, 8421 Py_ssize_t startpos, Py_ssize_t endpos, 8422 Py_ssize_t *newpos) 8423{ 8424 static char *argparse = "O!n;translating error handler must return (str, int) tuple"; 8425 8426 Py_ssize_t i_newpos; 8427 PyObject *restuple; 8428 PyObject *resunicode; 8429 8430 if (*errorHandler == NULL) { 8431 *errorHandler = PyCodec_LookupError(errors); 8432 if (*errorHandler == NULL) 8433 return NULL; 8434 } 8435 8436 make_translate_exception(exceptionObject, 8437 unicode, startpos, endpos, reason); 8438 if (*exceptionObject == NULL) 8439 return NULL; 8440 8441 restuple = PyObject_CallFunctionObjArgs( 8442 *errorHandler, *exceptionObject, NULL); 8443 if (restuple == NULL) 8444 return NULL; 8445 if (!PyTuple_Check(restuple)) { 8446 PyErr_SetString(PyExc_TypeError, &argparse[4]); 8447 Py_DECREF(restuple); 8448 return NULL; 8449 } 8450 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, 8451 &resunicode, &i_newpos)) { 8452 Py_DECREF(restuple); 8453 return NULL; 8454 } 8455 if (i_newpos<0) 8456 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos; 8457 else 8458 *newpos = i_newpos; 8459 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) { 8460 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); 8461 Py_DECREF(restuple); 8462 return NULL; 8463 } 8464 Py_INCREF(resunicode); 8465 Py_DECREF(restuple); 8466 return resunicode; 8467} 8468 8469/* Lookup the character ch in the mapping and put the result in result, 8470 which must be decrefed by the caller. 8471 Return 0 on success, -1 on error */ 8472static int 8473charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result) 8474{ 8475 PyObject *w = PyLong_FromLong((long)c); 8476 PyObject *x; 8477 8478 if (w == NULL) 8479 return -1; 8480 x = PyObject_GetItem(mapping, w); 8481 Py_DECREF(w); 8482 if (x == NULL) { 8483 if (PyErr_ExceptionMatches(PyExc_LookupError)) { 8484 /* No mapping found means: use 1:1 mapping. */ 8485 PyErr_Clear(); 8486 *result = NULL; 8487 return 0; 8488 } else 8489 return -1; 8490 } 8491 else if (x == Py_None) { 8492 *result = x; 8493 return 0; 8494 } 8495 else if (PyLong_Check(x)) { 8496 long value = PyLong_AS_LONG(x); 8497 if (value < 0 || value > MAX_UNICODE) { 8498 PyErr_Format(PyExc_ValueError, 8499 "character mapping must be in range(0x%x)", 8500 MAX_UNICODE+1); 8501 Py_DECREF(x); 8502 return -1; 8503 } 8504 *result = x; 8505 return 0; 8506 } 8507 else if (PyUnicode_Check(x)) { 8508 *result = x; 8509 return 0; 8510 } 8511 else { 8512 /* wrong return value */ 8513 PyErr_SetString(PyExc_TypeError, 8514 "character mapping must return integer, None or str"); 8515 Py_DECREF(x); 8516 return -1; 8517 } 8518} 8519 8520/* lookup the character, write the result into the writer. 8521 Return 1 if the result was written into the writer, return 0 if the mapping 8522 was undefined, raise an exception return -1 on error. */ 8523static int 8524charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, 8525 _PyUnicodeWriter *writer) 8526{ 8527 PyObject *item; 8528 8529 if (charmaptranslate_lookup(ch, mapping, &item)) 8530 return -1; 8531 8532 if (item == NULL) { 8533 /* not found => default to 1:1 mapping */ 8534 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8535 return -1; 8536 } 8537 return 1; 8538 } 8539 8540 if (item == Py_None) { 8541 Py_DECREF(item); 8542 return 0; 8543 } 8544 8545 if (PyLong_Check(item)) { 8546 long ch = (Py_UCS4)PyLong_AS_LONG(item); 8547 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8548 used it */ 8549 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { 8550 Py_DECREF(item); 8551 return -1; 8552 } 8553 Py_DECREF(item); 8554 return 1; 8555 } 8556 8557 if (!PyUnicode_Check(item)) { 8558 Py_DECREF(item); 8559 return -1; 8560 } 8561 8562 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { 8563 Py_DECREF(item); 8564 return -1; 8565 } 8566 8567 Py_DECREF(item); 8568 return 1; 8569} 8570 8571static int 8572unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, 8573 Py_UCS1 *translate) 8574{ 8575 PyObject *item = NULL; 8576 int ret = 0; 8577 8578 if (charmaptranslate_lookup(ch, mapping, &item)) { 8579 return -1; 8580 } 8581 8582 if (item == Py_None) { 8583 /* deletion */ 8584 translate[ch] = 0xfe; 8585 } 8586 else if (item == NULL) { 8587 /* not found => default to 1:1 mapping */ 8588 translate[ch] = ch; 8589 return 1; 8590 } 8591 else if (PyLong_Check(item)) { 8592 long replace = PyLong_AS_LONG(item); 8593 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already 8594 used it */ 8595 if (127 < replace) { 8596 /* invalid character or character outside ASCII: 8597 skip the fast translate */ 8598 goto exit; 8599 } 8600 translate[ch] = (Py_UCS1)replace; 8601 } 8602 else if (PyUnicode_Check(item)) { 8603 Py_UCS4 replace; 8604 8605 if (PyUnicode_READY(item) == -1) { 8606 Py_DECREF(item); 8607 return -1; 8608 } 8609 if (PyUnicode_GET_LENGTH(item) != 1) 8610 goto exit; 8611 8612 replace = PyUnicode_READ_CHAR(item, 0); 8613 if (replace > 127) 8614 goto exit; 8615 translate[ch] = (Py_UCS1)replace; 8616 } 8617 else { 8618 /* not None, NULL, long or unicode */ 8619 goto exit; 8620 } 8621 ret = 1; 8622 8623 exit: 8624 Py_DECREF(item); 8625 return ret; 8626} 8627 8628/* Fast path for ascii => ascii translation. Return 1 if the whole string 8629 was translated into writer, return 0 if the input string was partially 8630 translated into writer, raise an exception and return -1 on error. */ 8631static int 8632unicode_fast_translate(PyObject *input, PyObject *mapping, 8633 _PyUnicodeWriter *writer, int ignore) 8634{ 8635 Py_UCS1 ascii_table[128], ch, ch2; 8636 Py_ssize_t len; 8637 Py_UCS1 *in, *end, *out; 8638 int res = 0; 8639 8640 if (PyUnicode_READY(input) == -1) 8641 return -1; 8642 if (!PyUnicode_IS_ASCII(input)) 8643 return 0; 8644 len = PyUnicode_GET_LENGTH(input); 8645 8646 memset(ascii_table, 0xff, 128); 8647 8648 in = PyUnicode_1BYTE_DATA(input); 8649 end = in + len; 8650 8651 assert(PyUnicode_IS_ASCII(writer->buffer)); 8652 assert(PyUnicode_GET_LENGTH(writer->buffer) == len); 8653 out = PyUnicode_1BYTE_DATA(writer->buffer); 8654 8655 for (; in < end; in++) { 8656 ch = *in; 8657 ch2 = ascii_table[ch]; 8658 if (ch2 == 0xff) { 8659 int translate = unicode_fast_translate_lookup(mapping, ch, 8660 ascii_table); 8661 if (translate < 0) 8662 return -1; 8663 if (translate == 0) 8664 goto exit; 8665 ch2 = ascii_table[ch]; 8666 } 8667 if (ch2 == 0xfe) { 8668 if (ignore) 8669 continue; 8670 goto exit; 8671 } 8672 assert(ch2 < 128); 8673 *out = ch2; 8674 out++; 8675 } 8676 res = 1; 8677 8678exit: 8679 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); 8680 return res; 8681} 8682 8683PyObject * 8684_PyUnicode_TranslateCharmap(PyObject *input, 8685 PyObject *mapping, 8686 const char *errors) 8687{ 8688 /* input object */ 8689 char *data; 8690 Py_ssize_t size, i; 8691 int kind; 8692 /* output buffer */ 8693 _PyUnicodeWriter writer; 8694 /* error handler */ 8695 char *reason = "character maps to <undefined>"; 8696 PyObject *errorHandler = NULL; 8697 PyObject *exc = NULL; 8698 int ignore; 8699 int res; 8700 8701 if (mapping == NULL) { 8702 PyErr_BadArgument(); 8703 return NULL; 8704 } 8705 8706 if (PyUnicode_READY(input) == -1) 8707 return NULL; 8708 data = (char*)PyUnicode_DATA(input); 8709 kind = PyUnicode_KIND(input); 8710 size = PyUnicode_GET_LENGTH(input); 8711 8712 if (size == 0) { 8713 Py_INCREF(input); 8714 return input; 8715 } 8716 8717 /* allocate enough for a simple 1:1 translation without 8718 replacements, if we need more, we'll resize */ 8719 _PyUnicodeWriter_Init(&writer); 8720 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) 8721 goto onError; 8722 8723 ignore = (errors != NULL && strcmp(errors, "ignore") == 0); 8724 8725 res = unicode_fast_translate(input, mapping, &writer, ignore); 8726 if (res < 0) { 8727 _PyUnicodeWriter_Dealloc(&writer); 8728 return NULL; 8729 } 8730 if (res == 1) 8731 return _PyUnicodeWriter_Finish(&writer); 8732 8733 i = writer.pos; 8734 while (i<size) { 8735 /* try to encode it */ 8736 int translate; 8737 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ 8738 Py_ssize_t newpos; 8739 /* startpos for collecting untranslatable chars */ 8740 Py_ssize_t collstart; 8741 Py_ssize_t collend; 8742 Py_UCS4 ch; 8743 8744 ch = PyUnicode_READ(kind, data, i); 8745 translate = charmaptranslate_output(ch, mapping, &writer); 8746 if (translate < 0) 8747 goto onError; 8748 8749 if (translate != 0) { 8750 /* it worked => adjust input pointer */ 8751 ++i; 8752 continue; 8753 } 8754 8755 /* untranslatable character */ 8756 collstart = i; 8757 collend = i+1; 8758 8759 /* find all untranslatable characters */ 8760 while (collend < size) { 8761 PyObject *x; 8762 ch = PyUnicode_READ(kind, data, collend); 8763 if (charmaptranslate_lookup(ch, mapping, &x)) 8764 goto onError; 8765 Py_XDECREF(x); 8766 if (x != Py_None) 8767 break; 8768 ++collend; 8769 } 8770 8771 if (ignore) { 8772 i = collend; 8773 } 8774 else { 8775 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, 8776 reason, input, &exc, 8777 collstart, collend, &newpos); 8778 if (repunicode == NULL) 8779 goto onError; 8780 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) { 8781 Py_DECREF(repunicode); 8782 goto onError; 8783 } 8784 Py_DECREF(repunicode); 8785 i = newpos; 8786 } 8787 } 8788 Py_XDECREF(exc); 8789 Py_XDECREF(errorHandler); 8790 return _PyUnicodeWriter_Finish(&writer); 8791 8792 onError: 8793 _PyUnicodeWriter_Dealloc(&writer); 8794 Py_XDECREF(exc); 8795 Py_XDECREF(errorHandler); 8796 return NULL; 8797} 8798 8799/* Deprecated. Use PyUnicode_Translate instead. */ 8800PyObject * 8801PyUnicode_TranslateCharmap(const Py_UNICODE *p, 8802 Py_ssize_t size, 8803 PyObject *mapping, 8804 const char *errors) 8805{ 8806 PyObject *result; 8807 PyObject *unicode = PyUnicode_FromUnicode(p, size); 8808 if (!unicode) 8809 return NULL; 8810 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); 8811 Py_DECREF(unicode); 8812 return result; 8813} 8814 8815PyObject * 8816PyUnicode_Translate(PyObject *str, 8817 PyObject *mapping, 8818 const char *errors) 8819{ 8820 PyObject *result; 8821 8822 str = PyUnicode_FromObject(str); 8823 if (str == NULL) 8824 return NULL; 8825 result = _PyUnicode_TranslateCharmap(str, mapping, errors); 8826 Py_DECREF(str); 8827 return result; 8828} 8829 8830static Py_UCS4 8831fix_decimal_and_space_to_ascii(PyObject *self) 8832{ 8833 /* No need to call PyUnicode_READY(self) because this function is only 8834 called as a callback from fixup() which does it already. */ 8835 const Py_ssize_t len = PyUnicode_GET_LENGTH(self); 8836 const int kind = PyUnicode_KIND(self); 8837 void *data = PyUnicode_DATA(self); 8838 Py_UCS4 maxchar = 127, ch, fixed; 8839 int modified = 0; 8840 Py_ssize_t i; 8841 8842 for (i = 0; i < len; ++i) { 8843 ch = PyUnicode_READ(kind, data, i); 8844 fixed = 0; 8845 if (ch > 127) { 8846 if (Py_UNICODE_ISSPACE(ch)) 8847 fixed = ' '; 8848 else { 8849 const int decimal = Py_UNICODE_TODECIMAL(ch); 8850 if (decimal >= 0) 8851 fixed = '0' + decimal; 8852 } 8853 if (fixed != 0) { 8854 modified = 1; 8855 maxchar = Py_MAX(maxchar, fixed); 8856 PyUnicode_WRITE(kind, data, i, fixed); 8857 } 8858 else 8859 maxchar = Py_MAX(maxchar, ch); 8860 } 8861 } 8862 8863 return (modified) ? maxchar : 0; 8864} 8865 8866PyObject * 8867_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) 8868{ 8869 if (!PyUnicode_Check(unicode)) { 8870 PyErr_BadInternalCall(); 8871 return NULL; 8872 } 8873 if (PyUnicode_READY(unicode) == -1) 8874 return NULL; 8875 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { 8876 /* If the string is already ASCII, just return the same string */ 8877 Py_INCREF(unicode); 8878 return unicode; 8879 } 8880 return fixup(unicode, fix_decimal_and_space_to_ascii); 8881} 8882 8883PyObject * 8884PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, 8885 Py_ssize_t length) 8886{ 8887 PyObject *decimal; 8888 Py_ssize_t i; 8889 Py_UCS4 maxchar; 8890 enum PyUnicode_Kind kind; 8891 void *data; 8892 8893 maxchar = 127; 8894 for (i = 0; i < length; i++) { 8895 Py_UCS4 ch = s[i]; 8896 if (ch > 127) { 8897 int decimal = Py_UNICODE_TODECIMAL(ch); 8898 if (decimal >= 0) 8899 ch = '0' + decimal; 8900 maxchar = Py_MAX(maxchar, ch); 8901 } 8902 } 8903 8904 /* Copy to a new string */ 8905 decimal = PyUnicode_New(length, maxchar); 8906 if (decimal == NULL) 8907 return decimal; 8908 kind = PyUnicode_KIND(decimal); 8909 data = PyUnicode_DATA(decimal); 8910 /* Iterate over code points */ 8911 for (i = 0; i < length; i++) { 8912 Py_UCS4 ch = s[i]; 8913 if (ch > 127) { 8914 int decimal = Py_UNICODE_TODECIMAL(ch); 8915 if (decimal >= 0) 8916 ch = '0' + decimal; 8917 } 8918 PyUnicode_WRITE(kind, data, i, ch); 8919 } 8920 return unicode_result(decimal); 8921} 8922/* --- Decimal Encoder ---------------------------------------------------- */ 8923 8924int 8925PyUnicode_EncodeDecimal(Py_UNICODE *s, 8926 Py_ssize_t length, 8927 char *output, 8928 const char *errors) 8929{ 8930 PyObject *unicode; 8931 Py_ssize_t i; 8932 enum PyUnicode_Kind kind; 8933 void *data; 8934 8935 if (output == NULL) { 8936 PyErr_BadArgument(); 8937 return -1; 8938 } 8939 8940 unicode = PyUnicode_FromUnicode(s, length); 8941 if (unicode == NULL) 8942 return -1; 8943 8944 if (PyUnicode_READY(unicode) == -1) { 8945 Py_DECREF(unicode); 8946 return -1; 8947 } 8948 kind = PyUnicode_KIND(unicode); 8949 data = PyUnicode_DATA(unicode); 8950 8951 for (i=0; i < length; ) { 8952 PyObject *exc; 8953 Py_UCS4 ch; 8954 int decimal; 8955 Py_ssize_t startpos; 8956 8957 ch = PyUnicode_READ(kind, data, i); 8958 8959 if (Py_UNICODE_ISSPACE(ch)) { 8960 *output++ = ' '; 8961 i++; 8962 continue; 8963 } 8964 decimal = Py_UNICODE_TODECIMAL(ch); 8965 if (decimal >= 0) { 8966 *output++ = '0' + decimal; 8967 i++; 8968 continue; 8969 } 8970 if (0 < ch && ch < 256) { 8971 *output++ = (char)ch; 8972 i++; 8973 continue; 8974 } 8975 8976 startpos = i; 8977 exc = NULL; 8978 raise_encode_exception(&exc, "decimal", unicode, 8979 startpos, startpos+1, 8980 "invalid decimal Unicode string"); 8981 Py_XDECREF(exc); 8982 Py_DECREF(unicode); 8983 return -1; 8984 } 8985 /* 0-terminate the output string */ 8986 *output++ = '\0'; 8987 Py_DECREF(unicode); 8988 return 0; 8989} 8990 8991/* --- Helpers ------------------------------------------------------------ */ 8992 8993/* helper macro to fixup start/end slice values */ 8994#define ADJUST_INDICES(start, end, len) \ 8995 if (end > len) \ 8996 end = len; \ 8997 else if (end < 0) { \ 8998 end += len; \ 8999 if (end < 0) \ 9000 end = 0; \ 9001 } \ 9002 if (start < 0) { \ 9003 start += len; \ 9004 if (start < 0) \ 9005 start = 0; \ 9006 } 9007 9008static Py_ssize_t 9009any_find_slice(int direction, PyObject* s1, PyObject* s2, 9010 Py_ssize_t start, 9011 Py_ssize_t end) 9012{ 9013 int kind1, kind2; 9014 void *buf1, *buf2; 9015 Py_ssize_t len1, len2, result; 9016 9017 kind1 = PyUnicode_KIND(s1); 9018 kind2 = PyUnicode_KIND(s2); 9019 if (kind1 < kind2) 9020 return -1; 9021 9022 len1 = PyUnicode_GET_LENGTH(s1); 9023 len2 = PyUnicode_GET_LENGTH(s2); 9024 ADJUST_INDICES(start, end, len1); 9025 if (end - start < len2) 9026 return -1; 9027 9028 buf1 = PyUnicode_DATA(s1); 9029 buf2 = PyUnicode_DATA(s2); 9030 if (len2 == 1) { 9031 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 9032 result = findchar((const char *)buf1 + kind1*start, 9033 kind1, end - start, ch, direction); 9034 if (result == -1) 9035 return -1; 9036 else 9037 return start + result; 9038 } 9039 9040 if (kind2 != kind1) { 9041 buf2 = _PyUnicode_AsKind(s2, kind1); 9042 if (!buf2) 9043 return -2; 9044 } 9045 9046 if (direction > 0) { 9047 switch (kind1) { 9048 case PyUnicode_1BYTE_KIND: 9049 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9050 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); 9051 else 9052 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); 9053 break; 9054 case PyUnicode_2BYTE_KIND: 9055 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end); 9056 break; 9057 case PyUnicode_4BYTE_KIND: 9058 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end); 9059 break; 9060 default: 9061 assert(0); result = -2; 9062 } 9063 } 9064 else { 9065 switch (kind1) { 9066 case PyUnicode_1BYTE_KIND: 9067 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) 9068 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); 9069 else 9070 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9071 break; 9072 case PyUnicode_2BYTE_KIND: 9073 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9074 break; 9075 case PyUnicode_4BYTE_KIND: 9076 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end); 9077 break; 9078 default: 9079 assert(0); result = -2; 9080 } 9081 } 9082 9083 if (kind2 != kind1) 9084 PyMem_Free(buf2); 9085 9086 return result; 9087} 9088 9089Py_ssize_t 9090_PyUnicode_InsertThousandsGrouping( 9091 PyObject *unicode, Py_ssize_t index, 9092 Py_ssize_t n_buffer, 9093 void *digits, Py_ssize_t n_digits, 9094 Py_ssize_t min_width, 9095 const char *grouping, PyObject *thousands_sep, 9096 Py_UCS4 *maxchar) 9097{ 9098 unsigned int kind, thousands_sep_kind; 9099 char *data, *thousands_sep_data; 9100 Py_ssize_t thousands_sep_len; 9101 Py_ssize_t len; 9102 9103 if (unicode != NULL) { 9104 kind = PyUnicode_KIND(unicode); 9105 data = (char *) PyUnicode_DATA(unicode) + index * kind; 9106 } 9107 else { 9108 kind = PyUnicode_1BYTE_KIND; 9109 data = NULL; 9110 } 9111 thousands_sep_kind = PyUnicode_KIND(thousands_sep); 9112 thousands_sep_data = PyUnicode_DATA(thousands_sep); 9113 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep); 9114 if (unicode != NULL && thousands_sep_kind != kind) { 9115 if (thousands_sep_kind < kind) { 9116 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind); 9117 if (!thousands_sep_data) 9118 return -1; 9119 } 9120 else { 9121 data = _PyUnicode_AsKind(unicode, thousands_sep_kind); 9122 if (!data) 9123 return -1; 9124 } 9125 } 9126 9127 switch (kind) { 9128 case PyUnicode_1BYTE_KIND: 9129 if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) 9130 len = asciilib_InsertThousandsGrouping( 9131 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits, 9132 min_width, grouping, 9133 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9134 else 9135 len = ucs1lib_InsertThousandsGrouping( 9136 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, 9137 min_width, grouping, 9138 (Py_UCS1 *) thousands_sep_data, thousands_sep_len); 9139 break; 9140 case PyUnicode_2BYTE_KIND: 9141 len = ucs2lib_InsertThousandsGrouping( 9142 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits, 9143 min_width, grouping, 9144 (Py_UCS2 *) thousands_sep_data, thousands_sep_len); 9145 break; 9146 case PyUnicode_4BYTE_KIND: 9147 len = ucs4lib_InsertThousandsGrouping( 9148 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits, 9149 min_width, grouping, 9150 (Py_UCS4 *) thousands_sep_data, thousands_sep_len); 9151 break; 9152 default: 9153 assert(0); 9154 return -1; 9155 } 9156 if (unicode != NULL && thousands_sep_kind != kind) { 9157 if (thousands_sep_kind < kind) 9158 PyMem_Free(thousands_sep_data); 9159 else 9160 PyMem_Free(data); 9161 } 9162 if (unicode == NULL) { 9163 *maxchar = 127; 9164 if (len != n_digits) { 9165 *maxchar = Py_MAX(*maxchar, 9166 PyUnicode_MAX_CHAR_VALUE(thousands_sep)); 9167 } 9168 } 9169 return len; 9170} 9171 9172 9173Py_ssize_t 9174PyUnicode_Count(PyObject *str, 9175 PyObject *substr, 9176 Py_ssize_t start, 9177 Py_ssize_t end) 9178{ 9179 Py_ssize_t result; 9180 PyObject* str_obj; 9181 PyObject* sub_obj; 9182 int kind1, kind2; 9183 void *buf1 = NULL, *buf2 = NULL; 9184 Py_ssize_t len1, len2; 9185 9186 str_obj = PyUnicode_FromObject(str); 9187 if (!str_obj) 9188 return -1; 9189 sub_obj = PyUnicode_FromObject(substr); 9190 if (!sub_obj) { 9191 Py_DECREF(str_obj); 9192 return -1; 9193 } 9194 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 9195 Py_DECREF(sub_obj); 9196 Py_DECREF(str_obj); 9197 return -1; 9198 } 9199 9200 kind1 = PyUnicode_KIND(str_obj); 9201 kind2 = PyUnicode_KIND(sub_obj); 9202 if (kind1 < kind2) { 9203 Py_DECREF(sub_obj); 9204 Py_DECREF(str_obj); 9205 return 0; 9206 } 9207 9208 len1 = PyUnicode_GET_LENGTH(str_obj); 9209 len2 = PyUnicode_GET_LENGTH(sub_obj); 9210 ADJUST_INDICES(start, end, len1); 9211 if (end - start < len2) { 9212 Py_DECREF(sub_obj); 9213 Py_DECREF(str_obj); 9214 return 0; 9215 } 9216 9217 buf1 = PyUnicode_DATA(str_obj); 9218 buf2 = PyUnicode_DATA(sub_obj); 9219 if (kind2 != kind1) { 9220 buf2 = _PyUnicode_AsKind(sub_obj, kind1); 9221 if (!buf2) 9222 goto onError; 9223 } 9224 9225 switch (kind1) { 9226 case PyUnicode_1BYTE_KIND: 9227 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) 9228 result = asciilib_count( 9229 ((Py_UCS1*)buf1) + start, end - start, 9230 buf2, len2, PY_SSIZE_T_MAX 9231 ); 9232 else 9233 result = ucs1lib_count( 9234 ((Py_UCS1*)buf1) + start, end - start, 9235 buf2, len2, PY_SSIZE_T_MAX 9236 ); 9237 break; 9238 case PyUnicode_2BYTE_KIND: 9239 result = ucs2lib_count( 9240 ((Py_UCS2*)buf1) + start, end - start, 9241 buf2, len2, PY_SSIZE_T_MAX 9242 ); 9243 break; 9244 case PyUnicode_4BYTE_KIND: 9245 result = ucs4lib_count( 9246 ((Py_UCS4*)buf1) + start, end - start, 9247 buf2, len2, PY_SSIZE_T_MAX 9248 ); 9249 break; 9250 default: 9251 assert(0); result = 0; 9252 } 9253 9254 Py_DECREF(sub_obj); 9255 Py_DECREF(str_obj); 9256 9257 if (kind2 != kind1) 9258 PyMem_Free(buf2); 9259 9260 return result; 9261 onError: 9262 Py_DECREF(sub_obj); 9263 Py_DECREF(str_obj); 9264 if (kind2 != kind1 && buf2) 9265 PyMem_Free(buf2); 9266 return -1; 9267} 9268 9269Py_ssize_t 9270PyUnicode_Find(PyObject *str, 9271 PyObject *sub, 9272 Py_ssize_t start, 9273 Py_ssize_t end, 9274 int direction) 9275{ 9276 Py_ssize_t result; 9277 9278 str = PyUnicode_FromObject(str); 9279 if (!str) 9280 return -2; 9281 sub = PyUnicode_FromObject(sub); 9282 if (!sub) { 9283 Py_DECREF(str); 9284 return -2; 9285 } 9286 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { 9287 Py_DECREF(sub); 9288 Py_DECREF(str); 9289 return -2; 9290 } 9291 9292 result = any_find_slice(direction, 9293 str, sub, start, end 9294 ); 9295 9296 Py_DECREF(str); 9297 Py_DECREF(sub); 9298 9299 return result; 9300} 9301 9302Py_ssize_t 9303PyUnicode_FindChar(PyObject *str, Py_UCS4 ch, 9304 Py_ssize_t start, Py_ssize_t end, 9305 int direction) 9306{ 9307 int kind; 9308 Py_ssize_t result; 9309 if (PyUnicode_READY(str) == -1) 9310 return -2; 9311 if (start < 0 || end < 0) { 9312 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9313 return -2; 9314 } 9315 if (end > PyUnicode_GET_LENGTH(str)) 9316 end = PyUnicode_GET_LENGTH(str); 9317 if (start >= end) 9318 return -1; 9319 kind = PyUnicode_KIND(str); 9320 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start, 9321 kind, end-start, ch, direction); 9322 if (result == -1) 9323 return -1; 9324 else 9325 return start + result; 9326} 9327 9328static int 9329tailmatch(PyObject *self, 9330 PyObject *substring, 9331 Py_ssize_t start, 9332 Py_ssize_t end, 9333 int direction) 9334{ 9335 int kind_self; 9336 int kind_sub; 9337 void *data_self; 9338 void *data_sub; 9339 Py_ssize_t offset; 9340 Py_ssize_t i; 9341 Py_ssize_t end_sub; 9342 9343 if (PyUnicode_READY(self) == -1 || 9344 PyUnicode_READY(substring) == -1) 9345 return -1; 9346 9347 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self)); 9348 end -= PyUnicode_GET_LENGTH(substring); 9349 if (end < start) 9350 return 0; 9351 9352 if (PyUnicode_GET_LENGTH(substring) == 0) 9353 return 1; 9354 9355 kind_self = PyUnicode_KIND(self); 9356 data_self = PyUnicode_DATA(self); 9357 kind_sub = PyUnicode_KIND(substring); 9358 data_sub = PyUnicode_DATA(substring); 9359 end_sub = PyUnicode_GET_LENGTH(substring) - 1; 9360 9361 if (direction > 0) 9362 offset = end; 9363 else 9364 offset = start; 9365 9366 if (PyUnicode_READ(kind_self, data_self, offset) == 9367 PyUnicode_READ(kind_sub, data_sub, 0) && 9368 PyUnicode_READ(kind_self, data_self, offset + end_sub) == 9369 PyUnicode_READ(kind_sub, data_sub, end_sub)) { 9370 /* If both are of the same kind, memcmp is sufficient */ 9371 if (kind_self == kind_sub) { 9372 return ! memcmp((char *)data_self + 9373 (offset * PyUnicode_KIND(substring)), 9374 data_sub, 9375 PyUnicode_GET_LENGTH(substring) * 9376 PyUnicode_KIND(substring)); 9377 } 9378 /* otherwise we have to compare each character by first accesing it */ 9379 else { 9380 /* We do not need to compare 0 and len(substring)-1 because 9381 the if statement above ensured already that they are equal 9382 when we end up here. */ 9383 for (i = 1; i < end_sub; ++i) { 9384 if (PyUnicode_READ(kind_self, data_self, offset + i) != 9385 PyUnicode_READ(kind_sub, data_sub, i)) 9386 return 0; 9387 } 9388 return 1; 9389 } 9390 } 9391 9392 return 0; 9393} 9394 9395Py_ssize_t 9396PyUnicode_Tailmatch(PyObject *str, 9397 PyObject *substr, 9398 Py_ssize_t start, 9399 Py_ssize_t end, 9400 int direction) 9401{ 9402 Py_ssize_t result; 9403 9404 str = PyUnicode_FromObject(str); 9405 if (str == NULL) 9406 return -1; 9407 substr = PyUnicode_FromObject(substr); 9408 if (substr == NULL) { 9409 Py_DECREF(str); 9410 return -1; 9411 } 9412 9413 result = tailmatch(str, substr, 9414 start, end, direction); 9415 Py_DECREF(str); 9416 Py_DECREF(substr); 9417 return result; 9418} 9419 9420/* Apply fixfct filter to the Unicode object self and return a 9421 reference to the modified object */ 9422 9423static PyObject * 9424fixup(PyObject *self, 9425 Py_UCS4 (*fixfct)(PyObject *s)) 9426{ 9427 PyObject *u; 9428 Py_UCS4 maxchar_old, maxchar_new = 0; 9429 PyObject *v; 9430 9431 u = _PyUnicode_Copy(self); 9432 if (u == NULL) 9433 return NULL; 9434 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u); 9435 9436 /* fix functions return the new maximum character in a string, 9437 if the kind of the resulting unicode object does not change, 9438 everything is fine. Otherwise we need to change the string kind 9439 and re-run the fix function. */ 9440 maxchar_new = fixfct(u); 9441 9442 if (maxchar_new == 0) { 9443 /* no changes */; 9444 if (PyUnicode_CheckExact(self)) { 9445 Py_DECREF(u); 9446 Py_INCREF(self); 9447 return self; 9448 } 9449 else 9450 return u; 9451 } 9452 9453 maxchar_new = align_maxchar(maxchar_new); 9454 9455 if (maxchar_new == maxchar_old) 9456 return u; 9457 9458 /* In case the maximum character changed, we need to 9459 convert the string to the new category. */ 9460 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new); 9461 if (v == NULL) { 9462 Py_DECREF(u); 9463 return NULL; 9464 } 9465 if (maxchar_new > maxchar_old) { 9466 /* If the maxchar increased so that the kind changed, not all 9467 characters are representable anymore and we need to fix the 9468 string again. This only happens in very few cases. */ 9469 _PyUnicode_FastCopyCharacters(v, 0, 9470 self, 0, PyUnicode_GET_LENGTH(self)); 9471 maxchar_old = fixfct(v); 9472 assert(maxchar_old > 0 && maxchar_old <= maxchar_new); 9473 } 9474 else { 9475 _PyUnicode_FastCopyCharacters(v, 0, 9476 u, 0, PyUnicode_GET_LENGTH(self)); 9477 } 9478 Py_DECREF(u); 9479 assert(_PyUnicode_CheckConsistency(v, 1)); 9480 return v; 9481} 9482 9483static PyObject * 9484ascii_upper_or_lower(PyObject *self, int lower) 9485{ 9486 Py_ssize_t len = PyUnicode_GET_LENGTH(self); 9487 char *resdata, *data = PyUnicode_DATA(self); 9488 PyObject *res; 9489 9490 res = PyUnicode_New(len, 127); 9491 if (res == NULL) 9492 return NULL; 9493 resdata = PyUnicode_DATA(res); 9494 if (lower) 9495 _Py_bytes_lower(resdata, data, len); 9496 else 9497 _Py_bytes_upper(resdata, data, len); 9498 return res; 9499} 9500 9501static Py_UCS4 9502handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) 9503{ 9504 Py_ssize_t j; 9505 int final_sigma; 9506 Py_UCS4 c = 0; /* initialize to prevent gcc warning */ 9507 /* U+03A3 is in the Final_Sigma context when, it is found like this: 9508 9509 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) 9510 9511 where ! is a negation and \p{xxx} is a character with property xxx. 9512 */ 9513 for (j = i - 1; j >= 0; j--) { 9514 c = PyUnicode_READ(kind, data, j); 9515 if (!_PyUnicode_IsCaseIgnorable(c)) 9516 break; 9517 } 9518 final_sigma = j >= 0 && _PyUnicode_IsCased(c); 9519 if (final_sigma) { 9520 for (j = i + 1; j < length; j++) { 9521 c = PyUnicode_READ(kind, data, j); 9522 if (!_PyUnicode_IsCaseIgnorable(c)) 9523 break; 9524 } 9525 final_sigma = j == length || !_PyUnicode_IsCased(c); 9526 } 9527 return (final_sigma) ? 0x3C2 : 0x3C3; 9528} 9529 9530static int 9531lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, 9532 Py_UCS4 c, Py_UCS4 *mapped) 9533{ 9534 /* Obscure special case. */ 9535 if (c == 0x3A3) { 9536 mapped[0] = handle_capital_sigma(kind, data, length, i); 9537 return 1; 9538 } 9539 return _PyUnicode_ToLowerFull(c, mapped); 9540} 9541 9542static Py_ssize_t 9543do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9544{ 9545 Py_ssize_t i, k = 0; 9546 int n_res, j; 9547 Py_UCS4 c, mapped[3]; 9548 9549 c = PyUnicode_READ(kind, data, 0); 9550 n_res = _PyUnicode_ToUpperFull(c, mapped); 9551 for (j = 0; j < n_res; j++) { 9552 *maxchar = Py_MAX(*maxchar, mapped[j]); 9553 res[k++] = mapped[j]; 9554 } 9555 for (i = 1; i < length; i++) { 9556 c = PyUnicode_READ(kind, data, i); 9557 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9558 for (j = 0; j < n_res; j++) { 9559 *maxchar = Py_MAX(*maxchar, mapped[j]); 9560 res[k++] = mapped[j]; 9561 } 9562 } 9563 return k; 9564} 9565 9566static Py_ssize_t 9567do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { 9568 Py_ssize_t i, k = 0; 9569 9570 for (i = 0; i < length; i++) { 9571 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9572 int n_res, j; 9573 if (Py_UNICODE_ISUPPER(c)) { 9574 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9575 } 9576 else if (Py_UNICODE_ISLOWER(c)) { 9577 n_res = _PyUnicode_ToUpperFull(c, mapped); 9578 } 9579 else { 9580 n_res = 1; 9581 mapped[0] = c; 9582 } 9583 for (j = 0; j < n_res; j++) { 9584 *maxchar = Py_MAX(*maxchar, mapped[j]); 9585 res[k++] = mapped[j]; 9586 } 9587 } 9588 return k; 9589} 9590 9591static Py_ssize_t 9592do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, 9593 Py_UCS4 *maxchar, int lower) 9594{ 9595 Py_ssize_t i, k = 0; 9596 9597 for (i = 0; i < length; i++) { 9598 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; 9599 int n_res, j; 9600 if (lower) 9601 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9602 else 9603 n_res = _PyUnicode_ToUpperFull(c, mapped); 9604 for (j = 0; j < n_res; j++) { 9605 *maxchar = Py_MAX(*maxchar, mapped[j]); 9606 res[k++] = mapped[j]; 9607 } 9608 } 9609 return k; 9610} 9611 9612static Py_ssize_t 9613do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9614{ 9615 return do_upper_or_lower(kind, data, length, res, maxchar, 0); 9616} 9617 9618static Py_ssize_t 9619do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9620{ 9621 return do_upper_or_lower(kind, data, length, res, maxchar, 1); 9622} 9623 9624static Py_ssize_t 9625do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9626{ 9627 Py_ssize_t i, k = 0; 9628 9629 for (i = 0; i < length; i++) { 9630 Py_UCS4 c = PyUnicode_READ(kind, data, i); 9631 Py_UCS4 mapped[3]; 9632 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); 9633 for (j = 0; j < n_res; j++) { 9634 *maxchar = Py_MAX(*maxchar, mapped[j]); 9635 res[k++] = mapped[j]; 9636 } 9637 } 9638 return k; 9639} 9640 9641static Py_ssize_t 9642do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) 9643{ 9644 Py_ssize_t i, k = 0; 9645 int previous_is_cased; 9646 9647 previous_is_cased = 0; 9648 for (i = 0; i < length; i++) { 9649 const Py_UCS4 c = PyUnicode_READ(kind, data, i); 9650 Py_UCS4 mapped[3]; 9651 int n_res, j; 9652 9653 if (previous_is_cased) 9654 n_res = lower_ucs4(kind, data, length, i, c, mapped); 9655 else 9656 n_res = _PyUnicode_ToTitleFull(c, mapped); 9657 9658 for (j = 0; j < n_res; j++) { 9659 *maxchar = Py_MAX(*maxchar, mapped[j]); 9660 res[k++] = mapped[j]; 9661 } 9662 9663 previous_is_cased = _PyUnicode_IsCased(c); 9664 } 9665 return k; 9666} 9667 9668static PyObject * 9669case_operation(PyObject *self, 9670 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) 9671{ 9672 PyObject *res = NULL; 9673 Py_ssize_t length, newlength = 0; 9674 int kind, outkind; 9675 void *data, *outdata; 9676 Py_UCS4 maxchar = 0, *tmp, *tmpend; 9677 9678 assert(PyUnicode_IS_READY(self)); 9679 9680 kind = PyUnicode_KIND(self); 9681 data = PyUnicode_DATA(self); 9682 length = PyUnicode_GET_LENGTH(self); 9683 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) { 9684 PyErr_SetString(PyExc_OverflowError, "string is too long"); 9685 return NULL; 9686 } 9687 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); 9688 if (tmp == NULL) 9689 return PyErr_NoMemory(); 9690 newlength = perform(kind, data, length, tmp, &maxchar); 9691 res = PyUnicode_New(newlength, maxchar); 9692 if (res == NULL) 9693 goto leave; 9694 tmpend = tmp + newlength; 9695 outdata = PyUnicode_DATA(res); 9696 outkind = PyUnicode_KIND(res); 9697 switch (outkind) { 9698 case PyUnicode_1BYTE_KIND: 9699 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); 9700 break; 9701 case PyUnicode_2BYTE_KIND: 9702 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); 9703 break; 9704 case PyUnicode_4BYTE_KIND: 9705 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); 9706 break; 9707 default: 9708 assert(0); 9709 break; 9710 } 9711 leave: 9712 PyMem_FREE(tmp); 9713 return res; 9714} 9715 9716PyObject * 9717PyUnicode_Join(PyObject *separator, PyObject *seq) 9718{ 9719 PyObject *sep = NULL; 9720 Py_ssize_t seplen; 9721 PyObject *res = NULL; /* the result */ 9722 PyObject *fseq; /* PySequence_Fast(seq) */ 9723 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ 9724 PyObject **items; 9725 PyObject *item; 9726 Py_ssize_t sz, i, res_offset; 9727 Py_UCS4 maxchar; 9728 Py_UCS4 item_maxchar; 9729 int use_memcpy; 9730 unsigned char *res_data = NULL, *sep_data = NULL; 9731 PyObject *last_obj; 9732 unsigned int kind = 0; 9733 9734 fseq = PySequence_Fast(seq, "can only join an iterable"); 9735 if (fseq == NULL) { 9736 return NULL; 9737 } 9738 9739 /* NOTE: the following code can't call back into Python code, 9740 * so we are sure that fseq won't be mutated. 9741 */ 9742 9743 seqlen = PySequence_Fast_GET_SIZE(fseq); 9744 /* If empty sequence, return u"". */ 9745 if (seqlen == 0) { 9746 Py_DECREF(fseq); 9747 _Py_RETURN_UNICODE_EMPTY(); 9748 } 9749 9750 /* If singleton sequence with an exact Unicode, return that. */ 9751 last_obj = NULL; 9752 items = PySequence_Fast_ITEMS(fseq); 9753 if (seqlen == 1) { 9754 if (PyUnicode_CheckExact(items[0])) { 9755 res = items[0]; 9756 Py_INCREF(res); 9757 Py_DECREF(fseq); 9758 return res; 9759 } 9760 seplen = 0; 9761 maxchar = 0; 9762 } 9763 else { 9764 /* Set up sep and seplen */ 9765 if (separator == NULL) { 9766 /* fall back to a blank space separator */ 9767 sep = PyUnicode_FromOrdinal(' '); 9768 if (!sep) 9769 goto onError; 9770 seplen = 1; 9771 maxchar = 32; 9772 } 9773 else { 9774 if (!PyUnicode_Check(separator)) { 9775 PyErr_Format(PyExc_TypeError, 9776 "separator: expected str instance," 9777 " %.80s found", 9778 Py_TYPE(separator)->tp_name); 9779 goto onError; 9780 } 9781 if (PyUnicode_READY(separator)) 9782 goto onError; 9783 sep = separator; 9784 seplen = PyUnicode_GET_LENGTH(separator); 9785 maxchar = PyUnicode_MAX_CHAR_VALUE(separator); 9786 /* inc refcount to keep this code path symmetric with the 9787 above case of a blank separator */ 9788 Py_INCREF(sep); 9789 } 9790 last_obj = sep; 9791 } 9792 9793 /* There are at least two things to join, or else we have a subclass 9794 * of str in the sequence. 9795 * Do a pre-pass to figure out the total amount of space we'll 9796 * need (sz), and see whether all argument are strings. 9797 */ 9798 sz = 0; 9799#ifdef Py_DEBUG 9800 use_memcpy = 0; 9801#else 9802 use_memcpy = 1; 9803#endif 9804 for (i = 0; i < seqlen; i++) { 9805 const Py_ssize_t old_sz = sz; 9806 item = items[i]; 9807 if (!PyUnicode_Check(item)) { 9808 PyErr_Format(PyExc_TypeError, 9809 "sequence item %zd: expected str instance," 9810 " %.80s found", 9811 i, Py_TYPE(item)->tp_name); 9812 goto onError; 9813 } 9814 if (PyUnicode_READY(item) == -1) 9815 goto onError; 9816 sz += PyUnicode_GET_LENGTH(item); 9817 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); 9818 maxchar = Py_MAX(maxchar, item_maxchar); 9819 if (i != 0) 9820 sz += seplen; 9821 if (sz < old_sz || sz > PY_SSIZE_T_MAX) { 9822 PyErr_SetString(PyExc_OverflowError, 9823 "join() result is too long for a Python string"); 9824 goto onError; 9825 } 9826 if (use_memcpy && last_obj != NULL) { 9827 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item)) 9828 use_memcpy = 0; 9829 } 9830 last_obj = item; 9831 } 9832 9833 res = PyUnicode_New(sz, maxchar); 9834 if (res == NULL) 9835 goto onError; 9836 9837 /* Catenate everything. */ 9838#ifdef Py_DEBUG 9839 use_memcpy = 0; 9840#else 9841 if (use_memcpy) { 9842 res_data = PyUnicode_1BYTE_DATA(res); 9843 kind = PyUnicode_KIND(res); 9844 if (seplen != 0) 9845 sep_data = PyUnicode_1BYTE_DATA(sep); 9846 } 9847#endif 9848 if (use_memcpy) { 9849 for (i = 0; i < seqlen; ++i) { 9850 Py_ssize_t itemlen; 9851 item = items[i]; 9852 9853 /* Copy item, and maybe the separator. */ 9854 if (i && seplen != 0) { 9855 Py_MEMCPY(res_data, 9856 sep_data, 9857 kind * seplen); 9858 res_data += kind * seplen; 9859 } 9860 9861 itemlen = PyUnicode_GET_LENGTH(item); 9862 if (itemlen != 0) { 9863 Py_MEMCPY(res_data, 9864 PyUnicode_DATA(item), 9865 kind * itemlen); 9866 res_data += kind * itemlen; 9867 } 9868 } 9869 assert(res_data == PyUnicode_1BYTE_DATA(res) 9870 + kind * PyUnicode_GET_LENGTH(res)); 9871 } 9872 else { 9873 for (i = 0, res_offset = 0; i < seqlen; ++i) { 9874 Py_ssize_t itemlen; 9875 item = items[i]; 9876 9877 /* Copy item, and maybe the separator. */ 9878 if (i && seplen != 0) { 9879 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen); 9880 res_offset += seplen; 9881 } 9882 9883 itemlen = PyUnicode_GET_LENGTH(item); 9884 if (itemlen != 0) { 9885 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen); 9886 res_offset += itemlen; 9887 } 9888 } 9889 assert(res_offset == PyUnicode_GET_LENGTH(res)); 9890 } 9891 9892 Py_DECREF(fseq); 9893 Py_XDECREF(sep); 9894 assert(_PyUnicode_CheckConsistency(res, 1)); 9895 return res; 9896 9897 onError: 9898 Py_DECREF(fseq); 9899 Py_XDECREF(sep); 9900 Py_XDECREF(res); 9901 return NULL; 9902} 9903 9904#define FILL(kind, data, value, start, length) \ 9905 do { \ 9906 Py_ssize_t i_ = 0; \ 9907 assert(kind != PyUnicode_WCHAR_KIND); \ 9908 switch ((kind)) { \ 9909 case PyUnicode_1BYTE_KIND: { \ 9910 unsigned char * to_ = (unsigned char *)((data)) + (start); \ 9911 memset(to_, (unsigned char)value, (length)); \ 9912 break; \ 9913 } \ 9914 case PyUnicode_2BYTE_KIND: { \ 9915 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \ 9916 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9917 break; \ 9918 } \ 9919 case PyUnicode_4BYTE_KIND: { \ 9920 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \ 9921 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \ 9922 break; \ 9923 } \ 9924 default: assert(0); \ 9925 } \ 9926 } while (0) 9927 9928void 9929_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9930 Py_UCS4 fill_char) 9931{ 9932 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode); 9933 const void *data = PyUnicode_DATA(unicode); 9934 assert(PyUnicode_IS_READY(unicode)); 9935 assert(unicode_modifiable(unicode)); 9936 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode)); 9937 assert(start >= 0); 9938 assert(start + length <= PyUnicode_GET_LENGTH(unicode)); 9939 FILL(kind, data, fill_char, start, length); 9940} 9941 9942Py_ssize_t 9943PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length, 9944 Py_UCS4 fill_char) 9945{ 9946 Py_ssize_t maxlen; 9947 9948 if (!PyUnicode_Check(unicode)) { 9949 PyErr_BadInternalCall(); 9950 return -1; 9951 } 9952 if (PyUnicode_READY(unicode) == -1) 9953 return -1; 9954 if (unicode_check_modifiable(unicode)) 9955 return -1; 9956 9957 if (start < 0) { 9958 PyErr_SetString(PyExc_IndexError, "string index out of range"); 9959 return -1; 9960 } 9961 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) { 9962 PyErr_SetString(PyExc_ValueError, 9963 "fill character is bigger than " 9964 "the string maximum character"); 9965 return -1; 9966 } 9967 9968 maxlen = PyUnicode_GET_LENGTH(unicode) - start; 9969 length = Py_MIN(maxlen, length); 9970 if (length <= 0) 9971 return 0; 9972 9973 _PyUnicode_FastFill(unicode, start, length, fill_char); 9974 return length; 9975} 9976 9977static PyObject * 9978pad(PyObject *self, 9979 Py_ssize_t left, 9980 Py_ssize_t right, 9981 Py_UCS4 fill) 9982{ 9983 PyObject *u; 9984 Py_UCS4 maxchar; 9985 int kind; 9986 void *data; 9987 9988 if (left < 0) 9989 left = 0; 9990 if (right < 0) 9991 right = 0; 9992 9993 if (left == 0 && right == 0) 9994 return unicode_result_unchanged(self); 9995 9996 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) || 9997 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) { 9998 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); 9999 return NULL; 10000 } 10001 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10002 maxchar = Py_MAX(maxchar, fill); 10003 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); 10004 if (!u) 10005 return NULL; 10006 10007 kind = PyUnicode_KIND(u); 10008 data = PyUnicode_DATA(u); 10009 if (left) 10010 FILL(kind, data, fill, 0, left); 10011 if (right) 10012 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); 10013 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self)); 10014 assert(_PyUnicode_CheckConsistency(u, 1)); 10015 return u; 10016} 10017 10018PyObject * 10019PyUnicode_Splitlines(PyObject *string, int keepends) 10020{ 10021 PyObject *list; 10022 10023 string = PyUnicode_FromObject(string); 10024 if (string == NULL) 10025 return NULL; 10026 if (PyUnicode_READY(string) == -1) { 10027 Py_DECREF(string); 10028 return NULL; 10029 } 10030 10031 switch (PyUnicode_KIND(string)) { 10032 case PyUnicode_1BYTE_KIND: 10033 if (PyUnicode_IS_ASCII(string)) 10034 list = asciilib_splitlines( 10035 string, PyUnicode_1BYTE_DATA(string), 10036 PyUnicode_GET_LENGTH(string), keepends); 10037 else 10038 list = ucs1lib_splitlines( 10039 string, PyUnicode_1BYTE_DATA(string), 10040 PyUnicode_GET_LENGTH(string), keepends); 10041 break; 10042 case PyUnicode_2BYTE_KIND: 10043 list = ucs2lib_splitlines( 10044 string, PyUnicode_2BYTE_DATA(string), 10045 PyUnicode_GET_LENGTH(string), keepends); 10046 break; 10047 case PyUnicode_4BYTE_KIND: 10048 list = ucs4lib_splitlines( 10049 string, PyUnicode_4BYTE_DATA(string), 10050 PyUnicode_GET_LENGTH(string), keepends); 10051 break; 10052 default: 10053 assert(0); 10054 list = 0; 10055 } 10056 Py_DECREF(string); 10057 return list; 10058} 10059 10060static PyObject * 10061split(PyObject *self, 10062 PyObject *substring, 10063 Py_ssize_t maxcount) 10064{ 10065 int kind1, kind2; 10066 void *buf1, *buf2; 10067 Py_ssize_t len1, len2; 10068 PyObject* out; 10069 10070 if (maxcount < 0) 10071 maxcount = PY_SSIZE_T_MAX; 10072 10073 if (PyUnicode_READY(self) == -1) 10074 return NULL; 10075 10076 if (substring == NULL) 10077 switch (PyUnicode_KIND(self)) { 10078 case PyUnicode_1BYTE_KIND: 10079 if (PyUnicode_IS_ASCII(self)) 10080 return asciilib_split_whitespace( 10081 self, PyUnicode_1BYTE_DATA(self), 10082 PyUnicode_GET_LENGTH(self), maxcount 10083 ); 10084 else 10085 return ucs1lib_split_whitespace( 10086 self, PyUnicode_1BYTE_DATA(self), 10087 PyUnicode_GET_LENGTH(self), maxcount 10088 ); 10089 case PyUnicode_2BYTE_KIND: 10090 return ucs2lib_split_whitespace( 10091 self, PyUnicode_2BYTE_DATA(self), 10092 PyUnicode_GET_LENGTH(self), maxcount 10093 ); 10094 case PyUnicode_4BYTE_KIND: 10095 return ucs4lib_split_whitespace( 10096 self, PyUnicode_4BYTE_DATA(self), 10097 PyUnicode_GET_LENGTH(self), maxcount 10098 ); 10099 default: 10100 assert(0); 10101 return NULL; 10102 } 10103 10104 if (PyUnicode_READY(substring) == -1) 10105 return NULL; 10106 10107 kind1 = PyUnicode_KIND(self); 10108 kind2 = PyUnicode_KIND(substring); 10109 len1 = PyUnicode_GET_LENGTH(self); 10110 len2 = PyUnicode_GET_LENGTH(substring); 10111 if (kind1 < kind2 || len1 < len2) { 10112 out = PyList_New(1); 10113 if (out == NULL) 10114 return NULL; 10115 Py_INCREF(self); 10116 PyList_SET_ITEM(out, 0, self); 10117 return out; 10118 } 10119 buf1 = PyUnicode_DATA(self); 10120 buf2 = PyUnicode_DATA(substring); 10121 if (kind2 != kind1) { 10122 buf2 = _PyUnicode_AsKind(substring, kind1); 10123 if (!buf2) 10124 return NULL; 10125 } 10126 10127 switch (kind1) { 10128 case PyUnicode_1BYTE_KIND: 10129 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10130 out = asciilib_split( 10131 self, buf1, len1, buf2, len2, maxcount); 10132 else 10133 out = ucs1lib_split( 10134 self, buf1, len1, buf2, len2, maxcount); 10135 break; 10136 case PyUnicode_2BYTE_KIND: 10137 out = ucs2lib_split( 10138 self, buf1, len1, buf2, len2, maxcount); 10139 break; 10140 case PyUnicode_4BYTE_KIND: 10141 out = ucs4lib_split( 10142 self, buf1, len1, buf2, len2, maxcount); 10143 break; 10144 default: 10145 out = NULL; 10146 } 10147 if (kind2 != kind1) 10148 PyMem_Free(buf2); 10149 return out; 10150} 10151 10152static PyObject * 10153rsplit(PyObject *self, 10154 PyObject *substring, 10155 Py_ssize_t maxcount) 10156{ 10157 int kind1, kind2; 10158 void *buf1, *buf2; 10159 Py_ssize_t len1, len2; 10160 PyObject* out; 10161 10162 if (maxcount < 0) 10163 maxcount = PY_SSIZE_T_MAX; 10164 10165 if (PyUnicode_READY(self) == -1) 10166 return NULL; 10167 10168 if (substring == NULL) 10169 switch (PyUnicode_KIND(self)) { 10170 case PyUnicode_1BYTE_KIND: 10171 if (PyUnicode_IS_ASCII(self)) 10172 return asciilib_rsplit_whitespace( 10173 self, PyUnicode_1BYTE_DATA(self), 10174 PyUnicode_GET_LENGTH(self), maxcount 10175 ); 10176 else 10177 return ucs1lib_rsplit_whitespace( 10178 self, PyUnicode_1BYTE_DATA(self), 10179 PyUnicode_GET_LENGTH(self), maxcount 10180 ); 10181 case PyUnicode_2BYTE_KIND: 10182 return ucs2lib_rsplit_whitespace( 10183 self, PyUnicode_2BYTE_DATA(self), 10184 PyUnicode_GET_LENGTH(self), maxcount 10185 ); 10186 case PyUnicode_4BYTE_KIND: 10187 return ucs4lib_rsplit_whitespace( 10188 self, PyUnicode_4BYTE_DATA(self), 10189 PyUnicode_GET_LENGTH(self), maxcount 10190 ); 10191 default: 10192 assert(0); 10193 return NULL; 10194 } 10195 10196 if (PyUnicode_READY(substring) == -1) 10197 return NULL; 10198 10199 kind1 = PyUnicode_KIND(self); 10200 kind2 = PyUnicode_KIND(substring); 10201 len1 = PyUnicode_GET_LENGTH(self); 10202 len2 = PyUnicode_GET_LENGTH(substring); 10203 if (kind1 < kind2 || len1 < len2) { 10204 out = PyList_New(1); 10205 if (out == NULL) 10206 return NULL; 10207 Py_INCREF(self); 10208 PyList_SET_ITEM(out, 0, self); 10209 return out; 10210 } 10211 buf1 = PyUnicode_DATA(self); 10212 buf2 = PyUnicode_DATA(substring); 10213 if (kind2 != kind1) { 10214 buf2 = _PyUnicode_AsKind(substring, kind1); 10215 if (!buf2) 10216 return NULL; 10217 } 10218 10219 switch (kind1) { 10220 case PyUnicode_1BYTE_KIND: 10221 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) 10222 out = asciilib_rsplit( 10223 self, buf1, len1, buf2, len2, maxcount); 10224 else 10225 out = ucs1lib_rsplit( 10226 self, buf1, len1, buf2, len2, maxcount); 10227 break; 10228 case PyUnicode_2BYTE_KIND: 10229 out = ucs2lib_rsplit( 10230 self, buf1, len1, buf2, len2, maxcount); 10231 break; 10232 case PyUnicode_4BYTE_KIND: 10233 out = ucs4lib_rsplit( 10234 self, buf1, len1, buf2, len2, maxcount); 10235 break; 10236 default: 10237 out = NULL; 10238 } 10239 if (kind2 != kind1) 10240 PyMem_Free(buf2); 10241 return out; 10242} 10243 10244static Py_ssize_t 10245anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, 10246 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) 10247{ 10248 switch (kind) { 10249 case PyUnicode_1BYTE_KIND: 10250 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) 10251 return asciilib_find(buf1, len1, buf2, len2, offset); 10252 else 10253 return ucs1lib_find(buf1, len1, buf2, len2, offset); 10254 case PyUnicode_2BYTE_KIND: 10255 return ucs2lib_find(buf1, len1, buf2, len2, offset); 10256 case PyUnicode_4BYTE_KIND: 10257 return ucs4lib_find(buf1, len1, buf2, len2, offset); 10258 } 10259 assert(0); 10260 return -1; 10261} 10262 10263static Py_ssize_t 10264anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, 10265 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) 10266{ 10267 switch (kind) { 10268 case PyUnicode_1BYTE_KIND: 10269 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) 10270 return asciilib_count(sbuf, slen, buf1, len1, maxcount); 10271 else 10272 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); 10273 case PyUnicode_2BYTE_KIND: 10274 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); 10275 case PyUnicode_4BYTE_KIND: 10276 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount); 10277 } 10278 assert(0); 10279 return 0; 10280} 10281 10282static void 10283replace_1char_inplace(PyObject *u, Py_ssize_t pos, 10284 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount) 10285{ 10286 int kind = PyUnicode_KIND(u); 10287 void *data = PyUnicode_DATA(u); 10288 Py_ssize_t len = PyUnicode_GET_LENGTH(u); 10289 if (kind == PyUnicode_1BYTE_KIND) { 10290 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos, 10291 (Py_UCS1 *)data + len, 10292 u1, u2, maxcount); 10293 } 10294 else if (kind == PyUnicode_2BYTE_KIND) { 10295 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos, 10296 (Py_UCS2 *)data + len, 10297 u1, u2, maxcount); 10298 } 10299 else { 10300 assert(kind == PyUnicode_4BYTE_KIND); 10301 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos, 10302 (Py_UCS4 *)data + len, 10303 u1, u2, maxcount); 10304 } 10305} 10306 10307static PyObject * 10308replace(PyObject *self, PyObject *str1, 10309 PyObject *str2, Py_ssize_t maxcount) 10310{ 10311 PyObject *u; 10312 char *sbuf = PyUnicode_DATA(self); 10313 char *buf1 = PyUnicode_DATA(str1); 10314 char *buf2 = PyUnicode_DATA(str2); 10315 int srelease = 0, release1 = 0, release2 = 0; 10316 int skind = PyUnicode_KIND(self); 10317 int kind1 = PyUnicode_KIND(str1); 10318 int kind2 = PyUnicode_KIND(str2); 10319 Py_ssize_t slen = PyUnicode_GET_LENGTH(self); 10320 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1); 10321 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2); 10322 int mayshrink; 10323 Py_UCS4 maxchar, maxchar_str1, maxchar_str2; 10324 10325 if (maxcount < 0) 10326 maxcount = PY_SSIZE_T_MAX; 10327 else if (maxcount == 0 || slen == 0) 10328 goto nothing; 10329 10330 if (str1 == str2) 10331 goto nothing; 10332 10333 maxchar = PyUnicode_MAX_CHAR_VALUE(self); 10334 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1); 10335 if (maxchar < maxchar_str1) 10336 /* substring too wide to be present */ 10337 goto nothing; 10338 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2); 10339 /* Replacing str1 with str2 may cause a maxchar reduction in the 10340 result string. */ 10341 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1); 10342 maxchar = Py_MAX(maxchar, maxchar_str2); 10343 10344 if (len1 == len2) { 10345 /* same length */ 10346 if (len1 == 0) 10347 goto nothing; 10348 if (len1 == 1) { 10349 /* replace characters */ 10350 Py_UCS4 u1, u2; 10351 Py_ssize_t pos; 10352 10353 u1 = PyUnicode_READ(kind1, buf1, 0); 10354 pos = findchar(sbuf, skind, slen, u1, 1); 10355 if (pos < 0) 10356 goto nothing; 10357 u2 = PyUnicode_READ(kind2, buf2, 0); 10358 u = PyUnicode_New(slen, maxchar); 10359 if (!u) 10360 goto error; 10361 10362 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen); 10363 replace_1char_inplace(u, pos, u1, u2, maxcount); 10364 } 10365 else { 10366 int rkind = skind; 10367 char *res; 10368 Py_ssize_t i; 10369 10370 if (kind1 < rkind) { 10371 /* widen substring */ 10372 buf1 = _PyUnicode_AsKind(str1, rkind); 10373 if (!buf1) goto error; 10374 release1 = 1; 10375 } 10376 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); 10377 if (i < 0) 10378 goto nothing; 10379 if (rkind > kind2) { 10380 /* widen replacement */ 10381 buf2 = _PyUnicode_AsKind(str2, rkind); 10382 if (!buf2) goto error; 10383 release2 = 1; 10384 } 10385 else if (rkind < kind2) { 10386 /* widen self and buf1 */ 10387 rkind = kind2; 10388 if (release1) PyMem_Free(buf1); 10389 release1 = 0; 10390 sbuf = _PyUnicode_AsKind(self, rkind); 10391 if (!sbuf) goto error; 10392 srelease = 1; 10393 buf1 = _PyUnicode_AsKind(str1, rkind); 10394 if (!buf1) goto error; 10395 release1 = 1; 10396 } 10397 u = PyUnicode_New(slen, maxchar); 10398 if (!u) 10399 goto error; 10400 assert(PyUnicode_KIND(u) == rkind); 10401 res = PyUnicode_DATA(u); 10402 10403 memcpy(res, sbuf, rkind * slen); 10404 /* change everything in-place, starting with this one */ 10405 memcpy(res + rkind * i, 10406 buf2, 10407 rkind * len2); 10408 i += len1; 10409 10410 while ( --maxcount > 0) { 10411 i = anylib_find(rkind, self, 10412 sbuf+rkind*i, slen-i, 10413 str1, buf1, len1, i); 10414 if (i == -1) 10415 break; 10416 memcpy(res + rkind * i, 10417 buf2, 10418 rkind * len2); 10419 i += len1; 10420 } 10421 } 10422 } 10423 else { 10424 Py_ssize_t n, i, j, ires; 10425 Py_ssize_t new_size; 10426 int rkind = skind; 10427 char *res; 10428 10429 if (kind1 < rkind) { 10430 /* widen substring */ 10431 buf1 = _PyUnicode_AsKind(str1, rkind); 10432 if (!buf1) goto error; 10433 release1 = 1; 10434 } 10435 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); 10436 if (n == 0) 10437 goto nothing; 10438 if (kind2 < rkind) { 10439 /* widen replacement */ 10440 buf2 = _PyUnicode_AsKind(str2, rkind); 10441 if (!buf2) goto error; 10442 release2 = 1; 10443 } 10444 else if (kind2 > rkind) { 10445 /* widen self and buf1 */ 10446 rkind = kind2; 10447 sbuf = _PyUnicode_AsKind(self, rkind); 10448 if (!sbuf) goto error; 10449 srelease = 1; 10450 if (release1) PyMem_Free(buf1); 10451 release1 = 0; 10452 buf1 = _PyUnicode_AsKind(str1, rkind); 10453 if (!buf1) goto error; 10454 release1 = 1; 10455 } 10456 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) - 10457 PyUnicode_GET_LENGTH(str1))); */ 10458 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) { 10459 PyErr_SetString(PyExc_OverflowError, 10460 "replace string is too long"); 10461 goto error; 10462 } 10463 new_size = slen + n * (len2 - len1); 10464 if (new_size == 0) { 10465 _Py_INCREF_UNICODE_EMPTY(); 10466 if (!unicode_empty) 10467 goto error; 10468 u = unicode_empty; 10469 goto done; 10470 } 10471 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) { 10472 PyErr_SetString(PyExc_OverflowError, 10473 "replace string is too long"); 10474 goto error; 10475 } 10476 u = PyUnicode_New(new_size, maxchar); 10477 if (!u) 10478 goto error; 10479 assert(PyUnicode_KIND(u) == rkind); 10480 res = PyUnicode_DATA(u); 10481 ires = i = 0; 10482 if (len1 > 0) { 10483 while (n-- > 0) { 10484 /* look for next match */ 10485 j = anylib_find(rkind, self, 10486 sbuf + rkind * i, slen-i, 10487 str1, buf1, len1, i); 10488 if (j == -1) 10489 break; 10490 else if (j > i) { 10491 /* copy unchanged part [i:j] */ 10492 memcpy(res + rkind * ires, 10493 sbuf + rkind * i, 10494 rkind * (j-i)); 10495 ires += j - i; 10496 } 10497 /* copy substitution string */ 10498 if (len2 > 0) { 10499 memcpy(res + rkind * ires, 10500 buf2, 10501 rkind * len2); 10502 ires += len2; 10503 } 10504 i = j + len1; 10505 } 10506 if (i < slen) 10507 /* copy tail [i:] */ 10508 memcpy(res + rkind * ires, 10509 sbuf + rkind * i, 10510 rkind * (slen-i)); 10511 } 10512 else { 10513 /* interleave */ 10514 while (n > 0) { 10515 memcpy(res + rkind * ires, 10516 buf2, 10517 rkind * len2); 10518 ires += len2; 10519 if (--n <= 0) 10520 break; 10521 memcpy(res + rkind * ires, 10522 sbuf + rkind * i, 10523 rkind); 10524 ires++; 10525 i++; 10526 } 10527 memcpy(res + rkind * ires, 10528 sbuf + rkind * i, 10529 rkind * (slen-i)); 10530 } 10531 } 10532 10533 if (mayshrink) { 10534 unicode_adjust_maxchar(&u); 10535 if (u == NULL) 10536 goto error; 10537 } 10538 10539 done: 10540 if (srelease) 10541 PyMem_FREE(sbuf); 10542 if (release1) 10543 PyMem_FREE(buf1); 10544 if (release2) 10545 PyMem_FREE(buf2); 10546 assert(_PyUnicode_CheckConsistency(u, 1)); 10547 return u; 10548 10549 nothing: 10550 /* nothing to replace; return original string (when possible) */ 10551 if (srelease) 10552 PyMem_FREE(sbuf); 10553 if (release1) 10554 PyMem_FREE(buf1); 10555 if (release2) 10556 PyMem_FREE(buf2); 10557 return unicode_result_unchanged(self); 10558 10559 error: 10560 if (srelease && sbuf) 10561 PyMem_FREE(sbuf); 10562 if (release1 && buf1) 10563 PyMem_FREE(buf1); 10564 if (release2 && buf2) 10565 PyMem_FREE(buf2); 10566 return NULL; 10567} 10568 10569/* --- Unicode Object Methods --------------------------------------------- */ 10570 10571PyDoc_STRVAR(title__doc__, 10572 "S.title() -> str\n\ 10573\n\ 10574Return a titlecased version of S, i.e. words start with title case\n\ 10575characters, all remaining cased characters have lower case."); 10576 10577static PyObject* 10578unicode_title(PyObject *self) 10579{ 10580 if (PyUnicode_READY(self) == -1) 10581 return NULL; 10582 return case_operation(self, do_title); 10583} 10584 10585PyDoc_STRVAR(capitalize__doc__, 10586 "S.capitalize() -> str\n\ 10587\n\ 10588Return a capitalized version of S, i.e. make the first character\n\ 10589have upper case and the rest lower case."); 10590 10591static PyObject* 10592unicode_capitalize(PyObject *self) 10593{ 10594 if (PyUnicode_READY(self) == -1) 10595 return NULL; 10596 if (PyUnicode_GET_LENGTH(self) == 0) 10597 return unicode_result_unchanged(self); 10598 return case_operation(self, do_capitalize); 10599} 10600 10601PyDoc_STRVAR(casefold__doc__, 10602 "S.casefold() -> str\n\ 10603\n\ 10604Return a version of S suitable for caseless comparisons."); 10605 10606static PyObject * 10607unicode_casefold(PyObject *self) 10608{ 10609 if (PyUnicode_READY(self) == -1) 10610 return NULL; 10611 if (PyUnicode_IS_ASCII(self)) 10612 return ascii_upper_or_lower(self, 1); 10613 return case_operation(self, do_casefold); 10614} 10615 10616 10617/* Argument converter. Coerces to a single unicode character */ 10618 10619static int 10620convert_uc(PyObject *obj, void *addr) 10621{ 10622 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; 10623 PyObject *uniobj; 10624 10625 uniobj = PyUnicode_FromObject(obj); 10626 if (uniobj == NULL) { 10627 PyErr_SetString(PyExc_TypeError, 10628 "The fill character cannot be converted to Unicode"); 10629 return 0; 10630 } 10631 if (PyUnicode_GET_LENGTH(uniobj) != 1) { 10632 PyErr_SetString(PyExc_TypeError, 10633 "The fill character must be exactly one character long"); 10634 Py_DECREF(uniobj); 10635 return 0; 10636 } 10637 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); 10638 Py_DECREF(uniobj); 10639 return 1; 10640} 10641 10642PyDoc_STRVAR(center__doc__, 10643 "S.center(width[, fillchar]) -> str\n\ 10644\n\ 10645Return S centered in a string of length width. Padding is\n\ 10646done using the specified fill character (default is a space)"); 10647 10648static PyObject * 10649unicode_center(PyObject *self, PyObject *args) 10650{ 10651 Py_ssize_t marg, left; 10652 Py_ssize_t width; 10653 Py_UCS4 fillchar = ' '; 10654 10655 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) 10656 return NULL; 10657 10658 if (PyUnicode_READY(self) == -1) 10659 return NULL; 10660 10661 if (PyUnicode_GET_LENGTH(self) >= width) 10662 return unicode_result_unchanged(self); 10663 10664 marg = width - PyUnicode_GET_LENGTH(self); 10665 left = marg / 2 + (marg & width & 1); 10666 10667 return pad(self, left, marg - left, fillchar); 10668} 10669 10670/* This function assumes that str1 and str2 are readied by the caller. */ 10671 10672static int 10673unicode_compare(PyObject *str1, PyObject *str2) 10674{ 10675#define COMPARE(TYPE1, TYPE2) \ 10676 do { \ 10677 TYPE1* p1 = (TYPE1 *)data1; \ 10678 TYPE2* p2 = (TYPE2 *)data2; \ 10679 TYPE1* end = p1 + len; \ 10680 Py_UCS4 c1, c2; \ 10681 for (; p1 != end; p1++, p2++) { \ 10682 c1 = *p1; \ 10683 c2 = *p2; \ 10684 if (c1 != c2) \ 10685 return (c1 < c2) ? -1 : 1; \ 10686 } \ 10687 } \ 10688 while (0) 10689 10690 int kind1, kind2; 10691 void *data1, *data2; 10692 Py_ssize_t len1, len2, len; 10693 10694 kind1 = PyUnicode_KIND(str1); 10695 kind2 = PyUnicode_KIND(str2); 10696 data1 = PyUnicode_DATA(str1); 10697 data2 = PyUnicode_DATA(str2); 10698 len1 = PyUnicode_GET_LENGTH(str1); 10699 len2 = PyUnicode_GET_LENGTH(str2); 10700 len = Py_MIN(len1, len2); 10701 10702 switch(kind1) { 10703 case PyUnicode_1BYTE_KIND: 10704 { 10705 switch(kind2) { 10706 case PyUnicode_1BYTE_KIND: 10707 { 10708 int cmp = memcmp(data1, data2, len); 10709 /* normalize result of memcmp() into the range [-1; 1] */ 10710 if (cmp < 0) 10711 return -1; 10712 if (cmp > 0) 10713 return 1; 10714 break; 10715 } 10716 case PyUnicode_2BYTE_KIND: 10717 COMPARE(Py_UCS1, Py_UCS2); 10718 break; 10719 case PyUnicode_4BYTE_KIND: 10720 COMPARE(Py_UCS1, Py_UCS4); 10721 break; 10722 default: 10723 assert(0); 10724 } 10725 break; 10726 } 10727 case PyUnicode_2BYTE_KIND: 10728 { 10729 switch(kind2) { 10730 case PyUnicode_1BYTE_KIND: 10731 COMPARE(Py_UCS2, Py_UCS1); 10732 break; 10733 case PyUnicode_2BYTE_KIND: 10734 { 10735 COMPARE(Py_UCS2, Py_UCS2); 10736 break; 10737 } 10738 case PyUnicode_4BYTE_KIND: 10739 COMPARE(Py_UCS2, Py_UCS4); 10740 break; 10741 default: 10742 assert(0); 10743 } 10744 break; 10745 } 10746 case PyUnicode_4BYTE_KIND: 10747 { 10748 switch(kind2) { 10749 case PyUnicode_1BYTE_KIND: 10750 COMPARE(Py_UCS4, Py_UCS1); 10751 break; 10752 case PyUnicode_2BYTE_KIND: 10753 COMPARE(Py_UCS4, Py_UCS2); 10754 break; 10755 case PyUnicode_4BYTE_KIND: 10756 { 10757#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4 10758 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len); 10759 /* normalize result of wmemcmp() into the range [-1; 1] */ 10760 if (cmp < 0) 10761 return -1; 10762 if (cmp > 0) 10763 return 1; 10764#else 10765 COMPARE(Py_UCS4, Py_UCS4); 10766#endif 10767 break; 10768 } 10769 default: 10770 assert(0); 10771 } 10772 break; 10773 } 10774 default: 10775 assert(0); 10776 } 10777 10778 if (len1 == len2) 10779 return 0; 10780 if (len1 < len2) 10781 return -1; 10782 else 10783 return 1; 10784 10785#undef COMPARE 10786} 10787 10788Py_LOCAL(int) 10789unicode_compare_eq(PyObject *str1, PyObject *str2) 10790{ 10791 int kind; 10792 void *data1, *data2; 10793 Py_ssize_t len; 10794 int cmp; 10795 10796 len = PyUnicode_GET_LENGTH(str1); 10797 if (PyUnicode_GET_LENGTH(str2) != len) 10798 return 0; 10799 kind = PyUnicode_KIND(str1); 10800 if (PyUnicode_KIND(str2) != kind) 10801 return 0; 10802 data1 = PyUnicode_DATA(str1); 10803 data2 = PyUnicode_DATA(str2); 10804 10805 cmp = memcmp(data1, data2, len * kind); 10806 return (cmp == 0); 10807} 10808 10809 10810int 10811PyUnicode_Compare(PyObject *left, PyObject *right) 10812{ 10813 if (PyUnicode_Check(left) && PyUnicode_Check(right)) { 10814 if (PyUnicode_READY(left) == -1 || 10815 PyUnicode_READY(right) == -1) 10816 return -1; 10817 10818 /* a string is equal to itself */ 10819 if (left == right) 10820 return 0; 10821 10822 return unicode_compare(left, right); 10823 } 10824 PyErr_Format(PyExc_TypeError, 10825 "Can't compare %.100s and %.100s", 10826 left->ob_type->tp_name, 10827 right->ob_type->tp_name); 10828 return -1; 10829} 10830 10831int 10832_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right) 10833{ 10834 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */ 10835 if (right_str == NULL) 10836 return -1; 10837 return PyUnicode_Compare(left, right_str); 10838} 10839 10840int 10841PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) 10842{ 10843 Py_ssize_t i; 10844 int kind; 10845 Py_UCS4 chr; 10846 10847 assert(_PyUnicode_CHECK(uni)); 10848 if (PyUnicode_READY(uni) == -1) 10849 return -1; 10850 kind = PyUnicode_KIND(uni); 10851 if (kind == PyUnicode_1BYTE_KIND) { 10852 const void *data = PyUnicode_1BYTE_DATA(uni); 10853 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni); 10854 size_t len, len2 = strlen(str); 10855 int cmp; 10856 10857 len = Py_MIN(len1, len2); 10858 cmp = memcmp(data, str, len); 10859 if (cmp != 0) { 10860 if (cmp < 0) 10861 return -1; 10862 else 10863 return 1; 10864 } 10865 if (len1 > len2) 10866 return 1; /* uni is longer */ 10867 if (len1 < len2) 10868 return -1; /* str is longer */ 10869 return 0; 10870 } 10871 else { 10872 void *data = PyUnicode_DATA(uni); 10873 /* Compare Unicode string and source character set string */ 10874 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++) 10875 if (chr != (unsigned char)str[i]) 10876 return (chr < (unsigned char)(str[i])) ? -1 : 1; 10877 /* This check keeps Python strings that end in '\0' from comparing equal 10878 to C strings identical up to that point. */ 10879 if (PyUnicode_GET_LENGTH(uni) != i || chr) 10880 return 1; /* uni is longer */ 10881 if (str[i]) 10882 return -1; /* str is longer */ 10883 return 0; 10884 } 10885} 10886 10887 10888#define TEST_COND(cond) \ 10889 ((cond) ? Py_True : Py_False) 10890 10891PyObject * 10892PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) 10893{ 10894 int result; 10895 PyObject *v; 10896 10897 if (!PyUnicode_Check(left) || !PyUnicode_Check(right)) 10898 Py_RETURN_NOTIMPLEMENTED; 10899 10900 if (PyUnicode_READY(left) == -1 || 10901 PyUnicode_READY(right) == -1) 10902 return NULL; 10903 10904 if (left == right) { 10905 switch (op) { 10906 case Py_EQ: 10907 case Py_LE: 10908 case Py_GE: 10909 /* a string is equal to itself */ 10910 v = Py_True; 10911 break; 10912 case Py_NE: 10913 case Py_LT: 10914 case Py_GT: 10915 v = Py_False; 10916 break; 10917 default: 10918 PyErr_BadArgument(); 10919 return NULL; 10920 } 10921 } 10922 else if (op == Py_EQ || op == Py_NE) { 10923 result = unicode_compare_eq(left, right); 10924 result ^= (op == Py_NE); 10925 v = TEST_COND(result); 10926 } 10927 else { 10928 result = unicode_compare(left, right); 10929 10930 /* Convert the return value to a Boolean */ 10931 switch (op) { 10932 case Py_LE: 10933 v = TEST_COND(result <= 0); 10934 break; 10935 case Py_GE: 10936 v = TEST_COND(result >= 0); 10937 break; 10938 case Py_LT: 10939 v = TEST_COND(result == -1); 10940 break; 10941 case Py_GT: 10942 v = TEST_COND(result == 1); 10943 break; 10944 default: 10945 PyErr_BadArgument(); 10946 return NULL; 10947 } 10948 } 10949 Py_INCREF(v); 10950 return v; 10951} 10952 10953int 10954_PyUnicode_EQ(PyObject *aa, PyObject *bb) 10955{ 10956 return unicode_eq(aa, bb); 10957} 10958 10959int 10960PyUnicode_Contains(PyObject *container, PyObject *element) 10961{ 10962 PyObject *str, *sub; 10963 int kind1, kind2; 10964 void *buf1, *buf2; 10965 Py_ssize_t len1, len2; 10966 int result; 10967 10968 /* Coerce the two arguments */ 10969 sub = PyUnicode_FromObject(element); 10970 if (!sub) { 10971 PyErr_Format(PyExc_TypeError, 10972 "'in <string>' requires string as left operand, not %s", 10973 element->ob_type->tp_name); 10974 return -1; 10975 } 10976 10977 str = PyUnicode_FromObject(container); 10978 if (!str) { 10979 Py_DECREF(sub); 10980 return -1; 10981 } 10982 10983 kind1 = PyUnicode_KIND(str); 10984 kind2 = PyUnicode_KIND(sub); 10985 if (kind1 < kind2) { 10986 Py_DECREF(sub); 10987 Py_DECREF(str); 10988 return 0; 10989 } 10990 len1 = PyUnicode_GET_LENGTH(str); 10991 len2 = PyUnicode_GET_LENGTH(sub); 10992 if (len1 < len2) { 10993 Py_DECREF(sub); 10994 Py_DECREF(str); 10995 return 0; 10996 } 10997 buf1 = PyUnicode_DATA(str); 10998 buf2 = PyUnicode_DATA(sub); 10999 if (len2 == 1) { 11000 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); 11001 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; 11002 Py_DECREF(sub); 11003 Py_DECREF(str); 11004 return result; 11005 } 11006 if (kind2 != kind1) { 11007 buf2 = _PyUnicode_AsKind(sub, kind1); 11008 if (!buf2) { 11009 Py_DECREF(sub); 11010 Py_DECREF(str); 11011 return -1; 11012 } 11013 } 11014 11015 switch (kind1) { 11016 case PyUnicode_1BYTE_KIND: 11017 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1; 11018 break; 11019 case PyUnicode_2BYTE_KIND: 11020 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1; 11021 break; 11022 case PyUnicode_4BYTE_KIND: 11023 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1; 11024 break; 11025 default: 11026 result = -1; 11027 assert(0); 11028 } 11029 11030 Py_DECREF(str); 11031 Py_DECREF(sub); 11032 11033 if (kind2 != kind1) 11034 PyMem_Free(buf2); 11035 11036 return result; 11037} 11038 11039/* Concat to string or Unicode object giving a new Unicode object. */ 11040 11041PyObject * 11042PyUnicode_Concat(PyObject *left, PyObject *right) 11043{ 11044 PyObject *u = NULL, *v = NULL, *w; 11045 Py_UCS4 maxchar, maxchar2; 11046 Py_ssize_t u_len, v_len, new_len; 11047 11048 /* Coerce the two arguments */ 11049 u = PyUnicode_FromObject(left); 11050 if (u == NULL) 11051 goto onError; 11052 v = PyUnicode_FromObject(right); 11053 if (v == NULL) 11054 goto onError; 11055 11056 /* Shortcuts */ 11057 if (v == unicode_empty) { 11058 Py_DECREF(v); 11059 return u; 11060 } 11061 if (u == unicode_empty) { 11062 Py_DECREF(u); 11063 return v; 11064 } 11065 11066 u_len = PyUnicode_GET_LENGTH(u); 11067 v_len = PyUnicode_GET_LENGTH(v); 11068 if (u_len > PY_SSIZE_T_MAX - v_len) { 11069 PyErr_SetString(PyExc_OverflowError, 11070 "strings are too large to concat"); 11071 goto onError; 11072 } 11073 new_len = u_len + v_len; 11074 11075 maxchar = PyUnicode_MAX_CHAR_VALUE(u); 11076 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); 11077 maxchar = Py_MAX(maxchar, maxchar2); 11078 11079 /* Concat the two Unicode strings */ 11080 w = PyUnicode_New(new_len, maxchar); 11081 if (w == NULL) 11082 goto onError; 11083 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); 11084 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); 11085 Py_DECREF(u); 11086 Py_DECREF(v); 11087 assert(_PyUnicode_CheckConsistency(w, 1)); 11088 return w; 11089 11090 onError: 11091 Py_XDECREF(u); 11092 Py_XDECREF(v); 11093 return NULL; 11094} 11095 11096void 11097PyUnicode_Append(PyObject **p_left, PyObject *right) 11098{ 11099 PyObject *left, *res; 11100 Py_UCS4 maxchar, maxchar2; 11101 Py_ssize_t left_len, right_len, new_len; 11102 11103 if (p_left == NULL) { 11104 if (!PyErr_Occurred()) 11105 PyErr_BadInternalCall(); 11106 return; 11107 } 11108 left = *p_left; 11109 if (right == NULL || left == NULL 11110 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) { 11111 if (!PyErr_Occurred()) 11112 PyErr_BadInternalCall(); 11113 goto error; 11114 } 11115 11116 if (PyUnicode_READY(left) == -1) 11117 goto error; 11118 if (PyUnicode_READY(right) == -1) 11119 goto error; 11120 11121 /* Shortcuts */ 11122 if (left == unicode_empty) { 11123 Py_DECREF(left); 11124 Py_INCREF(right); 11125 *p_left = right; 11126 return; 11127 } 11128 if (right == unicode_empty) 11129 return; 11130 11131 left_len = PyUnicode_GET_LENGTH(left); 11132 right_len = PyUnicode_GET_LENGTH(right); 11133 if (left_len > PY_SSIZE_T_MAX - right_len) { 11134 PyErr_SetString(PyExc_OverflowError, 11135 "strings are too large to concat"); 11136 goto error; 11137 } 11138 new_len = left_len + right_len; 11139 11140 if (unicode_modifiable(left) 11141 && PyUnicode_CheckExact(right) 11142 && PyUnicode_KIND(right) <= PyUnicode_KIND(left) 11143 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires 11144 to change the structure size, but characters are stored just after 11145 the structure, and so it requires to move all characters which is 11146 not so different than duplicating the string. */ 11147 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) 11148 { 11149 /* append inplace */ 11150 if (unicode_resize(p_left, new_len) != 0) 11151 goto error; 11152 11153 /* copy 'right' into the newly allocated area of 'left' */ 11154 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len); 11155 } 11156 else { 11157 maxchar = PyUnicode_MAX_CHAR_VALUE(left); 11158 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); 11159 maxchar = Py_MAX(maxchar, maxchar2); 11160 11161 /* Concat the two Unicode strings */ 11162 res = PyUnicode_New(new_len, maxchar); 11163 if (res == NULL) 11164 goto error; 11165 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len); 11166 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len); 11167 Py_DECREF(left); 11168 *p_left = res; 11169 } 11170 assert(_PyUnicode_CheckConsistency(*p_left, 1)); 11171 return; 11172 11173error: 11174 Py_CLEAR(*p_left); 11175} 11176 11177void 11178PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right) 11179{ 11180 PyUnicode_Append(pleft, right); 11181 Py_XDECREF(right); 11182} 11183 11184PyDoc_STRVAR(count__doc__, 11185 "S.count(sub[, start[, end]]) -> int\n\ 11186\n\ 11187Return the number of non-overlapping occurrences of substring sub in\n\ 11188string S[start:end]. Optional arguments start and end are\n\ 11189interpreted as in slice notation."); 11190 11191static PyObject * 11192unicode_count(PyObject *self, PyObject *args) 11193{ 11194 PyObject *substring = NULL; /* initialize to fix a compiler warning */ 11195 Py_ssize_t start = 0; 11196 Py_ssize_t end = PY_SSIZE_T_MAX; 11197 PyObject *result; 11198 int kind1, kind2; 11199 void *buf1, *buf2; 11200 Py_ssize_t len1, len2, iresult; 11201 11202 if (!stringlib_parse_args_finds_unicode("count", args, &substring, 11203 &start, &end)) 11204 return NULL; 11205 11206 kind1 = PyUnicode_KIND(self); 11207 kind2 = PyUnicode_KIND(substring); 11208 if (kind1 < kind2) { 11209 Py_DECREF(substring); 11210 return PyLong_FromLong(0); 11211 } 11212 len1 = PyUnicode_GET_LENGTH(self); 11213 len2 = PyUnicode_GET_LENGTH(substring); 11214 ADJUST_INDICES(start, end, len1); 11215 if (end - start < len2) { 11216 Py_DECREF(substring); 11217 return PyLong_FromLong(0); 11218 } 11219 buf1 = PyUnicode_DATA(self); 11220 buf2 = PyUnicode_DATA(substring); 11221 if (kind2 != kind1) { 11222 buf2 = _PyUnicode_AsKind(substring, kind1); 11223 if (!buf2) { 11224 Py_DECREF(substring); 11225 return NULL; 11226 } 11227 } 11228 switch (kind1) { 11229 case PyUnicode_1BYTE_KIND: 11230 iresult = ucs1lib_count( 11231 ((Py_UCS1*)buf1) + start, end - start, 11232 buf2, len2, PY_SSIZE_T_MAX 11233 ); 11234 break; 11235 case PyUnicode_2BYTE_KIND: 11236 iresult = ucs2lib_count( 11237 ((Py_UCS2*)buf1) + start, end - start, 11238 buf2, len2, PY_SSIZE_T_MAX 11239 ); 11240 break; 11241 case PyUnicode_4BYTE_KIND: 11242 iresult = ucs4lib_count( 11243 ((Py_UCS4*)buf1) + start, end - start, 11244 buf2, len2, PY_SSIZE_T_MAX 11245 ); 11246 break; 11247 default: 11248 assert(0); iresult = 0; 11249 } 11250 11251 result = PyLong_FromSsize_t(iresult); 11252 11253 if (kind2 != kind1) 11254 PyMem_Free(buf2); 11255 11256 Py_DECREF(substring); 11257 11258 return result; 11259} 11260 11261PyDoc_STRVAR(encode__doc__, 11262 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ 11263\n\ 11264Encode S using the codec registered for encoding. Default encoding\n\ 11265is 'utf-8'. errors may be given to set a different error\n\ 11266handling scheme. Default is 'strict' meaning that encoding errors raise\n\ 11267a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 11268'xmlcharrefreplace' as well as any other name registered with\n\ 11269codecs.register_error that can handle UnicodeEncodeErrors."); 11270 11271static PyObject * 11272unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs) 11273{ 11274 static char *kwlist[] = {"encoding", "errors", 0}; 11275 char *encoding = NULL; 11276 char *errors = NULL; 11277 11278 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", 11279 kwlist, &encoding, &errors)) 11280 return NULL; 11281 return PyUnicode_AsEncodedString(self, encoding, errors); 11282} 11283 11284PyDoc_STRVAR(expandtabs__doc__, 11285 "S.expandtabs(tabsize=8) -> str\n\ 11286\n\ 11287Return a copy of S where all tab characters are expanded using spaces.\n\ 11288If tabsize is not given, a tab size of 8 characters is assumed."); 11289 11290static PyObject* 11291unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) 11292{ 11293 Py_ssize_t i, j, line_pos, src_len, incr; 11294 Py_UCS4 ch; 11295 PyObject *u; 11296 void *src_data, *dest_data; 11297 static char *kwlist[] = {"tabsize", 0}; 11298 int tabsize = 8; 11299 int kind; 11300 int found; 11301 11302 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs", 11303 kwlist, &tabsize)) 11304 return NULL; 11305 11306 if (PyUnicode_READY(self) == -1) 11307 return NULL; 11308 11309 /* First pass: determine size of output string */ 11310 src_len = PyUnicode_GET_LENGTH(self); 11311 i = j = line_pos = 0; 11312 kind = PyUnicode_KIND(self); 11313 src_data = PyUnicode_DATA(self); 11314 found = 0; 11315 for (; i < src_len; i++) { 11316 ch = PyUnicode_READ(kind, src_data, i); 11317 if (ch == '\t') { 11318 found = 1; 11319 if (tabsize > 0) { 11320 incr = tabsize - (line_pos % tabsize); /* cannot overflow */ 11321 if (j > PY_SSIZE_T_MAX - incr) 11322 goto overflow; 11323 line_pos += incr; 11324 j += incr; 11325 } 11326 } 11327 else { 11328 if (j > PY_SSIZE_T_MAX - 1) 11329 goto overflow; 11330 line_pos++; 11331 j++; 11332 if (ch == '\n' || ch == '\r') 11333 line_pos = 0; 11334 } 11335 } 11336 if (!found) 11337 return unicode_result_unchanged(self); 11338 11339 /* Second pass: create output string and fill it */ 11340 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self)); 11341 if (!u) 11342 return NULL; 11343 dest_data = PyUnicode_DATA(u); 11344 11345 i = j = line_pos = 0; 11346 11347 for (; i < src_len; i++) { 11348 ch = PyUnicode_READ(kind, src_data, i); 11349 if (ch == '\t') { 11350 if (tabsize > 0) { 11351 incr = tabsize - (line_pos % tabsize); 11352 line_pos += incr; 11353 FILL(kind, dest_data, ' ', j, incr); 11354 j += incr; 11355 } 11356 } 11357 else { 11358 line_pos++; 11359 PyUnicode_WRITE(kind, dest_data, j, ch); 11360 j++; 11361 if (ch == '\n' || ch == '\r') 11362 line_pos = 0; 11363 } 11364 } 11365 assert (j == PyUnicode_GET_LENGTH(u)); 11366 return unicode_result(u); 11367 11368 overflow: 11369 PyErr_SetString(PyExc_OverflowError, "new string is too long"); 11370 return NULL; 11371} 11372 11373PyDoc_STRVAR(find__doc__, 11374 "S.find(sub[, start[, end]]) -> int\n\ 11375\n\ 11376Return the lowest index in S where substring sub is found,\n\ 11377such that sub is contained within S[start:end]. Optional\n\ 11378arguments start and end are interpreted as in slice notation.\n\ 11379\n\ 11380Return -1 on failure."); 11381 11382static PyObject * 11383unicode_find(PyObject *self, PyObject *args) 11384{ 11385 /* initialize variables to prevent gcc warning */ 11386 PyObject *substring = NULL; 11387 Py_ssize_t start = 0; 11388 Py_ssize_t end = 0; 11389 Py_ssize_t result; 11390 11391 if (!stringlib_parse_args_finds_unicode("find", args, &substring, 11392 &start, &end)) 11393 return NULL; 11394 11395 if (PyUnicode_READY(self) == -1) { 11396 Py_DECREF(substring); 11397 return NULL; 11398 } 11399 if (PyUnicode_READY(substring) == -1) { 11400 Py_DECREF(substring); 11401 return NULL; 11402 } 11403 11404 result = any_find_slice(1, self, substring, start, end); 11405 11406 Py_DECREF(substring); 11407 11408 if (result == -2) 11409 return NULL; 11410 11411 return PyLong_FromSsize_t(result); 11412} 11413 11414static PyObject * 11415unicode_getitem(PyObject *self, Py_ssize_t index) 11416{ 11417 void *data; 11418 enum PyUnicode_Kind kind; 11419 Py_UCS4 ch; 11420 11421 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) { 11422 PyErr_BadArgument(); 11423 return NULL; 11424 } 11425 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) { 11426 PyErr_SetString(PyExc_IndexError, "string index out of range"); 11427 return NULL; 11428 } 11429 kind = PyUnicode_KIND(self); 11430 data = PyUnicode_DATA(self); 11431 ch = PyUnicode_READ(kind, data, index); 11432 return unicode_char(ch); 11433} 11434 11435/* Believe it or not, this produces the same value for ASCII strings 11436 as bytes_hash(). */ 11437static Py_hash_t 11438unicode_hash(PyObject *self) 11439{ 11440 Py_ssize_t len; 11441 Py_uhash_t x; /* Unsigned for defined overflow behavior. */ 11442 11443#ifdef Py_DEBUG 11444 assert(_Py_HashSecret_Initialized); 11445#endif 11446 if (_PyUnicode_HASH(self) != -1) 11447 return _PyUnicode_HASH(self); 11448 if (PyUnicode_READY(self) == -1) 11449 return -1; 11450 len = PyUnicode_GET_LENGTH(self); 11451 /* 11452 We make the hash of the empty string be 0, rather than using 11453 (prefix ^ suffix), since this slightly obfuscates the hash secret 11454 */ 11455 if (len == 0) { 11456 _PyUnicode_HASH(self) = 0; 11457 return 0; 11458 } 11459 x = _Py_HashBytes(PyUnicode_DATA(self), 11460 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); 11461 _PyUnicode_HASH(self) = x; 11462 return x; 11463} 11464 11465PyDoc_STRVAR(index__doc__, 11466 "S.index(sub[, start[, end]]) -> int\n\ 11467\n\ 11468Like S.find() but raise ValueError when the substring is not found."); 11469 11470static PyObject * 11471unicode_index(PyObject *self, PyObject *args) 11472{ 11473 /* initialize variables to prevent gcc warning */ 11474 Py_ssize_t result; 11475 PyObject *substring = NULL; 11476 Py_ssize_t start = 0; 11477 Py_ssize_t end = 0; 11478 11479 if (!stringlib_parse_args_finds_unicode("index", args, &substring, 11480 &start, &end)) 11481 return NULL; 11482 11483 if (PyUnicode_READY(self) == -1) { 11484 Py_DECREF(substring); 11485 return NULL; 11486 } 11487 if (PyUnicode_READY(substring) == -1) { 11488 Py_DECREF(substring); 11489 return NULL; 11490 } 11491 11492 result = any_find_slice(1, self, substring, start, end); 11493 11494 Py_DECREF(substring); 11495 11496 if (result == -2) 11497 return NULL; 11498 11499 if (result < 0) { 11500 PyErr_SetString(PyExc_ValueError, "substring not found"); 11501 return NULL; 11502 } 11503 11504 return PyLong_FromSsize_t(result); 11505} 11506 11507PyDoc_STRVAR(islower__doc__, 11508 "S.islower() -> bool\n\ 11509\n\ 11510Return True if all cased characters in S are lowercase and there is\n\ 11511at least one cased character in S, False otherwise."); 11512 11513static PyObject* 11514unicode_islower(PyObject *self) 11515{ 11516 Py_ssize_t i, length; 11517 int kind; 11518 void *data; 11519 int cased; 11520 11521 if (PyUnicode_READY(self) == -1) 11522 return NULL; 11523 length = PyUnicode_GET_LENGTH(self); 11524 kind = PyUnicode_KIND(self); 11525 data = PyUnicode_DATA(self); 11526 11527 /* Shortcut for single character strings */ 11528 if (length == 1) 11529 return PyBool_FromLong( 11530 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0))); 11531 11532 /* Special case for empty strings */ 11533 if (length == 0) 11534 return PyBool_FromLong(0); 11535 11536 cased = 0; 11537 for (i = 0; i < length; i++) { 11538 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11539 11540 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) 11541 return PyBool_FromLong(0); 11542 else if (!cased && Py_UNICODE_ISLOWER(ch)) 11543 cased = 1; 11544 } 11545 return PyBool_FromLong(cased); 11546} 11547 11548PyDoc_STRVAR(isupper__doc__, 11549 "S.isupper() -> bool\n\ 11550\n\ 11551Return True if all cased characters in S are uppercase and there is\n\ 11552at least one cased character in S, False otherwise."); 11553 11554static PyObject* 11555unicode_isupper(PyObject *self) 11556{ 11557 Py_ssize_t i, length; 11558 int kind; 11559 void *data; 11560 int cased; 11561 11562 if (PyUnicode_READY(self) == -1) 11563 return NULL; 11564 length = PyUnicode_GET_LENGTH(self); 11565 kind = PyUnicode_KIND(self); 11566 data = PyUnicode_DATA(self); 11567 11568 /* Shortcut for single character strings */ 11569 if (length == 1) 11570 return PyBool_FromLong( 11571 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0); 11572 11573 /* Special case for empty strings */ 11574 if (length == 0) 11575 return PyBool_FromLong(0); 11576 11577 cased = 0; 11578 for (i = 0; i < length; i++) { 11579 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11580 11581 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) 11582 return PyBool_FromLong(0); 11583 else if (!cased && Py_UNICODE_ISUPPER(ch)) 11584 cased = 1; 11585 } 11586 return PyBool_FromLong(cased); 11587} 11588 11589PyDoc_STRVAR(istitle__doc__, 11590 "S.istitle() -> bool\n\ 11591\n\ 11592Return True if S is a titlecased string and there is at least one\n\ 11593character in S, i.e. upper- and titlecase characters may only\n\ 11594follow uncased characters and lowercase characters only cased ones.\n\ 11595Return False otherwise."); 11596 11597static PyObject* 11598unicode_istitle(PyObject *self) 11599{ 11600 Py_ssize_t i, length; 11601 int kind; 11602 void *data; 11603 int cased, previous_is_cased; 11604 11605 if (PyUnicode_READY(self) == -1) 11606 return NULL; 11607 length = PyUnicode_GET_LENGTH(self); 11608 kind = PyUnicode_KIND(self); 11609 data = PyUnicode_DATA(self); 11610 11611 /* Shortcut for single character strings */ 11612 if (length == 1) { 11613 Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11614 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) || 11615 (Py_UNICODE_ISUPPER(ch) != 0)); 11616 } 11617 11618 /* Special case for empty strings */ 11619 if (length == 0) 11620 return PyBool_FromLong(0); 11621 11622 cased = 0; 11623 previous_is_cased = 0; 11624 for (i = 0; i < length; i++) { 11625 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11626 11627 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { 11628 if (previous_is_cased) 11629 return PyBool_FromLong(0); 11630 previous_is_cased = 1; 11631 cased = 1; 11632 } 11633 else if (Py_UNICODE_ISLOWER(ch)) { 11634 if (!previous_is_cased) 11635 return PyBool_FromLong(0); 11636 previous_is_cased = 1; 11637 cased = 1; 11638 } 11639 else 11640 previous_is_cased = 0; 11641 } 11642 return PyBool_FromLong(cased); 11643} 11644 11645PyDoc_STRVAR(isspace__doc__, 11646 "S.isspace() -> bool\n\ 11647\n\ 11648Return True if all characters in S are whitespace\n\ 11649and there is at least one character in S, False otherwise."); 11650 11651static PyObject* 11652unicode_isspace(PyObject *self) 11653{ 11654 Py_ssize_t i, length; 11655 int kind; 11656 void *data; 11657 11658 if (PyUnicode_READY(self) == -1) 11659 return NULL; 11660 length = PyUnicode_GET_LENGTH(self); 11661 kind = PyUnicode_KIND(self); 11662 data = PyUnicode_DATA(self); 11663 11664 /* Shortcut for single character strings */ 11665 if (length == 1) 11666 return PyBool_FromLong( 11667 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0))); 11668 11669 /* Special case for empty strings */ 11670 if (length == 0) 11671 return PyBool_FromLong(0); 11672 11673 for (i = 0; i < length; i++) { 11674 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11675 if (!Py_UNICODE_ISSPACE(ch)) 11676 return PyBool_FromLong(0); 11677 } 11678 return PyBool_FromLong(1); 11679} 11680 11681PyDoc_STRVAR(isalpha__doc__, 11682 "S.isalpha() -> bool\n\ 11683\n\ 11684Return True if all characters in S are alphabetic\n\ 11685and there is at least one character in S, False otherwise."); 11686 11687static PyObject* 11688unicode_isalpha(PyObject *self) 11689{ 11690 Py_ssize_t i, length; 11691 int kind; 11692 void *data; 11693 11694 if (PyUnicode_READY(self) == -1) 11695 return NULL; 11696 length = PyUnicode_GET_LENGTH(self); 11697 kind = PyUnicode_KIND(self); 11698 data = PyUnicode_DATA(self); 11699 11700 /* Shortcut for single character strings */ 11701 if (length == 1) 11702 return PyBool_FromLong( 11703 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0))); 11704 11705 /* Special case for empty strings */ 11706 if (length == 0) 11707 return PyBool_FromLong(0); 11708 11709 for (i = 0; i < length; i++) { 11710 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i))) 11711 return PyBool_FromLong(0); 11712 } 11713 return PyBool_FromLong(1); 11714} 11715 11716PyDoc_STRVAR(isalnum__doc__, 11717 "S.isalnum() -> bool\n\ 11718\n\ 11719Return True if all characters in S are alphanumeric\n\ 11720and there is at least one character in S, False otherwise."); 11721 11722static PyObject* 11723unicode_isalnum(PyObject *self) 11724{ 11725 int kind; 11726 void *data; 11727 Py_ssize_t len, i; 11728 11729 if (PyUnicode_READY(self) == -1) 11730 return NULL; 11731 11732 kind = PyUnicode_KIND(self); 11733 data = PyUnicode_DATA(self); 11734 len = PyUnicode_GET_LENGTH(self); 11735 11736 /* Shortcut for single character strings */ 11737 if (len == 1) { 11738 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11739 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch)); 11740 } 11741 11742 /* Special case for empty strings */ 11743 if (len == 0) 11744 return PyBool_FromLong(0); 11745 11746 for (i = 0; i < len; i++) { 11747 const Py_UCS4 ch = PyUnicode_READ(kind, data, i); 11748 if (!Py_UNICODE_ISALNUM(ch)) 11749 return PyBool_FromLong(0); 11750 } 11751 return PyBool_FromLong(1); 11752} 11753 11754PyDoc_STRVAR(isdecimal__doc__, 11755 "S.isdecimal() -> bool\n\ 11756\n\ 11757Return True if there are only decimal characters in S,\n\ 11758False otherwise."); 11759 11760static PyObject* 11761unicode_isdecimal(PyObject *self) 11762{ 11763 Py_ssize_t i, length; 11764 int kind; 11765 void *data; 11766 11767 if (PyUnicode_READY(self) == -1) 11768 return NULL; 11769 length = PyUnicode_GET_LENGTH(self); 11770 kind = PyUnicode_KIND(self); 11771 data = PyUnicode_DATA(self); 11772 11773 /* Shortcut for single character strings */ 11774 if (length == 1) 11775 return PyBool_FromLong( 11776 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0))); 11777 11778 /* Special case for empty strings */ 11779 if (length == 0) 11780 return PyBool_FromLong(0); 11781 11782 for (i = 0; i < length; i++) { 11783 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i))) 11784 return PyBool_FromLong(0); 11785 } 11786 return PyBool_FromLong(1); 11787} 11788 11789PyDoc_STRVAR(isdigit__doc__, 11790 "S.isdigit() -> bool\n\ 11791\n\ 11792Return True if all characters in S are digits\n\ 11793and there is at least one character in S, False otherwise."); 11794 11795static PyObject* 11796unicode_isdigit(PyObject *self) 11797{ 11798 Py_ssize_t i, length; 11799 int kind; 11800 void *data; 11801 11802 if (PyUnicode_READY(self) == -1) 11803 return NULL; 11804 length = PyUnicode_GET_LENGTH(self); 11805 kind = PyUnicode_KIND(self); 11806 data = PyUnicode_DATA(self); 11807 11808 /* Shortcut for single character strings */ 11809 if (length == 1) { 11810 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0); 11811 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch)); 11812 } 11813 11814 /* Special case for empty strings */ 11815 if (length == 0) 11816 return PyBool_FromLong(0); 11817 11818 for (i = 0; i < length; i++) { 11819 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i))) 11820 return PyBool_FromLong(0); 11821 } 11822 return PyBool_FromLong(1); 11823} 11824 11825PyDoc_STRVAR(isnumeric__doc__, 11826 "S.isnumeric() -> bool\n\ 11827\n\ 11828Return True if there are only numeric characters in S,\n\ 11829False otherwise."); 11830 11831static PyObject* 11832unicode_isnumeric(PyObject *self) 11833{ 11834 Py_ssize_t i, length; 11835 int kind; 11836 void *data; 11837 11838 if (PyUnicode_READY(self) == -1) 11839 return NULL; 11840 length = PyUnicode_GET_LENGTH(self); 11841 kind = PyUnicode_KIND(self); 11842 data = PyUnicode_DATA(self); 11843 11844 /* Shortcut for single character strings */ 11845 if (length == 1) 11846 return PyBool_FromLong( 11847 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0))); 11848 11849 /* Special case for empty strings */ 11850 if (length == 0) 11851 return PyBool_FromLong(0); 11852 11853 for (i = 0; i < length; i++) { 11854 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i))) 11855 return PyBool_FromLong(0); 11856 } 11857 return PyBool_FromLong(1); 11858} 11859 11860int 11861PyUnicode_IsIdentifier(PyObject *self) 11862{ 11863 int kind; 11864 void *data; 11865 Py_ssize_t i; 11866 Py_UCS4 first; 11867 11868 if (PyUnicode_READY(self) == -1) { 11869 Py_FatalError("identifier not ready"); 11870 return 0; 11871 } 11872 11873 /* Special case for empty strings */ 11874 if (PyUnicode_GET_LENGTH(self) == 0) 11875 return 0; 11876 kind = PyUnicode_KIND(self); 11877 data = PyUnicode_DATA(self); 11878 11879 /* PEP 3131 says that the first character must be in 11880 XID_Start and subsequent characters in XID_Continue, 11881 and for the ASCII range, the 2.x rules apply (i.e 11882 start with letters and underscore, continue with 11883 letters, digits, underscore). However, given the current 11884 definition of XID_Start and XID_Continue, it is sufficient 11885 to check just for these, except that _ must be allowed 11886 as starting an identifier. */ 11887 first = PyUnicode_READ(kind, data, 0); 11888 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) 11889 return 0; 11890 11891 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++) 11892 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i))) 11893 return 0; 11894 return 1; 11895} 11896 11897PyDoc_STRVAR(isidentifier__doc__, 11898 "S.isidentifier() -> bool\n\ 11899\n\ 11900Return True if S is a valid identifier according\n\ 11901to the language definition.\n\ 11902\n\ 11903Use keyword.iskeyword() to test for reserved identifiers\n\ 11904such as \"def\" and \"class\".\n"); 11905 11906static PyObject* 11907unicode_isidentifier(PyObject *self) 11908{ 11909 return PyBool_FromLong(PyUnicode_IsIdentifier(self)); 11910} 11911 11912PyDoc_STRVAR(isprintable__doc__, 11913 "S.isprintable() -> bool\n\ 11914\n\ 11915Return True if all characters in S are considered\n\ 11916printable in repr() or S is empty, False otherwise."); 11917 11918static PyObject* 11919unicode_isprintable(PyObject *self) 11920{ 11921 Py_ssize_t i, length; 11922 int kind; 11923 void *data; 11924 11925 if (PyUnicode_READY(self) == -1) 11926 return NULL; 11927 length = PyUnicode_GET_LENGTH(self); 11928 kind = PyUnicode_KIND(self); 11929 data = PyUnicode_DATA(self); 11930 11931 /* Shortcut for single character strings */ 11932 if (length == 1) 11933 return PyBool_FromLong( 11934 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0))); 11935 11936 for (i = 0; i < length; i++) { 11937 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) { 11938 Py_RETURN_FALSE; 11939 } 11940 } 11941 Py_RETURN_TRUE; 11942} 11943 11944PyDoc_STRVAR(join__doc__, 11945 "S.join(iterable) -> str\n\ 11946\n\ 11947Return a string which is the concatenation of the strings in the\n\ 11948iterable. The separator between elements is S."); 11949 11950static PyObject* 11951unicode_join(PyObject *self, PyObject *data) 11952{ 11953 return PyUnicode_Join(self, data); 11954} 11955 11956static Py_ssize_t 11957unicode_length(PyObject *self) 11958{ 11959 if (PyUnicode_READY(self) == -1) 11960 return -1; 11961 return PyUnicode_GET_LENGTH(self); 11962} 11963 11964PyDoc_STRVAR(ljust__doc__, 11965 "S.ljust(width[, fillchar]) -> str\n\ 11966\n\ 11967Return S left-justified in a Unicode string of length width. Padding is\n\ 11968done using the specified fill character (default is a space)."); 11969 11970static PyObject * 11971unicode_ljust(PyObject *self, PyObject *args) 11972{ 11973 Py_ssize_t width; 11974 Py_UCS4 fillchar = ' '; 11975 11976 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) 11977 return NULL; 11978 11979 if (PyUnicode_READY(self) == -1) 11980 return NULL; 11981 11982 if (PyUnicode_GET_LENGTH(self) >= width) 11983 return unicode_result_unchanged(self); 11984 11985 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar); 11986} 11987 11988PyDoc_STRVAR(lower__doc__, 11989 "S.lower() -> str\n\ 11990\n\ 11991Return a copy of the string S converted to lowercase."); 11992 11993static PyObject* 11994unicode_lower(PyObject *self) 11995{ 11996 if (PyUnicode_READY(self) == -1) 11997 return NULL; 11998 if (PyUnicode_IS_ASCII(self)) 11999 return ascii_upper_or_lower(self, 1); 12000 return case_operation(self, do_lower); 12001} 12002 12003#define LEFTSTRIP 0 12004#define RIGHTSTRIP 1 12005#define BOTHSTRIP 2 12006 12007/* Arrays indexed by above */ 12008static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; 12009 12010#define STRIPNAME(i) (stripformat[i]+3) 12011 12012/* externally visible for str.strip(unicode) */ 12013PyObject * 12014_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj) 12015{ 12016 void *data; 12017 int kind; 12018 Py_ssize_t i, j, len; 12019 BLOOM_MASK sepmask; 12020 Py_ssize_t seplen; 12021 12022 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) 12023 return NULL; 12024 12025 kind = PyUnicode_KIND(self); 12026 data = PyUnicode_DATA(self); 12027 len = PyUnicode_GET_LENGTH(self); 12028 seplen = PyUnicode_GET_LENGTH(sepobj); 12029 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), 12030 PyUnicode_DATA(sepobj), 12031 seplen); 12032 12033 i = 0; 12034 if (striptype != RIGHTSTRIP) { 12035 while (i < len) { 12036 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12037 if (!BLOOM(sepmask, ch)) 12038 break; 12039 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12040 break; 12041 i++; 12042 } 12043 } 12044 12045 j = len; 12046 if (striptype != LEFTSTRIP) { 12047 j--; 12048 while (j >= i) { 12049 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12050 if (!BLOOM(sepmask, ch)) 12051 break; 12052 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) 12053 break; 12054 j--; 12055 } 12056 12057 j++; 12058 } 12059 12060 return PyUnicode_Substring(self, i, j); 12061} 12062 12063PyObject* 12064PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) 12065{ 12066 unsigned char *data; 12067 int kind; 12068 Py_ssize_t length; 12069 12070 if (PyUnicode_READY(self) == -1) 12071 return NULL; 12072 12073 length = PyUnicode_GET_LENGTH(self); 12074 end = Py_MIN(end, length); 12075 12076 if (start == 0 && end == length) 12077 return unicode_result_unchanged(self); 12078 12079 if (start < 0 || end < 0) { 12080 PyErr_SetString(PyExc_IndexError, "string index out of range"); 12081 return NULL; 12082 } 12083 if (start >= length || end < start) 12084 _Py_RETURN_UNICODE_EMPTY(); 12085 12086 length = end - start; 12087 if (PyUnicode_IS_ASCII(self)) { 12088 data = PyUnicode_1BYTE_DATA(self); 12089 return _PyUnicode_FromASCII((char*)(data + start), length); 12090 } 12091 else { 12092 kind = PyUnicode_KIND(self); 12093 data = PyUnicode_1BYTE_DATA(self); 12094 return PyUnicode_FromKindAndData(kind, 12095 data + kind * start, 12096 length); 12097 } 12098} 12099 12100static PyObject * 12101do_strip(PyObject *self, int striptype) 12102{ 12103 Py_ssize_t len, i, j; 12104 12105 if (PyUnicode_READY(self) == -1) 12106 return NULL; 12107 12108 len = PyUnicode_GET_LENGTH(self); 12109 12110 if (PyUnicode_IS_ASCII(self)) { 12111 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); 12112 12113 i = 0; 12114 if (striptype != RIGHTSTRIP) { 12115 while (i < len) { 12116 Py_UCS1 ch = data[i]; 12117 if (!_Py_ascii_whitespace[ch]) 12118 break; 12119 i++; 12120 } 12121 } 12122 12123 j = len; 12124 if (striptype != LEFTSTRIP) { 12125 j--; 12126 while (j >= i) { 12127 Py_UCS1 ch = data[j]; 12128 if (!_Py_ascii_whitespace[ch]) 12129 break; 12130 j--; 12131 } 12132 j++; 12133 } 12134 } 12135 else { 12136 int kind = PyUnicode_KIND(self); 12137 void *data = PyUnicode_DATA(self); 12138 12139 i = 0; 12140 if (striptype != RIGHTSTRIP) { 12141 while (i < len) { 12142 Py_UCS4 ch = PyUnicode_READ(kind, data, i); 12143 if (!Py_UNICODE_ISSPACE(ch)) 12144 break; 12145 i++; 12146 } 12147 } 12148 12149 j = len; 12150 if (striptype != LEFTSTRIP) { 12151 j--; 12152 while (j >= i) { 12153 Py_UCS4 ch = PyUnicode_READ(kind, data, j); 12154 if (!Py_UNICODE_ISSPACE(ch)) 12155 break; 12156 j--; 12157 } 12158 j++; 12159 } 12160 } 12161 12162 return PyUnicode_Substring(self, i, j); 12163} 12164 12165 12166static PyObject * 12167do_argstrip(PyObject *self, int striptype, PyObject *args) 12168{ 12169 PyObject *sep = NULL; 12170 12171 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep)) 12172 return NULL; 12173 12174 if (sep != NULL && sep != Py_None) { 12175 if (PyUnicode_Check(sep)) 12176 return _PyUnicode_XStrip(self, striptype, sep); 12177 else { 12178 PyErr_Format(PyExc_TypeError, 12179 "%s arg must be None or str", 12180 STRIPNAME(striptype)); 12181 return NULL; 12182 } 12183 } 12184 12185 return do_strip(self, striptype); 12186} 12187 12188 12189PyDoc_STRVAR(strip__doc__, 12190 "S.strip([chars]) -> str\n\ 12191\n\ 12192Return a copy of the string S with leading and trailing\n\ 12193whitespace removed.\n\ 12194If chars is given and not None, remove characters in chars instead."); 12195 12196static PyObject * 12197unicode_strip(PyObject *self, PyObject *args) 12198{ 12199 if (PyTuple_GET_SIZE(args) == 0) 12200 return do_strip(self, BOTHSTRIP); /* Common case */ 12201 else 12202 return do_argstrip(self, BOTHSTRIP, args); 12203} 12204 12205 12206PyDoc_STRVAR(lstrip__doc__, 12207 "S.lstrip([chars]) -> str\n\ 12208\n\ 12209Return a copy of the string S with leading whitespace removed.\n\ 12210If chars is given and not None, remove characters in chars instead."); 12211 12212static PyObject * 12213unicode_lstrip(PyObject *self, PyObject *args) 12214{ 12215 if (PyTuple_GET_SIZE(args) == 0) 12216 return do_strip(self, LEFTSTRIP); /* Common case */ 12217 else 12218 return do_argstrip(self, LEFTSTRIP, args); 12219} 12220 12221 12222PyDoc_STRVAR(rstrip__doc__, 12223 "S.rstrip([chars]) -> str\n\ 12224\n\ 12225Return a copy of the string S with trailing whitespace removed.\n\ 12226If chars is given and not None, remove characters in chars instead."); 12227 12228static PyObject * 12229unicode_rstrip(PyObject *self, PyObject *args) 12230{ 12231 if (PyTuple_GET_SIZE(args) == 0) 12232 return do_strip(self, RIGHTSTRIP); /* Common case */ 12233 else 12234 return do_argstrip(self, RIGHTSTRIP, args); 12235} 12236 12237 12238static PyObject* 12239unicode_repeat(PyObject *str, Py_ssize_t len) 12240{ 12241 PyObject *u; 12242 Py_ssize_t nchars, n; 12243 12244 if (len < 1) 12245 _Py_RETURN_UNICODE_EMPTY(); 12246 12247 /* no repeat, return original string */ 12248 if (len == 1) 12249 return unicode_result_unchanged(str); 12250 12251 if (PyUnicode_READY(str) == -1) 12252 return NULL; 12253 12254 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) { 12255 PyErr_SetString(PyExc_OverflowError, 12256 "repeated string is too long"); 12257 return NULL; 12258 } 12259 nchars = len * PyUnicode_GET_LENGTH(str); 12260 12261 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str)); 12262 if (!u) 12263 return NULL; 12264 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str)); 12265 12266 if (PyUnicode_GET_LENGTH(str) == 1) { 12267 const int kind = PyUnicode_KIND(str); 12268 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0); 12269 if (kind == PyUnicode_1BYTE_KIND) { 12270 void *to = PyUnicode_DATA(u); 12271 memset(to, (unsigned char)fill_char, len); 12272 } 12273 else if (kind == PyUnicode_2BYTE_KIND) { 12274 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u); 12275 for (n = 0; n < len; ++n) 12276 ucs2[n] = fill_char; 12277 } else { 12278 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u); 12279 assert(kind == PyUnicode_4BYTE_KIND); 12280 for (n = 0; n < len; ++n) 12281 ucs4[n] = fill_char; 12282 } 12283 } 12284 else { 12285 /* number of characters copied this far */ 12286 Py_ssize_t done = PyUnicode_GET_LENGTH(str); 12287 const Py_ssize_t char_size = PyUnicode_KIND(str); 12288 char *to = (char *) PyUnicode_DATA(u); 12289 Py_MEMCPY(to, PyUnicode_DATA(str), 12290 PyUnicode_GET_LENGTH(str) * char_size); 12291 while (done < nchars) { 12292 n = (done <= nchars-done) ? done : nchars-done; 12293 Py_MEMCPY(to + (done * char_size), to, n * char_size); 12294 done += n; 12295 } 12296 } 12297 12298 assert(_PyUnicode_CheckConsistency(u, 1)); 12299 return u; 12300} 12301 12302PyObject * 12303PyUnicode_Replace(PyObject *obj, 12304 PyObject *subobj, 12305 PyObject *replobj, 12306 Py_ssize_t maxcount) 12307{ 12308 PyObject *self; 12309 PyObject *str1; 12310 PyObject *str2; 12311 PyObject *result; 12312 12313 self = PyUnicode_FromObject(obj); 12314 if (self == NULL) 12315 return NULL; 12316 str1 = PyUnicode_FromObject(subobj); 12317 if (str1 == NULL) { 12318 Py_DECREF(self); 12319 return NULL; 12320 } 12321 str2 = PyUnicode_FromObject(replobj); 12322 if (str2 == NULL) { 12323 Py_DECREF(self); 12324 Py_DECREF(str1); 12325 return NULL; 12326 } 12327 if (PyUnicode_READY(self) == -1 || 12328 PyUnicode_READY(str1) == -1 || 12329 PyUnicode_READY(str2) == -1) 12330 result = NULL; 12331 else 12332 result = replace(self, str1, str2, maxcount); 12333 Py_DECREF(self); 12334 Py_DECREF(str1); 12335 Py_DECREF(str2); 12336 return result; 12337} 12338 12339PyDoc_STRVAR(replace__doc__, 12340 "S.replace(old, new[, count]) -> str\n\ 12341\n\ 12342Return a copy of S with all occurrences of substring\n\ 12343old replaced by new. If the optional argument count is\n\ 12344given, only the first count occurrences are replaced."); 12345 12346static PyObject* 12347unicode_replace(PyObject *self, PyObject *args) 12348{ 12349 PyObject *str1; 12350 PyObject *str2; 12351 Py_ssize_t maxcount = -1; 12352 PyObject *result; 12353 12354 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) 12355 return NULL; 12356 if (PyUnicode_READY(self) == -1) 12357 return NULL; 12358 str1 = PyUnicode_FromObject(str1); 12359 if (str1 == NULL) 12360 return NULL; 12361 str2 = PyUnicode_FromObject(str2); 12362 if (str2 == NULL) { 12363 Py_DECREF(str1); 12364 return NULL; 12365 } 12366 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) 12367 result = NULL; 12368 else 12369 result = replace(self, str1, str2, maxcount); 12370 12371 Py_DECREF(str1); 12372 Py_DECREF(str2); 12373 return result; 12374} 12375 12376static PyObject * 12377unicode_repr(PyObject *unicode) 12378{ 12379 PyObject *repr; 12380 Py_ssize_t isize; 12381 Py_ssize_t osize, squote, dquote, i, o; 12382 Py_UCS4 max, quote; 12383 int ikind, okind, unchanged; 12384 void *idata, *odata; 12385 12386 if (PyUnicode_READY(unicode) == -1) 12387 return NULL; 12388 12389 isize = PyUnicode_GET_LENGTH(unicode); 12390 idata = PyUnicode_DATA(unicode); 12391 12392 /* Compute length of output, quote characters, and 12393 maximum character */ 12394 osize = 0; 12395 max = 127; 12396 squote = dquote = 0; 12397 ikind = PyUnicode_KIND(unicode); 12398 for (i = 0; i < isize; i++) { 12399 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12400 Py_ssize_t incr = 1; 12401 switch (ch) { 12402 case '\'': squote++; break; 12403 case '"': dquote++; break; 12404 case '\\': case '\t': case '\r': case '\n': 12405 incr = 2; 12406 break; 12407 default: 12408 /* Fast-path ASCII */ 12409 if (ch < ' ' || ch == 0x7f) 12410 incr = 4; /* \xHH */ 12411 else if (ch < 0x7f) 12412 ; 12413 else if (Py_UNICODE_ISPRINTABLE(ch)) 12414 max = ch > max ? ch : max; 12415 else if (ch < 0x100) 12416 incr = 4; /* \xHH */ 12417 else if (ch < 0x10000) 12418 incr = 6; /* \uHHHH */ 12419 else 12420 incr = 10; /* \uHHHHHHHH */ 12421 } 12422 if (osize > PY_SSIZE_T_MAX - incr) { 12423 PyErr_SetString(PyExc_OverflowError, 12424 "string is too long to generate repr"); 12425 return NULL; 12426 } 12427 osize += incr; 12428 } 12429 12430 quote = '\''; 12431 unchanged = (osize == isize); 12432 if (squote) { 12433 unchanged = 0; 12434 if (dquote) 12435 /* Both squote and dquote present. Use squote, 12436 and escape them */ 12437 osize += squote; 12438 else 12439 quote = '"'; 12440 } 12441 osize += 2; /* quotes */ 12442 12443 repr = PyUnicode_New(osize, max); 12444 if (repr == NULL) 12445 return NULL; 12446 okind = PyUnicode_KIND(repr); 12447 odata = PyUnicode_DATA(repr); 12448 12449 PyUnicode_WRITE(okind, odata, 0, quote); 12450 PyUnicode_WRITE(okind, odata, osize-1, quote); 12451 if (unchanged) { 12452 _PyUnicode_FastCopyCharacters(repr, 1, 12453 unicode, 0, 12454 isize); 12455 } 12456 else { 12457 for (i = 0, o = 1; i < isize; i++) { 12458 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i); 12459 12460 /* Escape quotes and backslashes */ 12461 if ((ch == quote) || (ch == '\\')) { 12462 PyUnicode_WRITE(okind, odata, o++, '\\'); 12463 PyUnicode_WRITE(okind, odata, o++, ch); 12464 continue; 12465 } 12466 12467 /* Map special whitespace to '\t', \n', '\r' */ 12468 if (ch == '\t') { 12469 PyUnicode_WRITE(okind, odata, o++, '\\'); 12470 PyUnicode_WRITE(okind, odata, o++, 't'); 12471 } 12472 else if (ch == '\n') { 12473 PyUnicode_WRITE(okind, odata, o++, '\\'); 12474 PyUnicode_WRITE(okind, odata, o++, 'n'); 12475 } 12476 else if (ch == '\r') { 12477 PyUnicode_WRITE(okind, odata, o++, '\\'); 12478 PyUnicode_WRITE(okind, odata, o++, 'r'); 12479 } 12480 12481 /* Map non-printable US ASCII to '\xhh' */ 12482 else if (ch < ' ' || ch == 0x7F) { 12483 PyUnicode_WRITE(okind, odata, o++, '\\'); 12484 PyUnicode_WRITE(okind, odata, o++, 'x'); 12485 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12486 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12487 } 12488 12489 /* Copy ASCII characters as-is */ 12490 else if (ch < 0x7F) { 12491 PyUnicode_WRITE(okind, odata, o++, ch); 12492 } 12493 12494 /* Non-ASCII characters */ 12495 else { 12496 /* Map Unicode whitespace and control characters 12497 (categories Z* and C* except ASCII space) 12498 */ 12499 if (!Py_UNICODE_ISPRINTABLE(ch)) { 12500 PyUnicode_WRITE(okind, odata, o++, '\\'); 12501 /* Map 8-bit characters to '\xhh' */ 12502 if (ch <= 0xff) { 12503 PyUnicode_WRITE(okind, odata, o++, 'x'); 12504 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]); 12505 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]); 12506 } 12507 /* Map 16-bit characters to '\uxxxx' */ 12508 else if (ch <= 0xffff) { 12509 PyUnicode_WRITE(okind, odata, o++, 'u'); 12510 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12511 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12512 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12513 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12514 } 12515 /* Map 21-bit characters to '\U00xxxxxx' */ 12516 else { 12517 PyUnicode_WRITE(okind, odata, o++, 'U'); 12518 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]); 12519 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]); 12520 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]); 12521 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]); 12522 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]); 12523 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]); 12524 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]); 12525 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]); 12526 } 12527 } 12528 /* Copy characters as-is */ 12529 else { 12530 PyUnicode_WRITE(okind, odata, o++, ch); 12531 } 12532 } 12533 } 12534 } 12535 /* Closing quote already added at the beginning */ 12536 assert(_PyUnicode_CheckConsistency(repr, 1)); 12537 return repr; 12538} 12539 12540PyDoc_STRVAR(rfind__doc__, 12541 "S.rfind(sub[, start[, end]]) -> int\n\ 12542\n\ 12543Return the highest index in S where substring sub is found,\n\ 12544such that sub is contained within S[start:end]. Optional\n\ 12545arguments start and end are interpreted as in slice notation.\n\ 12546\n\ 12547Return -1 on failure."); 12548 12549static PyObject * 12550unicode_rfind(PyObject *self, PyObject *args) 12551{ 12552 /* initialize variables to prevent gcc warning */ 12553 PyObject *substring = NULL; 12554 Py_ssize_t start = 0; 12555 Py_ssize_t end = 0; 12556 Py_ssize_t result; 12557 12558 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, 12559 &start, &end)) 12560 return NULL; 12561 12562 if (PyUnicode_READY(self) == -1) { 12563 Py_DECREF(substring); 12564 return NULL; 12565 } 12566 if (PyUnicode_READY(substring) == -1) { 12567 Py_DECREF(substring); 12568 return NULL; 12569 } 12570 12571 result = any_find_slice(-1, self, substring, start, end); 12572 12573 Py_DECREF(substring); 12574 12575 if (result == -2) 12576 return NULL; 12577 12578 return PyLong_FromSsize_t(result); 12579} 12580 12581PyDoc_STRVAR(rindex__doc__, 12582 "S.rindex(sub[, start[, end]]) -> int\n\ 12583\n\ 12584Like S.rfind() but raise ValueError when the substring is not found."); 12585 12586static PyObject * 12587unicode_rindex(PyObject *self, PyObject *args) 12588{ 12589 /* initialize variables to prevent gcc warning */ 12590 PyObject *substring = NULL; 12591 Py_ssize_t start = 0; 12592 Py_ssize_t end = 0; 12593 Py_ssize_t result; 12594 12595 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, 12596 &start, &end)) 12597 return NULL; 12598 12599 if (PyUnicode_READY(self) == -1) { 12600 Py_DECREF(substring); 12601 return NULL; 12602 } 12603 if (PyUnicode_READY(substring) == -1) { 12604 Py_DECREF(substring); 12605 return NULL; 12606 } 12607 12608 result = any_find_slice(-1, self, substring, start, end); 12609 12610 Py_DECREF(substring); 12611 12612 if (result == -2) 12613 return NULL; 12614 12615 if (result < 0) { 12616 PyErr_SetString(PyExc_ValueError, "substring not found"); 12617 return NULL; 12618 } 12619 12620 return PyLong_FromSsize_t(result); 12621} 12622 12623PyDoc_STRVAR(rjust__doc__, 12624 "S.rjust(width[, fillchar]) -> str\n\ 12625\n\ 12626Return S right-justified in a string of length width. Padding is\n\ 12627done using the specified fill character (default is a space)."); 12628 12629static PyObject * 12630unicode_rjust(PyObject *self, PyObject *args) 12631{ 12632 Py_ssize_t width; 12633 Py_UCS4 fillchar = ' '; 12634 12635 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) 12636 return NULL; 12637 12638 if (PyUnicode_READY(self) == -1) 12639 return NULL; 12640 12641 if (PyUnicode_GET_LENGTH(self) >= width) 12642 return unicode_result_unchanged(self); 12643 12644 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar); 12645} 12646 12647PyObject * 12648PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12649{ 12650 PyObject *result; 12651 12652 s = PyUnicode_FromObject(s); 12653 if (s == NULL) 12654 return NULL; 12655 if (sep != NULL) { 12656 sep = PyUnicode_FromObject(sep); 12657 if (sep == NULL) { 12658 Py_DECREF(s); 12659 return NULL; 12660 } 12661 } 12662 12663 result = split(s, sep, maxsplit); 12664 12665 Py_DECREF(s); 12666 Py_XDECREF(sep); 12667 return result; 12668} 12669 12670PyDoc_STRVAR(split__doc__, 12671 "S.split(sep=None, maxsplit=-1) -> list of strings\n\ 12672\n\ 12673Return a list of the words in S, using sep as the\n\ 12674delimiter string. If maxsplit is given, at most maxsplit\n\ 12675splits are done. If sep is not specified or is None, any\n\ 12676whitespace string is a separator and empty strings are\n\ 12677removed from the result."); 12678 12679static PyObject* 12680unicode_split(PyObject *self, PyObject *args, PyObject *kwds) 12681{ 12682 static char *kwlist[] = {"sep", "maxsplit", 0}; 12683 PyObject *substring = Py_None; 12684 Py_ssize_t maxcount = -1; 12685 12686 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", 12687 kwlist, &substring, &maxcount)) 12688 return NULL; 12689 12690 if (substring == Py_None) 12691 return split(self, NULL, maxcount); 12692 else if (PyUnicode_Check(substring)) 12693 return split(self, substring, maxcount); 12694 else 12695 return PyUnicode_Split(self, substring, maxcount); 12696} 12697 12698PyObject * 12699PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) 12700{ 12701 PyObject* str_obj; 12702 PyObject* sep_obj; 12703 PyObject* out; 12704 int kind1, kind2; 12705 void *buf1, *buf2; 12706 Py_ssize_t len1, len2; 12707 12708 str_obj = PyUnicode_FromObject(str_in); 12709 if (!str_obj) 12710 return NULL; 12711 sep_obj = PyUnicode_FromObject(sep_in); 12712 if (!sep_obj) { 12713 Py_DECREF(str_obj); 12714 return NULL; 12715 } 12716 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { 12717 Py_DECREF(sep_obj); 12718 Py_DECREF(str_obj); 12719 return NULL; 12720 } 12721 12722 kind1 = PyUnicode_KIND(str_obj); 12723 kind2 = PyUnicode_KIND(sep_obj); 12724 len1 = PyUnicode_GET_LENGTH(str_obj); 12725 len2 = PyUnicode_GET_LENGTH(sep_obj); 12726 if (kind1 < kind2 || len1 < len2) { 12727 _Py_INCREF_UNICODE_EMPTY(); 12728 if (!unicode_empty) 12729 out = NULL; 12730 else { 12731 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty); 12732 Py_DECREF(unicode_empty); 12733 } 12734 Py_DECREF(sep_obj); 12735 Py_DECREF(str_obj); 12736 return out; 12737 } 12738 buf1 = PyUnicode_DATA(str_obj); 12739 buf2 = PyUnicode_DATA(sep_obj); 12740 if (kind2 != kind1) { 12741 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12742 if (!buf2) 12743 goto onError; 12744 } 12745 12746 switch (kind1) { 12747 case PyUnicode_1BYTE_KIND: 12748 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12749 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12750 else 12751 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12752 break; 12753 case PyUnicode_2BYTE_KIND: 12754 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12755 break; 12756 case PyUnicode_4BYTE_KIND: 12757 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); 12758 break; 12759 default: 12760 assert(0); 12761 out = 0; 12762 } 12763 12764 Py_DECREF(sep_obj); 12765 Py_DECREF(str_obj); 12766 if (kind2 != kind1) 12767 PyMem_Free(buf2); 12768 12769 return out; 12770 onError: 12771 Py_DECREF(sep_obj); 12772 Py_DECREF(str_obj); 12773 if (kind2 != kind1 && buf2) 12774 PyMem_Free(buf2); 12775 return NULL; 12776} 12777 12778 12779PyObject * 12780PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) 12781{ 12782 PyObject* str_obj; 12783 PyObject* sep_obj; 12784 PyObject* out; 12785 int kind1, kind2; 12786 void *buf1, *buf2; 12787 Py_ssize_t len1, len2; 12788 12789 str_obj = PyUnicode_FromObject(str_in); 12790 if (!str_obj) 12791 return NULL; 12792 sep_obj = PyUnicode_FromObject(sep_in); 12793 if (!sep_obj) { 12794 Py_DECREF(str_obj); 12795 return NULL; 12796 } 12797 12798 kind1 = PyUnicode_KIND(str_obj); 12799 kind2 = PyUnicode_KIND(sep_obj); 12800 len1 = PyUnicode_GET_LENGTH(str_obj); 12801 len2 = PyUnicode_GET_LENGTH(sep_obj); 12802 if (kind1 < kind2 || len1 < len2) { 12803 _Py_INCREF_UNICODE_EMPTY(); 12804 if (!unicode_empty) 12805 out = NULL; 12806 else { 12807 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj); 12808 Py_DECREF(unicode_empty); 12809 } 12810 Py_DECREF(sep_obj); 12811 Py_DECREF(str_obj); 12812 return out; 12813 } 12814 buf1 = PyUnicode_DATA(str_obj); 12815 buf2 = PyUnicode_DATA(sep_obj); 12816 if (kind2 != kind1) { 12817 buf2 = _PyUnicode_AsKind(sep_obj, kind1); 12818 if (!buf2) 12819 goto onError; 12820 } 12821 12822 switch (kind1) { 12823 case PyUnicode_1BYTE_KIND: 12824 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) 12825 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12826 else 12827 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12828 break; 12829 case PyUnicode_2BYTE_KIND: 12830 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12831 break; 12832 case PyUnicode_4BYTE_KIND: 12833 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); 12834 break; 12835 default: 12836 assert(0); 12837 out = 0; 12838 } 12839 12840 Py_DECREF(sep_obj); 12841 Py_DECREF(str_obj); 12842 if (kind2 != kind1) 12843 PyMem_Free(buf2); 12844 12845 return out; 12846 onError: 12847 Py_DECREF(sep_obj); 12848 Py_DECREF(str_obj); 12849 if (kind2 != kind1 && buf2) 12850 PyMem_Free(buf2); 12851 return NULL; 12852} 12853 12854PyDoc_STRVAR(partition__doc__, 12855 "S.partition(sep) -> (head, sep, tail)\n\ 12856\n\ 12857Search for the separator sep in S, and return the part before it,\n\ 12858the separator itself, and the part after it. If the separator is not\n\ 12859found, return S and two empty strings."); 12860 12861static PyObject* 12862unicode_partition(PyObject *self, PyObject *separator) 12863{ 12864 return PyUnicode_Partition(self, separator); 12865} 12866 12867PyDoc_STRVAR(rpartition__doc__, 12868 "S.rpartition(sep) -> (head, sep, tail)\n\ 12869\n\ 12870Search for the separator sep in S, starting at the end of S, and return\n\ 12871the part before it, the separator itself, and the part after it. If the\n\ 12872separator is not found, return two empty strings and S."); 12873 12874static PyObject* 12875unicode_rpartition(PyObject *self, PyObject *separator) 12876{ 12877 return PyUnicode_RPartition(self, separator); 12878} 12879 12880PyObject * 12881PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) 12882{ 12883 PyObject *result; 12884 12885 s = PyUnicode_FromObject(s); 12886 if (s == NULL) 12887 return NULL; 12888 if (sep != NULL) { 12889 sep = PyUnicode_FromObject(sep); 12890 if (sep == NULL) { 12891 Py_DECREF(s); 12892 return NULL; 12893 } 12894 } 12895 12896 result = rsplit(s, sep, maxsplit); 12897 12898 Py_DECREF(s); 12899 Py_XDECREF(sep); 12900 return result; 12901} 12902 12903PyDoc_STRVAR(rsplit__doc__, 12904 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ 12905\n\ 12906Return a list of the words in S, using sep as the\n\ 12907delimiter string, starting at the end of the string and\n\ 12908working to the front. If maxsplit is given, at most maxsplit\n\ 12909splits are done. If sep is not specified, any whitespace string\n\ 12910is a separator."); 12911 12912static PyObject* 12913unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) 12914{ 12915 static char *kwlist[] = {"sep", "maxsplit", 0}; 12916 PyObject *substring = Py_None; 12917 Py_ssize_t maxcount = -1; 12918 12919 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", 12920 kwlist, &substring, &maxcount)) 12921 return NULL; 12922 12923 if (substring == Py_None) 12924 return rsplit(self, NULL, maxcount); 12925 else if (PyUnicode_Check(substring)) 12926 return rsplit(self, substring, maxcount); 12927 else 12928 return PyUnicode_RSplit(self, substring, maxcount); 12929} 12930 12931PyDoc_STRVAR(splitlines__doc__, 12932 "S.splitlines([keepends]) -> list of strings\n\ 12933\n\ 12934Return a list of the lines in S, breaking at line boundaries.\n\ 12935Line breaks are not included in the resulting list unless keepends\n\ 12936is given and true."); 12937 12938static PyObject* 12939unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds) 12940{ 12941 static char *kwlist[] = {"keepends", 0}; 12942 int keepends = 0; 12943 12944 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines", 12945 kwlist, &keepends)) 12946 return NULL; 12947 12948 return PyUnicode_Splitlines(self, keepends); 12949} 12950 12951static 12952PyObject *unicode_str(PyObject *self) 12953{ 12954 return unicode_result_unchanged(self); 12955} 12956 12957PyDoc_STRVAR(swapcase__doc__, 12958 "S.swapcase() -> str\n\ 12959\n\ 12960Return a copy of S with uppercase characters converted to lowercase\n\ 12961and vice versa."); 12962 12963static PyObject* 12964unicode_swapcase(PyObject *self) 12965{ 12966 if (PyUnicode_READY(self) == -1) 12967 return NULL; 12968 return case_operation(self, do_swapcase); 12969} 12970 12971/*[clinic input] 12972 12973@staticmethod 12974str.maketrans as unicode_maketrans 12975 12976 x: object 12977 12978 y: unicode=NULL 12979 12980 z: unicode=NULL 12981 12982 / 12983 12984Return a translation table usable for str.translate(). 12985 12986If there is only one argument, it must be a dictionary mapping Unicode 12987ordinals (integers) or characters to Unicode ordinals, strings or None. 12988Character keys will be then converted to ordinals. 12989If there are two arguments, they must be strings of equal length, and 12990in the resulting dictionary, each character in x will be mapped to the 12991character at the same position in y. If there is a third argument, it 12992must be a string, whose characters will be mapped to None in the result. 12993[clinic start generated code]*/ 12994 12995static PyObject * 12996unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z) 12997/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/ 12998{ 12999 PyObject *new = NULL, *key, *value; 13000 Py_ssize_t i = 0; 13001 int res; 13002 13003 new = PyDict_New(); 13004 if (!new) 13005 return NULL; 13006 if (y != NULL) { 13007 int x_kind, y_kind, z_kind; 13008 void *x_data, *y_data, *z_data; 13009 13010 /* x must be a string too, of equal length */ 13011 if (!PyUnicode_Check(x)) { 13012 PyErr_SetString(PyExc_TypeError, "first maketrans argument must " 13013 "be a string if there is a second argument"); 13014 goto err; 13015 } 13016 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) { 13017 PyErr_SetString(PyExc_ValueError, "the first two maketrans " 13018 "arguments must have equal length"); 13019 goto err; 13020 } 13021 /* create entries for translating chars in x to those in y */ 13022 x_kind = PyUnicode_KIND(x); 13023 y_kind = PyUnicode_KIND(y); 13024 x_data = PyUnicode_DATA(x); 13025 y_data = PyUnicode_DATA(y); 13026 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) { 13027 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i)); 13028 if (!key) 13029 goto err; 13030 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i)); 13031 if (!value) { 13032 Py_DECREF(key); 13033 goto err; 13034 } 13035 res = PyDict_SetItem(new, key, value); 13036 Py_DECREF(key); 13037 Py_DECREF(value); 13038 if (res < 0) 13039 goto err; 13040 } 13041 /* create entries for deleting chars in z */ 13042 if (z != NULL) { 13043 z_kind = PyUnicode_KIND(z); 13044 z_data = PyUnicode_DATA(z); 13045 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) { 13046 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i)); 13047 if (!key) 13048 goto err; 13049 res = PyDict_SetItem(new, key, Py_None); 13050 Py_DECREF(key); 13051 if (res < 0) 13052 goto err; 13053 } 13054 } 13055 } else { 13056 int kind; 13057 void *data; 13058 13059 /* x must be a dict */ 13060 if (!PyDict_CheckExact(x)) { 13061 PyErr_SetString(PyExc_TypeError, "if you give only one argument " 13062 "to maketrans it must be a dict"); 13063 goto err; 13064 } 13065 /* copy entries into the new dict, converting string keys to int keys */ 13066 while (PyDict_Next(x, &i, &key, &value)) { 13067 if (PyUnicode_Check(key)) { 13068 /* convert string keys to integer keys */ 13069 PyObject *newkey; 13070 if (PyUnicode_GET_LENGTH(key) != 1) { 13071 PyErr_SetString(PyExc_ValueError, "string keys in translate " 13072 "table must be of length 1"); 13073 goto err; 13074 } 13075 kind = PyUnicode_KIND(key); 13076 data = PyUnicode_DATA(key); 13077 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0)); 13078 if (!newkey) 13079 goto err; 13080 res = PyDict_SetItem(new, newkey, value); 13081 Py_DECREF(newkey); 13082 if (res < 0) 13083 goto err; 13084 } else if (PyLong_Check(key)) { 13085 /* just keep integer keys */ 13086 if (PyDict_SetItem(new, key, value) < 0) 13087 goto err; 13088 } else { 13089 PyErr_SetString(PyExc_TypeError, "keys in translate table must " 13090 "be strings or integers"); 13091 goto err; 13092 } 13093 } 13094 } 13095 return new; 13096 err: 13097 Py_DECREF(new); 13098 return NULL; 13099} 13100 13101PyDoc_STRVAR(translate__doc__, 13102 "S.translate(table) -> str\n\ 13103\n\ 13104Return a copy of the string S in which each character has been mapped\n\ 13105through the given translation table. The table must implement\n\ 13106lookup/indexing via __getitem__, for instance a dictionary or list,\n\ 13107mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\ 13108this operation raises LookupError, the character is left untouched.\n\ 13109Characters mapped to None are deleted."); 13110 13111static PyObject* 13112unicode_translate(PyObject *self, PyObject *table) 13113{ 13114 return _PyUnicode_TranslateCharmap(self, table, "ignore"); 13115} 13116 13117PyDoc_STRVAR(upper__doc__, 13118 "S.upper() -> str\n\ 13119\n\ 13120Return a copy of S converted to uppercase."); 13121 13122static PyObject* 13123unicode_upper(PyObject *self) 13124{ 13125 if (PyUnicode_READY(self) == -1) 13126 return NULL; 13127 if (PyUnicode_IS_ASCII(self)) 13128 return ascii_upper_or_lower(self, 0); 13129 return case_operation(self, do_upper); 13130} 13131 13132PyDoc_STRVAR(zfill__doc__, 13133 "S.zfill(width) -> str\n\ 13134\n\ 13135Pad a numeric string S with zeros on the left, to fill a field\n\ 13136of the specified width. The string S is never truncated."); 13137 13138static PyObject * 13139unicode_zfill(PyObject *self, PyObject *args) 13140{ 13141 Py_ssize_t fill; 13142 PyObject *u; 13143 Py_ssize_t width; 13144 int kind; 13145 void *data; 13146 Py_UCS4 chr; 13147 13148 if (!PyArg_ParseTuple(args, "n:zfill", &width)) 13149 return NULL; 13150 13151 if (PyUnicode_READY(self) == -1) 13152 return NULL; 13153 13154 if (PyUnicode_GET_LENGTH(self) >= width) 13155 return unicode_result_unchanged(self); 13156 13157 fill = width - PyUnicode_GET_LENGTH(self); 13158 13159 u = pad(self, fill, 0, '0'); 13160 13161 if (u == NULL) 13162 return NULL; 13163 13164 kind = PyUnicode_KIND(u); 13165 data = PyUnicode_DATA(u); 13166 chr = PyUnicode_READ(kind, data, fill); 13167 13168 if (chr == '+' || chr == '-') { 13169 /* move sign to beginning of string */ 13170 PyUnicode_WRITE(kind, data, 0, chr); 13171 PyUnicode_WRITE(kind, data, fill, '0'); 13172 } 13173 13174 assert(_PyUnicode_CheckConsistency(u, 1)); 13175 return u; 13176} 13177 13178#if 0 13179static PyObject * 13180unicode__decimal2ascii(PyObject *self) 13181{ 13182 return PyUnicode_TransformDecimalAndSpaceToASCII(self); 13183} 13184#endif 13185 13186PyDoc_STRVAR(startswith__doc__, 13187 "S.startswith(prefix[, start[, end]]) -> bool\n\ 13188\n\ 13189Return True if S starts with the specified prefix, False otherwise.\n\ 13190With optional start, test S beginning at that position.\n\ 13191With optional end, stop comparing S at that position.\n\ 13192prefix can also be a tuple of strings to try."); 13193 13194static PyObject * 13195unicode_startswith(PyObject *self, 13196 PyObject *args) 13197{ 13198 PyObject *subobj; 13199 PyObject *substring; 13200 Py_ssize_t start = 0; 13201 Py_ssize_t end = PY_SSIZE_T_MAX; 13202 int result; 13203 13204 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) 13205 return NULL; 13206 if (PyTuple_Check(subobj)) { 13207 Py_ssize_t i; 13208 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13209 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); 13210 if (substring == NULL) 13211 return NULL; 13212 result = tailmatch(self, substring, start, end, -1); 13213 Py_DECREF(substring); 13214 if (result == -1) 13215 return NULL; 13216 if (result) { 13217 Py_RETURN_TRUE; 13218 } 13219 } 13220 /* nothing matched */ 13221 Py_RETURN_FALSE; 13222 } 13223 substring = PyUnicode_FromObject(subobj); 13224 if (substring == NULL) { 13225 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13226 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " 13227 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13228 return NULL; 13229 } 13230 result = tailmatch(self, substring, start, end, -1); 13231 Py_DECREF(substring); 13232 if (result == -1) 13233 return NULL; 13234 return PyBool_FromLong(result); 13235} 13236 13237 13238PyDoc_STRVAR(endswith__doc__, 13239 "S.endswith(suffix[, start[, end]]) -> bool\n\ 13240\n\ 13241Return True if S ends with the specified suffix, False otherwise.\n\ 13242With optional start, test S beginning at that position.\n\ 13243With optional end, stop comparing S at that position.\n\ 13244suffix can also be a tuple of strings to try."); 13245 13246static PyObject * 13247unicode_endswith(PyObject *self, 13248 PyObject *args) 13249{ 13250 PyObject *subobj; 13251 PyObject *substring; 13252 Py_ssize_t start = 0; 13253 Py_ssize_t end = PY_SSIZE_T_MAX; 13254 int result; 13255 13256 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) 13257 return NULL; 13258 if (PyTuple_Check(subobj)) { 13259 Py_ssize_t i; 13260 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { 13261 substring = PyUnicode_FromObject( 13262 PyTuple_GET_ITEM(subobj, i)); 13263 if (substring == NULL) 13264 return NULL; 13265 result = tailmatch(self, substring, start, end, +1); 13266 Py_DECREF(substring); 13267 if (result == -1) 13268 return NULL; 13269 if (result) { 13270 Py_RETURN_TRUE; 13271 } 13272 } 13273 Py_RETURN_FALSE; 13274 } 13275 substring = PyUnicode_FromObject(subobj); 13276 if (substring == NULL) { 13277 if (PyErr_ExceptionMatches(PyExc_TypeError)) 13278 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " 13279 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); 13280 return NULL; 13281 } 13282 result = tailmatch(self, substring, start, end, +1); 13283 Py_DECREF(substring); 13284 if (result == -1) 13285 return NULL; 13286 return PyBool_FromLong(result); 13287} 13288 13289Py_LOCAL_INLINE(void) 13290_PyUnicodeWriter_Update(_PyUnicodeWriter *writer) 13291{ 13292 if (!writer->readonly) 13293 writer->size = PyUnicode_GET_LENGTH(writer->buffer); 13294 else { 13295 /* Copy-on-write mode: set buffer size to 0 so 13296 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on 13297 * next write. */ 13298 writer->size = 0; 13299 } 13300 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); 13301 writer->data = PyUnicode_DATA(writer->buffer); 13302 writer->kind = PyUnicode_KIND(writer->buffer); 13303} 13304 13305void 13306_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) 13307{ 13308 memset(writer, 0, sizeof(*writer)); 13309#ifdef Py_DEBUG 13310 writer->kind = 5; /* invalid kind */ 13311#endif 13312 writer->min_char = 127; 13313} 13314 13315int 13316_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, 13317 Py_ssize_t length, Py_UCS4 maxchar) 13318{ 13319#ifdef MS_WINDOWS 13320 /* On Windows, overallocate by 50% is the best factor */ 13321# define OVERALLOCATE_FACTOR 2 13322#else 13323 /* On Linux, overallocate by 25% is the best factor */ 13324# define OVERALLOCATE_FACTOR 4 13325#endif 13326 Py_ssize_t newlen; 13327 PyObject *newbuffer; 13328 13329 /* ensure that the _PyUnicodeWriter_Prepare macro was used */ 13330 assert((maxchar > writer->maxchar && length >= 0) 13331 || length > 0); 13332 13333 if (length > PY_SSIZE_T_MAX - writer->pos) { 13334 PyErr_NoMemory(); 13335 return -1; 13336 } 13337 newlen = writer->pos + length; 13338 13339 maxchar = Py_MAX(maxchar, writer->min_char); 13340 13341 if (writer->buffer == NULL) { 13342 assert(!writer->readonly); 13343 if (writer->overallocate 13344 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13345 /* overallocate to limit the number of realloc() */ 13346 newlen += newlen / OVERALLOCATE_FACTOR; 13347 } 13348 if (newlen < writer->min_length) 13349 newlen = writer->min_length; 13350 13351 writer->buffer = PyUnicode_New(newlen, maxchar); 13352 if (writer->buffer == NULL) 13353 return -1; 13354 } 13355 else if (newlen > writer->size) { 13356 if (writer->overallocate 13357 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) { 13358 /* overallocate to limit the number of realloc() */ 13359 newlen += newlen / OVERALLOCATE_FACTOR; 13360 } 13361 if (newlen < writer->min_length) 13362 newlen = writer->min_length; 13363 13364 if (maxchar > writer->maxchar || writer->readonly) { 13365 /* resize + widen */ 13366 newbuffer = PyUnicode_New(newlen, maxchar); 13367 if (newbuffer == NULL) 13368 return -1; 13369 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13370 writer->buffer, 0, writer->pos); 13371 Py_DECREF(writer->buffer); 13372 writer->readonly = 0; 13373 } 13374 else { 13375 newbuffer = resize_compact(writer->buffer, newlen); 13376 if (newbuffer == NULL) 13377 return -1; 13378 } 13379 writer->buffer = newbuffer; 13380 } 13381 else if (maxchar > writer->maxchar) { 13382 assert(!writer->readonly); 13383 newbuffer = PyUnicode_New(writer->size, maxchar); 13384 if (newbuffer == NULL) 13385 return -1; 13386 _PyUnicode_FastCopyCharacters(newbuffer, 0, 13387 writer->buffer, 0, writer->pos); 13388 Py_DECREF(writer->buffer); 13389 writer->buffer = newbuffer; 13390 } 13391 _PyUnicodeWriter_Update(writer); 13392 return 0; 13393 13394#undef OVERALLOCATE_FACTOR 13395} 13396 13397int 13398_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, 13399 enum PyUnicode_Kind kind) 13400{ 13401 Py_UCS4 maxchar; 13402 13403 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ 13404 assert(writer->kind < kind); 13405 13406 switch (kind) 13407 { 13408 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; 13409 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; 13410 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; 13411 default: 13412 assert(0 && "invalid kind"); 13413 return -1; 13414 } 13415 13416 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); 13417} 13418 13419Py_LOCAL_INLINE(int) 13420_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) 13421{ 13422 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) 13423 return -1; 13424 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); 13425 writer->pos++; 13426 return 0; 13427} 13428 13429int 13430_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) 13431{ 13432 return _PyUnicodeWriter_WriteCharInline(writer, ch); 13433} 13434 13435int 13436_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) 13437{ 13438 Py_UCS4 maxchar; 13439 Py_ssize_t len; 13440 13441 if (PyUnicode_READY(str) == -1) 13442 return -1; 13443 len = PyUnicode_GET_LENGTH(str); 13444 if (len == 0) 13445 return 0; 13446 maxchar = PyUnicode_MAX_CHAR_VALUE(str); 13447 if (maxchar > writer->maxchar || len > writer->size - writer->pos) { 13448 if (writer->buffer == NULL && !writer->overallocate) { 13449 assert(_PyUnicode_CheckConsistency(str, 1)); 13450 writer->readonly = 1; 13451 Py_INCREF(str); 13452 writer->buffer = str; 13453 _PyUnicodeWriter_Update(writer); 13454 writer->pos += len; 13455 return 0; 13456 } 13457 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1) 13458 return -1; 13459 } 13460 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13461 str, 0, len); 13462 writer->pos += len; 13463 return 0; 13464} 13465 13466int 13467_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, 13468 Py_ssize_t start, Py_ssize_t end) 13469{ 13470 Py_UCS4 maxchar; 13471 Py_ssize_t len; 13472 13473 if (PyUnicode_READY(str) == -1) 13474 return -1; 13475 13476 assert(0 <= start); 13477 assert(end <= PyUnicode_GET_LENGTH(str)); 13478 assert(start <= end); 13479 13480 if (end == 0) 13481 return 0; 13482 13483 if (start == 0 && end == PyUnicode_GET_LENGTH(str)) 13484 return _PyUnicodeWriter_WriteStr(writer, str); 13485 13486 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) 13487 maxchar = _PyUnicode_FindMaxChar(str, start, end); 13488 else 13489 maxchar = writer->maxchar; 13490 len = end - start; 13491 13492 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) 13493 return -1; 13494 13495 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 13496 str, start, len); 13497 writer->pos += len; 13498 return 0; 13499} 13500 13501int 13502_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer, 13503 const char *ascii, Py_ssize_t len) 13504{ 13505 if (len == -1) 13506 len = strlen(ascii); 13507 13508 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128); 13509 13510 if (writer->buffer == NULL && !writer->overallocate) { 13511 PyObject *str; 13512 13513 str = _PyUnicode_FromASCII(ascii, len); 13514 if (str == NULL) 13515 return -1; 13516 13517 writer->readonly = 1; 13518 writer->buffer = str; 13519 _PyUnicodeWriter_Update(writer); 13520 writer->pos += len; 13521 return 0; 13522 } 13523 13524 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) 13525 return -1; 13526 13527 switch (writer->kind) 13528 { 13529 case PyUnicode_1BYTE_KIND: 13530 { 13531 const Py_UCS1 *str = (const Py_UCS1 *)ascii; 13532 Py_UCS1 *data = writer->data; 13533 13534 Py_MEMCPY(data + writer->pos, str, len); 13535 break; 13536 } 13537 case PyUnicode_2BYTE_KIND: 13538 { 13539 _PyUnicode_CONVERT_BYTES( 13540 Py_UCS1, Py_UCS2, 13541 ascii, ascii + len, 13542 (Py_UCS2 *)writer->data + writer->pos); 13543 break; 13544 } 13545 case PyUnicode_4BYTE_KIND: 13546 { 13547 _PyUnicode_CONVERT_BYTES( 13548 Py_UCS1, Py_UCS4, 13549 ascii, ascii + len, 13550 (Py_UCS4 *)writer->data + writer->pos); 13551 break; 13552 } 13553 default: 13554 assert(0); 13555 } 13556 13557 writer->pos += len; 13558 return 0; 13559} 13560 13561int 13562_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer, 13563 const char *str, Py_ssize_t len) 13564{ 13565 Py_UCS4 maxchar; 13566 13567 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); 13568 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) 13569 return -1; 13570 unicode_write_cstr(writer->buffer, writer->pos, str, len); 13571 writer->pos += len; 13572 return 0; 13573} 13574 13575PyObject * 13576_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) 13577{ 13578 PyObject *str; 13579 if (writer->pos == 0) { 13580 Py_CLEAR(writer->buffer); 13581 _Py_RETURN_UNICODE_EMPTY(); 13582 } 13583 if (writer->readonly) { 13584 str = writer->buffer; 13585 writer->buffer = NULL; 13586 assert(PyUnicode_GET_LENGTH(str) == writer->pos); 13587 return str; 13588 } 13589 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { 13590 PyObject *newbuffer; 13591 newbuffer = resize_compact(writer->buffer, writer->pos); 13592 if (newbuffer == NULL) { 13593 Py_CLEAR(writer->buffer); 13594 return NULL; 13595 } 13596 writer->buffer = newbuffer; 13597 } 13598 str = writer->buffer; 13599 writer->buffer = NULL; 13600 assert(_PyUnicode_CheckConsistency(str, 1)); 13601 return unicode_result_ready(str); 13602} 13603 13604void 13605_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer) 13606{ 13607 Py_CLEAR(writer->buffer); 13608} 13609 13610#include "stringlib/unicode_format.h" 13611 13612PyDoc_STRVAR(format__doc__, 13613 "S.format(*args, **kwargs) -> str\n\ 13614\n\ 13615Return a formatted version of S, using substitutions from args and kwargs.\n\ 13616The substitutions are identified by braces ('{' and '}')."); 13617 13618PyDoc_STRVAR(format_map__doc__, 13619 "S.format_map(mapping) -> str\n\ 13620\n\ 13621Return a formatted version of S, using substitutions from mapping.\n\ 13622The substitutions are identified by braces ('{' and '}')."); 13623 13624static PyObject * 13625unicode__format__(PyObject* self, PyObject* args) 13626{ 13627 PyObject *format_spec; 13628 _PyUnicodeWriter writer; 13629 int ret; 13630 13631 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) 13632 return NULL; 13633 13634 if (PyUnicode_READY(self) == -1) 13635 return NULL; 13636 _PyUnicodeWriter_Init(&writer); 13637 ret = _PyUnicode_FormatAdvancedWriter(&writer, 13638 self, format_spec, 0, 13639 PyUnicode_GET_LENGTH(format_spec)); 13640 if (ret == -1) { 13641 _PyUnicodeWriter_Dealloc(&writer); 13642 return NULL; 13643 } 13644 return _PyUnicodeWriter_Finish(&writer); 13645} 13646 13647PyDoc_STRVAR(p_format__doc__, 13648 "S.__format__(format_spec) -> str\n\ 13649\n\ 13650Return a formatted version of S as described by format_spec."); 13651 13652static PyObject * 13653unicode__sizeof__(PyObject *v) 13654{ 13655 Py_ssize_t size; 13656 13657 /* If it's a compact object, account for base structure + 13658 character data. */ 13659 if (PyUnicode_IS_COMPACT_ASCII(v)) 13660 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1; 13661 else if (PyUnicode_IS_COMPACT(v)) 13662 size = sizeof(PyCompactUnicodeObject) + 13663 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v); 13664 else { 13665 /* If it is a two-block object, account for base object, and 13666 for character block if present. */ 13667 size = sizeof(PyUnicodeObject); 13668 if (_PyUnicode_DATA_ANY(v)) 13669 size += (PyUnicode_GET_LENGTH(v) + 1) * 13670 PyUnicode_KIND(v); 13671 } 13672 /* If the wstr pointer is present, account for it unless it is shared 13673 with the data pointer. Check if the data is not shared. */ 13674 if (_PyUnicode_HAS_WSTR_MEMORY(v)) 13675 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t); 13676 if (_PyUnicode_HAS_UTF8_MEMORY(v)) 13677 size += PyUnicode_UTF8_LENGTH(v) + 1; 13678 13679 return PyLong_FromSsize_t(size); 13680} 13681 13682PyDoc_STRVAR(sizeof__doc__, 13683 "S.__sizeof__() -> size of S in memory, in bytes"); 13684 13685static PyObject * 13686unicode_getnewargs(PyObject *v) 13687{ 13688 PyObject *copy = _PyUnicode_Copy(v); 13689 if (!copy) 13690 return NULL; 13691 return Py_BuildValue("(N)", copy); 13692} 13693 13694static PyMethodDef unicode_methods[] = { 13695 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, 13696 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, 13697 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__}, 13698 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__}, 13699 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, 13700 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, 13701 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__}, 13702 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, 13703 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, 13704 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, 13705 {"expandtabs", (PyCFunction) unicode_expandtabs, 13706 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__}, 13707 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, 13708 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, 13709 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, 13710 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, 13711 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, 13712 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, 13713 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, 13714 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, 13715 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, 13716 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, 13717 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, 13718 {"splitlines", (PyCFunction) unicode_splitlines, 13719 METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, 13720 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, 13721 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, 13722 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, 13723 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, 13724 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, 13725 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, 13726 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, 13727 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, 13728 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, 13729 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, 13730 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, 13731 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, 13732 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, 13733 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, 13734 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, 13735 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, 13736 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, 13737 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, 13738 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, 13739 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, 13740 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, 13741 UNICODE_MAKETRANS_METHODDEF 13742 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, 13743#if 0 13744 /* These methods are just used for debugging the implementation. */ 13745 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, 13746#endif 13747 13748 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, 13749 {NULL, NULL} 13750}; 13751 13752static PyObject * 13753unicode_mod(PyObject *v, PyObject *w) 13754{ 13755 if (!PyUnicode_Check(v)) 13756 Py_RETURN_NOTIMPLEMENTED; 13757 return PyUnicode_Format(v, w); 13758} 13759 13760static PyNumberMethods unicode_as_number = { 13761 0, /*nb_add*/ 13762 0, /*nb_subtract*/ 13763 0, /*nb_multiply*/ 13764 unicode_mod, /*nb_remainder*/ 13765}; 13766 13767static PySequenceMethods unicode_as_sequence = { 13768 (lenfunc) unicode_length, /* sq_length */ 13769 PyUnicode_Concat, /* sq_concat */ 13770 (ssizeargfunc) unicode_repeat, /* sq_repeat */ 13771 (ssizeargfunc) unicode_getitem, /* sq_item */ 13772 0, /* sq_slice */ 13773 0, /* sq_ass_item */ 13774 0, /* sq_ass_slice */ 13775 PyUnicode_Contains, /* sq_contains */ 13776}; 13777 13778static PyObject* 13779unicode_subscript(PyObject* self, PyObject* item) 13780{ 13781 if (PyUnicode_READY(self) == -1) 13782 return NULL; 13783 13784 if (PyIndex_Check(item)) { 13785 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); 13786 if (i == -1 && PyErr_Occurred()) 13787 return NULL; 13788 if (i < 0) 13789 i += PyUnicode_GET_LENGTH(self); 13790 return unicode_getitem(self, i); 13791 } else if (PySlice_Check(item)) { 13792 Py_ssize_t start, stop, step, slicelength, cur, i; 13793 PyObject *result; 13794 void *src_data, *dest_data; 13795 int src_kind, dest_kind; 13796 Py_UCS4 ch, max_char, kind_limit; 13797 13798 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), 13799 &start, &stop, &step, &slicelength) < 0) { 13800 return NULL; 13801 } 13802 13803 if (slicelength <= 0) { 13804 _Py_RETURN_UNICODE_EMPTY(); 13805 } else if (start == 0 && step == 1 && 13806 slicelength == PyUnicode_GET_LENGTH(self)) { 13807 return unicode_result_unchanged(self); 13808 } else if (step == 1) { 13809 return PyUnicode_Substring(self, 13810 start, start + slicelength); 13811 } 13812 /* General case */ 13813 src_kind = PyUnicode_KIND(self); 13814 src_data = PyUnicode_DATA(self); 13815 if (!PyUnicode_IS_ASCII(self)) { 13816 kind_limit = kind_maxchar_limit(src_kind); 13817 max_char = 0; 13818 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13819 ch = PyUnicode_READ(src_kind, src_data, cur); 13820 if (ch > max_char) { 13821 max_char = ch; 13822 if (max_char >= kind_limit) 13823 break; 13824 } 13825 } 13826 } 13827 else 13828 max_char = 127; 13829 result = PyUnicode_New(slicelength, max_char); 13830 if (result == NULL) 13831 return NULL; 13832 dest_kind = PyUnicode_KIND(result); 13833 dest_data = PyUnicode_DATA(result); 13834 13835 for (cur = start, i = 0; i < slicelength; cur += step, i++) { 13836 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); 13837 PyUnicode_WRITE(dest_kind, dest_data, i, ch); 13838 } 13839 assert(_PyUnicode_CheckConsistency(result, 1)); 13840 return result; 13841 } else { 13842 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); 13843 return NULL; 13844 } 13845} 13846 13847static PyMappingMethods unicode_as_mapping = { 13848 (lenfunc)unicode_length, /* mp_length */ 13849 (binaryfunc)unicode_subscript, /* mp_subscript */ 13850 (objobjargproc)0, /* mp_ass_subscript */ 13851}; 13852 13853 13854/* Helpers for PyUnicode_Format() */ 13855 13856struct unicode_formatter_t { 13857 PyObject *args; 13858 int args_owned; 13859 Py_ssize_t arglen, argidx; 13860 PyObject *dict; 13861 13862 enum PyUnicode_Kind fmtkind; 13863 Py_ssize_t fmtcnt, fmtpos; 13864 void *fmtdata; 13865 PyObject *fmtstr; 13866 13867 _PyUnicodeWriter writer; 13868}; 13869 13870struct unicode_format_arg_t { 13871 Py_UCS4 ch; 13872 int flags; 13873 Py_ssize_t width; 13874 int prec; 13875 int sign; 13876}; 13877 13878static PyObject * 13879unicode_format_getnextarg(struct unicode_formatter_t *ctx) 13880{ 13881 Py_ssize_t argidx = ctx->argidx; 13882 13883 if (argidx < ctx->arglen) { 13884 ctx->argidx++; 13885 if (ctx->arglen < 0) 13886 return ctx->args; 13887 else 13888 return PyTuple_GetItem(ctx->args, argidx); 13889 } 13890 PyErr_SetString(PyExc_TypeError, 13891 "not enough arguments for format string"); 13892 return NULL; 13893} 13894 13895/* Returns a new reference to a PyUnicode object, or NULL on failure. */ 13896 13897/* Format a float into the writer if the writer is not NULL, or into *p_output 13898 otherwise. 13899 13900 Return 0 on success, raise an exception and return -1 on error. */ 13901static int 13902formatfloat(PyObject *v, struct unicode_format_arg_t *arg, 13903 PyObject **p_output, 13904 _PyUnicodeWriter *writer) 13905{ 13906 char *p; 13907 double x; 13908 Py_ssize_t len; 13909 int prec; 13910 int dtoa_flags; 13911 13912 x = PyFloat_AsDouble(v); 13913 if (x == -1.0 && PyErr_Occurred()) 13914 return -1; 13915 13916 prec = arg->prec; 13917 if (prec < 0) 13918 prec = 6; 13919 13920 if (arg->flags & F_ALT) 13921 dtoa_flags = Py_DTSF_ALT; 13922 else 13923 dtoa_flags = 0; 13924 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL); 13925 if (p == NULL) 13926 return -1; 13927 len = strlen(p); 13928 if (writer) { 13929 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) { 13930 PyMem_Free(p); 13931 return -1; 13932 } 13933 } 13934 else 13935 *p_output = _PyUnicode_FromASCII(p, len); 13936 PyMem_Free(p); 13937 return 0; 13938} 13939 13940/* formatlong() emulates the format codes d, u, o, x and X, and 13941 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for 13942 * Python's regular ints. 13943 * Return value: a new PyUnicodeObject*, or NULL if error. 13944 * The output string is of the form 13945 * "-"? ("0x" | "0X")? digit+ 13946 * "0x"/"0X" are present only for x and X conversions, with F_ALT 13947 * set in flags. The case of hex digits will be correct, 13948 * There will be at least prec digits, zero-filled on the left if 13949 * necessary to get that many. 13950 * val object to be converted 13951 * flags bitmask of format flags; only F_ALT is looked at 13952 * prec minimum number of digits; 0-fill on left if needed 13953 * type a character in [duoxX]; u acts the same as d 13954 * 13955 * CAUTION: o, x and X conversions on regular ints can never 13956 * produce a '-' sign, but can for Python's unbounded ints. 13957 */ 13958PyObject * 13959_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type) 13960{ 13961 PyObject *result = NULL; 13962 char *buf; 13963 Py_ssize_t i; 13964 int sign; /* 1 if '-', else 0 */ 13965 int len; /* number of characters */ 13966 Py_ssize_t llen; 13967 int numdigits; /* len == numnondigits + numdigits */ 13968 int numnondigits = 0; 13969 13970 /* Avoid exceeding SSIZE_T_MAX */ 13971 if (prec > INT_MAX-3) { 13972 PyErr_SetString(PyExc_OverflowError, 13973 "precision too large"); 13974 return NULL; 13975 } 13976 13977 assert(PyLong_Check(val)); 13978 13979 switch (type) { 13980 default: 13981 assert(!"'type' not in [diuoxX]"); 13982 case 'd': 13983 case 'i': 13984 case 'u': 13985 /* int and int subclasses should print numerically when a numeric */ 13986 /* format code is used (see issue18780) */ 13987 result = PyNumber_ToBase(val, 10); 13988 break; 13989 case 'o': 13990 numnondigits = 2; 13991 result = PyNumber_ToBase(val, 8); 13992 break; 13993 case 'x': 13994 case 'X': 13995 numnondigits = 2; 13996 result = PyNumber_ToBase(val, 16); 13997 break; 13998 } 13999 if (!result) 14000 return NULL; 14001 14002 assert(unicode_modifiable(result)); 14003 assert(PyUnicode_IS_READY(result)); 14004 assert(PyUnicode_IS_ASCII(result)); 14005 14006 /* To modify the string in-place, there can only be one reference. */ 14007 if (Py_REFCNT(result) != 1) { 14008 Py_DECREF(result); 14009 PyErr_BadInternalCall(); 14010 return NULL; 14011 } 14012 buf = PyUnicode_DATA(result); 14013 llen = PyUnicode_GET_LENGTH(result); 14014 if (llen > INT_MAX) { 14015 Py_DECREF(result); 14016 PyErr_SetString(PyExc_ValueError, 14017 "string too large in _PyUnicode_FormatLong"); 14018 return NULL; 14019 } 14020 len = (int)llen; 14021 sign = buf[0] == '-'; 14022 numnondigits += sign; 14023 numdigits = len - numnondigits; 14024 assert(numdigits > 0); 14025 14026 /* Get rid of base marker unless F_ALT */ 14027 if (((alt) == 0 && 14028 (type == 'o' || type == 'x' || type == 'X'))) { 14029 assert(buf[sign] == '0'); 14030 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' || 14031 buf[sign+1] == 'o'); 14032 numnondigits -= 2; 14033 buf += 2; 14034 len -= 2; 14035 if (sign) 14036 buf[0] = '-'; 14037 assert(len == numnondigits + numdigits); 14038 assert(numdigits > 0); 14039 } 14040 14041 /* Fill with leading zeroes to meet minimum width. */ 14042 if (prec > numdigits) { 14043 PyObject *r1 = PyBytes_FromStringAndSize(NULL, 14044 numnondigits + prec); 14045 char *b1; 14046 if (!r1) { 14047 Py_DECREF(result); 14048 return NULL; 14049 } 14050 b1 = PyBytes_AS_STRING(r1); 14051 for (i = 0; i < numnondigits; ++i) 14052 *b1++ = *buf++; 14053 for (i = 0; i < prec - numdigits; i++) 14054 *b1++ = '0'; 14055 for (i = 0; i < numdigits; i++) 14056 *b1++ = *buf++; 14057 *b1 = '\0'; 14058 Py_DECREF(result); 14059 result = r1; 14060 buf = PyBytes_AS_STRING(result); 14061 len = numnondigits + prec; 14062 } 14063 14064 /* Fix up case for hex conversions. */ 14065 if (type == 'X') { 14066 /* Need to convert all lower case letters to upper case. 14067 and need to convert 0x to 0X (and -0x to -0X). */ 14068 for (i = 0; i < len; i++) 14069 if (buf[i] >= 'a' && buf[i] <= 'x') 14070 buf[i] -= 'a'-'A'; 14071 } 14072 if (!PyUnicode_Check(result) 14073 || buf != PyUnicode_DATA(result)) { 14074 PyObject *unicode; 14075 unicode = _PyUnicode_FromASCII(buf, len); 14076 Py_DECREF(result); 14077 result = unicode; 14078 } 14079 else if (len != PyUnicode_GET_LENGTH(result)) { 14080 if (PyUnicode_Resize(&result, len) < 0) 14081 Py_CLEAR(result); 14082 } 14083 return result; 14084} 14085 14086/* Format an integer or a float as an integer. 14087 * Return 1 if the number has been formatted into the writer, 14088 * 0 if the number has been formatted into *p_output 14089 * -1 and raise an exception on error */ 14090static int 14091mainformatlong(PyObject *v, 14092 struct unicode_format_arg_t *arg, 14093 PyObject **p_output, 14094 _PyUnicodeWriter *writer) 14095{ 14096 PyObject *iobj, *res; 14097 char type = (char)arg->ch; 14098 14099 if (!PyNumber_Check(v)) 14100 goto wrongtype; 14101 14102 /* make sure number is a type of integer for o, x, and X */ 14103 if (!PyLong_Check(v)) { 14104 if (type == 'o' || type == 'x' || type == 'X') { 14105 iobj = PyNumber_Index(v); 14106 if (iobj == NULL) { 14107 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14108 goto wrongtype; 14109 return -1; 14110 } 14111 } 14112 else { 14113 iobj = PyNumber_Long(v); 14114 if (iobj == NULL ) { 14115 if (PyErr_ExceptionMatches(PyExc_TypeError)) 14116 goto wrongtype; 14117 return -1; 14118 } 14119 } 14120 assert(PyLong_Check(iobj)); 14121 } 14122 else { 14123 iobj = v; 14124 Py_INCREF(iobj); 14125 } 14126 14127 if (PyLong_CheckExact(v) 14128 && arg->width == -1 && arg->prec == -1 14129 && !(arg->flags & (F_SIGN | F_BLANK)) 14130 && type != 'X') 14131 { 14132 /* Fast path */ 14133 int alternate = arg->flags & F_ALT; 14134 int base; 14135 14136 switch(type) 14137 { 14138 default: 14139 assert(0 && "'type' not in [diuoxX]"); 14140 case 'd': 14141 case 'i': 14142 case 'u': 14143 base = 10; 14144 break; 14145 case 'o': 14146 base = 8; 14147 break; 14148 case 'x': 14149 case 'X': 14150 base = 16; 14151 break; 14152 } 14153 14154 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) { 14155 Py_DECREF(iobj); 14156 return -1; 14157 } 14158 Py_DECREF(iobj); 14159 return 1; 14160 } 14161 14162 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type); 14163 Py_DECREF(iobj); 14164 if (res == NULL) 14165 return -1; 14166 *p_output = res; 14167 return 0; 14168 14169wrongtype: 14170 switch(type) 14171 { 14172 case 'o': 14173 case 'x': 14174 case 'X': 14175 PyErr_Format(PyExc_TypeError, 14176 "%%%c format: an integer is required, " 14177 "not %.200s", 14178 type, Py_TYPE(v)->tp_name); 14179 break; 14180 default: 14181 PyErr_Format(PyExc_TypeError, 14182 "%%%c format: a number is required, " 14183 "not %.200s", 14184 type, Py_TYPE(v)->tp_name); 14185 break; 14186 } 14187 return -1; 14188} 14189 14190static Py_UCS4 14191formatchar(PyObject *v) 14192{ 14193 /* presume that the buffer is at least 3 characters long */ 14194 if (PyUnicode_Check(v)) { 14195 if (PyUnicode_GET_LENGTH(v) == 1) { 14196 return PyUnicode_READ_CHAR(v, 0); 14197 } 14198 goto onError; 14199 } 14200 else { 14201 PyObject *iobj; 14202 long x; 14203 /* make sure number is a type of integer */ 14204 if (!PyLong_Check(v)) { 14205 iobj = PyNumber_Index(v); 14206 if (iobj == NULL) { 14207 goto onError; 14208 } 14209 v = iobj; 14210 Py_DECREF(iobj); 14211 } 14212 /* Integer input truncated to a character */ 14213 x = PyLong_AsLong(v); 14214 if (x == -1 && PyErr_Occurred()) 14215 goto onError; 14216 14217 if (x < 0 || x > MAX_UNICODE) { 14218 PyErr_SetString(PyExc_OverflowError, 14219 "%c arg not in range(0x110000)"); 14220 return (Py_UCS4) -1; 14221 } 14222 14223 return (Py_UCS4) x; 14224 } 14225 14226 onError: 14227 PyErr_SetString(PyExc_TypeError, 14228 "%c requires int or char"); 14229 return (Py_UCS4) -1; 14230} 14231 14232/* Parse options of an argument: flags, width, precision. 14233 Handle also "%(name)" syntax. 14234 14235 Return 0 if the argument has been formatted into arg->str. 14236 Return 1 if the argument has been written into ctx->writer, 14237 Raise an exception and return -1 on error. */ 14238static int 14239unicode_format_arg_parse(struct unicode_formatter_t *ctx, 14240 struct unicode_format_arg_t *arg) 14241{ 14242#define FORMAT_READ(ctx) \ 14243 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos) 14244 14245 PyObject *v; 14246 14247 if (arg->ch == '(') { 14248 /* Get argument value from a dictionary. Example: "%(name)s". */ 14249 Py_ssize_t keystart; 14250 Py_ssize_t keylen; 14251 PyObject *key; 14252 int pcount = 1; 14253 14254 if (ctx->dict == NULL) { 14255 PyErr_SetString(PyExc_TypeError, 14256 "format requires a mapping"); 14257 return -1; 14258 } 14259 ++ctx->fmtpos; 14260 --ctx->fmtcnt; 14261 keystart = ctx->fmtpos; 14262 /* Skip over balanced parentheses */ 14263 while (pcount > 0 && --ctx->fmtcnt >= 0) { 14264 arg->ch = FORMAT_READ(ctx); 14265 if (arg->ch == ')') 14266 --pcount; 14267 else if (arg->ch == '(') 14268 ++pcount; 14269 ctx->fmtpos++; 14270 } 14271 keylen = ctx->fmtpos - keystart - 1; 14272 if (ctx->fmtcnt < 0 || pcount > 0) { 14273 PyErr_SetString(PyExc_ValueError, 14274 "incomplete format key"); 14275 return -1; 14276 } 14277 key = PyUnicode_Substring(ctx->fmtstr, 14278 keystart, keystart + keylen); 14279 if (key == NULL) 14280 return -1; 14281 if (ctx->args_owned) { 14282 Py_DECREF(ctx->args); 14283 ctx->args_owned = 0; 14284 } 14285 ctx->args = PyObject_GetItem(ctx->dict, key); 14286 Py_DECREF(key); 14287 if (ctx->args == NULL) 14288 return -1; 14289 ctx->args_owned = 1; 14290 ctx->arglen = -1; 14291 ctx->argidx = -2; 14292 } 14293 14294 /* Parse flags. Example: "%+i" => flags=F_SIGN. */ 14295 while (--ctx->fmtcnt >= 0) { 14296 arg->ch = FORMAT_READ(ctx); 14297 ctx->fmtpos++; 14298 switch (arg->ch) { 14299 case '-': arg->flags |= F_LJUST; continue; 14300 case '+': arg->flags |= F_SIGN; continue; 14301 case ' ': arg->flags |= F_BLANK; continue; 14302 case '#': arg->flags |= F_ALT; continue; 14303 case '0': arg->flags |= F_ZERO; continue; 14304 } 14305 break; 14306 } 14307 14308 /* Parse width. Example: "%10s" => width=10 */ 14309 if (arg->ch == '*') { 14310 v = unicode_format_getnextarg(ctx); 14311 if (v == NULL) 14312 return -1; 14313 if (!PyLong_Check(v)) { 14314 PyErr_SetString(PyExc_TypeError, 14315 "* wants int"); 14316 return -1; 14317 } 14318 arg->width = PyLong_AsSsize_t(v); 14319 if (arg->width == -1 && PyErr_Occurred()) 14320 return -1; 14321 if (arg->width < 0) { 14322 arg->flags |= F_LJUST; 14323 arg->width = -arg->width; 14324 } 14325 if (--ctx->fmtcnt >= 0) { 14326 arg->ch = FORMAT_READ(ctx); 14327 ctx->fmtpos++; 14328 } 14329 } 14330 else if (arg->ch >= '0' && arg->ch <= '9') { 14331 arg->width = arg->ch - '0'; 14332 while (--ctx->fmtcnt >= 0) { 14333 arg->ch = FORMAT_READ(ctx); 14334 ctx->fmtpos++; 14335 if (arg->ch < '0' || arg->ch > '9') 14336 break; 14337 /* Since arg->ch is unsigned, the RHS would end up as unsigned, 14338 mixing signed and unsigned comparison. Since arg->ch is between 14339 '0' and '9', casting to int is safe. */ 14340 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) { 14341 PyErr_SetString(PyExc_ValueError, 14342 "width too big"); 14343 return -1; 14344 } 14345 arg->width = arg->width*10 + (arg->ch - '0'); 14346 } 14347 } 14348 14349 /* Parse precision. Example: "%.3f" => prec=3 */ 14350 if (arg->ch == '.') { 14351 arg->prec = 0; 14352 if (--ctx->fmtcnt >= 0) { 14353 arg->ch = FORMAT_READ(ctx); 14354 ctx->fmtpos++; 14355 } 14356 if (arg->ch == '*') { 14357 v = unicode_format_getnextarg(ctx); 14358 if (v == NULL) 14359 return -1; 14360 if (!PyLong_Check(v)) { 14361 PyErr_SetString(PyExc_TypeError, 14362 "* wants int"); 14363 return -1; 14364 } 14365 arg->prec = _PyLong_AsInt(v); 14366 if (arg->prec == -1 && PyErr_Occurred()) 14367 return -1; 14368 if (arg->prec < 0) 14369 arg->prec = 0; 14370 if (--ctx->fmtcnt >= 0) { 14371 arg->ch = FORMAT_READ(ctx); 14372 ctx->fmtpos++; 14373 } 14374 } 14375 else if (arg->ch >= '0' && arg->ch <= '9') { 14376 arg->prec = arg->ch - '0'; 14377 while (--ctx->fmtcnt >= 0) { 14378 arg->ch = FORMAT_READ(ctx); 14379 ctx->fmtpos++; 14380 if (arg->ch < '0' || arg->ch > '9') 14381 break; 14382 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) { 14383 PyErr_SetString(PyExc_ValueError, 14384 "precision too big"); 14385 return -1; 14386 } 14387 arg->prec = arg->prec*10 + (arg->ch - '0'); 14388 } 14389 } 14390 } 14391 14392 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */ 14393 if (ctx->fmtcnt >= 0) { 14394 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') { 14395 if (--ctx->fmtcnt >= 0) { 14396 arg->ch = FORMAT_READ(ctx); 14397 ctx->fmtpos++; 14398 } 14399 } 14400 } 14401 if (ctx->fmtcnt < 0) { 14402 PyErr_SetString(PyExc_ValueError, 14403 "incomplete format"); 14404 return -1; 14405 } 14406 return 0; 14407 14408#undef FORMAT_READ 14409} 14410 14411/* Format one argument. Supported conversion specifiers: 14412 14413 - "s", "r", "a": any type 14414 - "i", "d", "u": int or float 14415 - "o", "x", "X": int 14416 - "e", "E", "f", "F", "g", "G": float 14417 - "c": int or str (1 character) 14418 14419 When possible, the output is written directly into the Unicode writer 14420 (ctx->writer). A string is created when padding is required. 14421 14422 Return 0 if the argument has been formatted into *p_str, 14423 1 if the argument has been written into ctx->writer, 14424 -1 on error. */ 14425static int 14426unicode_format_arg_format(struct unicode_formatter_t *ctx, 14427 struct unicode_format_arg_t *arg, 14428 PyObject **p_str) 14429{ 14430 PyObject *v; 14431 _PyUnicodeWriter *writer = &ctx->writer; 14432 14433 if (ctx->fmtcnt == 0) 14434 ctx->writer.overallocate = 0; 14435 14436 if (arg->ch == '%') { 14437 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) 14438 return -1; 14439 return 1; 14440 } 14441 14442 v = unicode_format_getnextarg(ctx); 14443 if (v == NULL) 14444 return -1; 14445 14446 14447 switch (arg->ch) { 14448 case 's': 14449 case 'r': 14450 case 'a': 14451 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) { 14452 /* Fast path */ 14453 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1) 14454 return -1; 14455 return 1; 14456 } 14457 14458 if (PyUnicode_CheckExact(v) && arg->ch == 's') { 14459 *p_str = v; 14460 Py_INCREF(*p_str); 14461 } 14462 else { 14463 if (arg->ch == 's') 14464 *p_str = PyObject_Str(v); 14465 else if (arg->ch == 'r') 14466 *p_str = PyObject_Repr(v); 14467 else 14468 *p_str = PyObject_ASCII(v); 14469 } 14470 break; 14471 14472 case 'i': 14473 case 'd': 14474 case 'u': 14475 case 'o': 14476 case 'x': 14477 case 'X': 14478 { 14479 int ret = mainformatlong(v, arg, p_str, writer); 14480 if (ret != 0) 14481 return ret; 14482 arg->sign = 1; 14483 break; 14484 } 14485 14486 case 'e': 14487 case 'E': 14488 case 'f': 14489 case 'F': 14490 case 'g': 14491 case 'G': 14492 if (arg->width == -1 && arg->prec == -1 14493 && !(arg->flags & (F_SIGN | F_BLANK))) 14494 { 14495 /* Fast path */ 14496 if (formatfloat(v, arg, NULL, writer) == -1) 14497 return -1; 14498 return 1; 14499 } 14500 14501 arg->sign = 1; 14502 if (formatfloat(v, arg, p_str, NULL) == -1) 14503 return -1; 14504 break; 14505 14506 case 'c': 14507 { 14508 Py_UCS4 ch = formatchar(v); 14509 if (ch == (Py_UCS4) -1) 14510 return -1; 14511 if (arg->width == -1 && arg->prec == -1) { 14512 /* Fast path */ 14513 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) 14514 return -1; 14515 return 1; 14516 } 14517 *p_str = PyUnicode_FromOrdinal(ch); 14518 break; 14519 } 14520 14521 default: 14522 PyErr_Format(PyExc_ValueError, 14523 "unsupported format character '%c' (0x%x) " 14524 "at index %zd", 14525 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?', 14526 (int)arg->ch, 14527 ctx->fmtpos - 1); 14528 return -1; 14529 } 14530 if (*p_str == NULL) 14531 return -1; 14532 assert (PyUnicode_Check(*p_str)); 14533 return 0; 14534} 14535 14536static int 14537unicode_format_arg_output(struct unicode_formatter_t *ctx, 14538 struct unicode_format_arg_t *arg, 14539 PyObject *str) 14540{ 14541 Py_ssize_t len; 14542 enum PyUnicode_Kind kind; 14543 void *pbuf; 14544 Py_ssize_t pindex; 14545 Py_UCS4 signchar; 14546 Py_ssize_t buflen; 14547 Py_UCS4 maxchar; 14548 Py_ssize_t sublen; 14549 _PyUnicodeWriter *writer = &ctx->writer; 14550 Py_UCS4 fill; 14551 14552 fill = ' '; 14553 if (arg->sign && arg->flags & F_ZERO) 14554 fill = '0'; 14555 14556 if (PyUnicode_READY(str) == -1) 14557 return -1; 14558 14559 len = PyUnicode_GET_LENGTH(str); 14560 if ((arg->width == -1 || arg->width <= len) 14561 && (arg->prec == -1 || arg->prec >= len) 14562 && !(arg->flags & (F_SIGN | F_BLANK))) 14563 { 14564 /* Fast path */ 14565 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) 14566 return -1; 14567 return 0; 14568 } 14569 14570 /* Truncate the string for "s", "r" and "a" formats 14571 if the precision is set */ 14572 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') { 14573 if (arg->prec >= 0 && len > arg->prec) 14574 len = arg->prec; 14575 } 14576 14577 /* Adjust sign and width */ 14578 kind = PyUnicode_KIND(str); 14579 pbuf = PyUnicode_DATA(str); 14580 pindex = 0; 14581 signchar = '\0'; 14582 if (arg->sign) { 14583 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex); 14584 if (ch == '-' || ch == '+') { 14585 signchar = ch; 14586 len--; 14587 pindex++; 14588 } 14589 else if (arg->flags & F_SIGN) 14590 signchar = '+'; 14591 else if (arg->flags & F_BLANK) 14592 signchar = ' '; 14593 else 14594 arg->sign = 0; 14595 } 14596 if (arg->width < len) 14597 arg->width = len; 14598 14599 /* Prepare the writer */ 14600 maxchar = writer->maxchar; 14601 if (!(arg->flags & F_LJUST)) { 14602 if (arg->sign) { 14603 if ((arg->width-1) > len) 14604 maxchar = Py_MAX(maxchar, fill); 14605 } 14606 else { 14607 if (arg->width > len) 14608 maxchar = Py_MAX(maxchar, fill); 14609 } 14610 } 14611 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) { 14612 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len); 14613 maxchar = Py_MAX(maxchar, strmaxchar); 14614 } 14615 14616 buflen = arg->width; 14617 if (arg->sign && len == arg->width) 14618 buflen++; 14619 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1) 14620 return -1; 14621 14622 /* Write the sign if needed */ 14623 if (arg->sign) { 14624 if (fill != ' ') { 14625 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14626 writer->pos += 1; 14627 } 14628 if (arg->width > len) 14629 arg->width--; 14630 } 14631 14632 /* Write the numeric prefix for "x", "X" and "o" formats 14633 if the alternate form is used. 14634 For example, write "0x" for the "%#x" format. */ 14635 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14636 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14637 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch); 14638 if (fill != ' ') { 14639 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14640 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14641 writer->pos += 2; 14642 pindex += 2; 14643 } 14644 arg->width -= 2; 14645 if (arg->width < 0) 14646 arg->width = 0; 14647 len -= 2; 14648 } 14649 14650 /* Pad left with the fill character if needed */ 14651 if (arg->width > len && !(arg->flags & F_LJUST)) { 14652 sublen = arg->width - len; 14653 FILL(writer->kind, writer->data, fill, writer->pos, sublen); 14654 writer->pos += sublen; 14655 arg->width = len; 14656 } 14657 14658 /* If padding with spaces: write sign if needed and/or numeric prefix if 14659 the alternate form is used */ 14660 if (fill == ' ') { 14661 if (arg->sign) { 14662 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar); 14663 writer->pos += 1; 14664 } 14665 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) { 14666 assert(PyUnicode_READ(kind, pbuf, pindex) == '0'); 14667 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch); 14668 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0'); 14669 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch); 14670 writer->pos += 2; 14671 pindex += 2; 14672 } 14673 } 14674 14675 /* Write characters */ 14676 if (len) { 14677 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, 14678 str, pindex, len); 14679 writer->pos += len; 14680 } 14681 14682 /* Pad right with the fill character if needed */ 14683 if (arg->width > len) { 14684 sublen = arg->width - len; 14685 FILL(writer->kind, writer->data, ' ', writer->pos, sublen); 14686 writer->pos += sublen; 14687 } 14688 return 0; 14689} 14690 14691/* Helper of PyUnicode_Format(): format one arg. 14692 Return 0 on success, raise an exception and return -1 on error. */ 14693static int 14694unicode_format_arg(struct unicode_formatter_t *ctx) 14695{ 14696 struct unicode_format_arg_t arg; 14697 PyObject *str; 14698 int ret; 14699 14700 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos); 14701 arg.flags = 0; 14702 arg.width = -1; 14703 arg.prec = -1; 14704 arg.sign = 0; 14705 str = NULL; 14706 14707 ret = unicode_format_arg_parse(ctx, &arg); 14708 if (ret == -1) 14709 return -1; 14710 14711 ret = unicode_format_arg_format(ctx, &arg, &str); 14712 if (ret == -1) 14713 return -1; 14714 14715 if (ret != 1) { 14716 ret = unicode_format_arg_output(ctx, &arg, str); 14717 Py_DECREF(str); 14718 if (ret == -1) 14719 return -1; 14720 } 14721 14722 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') { 14723 PyErr_SetString(PyExc_TypeError, 14724 "not all arguments converted during string formatting"); 14725 return -1; 14726 } 14727 return 0; 14728} 14729 14730PyObject * 14731PyUnicode_Format(PyObject *format, PyObject *args) 14732{ 14733 struct unicode_formatter_t ctx; 14734 14735 if (format == NULL || args == NULL) { 14736 PyErr_BadInternalCall(); 14737 return NULL; 14738 } 14739 14740 ctx.fmtstr = PyUnicode_FromObject(format); 14741 if (ctx.fmtstr == NULL) 14742 return NULL; 14743 if (PyUnicode_READY(ctx.fmtstr) == -1) { 14744 Py_DECREF(ctx.fmtstr); 14745 return NULL; 14746 } 14747 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr); 14748 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr); 14749 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); 14750 ctx.fmtpos = 0; 14751 14752 _PyUnicodeWriter_Init(&ctx.writer); 14753 ctx.writer.min_length = ctx.fmtcnt + 100; 14754 ctx.writer.overallocate = 1; 14755 14756 if (PyTuple_Check(args)) { 14757 ctx.arglen = PyTuple_Size(args); 14758 ctx.argidx = 0; 14759 } 14760 else { 14761 ctx.arglen = -1; 14762 ctx.argidx = -2; 14763 } 14764 ctx.args_owned = 0; 14765 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args)) 14766 ctx.dict = args; 14767 else 14768 ctx.dict = NULL; 14769 ctx.args = args; 14770 14771 while (--ctx.fmtcnt >= 0) { 14772 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14773 Py_ssize_t nonfmtpos; 14774 14775 nonfmtpos = ctx.fmtpos++; 14776 while (ctx.fmtcnt >= 0 && 14777 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') { 14778 ctx.fmtpos++; 14779 ctx.fmtcnt--; 14780 } 14781 if (ctx.fmtcnt < 0) { 14782 ctx.fmtpos--; 14783 ctx.writer.overallocate = 0; 14784 } 14785 14786 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr, 14787 nonfmtpos, ctx.fmtpos) < 0) 14788 goto onError; 14789 } 14790 else { 14791 ctx.fmtpos++; 14792 if (unicode_format_arg(&ctx) == -1) 14793 goto onError; 14794 } 14795 } 14796 14797 if (ctx.argidx < ctx.arglen && !ctx.dict) { 14798 PyErr_SetString(PyExc_TypeError, 14799 "not all arguments converted during string formatting"); 14800 goto onError; 14801 } 14802 14803 if (ctx.args_owned) { 14804 Py_DECREF(ctx.args); 14805 } 14806 Py_DECREF(ctx.fmtstr); 14807 return _PyUnicodeWriter_Finish(&ctx.writer); 14808 14809 onError: 14810 Py_DECREF(ctx.fmtstr); 14811 _PyUnicodeWriter_Dealloc(&ctx.writer); 14812 if (ctx.args_owned) { 14813 Py_DECREF(ctx.args); 14814 } 14815 return NULL; 14816} 14817 14818static PyObject * 14819unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); 14820 14821static PyObject * 14822unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14823{ 14824 PyObject *x = NULL; 14825 static char *kwlist[] = {"object", "encoding", "errors", 0}; 14826 char *encoding = NULL; 14827 char *errors = NULL; 14828 14829 if (type != &PyUnicode_Type) 14830 return unicode_subtype_new(type, args, kwds); 14831 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", 14832 kwlist, &x, &encoding, &errors)) 14833 return NULL; 14834 if (x == NULL) 14835 _Py_RETURN_UNICODE_EMPTY(); 14836 if (encoding == NULL && errors == NULL) 14837 return PyObject_Str(x); 14838 else 14839 return PyUnicode_FromEncodedObject(x, encoding, errors); 14840} 14841 14842static PyObject * 14843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) 14844{ 14845 PyObject *unicode, *self; 14846 Py_ssize_t length, char_size; 14847 int share_wstr, share_utf8; 14848 unsigned int kind; 14849 void *data; 14850 14851 assert(PyType_IsSubtype(type, &PyUnicode_Type)); 14852 14853 unicode = unicode_new(&PyUnicode_Type, args, kwds); 14854 if (unicode == NULL) 14855 return NULL; 14856 assert(_PyUnicode_CHECK(unicode)); 14857 if (PyUnicode_READY(unicode) == -1) { 14858 Py_DECREF(unicode); 14859 return NULL; 14860 } 14861 14862 self = type->tp_alloc(type, 0); 14863 if (self == NULL) { 14864 Py_DECREF(unicode); 14865 return NULL; 14866 } 14867 kind = PyUnicode_KIND(unicode); 14868 length = PyUnicode_GET_LENGTH(unicode); 14869 14870 _PyUnicode_LENGTH(self) = length; 14871#ifdef Py_DEBUG 14872 _PyUnicode_HASH(self) = -1; 14873#else 14874 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14875#endif 14876 _PyUnicode_STATE(self).interned = 0; 14877 _PyUnicode_STATE(self).kind = kind; 14878 _PyUnicode_STATE(self).compact = 0; 14879 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; 14880 _PyUnicode_STATE(self).ready = 1; 14881 _PyUnicode_WSTR(self) = NULL; 14882 _PyUnicode_UTF8_LENGTH(self) = 0; 14883 _PyUnicode_UTF8(self) = NULL; 14884 _PyUnicode_WSTR_LENGTH(self) = 0; 14885 _PyUnicode_DATA_ANY(self) = NULL; 14886 14887 share_utf8 = 0; 14888 share_wstr = 0; 14889 if (kind == PyUnicode_1BYTE_KIND) { 14890 char_size = 1; 14891 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128) 14892 share_utf8 = 1; 14893 } 14894 else if (kind == PyUnicode_2BYTE_KIND) { 14895 char_size = 2; 14896 if (sizeof(wchar_t) == 2) 14897 share_wstr = 1; 14898 } 14899 else { 14900 assert(kind == PyUnicode_4BYTE_KIND); 14901 char_size = 4; 14902 if (sizeof(wchar_t) == 4) 14903 share_wstr = 1; 14904 } 14905 14906 /* Ensure we won't overflow the length. */ 14907 if (length > (PY_SSIZE_T_MAX / char_size - 1)) { 14908 PyErr_NoMemory(); 14909 goto onError; 14910 } 14911 data = PyObject_MALLOC((length + 1) * char_size); 14912 if (data == NULL) { 14913 PyErr_NoMemory(); 14914 goto onError; 14915 } 14916 14917 _PyUnicode_DATA_ANY(self) = data; 14918 if (share_utf8) { 14919 _PyUnicode_UTF8_LENGTH(self) = length; 14920 _PyUnicode_UTF8(self) = data; 14921 } 14922 if (share_wstr) { 14923 _PyUnicode_WSTR_LENGTH(self) = length; 14924 _PyUnicode_WSTR(self) = (wchar_t *)data; 14925 } 14926 14927 Py_MEMCPY(data, PyUnicode_DATA(unicode), 14928 kind * (length + 1)); 14929 assert(_PyUnicode_CheckConsistency(self, 1)); 14930#ifdef Py_DEBUG 14931 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); 14932#endif 14933 Py_DECREF(unicode); 14934 return self; 14935 14936onError: 14937 Py_DECREF(unicode); 14938 Py_DECREF(self); 14939 return NULL; 14940} 14941 14942PyDoc_STRVAR(unicode_doc, 14943"str(object='') -> str\n\ 14944str(bytes_or_buffer[, encoding[, errors]]) -> str\n\ 14945\n\ 14946Create a new string object from the given object. If encoding or\n\ 14947errors is specified, then the object must expose a data buffer\n\ 14948that will be decoded using the given encoding and error handler.\n\ 14949Otherwise, returns the result of object.__str__() (if defined)\n\ 14950or repr(object).\n\ 14951encoding defaults to sys.getdefaultencoding().\n\ 14952errors defaults to 'strict'."); 14953 14954static PyObject *unicode_iter(PyObject *seq); 14955 14956PyTypeObject PyUnicode_Type = { 14957 PyVarObject_HEAD_INIT(&PyType_Type, 0) 14958 "str", /* tp_name */ 14959 sizeof(PyUnicodeObject), /* tp_size */ 14960 0, /* tp_itemsize */ 14961 /* Slots */ 14962 (destructor)unicode_dealloc, /* tp_dealloc */ 14963 0, /* tp_print */ 14964 0, /* tp_getattr */ 14965 0, /* tp_setattr */ 14966 0, /* tp_reserved */ 14967 unicode_repr, /* tp_repr */ 14968 &unicode_as_number, /* tp_as_number */ 14969 &unicode_as_sequence, /* tp_as_sequence */ 14970 &unicode_as_mapping, /* tp_as_mapping */ 14971 (hashfunc) unicode_hash, /* tp_hash*/ 14972 0, /* tp_call*/ 14973 (reprfunc) unicode_str, /* tp_str */ 14974 PyObject_GenericGetAttr, /* tp_getattro */ 14975 0, /* tp_setattro */ 14976 0, /* tp_as_buffer */ 14977 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | 14978 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ 14979 unicode_doc, /* tp_doc */ 14980 0, /* tp_traverse */ 14981 0, /* tp_clear */ 14982 PyUnicode_RichCompare, /* tp_richcompare */ 14983 0, /* tp_weaklistoffset */ 14984 unicode_iter, /* tp_iter */ 14985 0, /* tp_iternext */ 14986 unicode_methods, /* tp_methods */ 14987 0, /* tp_members */ 14988 0, /* tp_getset */ 14989 &PyBaseObject_Type, /* tp_base */ 14990 0, /* tp_dict */ 14991 0, /* tp_descr_get */ 14992 0, /* tp_descr_set */ 14993 0, /* tp_dictoffset */ 14994 0, /* tp_init */ 14995 0, /* tp_alloc */ 14996 unicode_new, /* tp_new */ 14997 PyObject_Del, /* tp_free */ 14998}; 14999 15000/* Initialize the Unicode implementation */ 15001 15002int _PyUnicode_Init(void) 15003{ 15004 /* XXX - move this array to unicodectype.c ? */ 15005 Py_UCS2 linebreak[] = { 15006 0x000A, /* LINE FEED */ 15007 0x000D, /* CARRIAGE RETURN */ 15008 0x001C, /* FILE SEPARATOR */ 15009 0x001D, /* GROUP SEPARATOR */ 15010 0x001E, /* RECORD SEPARATOR */ 15011 0x0085, /* NEXT LINE */ 15012 0x2028, /* LINE SEPARATOR */ 15013 0x2029, /* PARAGRAPH SEPARATOR */ 15014 }; 15015 15016 /* Init the implementation */ 15017 _Py_INCREF_UNICODE_EMPTY(); 15018 if (!unicode_empty) 15019 Py_FatalError("Can't create empty string"); 15020 Py_DECREF(unicode_empty); 15021 15022 if (PyType_Ready(&PyUnicode_Type) < 0) 15023 Py_FatalError("Can't initialize 'unicode'"); 15024 15025 /* initialize the linebreak bloom filter */ 15026 bloom_linebreak = make_bloom_mask( 15027 PyUnicode_2BYTE_KIND, linebreak, 15028 Py_ARRAY_LENGTH(linebreak)); 15029 15030 if (PyType_Ready(&EncodingMapType) < 0) 15031 Py_FatalError("Can't initialize encoding map type"); 15032 15033 if (PyType_Ready(&PyFieldNameIter_Type) < 0) 15034 Py_FatalError("Can't initialize field name iterator type"); 15035 15036 if (PyType_Ready(&PyFormatterIter_Type) < 0) 15037 Py_FatalError("Can't initialize formatter iter type"); 15038 15039 return 0; 15040} 15041 15042/* Finalize the Unicode implementation */ 15043 15044int 15045PyUnicode_ClearFreeList(void) 15046{ 15047 return 0; 15048} 15049 15050void 15051_PyUnicode_Fini(void) 15052{ 15053 int i; 15054 15055 Py_CLEAR(unicode_empty); 15056 15057 for (i = 0; i < 256; i++) 15058 Py_CLEAR(unicode_latin1[i]); 15059 _PyUnicode_ClearStaticStrings(); 15060 (void)PyUnicode_ClearFreeList(); 15061} 15062 15063void 15064PyUnicode_InternInPlace(PyObject **p) 15065{ 15066 PyObject *s = *p; 15067 PyObject *t; 15068#ifdef Py_DEBUG 15069 assert(s != NULL); 15070 assert(_PyUnicode_CHECK(s)); 15071#else 15072 if (s == NULL || !PyUnicode_Check(s)) 15073 return; 15074#endif 15075 /* If it's a subclass, we don't really know what putting 15076 it in the interned dict might do. */ 15077 if (!PyUnicode_CheckExact(s)) 15078 return; 15079 if (PyUnicode_CHECK_INTERNED(s)) 15080 return; 15081 if (interned == NULL) { 15082 interned = PyDict_New(); 15083 if (interned == NULL) { 15084 PyErr_Clear(); /* Don't leave an exception */ 15085 return; 15086 } 15087 } 15088 /* It might be that the GetItem call fails even 15089 though the key is present in the dictionary, 15090 namely when this happens during a stack overflow. */ 15091 Py_ALLOW_RECURSION 15092 t = PyDict_GetItem(interned, s); 15093 Py_END_ALLOW_RECURSION 15094 15095 if (t) { 15096 Py_INCREF(t); 15097 Py_DECREF(*p); 15098 *p = t; 15099 return; 15100 } 15101 15102 PyThreadState_GET()->recursion_critical = 1; 15103 if (PyDict_SetItem(interned, s, s) < 0) { 15104 PyErr_Clear(); 15105 PyThreadState_GET()->recursion_critical = 0; 15106 return; 15107 } 15108 PyThreadState_GET()->recursion_critical = 0; 15109 /* The two references in interned are not counted by refcnt. 15110 The deallocator will take care of this */ 15111 Py_REFCNT(s) -= 2; 15112 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL; 15113} 15114 15115void 15116PyUnicode_InternImmortal(PyObject **p) 15117{ 15118 PyUnicode_InternInPlace(p); 15119 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { 15120 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL; 15121 Py_INCREF(*p); 15122 } 15123} 15124 15125PyObject * 15126PyUnicode_InternFromString(const char *cp) 15127{ 15128 PyObject *s = PyUnicode_FromString(cp); 15129 if (s == NULL) 15130 return NULL; 15131 PyUnicode_InternInPlace(&s); 15132 return s; 15133} 15134 15135void 15136_Py_ReleaseInternedUnicodeStrings(void) 15137{ 15138 PyObject *keys; 15139 PyObject *s; 15140 Py_ssize_t i, n; 15141 Py_ssize_t immortal_size = 0, mortal_size = 0; 15142 15143 if (interned == NULL || !PyDict_Check(interned)) 15144 return; 15145 keys = PyDict_Keys(interned); 15146 if (keys == NULL || !PyList_Check(keys)) { 15147 PyErr_Clear(); 15148 return; 15149 } 15150 15151 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak 15152 detector, interned unicode strings are not forcibly deallocated; 15153 rather, we give them their stolen references back, and then clear 15154 and DECREF the interned dict. */ 15155 15156 n = PyList_GET_SIZE(keys); 15157 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", 15158 n); 15159 for (i = 0; i < n; i++) { 15160 s = PyList_GET_ITEM(keys, i); 15161 if (PyUnicode_READY(s) == -1) { 15162 assert(0 && "could not ready string"); 15163 fprintf(stderr, "could not ready string\n"); 15164 } 15165 switch (PyUnicode_CHECK_INTERNED(s)) { 15166 case SSTATE_NOT_INTERNED: 15167 /* XXX Shouldn't happen */ 15168 break; 15169 case SSTATE_INTERNED_IMMORTAL: 15170 Py_REFCNT(s) += 1; 15171 immortal_size += PyUnicode_GET_LENGTH(s); 15172 break; 15173 case SSTATE_INTERNED_MORTAL: 15174 Py_REFCNT(s) += 2; 15175 mortal_size += PyUnicode_GET_LENGTH(s); 15176 break; 15177 default: 15178 Py_FatalError("Inconsistent interned string state."); 15179 } 15180 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED; 15181 } 15182 fprintf(stderr, "total size of all interned strings: " 15183 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " 15184 "mortal/immortal\n", mortal_size, immortal_size); 15185 Py_DECREF(keys); 15186 PyDict_Clear(interned); 15187 Py_CLEAR(interned); 15188} 15189 15190 15191/********************* Unicode Iterator **************************/ 15192 15193typedef struct { 15194 PyObject_HEAD 15195 Py_ssize_t it_index; 15196 PyObject *it_seq; /* Set to NULL when iterator is exhausted */ 15197} unicodeiterobject; 15198 15199static void 15200unicodeiter_dealloc(unicodeiterobject *it) 15201{ 15202 _PyObject_GC_UNTRACK(it); 15203 Py_XDECREF(it->it_seq); 15204 PyObject_GC_Del(it); 15205} 15206 15207static int 15208unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg) 15209{ 15210 Py_VISIT(it->it_seq); 15211 return 0; 15212} 15213 15214static PyObject * 15215unicodeiter_next(unicodeiterobject *it) 15216{ 15217 PyObject *seq, *item; 15218 15219 assert(it != NULL); 15220 seq = it->it_seq; 15221 if (seq == NULL) 15222 return NULL; 15223 assert(_PyUnicode_CHECK(seq)); 15224 15225 if (it->it_index < PyUnicode_GET_LENGTH(seq)) { 15226 int kind = PyUnicode_KIND(seq); 15227 void *data = PyUnicode_DATA(seq); 15228 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index); 15229 item = PyUnicode_FromOrdinal(chr); 15230 if (item != NULL) 15231 ++it->it_index; 15232 return item; 15233 } 15234 15235 Py_DECREF(seq); 15236 it->it_seq = NULL; 15237 return NULL; 15238} 15239 15240static PyObject * 15241unicodeiter_len(unicodeiterobject *it) 15242{ 15243 Py_ssize_t len = 0; 15244 if (it->it_seq) 15245 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index; 15246 return PyLong_FromSsize_t(len); 15247} 15248 15249PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it))."); 15250 15251static PyObject * 15252unicodeiter_reduce(unicodeiterobject *it) 15253{ 15254 if (it->it_seq != NULL) { 15255 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"), 15256 it->it_seq, it->it_index); 15257 } else { 15258 PyObject *u = PyUnicode_FromUnicode(NULL, 0); 15259 if (u == NULL) 15260 return NULL; 15261 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); 15262 } 15263} 15264 15265PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); 15266 15267static PyObject * 15268unicodeiter_setstate(unicodeiterobject *it, PyObject *state) 15269{ 15270 Py_ssize_t index = PyLong_AsSsize_t(state); 15271 if (index == -1 && PyErr_Occurred()) 15272 return NULL; 15273 if (it->it_seq != NULL) { 15274 if (index < 0) 15275 index = 0; 15276 else if (index > PyUnicode_GET_LENGTH(it->it_seq)) 15277 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */ 15278 it->it_index = index; 15279 } 15280 Py_RETURN_NONE; 15281} 15282 15283PyDoc_STRVAR(setstate_doc, "Set state information for unpickling."); 15284 15285static PyMethodDef unicodeiter_methods[] = { 15286 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS, 15287 length_hint_doc}, 15288 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS, 15289 reduce_doc}, 15290 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O, 15291 setstate_doc}, 15292 {NULL, NULL} /* sentinel */ 15293}; 15294 15295PyTypeObject PyUnicodeIter_Type = { 15296 PyVarObject_HEAD_INIT(&PyType_Type, 0) 15297 "str_iterator", /* tp_name */ 15298 sizeof(unicodeiterobject), /* tp_basicsize */ 15299 0, /* tp_itemsize */ 15300 /* methods */ 15301 (destructor)unicodeiter_dealloc, /* tp_dealloc */ 15302 0, /* tp_print */ 15303 0, /* tp_getattr */ 15304 0, /* tp_setattr */ 15305 0, /* tp_reserved */ 15306 0, /* tp_repr */ 15307 0, /* tp_as_number */ 15308 0, /* tp_as_sequence */ 15309 0, /* tp_as_mapping */ 15310 0, /* tp_hash */ 15311 0, /* tp_call */ 15312 0, /* tp_str */ 15313 PyObject_GenericGetAttr, /* tp_getattro */ 15314 0, /* tp_setattro */ 15315 0, /* tp_as_buffer */ 15316 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */ 15317 0, /* tp_doc */ 15318 (traverseproc)unicodeiter_traverse, /* tp_traverse */ 15319 0, /* tp_clear */ 15320 0, /* tp_richcompare */ 15321 0, /* tp_weaklistoffset */ 15322 PyObject_SelfIter, /* tp_iter */ 15323 (iternextfunc)unicodeiter_next, /* tp_iternext */ 15324 unicodeiter_methods, /* tp_methods */ 15325 0, 15326}; 15327 15328static PyObject * 15329unicode_iter(PyObject *seq) 15330{ 15331 unicodeiterobject *it; 15332 15333 if (!PyUnicode_Check(seq)) { 15334 PyErr_BadInternalCall(); 15335 return NULL; 15336 } 15337 if (PyUnicode_READY(seq) == -1) 15338 return NULL; 15339 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type); 15340 if (it == NULL) 15341 return NULL; 15342 it->it_index = 0; 15343 Py_INCREF(seq); 15344 it->it_seq = seq; 15345 _PyObject_GC_TRACK(it); 15346 return (PyObject *)it; 15347} 15348 15349 15350size_t 15351Py_UNICODE_strlen(const Py_UNICODE *u) 15352{ 15353 int res = 0; 15354 while(*u++) 15355 res++; 15356 return res; 15357} 15358 15359Py_UNICODE* 15360Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2) 15361{ 15362 Py_UNICODE *u = s1; 15363 while ((*u++ = *s2++)); 15364 return s1; 15365} 15366 15367Py_UNICODE* 15368Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15369{ 15370 Py_UNICODE *u = s1; 15371 while ((*u++ = *s2++)) 15372 if (n-- == 0) 15373 break; 15374 return s1; 15375} 15376 15377Py_UNICODE* 15378Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) 15379{ 15380 Py_UNICODE *u1 = s1; 15381 u1 += Py_UNICODE_strlen(u1); 15382 Py_UNICODE_strcpy(u1, s2); 15383 return s1; 15384} 15385 15386int 15387Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) 15388{ 15389 while (*s1 && *s2 && *s1 == *s2) 15390 s1++, s2++; 15391 if (*s1 && *s2) 15392 return (*s1 < *s2) ? -1 : +1; 15393 if (*s1) 15394 return 1; 15395 if (*s2) 15396 return -1; 15397 return 0; 15398} 15399 15400int 15401Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) 15402{ 15403 Py_UNICODE u1, u2; 15404 for (; n != 0; n--) { 15405 u1 = *s1; 15406 u2 = *s2; 15407 if (u1 != u2) 15408 return (u1 < u2) ? -1 : +1; 15409 if (u1 == '\0') 15410 return 0; 15411 s1++; 15412 s2++; 15413 } 15414 return 0; 15415} 15416 15417Py_UNICODE* 15418Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) 15419{ 15420 const Py_UNICODE *p; 15421 for (p = s; *p; p++) 15422 if (*p == c) 15423 return (Py_UNICODE*)p; 15424 return NULL; 15425} 15426 15427Py_UNICODE* 15428Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) 15429{ 15430 const Py_UNICODE *p; 15431 p = s + Py_UNICODE_strlen(s); 15432 while (p != s) { 15433 p--; 15434 if (*p == c) 15435 return (Py_UNICODE*)p; 15436 } 15437 return NULL; 15438} 15439 15440Py_UNICODE* 15441PyUnicode_AsUnicodeCopy(PyObject *unicode) 15442{ 15443 Py_UNICODE *u, *copy; 15444 Py_ssize_t len, size; 15445 15446 if (!PyUnicode_Check(unicode)) { 15447 PyErr_BadArgument(); 15448 return NULL; 15449 } 15450 u = PyUnicode_AsUnicodeAndSize(unicode, &len); 15451 if (u == NULL) 15452 return NULL; 15453 /* Ensure we won't overflow the size. */ 15454 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) { 15455 PyErr_NoMemory(); 15456 return NULL; 15457 } 15458 size = len + 1; /* copy the null character */ 15459 size *= sizeof(Py_UNICODE); 15460 copy = PyMem_Malloc(size); 15461 if (copy == NULL) { 15462 PyErr_NoMemory(); 15463 return NULL; 15464 } 15465 memcpy(copy, u, size); 15466 return copy; 15467} 15468 15469/* A _string module, to export formatter_parser and formatter_field_name_split 15470 to the string.Formatter class implemented in Python. */ 15471 15472static PyMethodDef _string_methods[] = { 15473 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, 15474 METH_O, PyDoc_STR("split the argument as a field name")}, 15475 {"formatter_parser", (PyCFunction) formatter_parser, 15476 METH_O, PyDoc_STR("parse the argument as a format string")}, 15477 {NULL, NULL} 15478}; 15479 15480static struct PyModuleDef _string_module = { 15481 PyModuleDef_HEAD_INIT, 15482 "_string", 15483 PyDoc_STR("string helper module"), 15484 0, 15485 _string_methods, 15486 NULL, 15487 NULL, 15488 NULL, 15489 NULL 15490}; 15491 15492PyMODINIT_FUNC 15493PyInit__string(void) 15494{ 15495 return PyModule_Create(&_string_module); 15496} 15497 15498 15499#ifdef __cplusplus 15500} 15501#endif 15502